/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
* Matthieu Schaller (schaller@strw.leidenuniv.nl)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
******************************************************************************/
#ifndef SWIFT_KERNEL_HYDRO_H
#define SWIFT_KERNEL_HYDRO_H
/**
* @file kernel_hydro.h
* @brief Kernel functions for SPH (scalar and vector version).
*
* All constants and kernel coefficients are taken from table 1 of
* Dehnen & Aly, MNRAS, 425, pp. 1062-1082 (2012).
*/
/* Config parameters. */
#include
/* Some standard headers. */
#include
/* Local headers. */
#include "dimension.h"
#include "error.h"
#include "inline.h"
#include "minmax.h"
#include "vector.h"
/* ------------------------------------------------------------------------- */
#if defined(CUBIC_SPLINE_KERNEL)
/* Coefficients for the kernel. */
#define kernel_name "Cubic spline (M4)"
#define kernel_degree 3 /*!< Degree of the polynomial */
#define kernel_ivals 2 /*!< Number of branches */
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma ((float)(1.825742))
#define kernel_constant ((float)(16. * M_1_PI))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(1.778002))
#define kernel_constant ((float)(80. * M_1_PI / 7.))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma ((float)(1.732051))
#define kernel_constant ((float)(8. / 3.))
#endif
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {3.f, -3.f, 0.f, 0.5f, /* 0 < u < 0.5 */
-1.f, 3.f, -3.f, 1.f, /* 0.5 < u < 1 */
0.f, 0.f, 0.f, 0.f}; /* 1 < u */
/* ------------------------------------------------------------------------- */
#elif defined(QUARTIC_SPLINE_KERNEL)
/* Coefficients for the kernel. */
#define kernel_name "Quartic spline (M5)"
#define kernel_degree 4 /* Degree of the polynomial */
#define kernel_ivals 5 /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma ((float)(2.018932))
#define kernel_constant ((float)(15625. * M_1_PI / 512.))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(1.977173))
#define kernel_constant ((float)(46875. * M_1_PI / 2398.))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma ((float)(1.936492))
#define kernel_constant ((float)(3125. / 768.))
#endif
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {
6.f, 0.f, -2.4f, 0.f, 0.368f, /* 0 < u < 0.2 */
-4.f, 8.f, -4.8f, 0.32f, 0.352f, /* 0.2 < u < 0.4 */
-4.f, 8.f, -4.8f, 0.32f, 0.352f, /* 0.4 < u < 0.6 */
1.f, -4.f, 6.f, -4.f, 1.f, /* 0.6 < u < 0.8 */
1.f, -4.f, 6.f, -4.f, 1.f, /* 0.8 < u < 1 */
0.f, 0.f, 0.f, 0.f, 0.f}; /* 1 < u */
/* ------------------------------------------------------------------------- */
#elif defined(QUINTIC_SPLINE_KERNEL)
/* Coefficients for the kernel. */
#define kernel_name "Quintic spline (M6)"
#define kernel_degree 5 /* Degree of the polynomial */
#define kernel_ivals 3 /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma ((float)(2.195775))
#define kernel_constant ((float)(2187. * M_1_PI / 40.))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(2.158131))
#define kernel_constant ((float)(15309. * M_1_PI / 478.))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma ((float)(2.121321))
#define kernel_constant ((float)(243. / 40.))
#endif
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {
-10.f, 10.f, 0.f,
-2.2222222f, 0.f, 0.271604938f, /* 0 < u < 1/3 */
5.f, -15.f, 16.666667f,
-7.77777777f, 0.925925f, 0.209876543f, /* 1/3 < u < 2/3 */
-1.f, 5.f, -10.f,
10.f, -5.f, 1.f, /* 2/3 < u < 1. */
0.f, 0.f, 0.f,
0.f, 0.f, 0.f}; /* 1 < u */
/* ------------------------------------------------------------------------- */
#elif defined(WENDLAND_C2_KERNEL)
/* Coefficients for the kernel. */
#define kernel_name "Wendland C2"
#define kernel_degree 5 /* Degree of the polynomial */
#define kernel_ivals 1 /* Number of branches */
#if defined(HYDRO_DIMENSION_1D)
/* Wendland C* have different form in 1D than 2D/3D */
#define kernel_gamma ((float)(1.620185))
#define kernel_constant ((float)(5. / 4.))
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {
0.f, -3.f, 8.f, -6.f, 0.f, 1.f, /* 0 < u < 1 */
0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
#else
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma ((float)(1.936492))
#define kernel_constant ((float)(21. * M_1_PI / 2.))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(1.897367))
#define kernel_constant ((float)(7. * M_1_PI))
#endif
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {
4.f, -15.f, 20.f, -10.f, 0.f, 1.f, /* 0 < u < 1 */
0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; /* 1 < u */
#endif
/* ------------------------------------------------------------------------- */
#elif defined(WENDLAND_C4_KERNEL)
/* Coefficients for the kernel. */
#define kernel_name "Wendland C4"
#define kernel_degree 8 /* Degree of the polynomial */
#define kernel_ivals 1 /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma ((float)(2.207940))
#define kernel_constant ((float)(495. * M_1_PI / 32.))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(2.171239))
#define kernel_constant ((float)(9. * M_1_PI))
#elif defined(HYDRO_DIMENSION_1D)
#error "Wendland C4 kernel not defined in 1D."
#endif
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {
11.666667f, -64.f, 140.f, -149.333333f, 70.f,
0.f, -9.3333333f, 0.f, 1.f, /* 0 < u < 1 */
0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f}; /* 1 < u */
/* ------------------------------------------------------------------------- */
#elif defined(WENDLAND_C6_KERNEL)
/* Coefficients for the kernel. */
#define kernel_name "Wendland C6"
#define kernel_degree 11 /* Degree of the polynomial */
#define kernel_ivals 1 /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma ((float)(2.449490))
#define kernel_constant ((float)(1365. * M_1_PI / 64.))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(2.415230))
#define kernel_constant ((float)(78. * M_1_PI / 7.))
#elif defined(HYDRO_DIMENSION_1D)
#error "Wendland C6 kernel not defined in 1D."
#endif
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
__attribute__((aligned(16))) = {
32.f, -231.f, 704.f, -1155.f, 1056.f, -462.f,
0.f, 66.f, 0.f, -11.f, 0.f, 1.f, /* 0 < u < 1 */
0.f, 0.f, 0.f, 0.f, 0.f, 0.f,
0.f, 0.f, 0.f, 0.f, 0.f, 0.f}; /* 1 < u */
/* ------------------------------------------------------------------------- */
#else
#error "A kernel function must be chosen at configure time !!"
/* ------------------------------------------------------------------------- */
#endif
/* Ok, now comes the real deal. */
/* First some powers of gamma = H/h */
#define kernel_gamma_inv ((float)(1. / kernel_gamma))
#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
/* define gamma^d, gamma^(d+1), 1/gamma^d and 1/gamma^(d+1) */
#if defined(HYDRO_DIMENSION_3D)
#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
#define kernel_gamma_dim_plus_one \
((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
#define kernel_gamma_inv_dim \
((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
#define kernel_gamma_inv_dim_plus_one \
((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma))
#define kernel_gamma_dim_plus_one \
((float)(kernel_gamma * kernel_gamma * kernel_gamma))
#define kernel_gamma_inv_dim ((float)(1. / (kernel_gamma * kernel_gamma)))
#define kernel_gamma_inv_dim_plus_one \
((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma_dim ((float)(kernel_gamma))
#define kernel_gamma_dim_plus_one ((float)(kernel_gamma * kernel_gamma))
#define kernel_gamma_inv_dim ((float)(1. / (kernel_gamma)))
#define kernel_gamma_inv_dim_plus_one \
((float)(1. / (kernel_gamma * kernel_gamma)))
#endif
/* The number of branches (floating point conversion) */
#define kernel_ivals_f ((float)(kernel_ivals))
/* Kernel self contribution (i.e. W(0,h)) */
#define kernel_root \
((float)(kernel_coeffs[kernel_degree]) * kernel_constant * \
kernel_gamma_inv_dim)
/* Kernel normalisation constant (volume term) */
#define kernel_norm ((float)(hydro_dimension_unit_sphere * kernel_gamma_dim))
/* ------------------------------------------------------------------------- */
/**
* @brief Computes the kernel function and its derivative.
*
* The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
* by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
*
* Returns 0 if \f$u > \gamma = H/h\f$.
*
* @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
* @param W (return) The value of the kernel function \f$W(x,h)\f$.
* @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
*/
__attribute__((always_inline)) INLINE static void kernel_deval(
float u, float *restrict W, float *restrict dW_dx) {
/* Go to the range [0,1[ from [0,H[ */
const float x = u * kernel_gamma_inv;
/* Pick the correct branch of the kernel */
const int temp = (int)(x * kernel_ivals_f);
const int ind = temp > kernel_ivals ? kernel_ivals : temp;
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
float w = coeffs[0] * x + coeffs[1];
float dw_dx = coeffs[0];
/* ... and the rest of them */
for (int k = 2; k <= kernel_degree; k++) {
dw_dx = dw_dx * x + w;
w = x * w + coeffs[k];
}
w = max(w, 0.f);
dw_dx = min(dw_dx, 0.f);
/* Return everything */
*W = w * kernel_constant * kernel_gamma_inv_dim;
*dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
}
/**
* @brief Computes the kernel function.
*
* The kernel function needs to be mutliplied by \f$h^{-d}\f$,
* where \f$d\f$ is the dimensionality of the problem.
*
* Returns 0 if \f$u > \gamma = H/h\f$
*
* @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
* @param W (return) The value of the kernel function \f$W(x,h)\f$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval(
float u, float *restrict W) {
/* Go to the range [0,1[ from [0,H[ */
const float x = u * kernel_gamma_inv;
/* Pick the correct branch of the kernel */
const int temp = (int)(x * kernel_ivals_f);
const int ind = temp > kernel_ivals ? kernel_ivals : temp;
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
float w = coeffs[0] * x + coeffs[1];
/* ... and the rest of them */
for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
w = max(w, 0.f);
/* Return everything */
*W = w * kernel_constant * kernel_gamma_inv_dim;
}
/**
* @brief Computes the kernel function in double precision.
*
* Required for computing the projected kernel because rounding
* error causes problems for the GSL integration function if
* we evaluate in single precision.
*
* The kernel function needs to be mutliplied by \f$h^{-d}\f$,
* where \f$d\f$ is the dimensionality of the problem.
*
* Returns 0 if \f$u > \gamma = H/h\f$
*
* @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
* @param W (return) The value of the kernel function \f$W(x,h)\f$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_double(
double u, double *restrict W) {
/* Go to the range [0,1[ from [0,H[ */
const double x = u * kernel_gamma_inv;
/* Pick the correct branch of the kernel */
const int temp = (int)(x * kernel_ivals_f);
const int ind = temp > kernel_ivals ? kernel_ivals : temp;
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
double w = ((double)coeffs[0]) * x + ((double)coeffs[1]);
/* ... and the rest of them */
for (int k = 2; k <= kernel_degree; k++) w = x * w + ((double)coeffs[k]);
w = max(w, 0.);
/* Return everything */
*W = w * ((double)kernel_constant) * ((double)kernel_gamma_inv_dim);
}
/**
* @brief Computes the kernel function derivative.
*
* The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
* by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
*
* Returns 0 if \f$u > \gamma = H/h\f$.
*
* @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
* @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
float u, float *restrict dW_dx) {
/* Go to the range [0,1[ from [0,H[ */
const float x = u * kernel_gamma_inv;
/* Pick the correct branch of the kernel */
const int temp = (int)(x * kernel_ivals_f);
const int ind = temp > kernel_ivals ? kernel_ivals : temp;
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
float dw_dx = ((float)kernel_degree * coeffs[0] * x) +
(float)(kernel_degree - 1) * coeffs[1];
/* ... and the rest of them */
for (int k = 2; k < kernel_degree; k++)
dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k];
dw_dx = min(dw_dx, 0.f);
/* Return everything */
*dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
}
#ifdef WENDLAND_C2_KERNEL
/**
* Computes dphi/dh for the chosen kernel.
*
* This corresponds to Appendix A of Price & Monaghan 2007, MNRAS, 374, 4
* but for a Wendland-C2 kernel.
*
* Assumes r < H (i.e. u < kernel_gamma)
*
* @param u The ratio r / h.
* @param h_inv 1 / h.
*/
__attribute__((always_inline, const)) INLINE static float potential_dh(
const float u, const float h_inv) {
/* Ratio of r to kernel support
* Recall that in our definition the kernel edge is H = kernel_gamma * h. */
const float q = fminf(u * kernel_gamma_inv, 1.f);
/* -24 q^7 + 105 q^6 -168 q^5 + 105 q^4 - 21 q^2 */
float dphi_dh = -24.f * q + 105.f;
dphi_dh = dphi_dh * q - 168.f;
dphi_dh = dphi_dh * q + 105.f;
dphi_dh = dphi_dh * q;
dphi_dh = dphi_dh * q - 21.f;
dphi_dh = dphi_dh * q;
dphi_dh = dphi_dh * q;
return dphi_dh * h_inv * h_inv * kernel_gamma_inv * 0.25f;
}
#endif
/* -------------------------------------------------------------------------
*/
#ifdef WITH_OLD_VECTORIZATION
/**
* @brief Computes the kernel function and its derivative (Vectorised version).
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_deval_vec(
vector *u, vector *w, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
/* Load x and get the interval id. */
vector ind;
ind.m =
vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
/* load the coefficients. */
vector c[kernel_degree + 1];
for (int k = 0; k < VEC_SIZE; k++)
for (int j = 0; j < kernel_degree + 1; j++)
c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(c[0].v, x.v, c[1].v);
dw_dx->v = c[0].v;
/* And we're off! */
for (int k = 2; k <= kernel_degree; k++) {
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, c[k].v);
}
/* Return everything */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
#endif
#ifdef WITH_VECTORIZATION
static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);
static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);
static const vector kernel_gamma_inv_dim_vec =
FILL_VEC((float)kernel_gamma_inv_dim);
static const vector kernel_gamma_inv_dim_plus_one_vec =
FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
* coefficients. */
#ifdef WENDLAND_C2_KERNEL
static const vector wendland_const_c0 = FILL_VEC(4.f);
static const vector wendland_const_c1 = FILL_VEC(-15.f);
static const vector wendland_const_c2 = FILL_VEC(20.f);
static const vector wendland_const_c3 = FILL_VEC(-10.f);
static const vector wendland_const_c4 = FILL_VEC(0.f);
static const vector wendland_const_c5 = FILL_VEC(1.f);
static const vector wendland_dwdx_const_c0 = FILL_VEC(20.f);
static const vector wendland_dwdx_const_c1 = FILL_VEC(-60.f);
static const vector wendland_dwdx_const_c2 = FILL_VEC(60.f);
static const vector wendland_dwdx_const_c3 = FILL_VEC(-20.f);
#elif defined(CUBIC_SPLINE_KERNEL)
/* First region 0 < u < 0.5 */
static const vector cubic_1_const_c0 = FILL_VEC(3.f);
static const vector cubic_1_const_c1 = FILL_VEC(-3.f);
static const vector cubic_1_const_c2 = FILL_VEC(0.f);
static const vector cubic_1_const_c3 = FILL_VEC(0.5f);
static const vector cubic_1_dwdx_const_c0 = FILL_VEC(9.f);
static const vector cubic_1_dwdx_const_c1 = FILL_VEC(-6.f);
static const vector cubic_1_dwdx_const_c2 = FILL_VEC(0.f);
/* Second region 0.5 <= u < 1 */
static const vector cubic_2_const_c0 = FILL_VEC(-1.f);
static const vector cubic_2_const_c1 = FILL_VEC(3.f);
static const vector cubic_2_const_c2 = FILL_VEC(-3.f);
static const vector cubic_2_const_c3 = FILL_VEC(1.f);
static const vector cubic_2_dwdx_const_c0 = FILL_VEC(-3.f);
static const vector cubic_2_dwdx_const_c1 = FILL_VEC(6.f);
static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f);
static const vector cond = FILL_VEC(0.5f);
#endif
/**
* @brief Computes the kernel function and its derivative for two particles
* using vectors. The return value is undefined if $u > \\gamma = H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
vector *u, vector *w, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
dw_dx->v = wendland_const_c0.v;
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2, dw_dx2;
mask_t mask_reg;
/* Form a mask for one part of the kernel. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using a mask. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
dw_dx->v = cubic_1_const_c0.v;
dw_dx2.v = cubic_2_const_c0.v;
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Blend both kernel regions into one vector (mask out unneeded values). */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
w->v = vec_blend(mask_reg, w->v, w2.v);
dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
#else
#error \
"Vectorisation not supported for this kernel!!! Choose a different one or configure with --disable-hand-vec."
#endif
/* Return everyting */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
/**
* @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors. The return value is undefined if $u > \\gamma =
* H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
* @param u2 The ratio of the distance to the smoothing length $u = x/h$ for
* second particle.
* @param w2 (return) The value of the kernel function $W(x,h)$ for second
* particle.
* @param dw_dx2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
* second particle.
*/
__attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
vector *u, vector *w, vector *dw_dx, vector *u2, vector *w2,
vector *dw_dx2) {
/* Go to the range [0,1[ from [0,H[ */
vector x, x2;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
x2.v = vec_mul(u2->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
w2->v = vec_fma(wendland_const_c0.v, x2.v, wendland_const_c1.v);
dw_dx->v = wendland_const_c0.v;
dw_dx2->v = wendland_const_c0.v;
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
w2->v = vec_fma(x2.v, w2->v, wendland_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
w2->v = vec_fma(x2.v, w2->v, wendland_const_c3.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
w2->v = vec_fma(x2.v, w2->v, wendland_const_c5.v);
/* Return everything */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
#elif defined(CUBIC_SPLINE_KERNEL)
vector w_2, dw_dx_2;
vector w2_2, dw_dx2_2;
mask_t mask_reg, mask_reg_v2;
/* Form a mask for one part of the kernel for each vector. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
w2->v = vec_fma(cubic_1_const_c0.v, x2.v, cubic_1_const_c1.v);
w_2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
w2_2.v = vec_fma(cubic_2_const_c0.v, x2.v, cubic_2_const_c1.v);
dw_dx->v = cubic_1_const_c0.v;
dw_dx2->v = cubic_1_const_c0.v;
dw_dx_2.v = cubic_2_const_c0.v;
dw_dx2_2.v = cubic_2_const_c0.v;
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2->v = vec_fma(x2.v, w2->v, cubic_1_const_c3.v);
w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
/* Blend both kernel regions into one vector (mask out unneeded values). */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
w->v = vec_blend(mask_reg, w->v, w_2.v);
w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v);
dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v);
dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v);
/* Return everything */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
#endif
}
/**
* @brief Computes the kernel function for two particles
* using vectors. The return value is undefined if $u > \\gamma = H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
vector *w) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero.*/
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2;
mask_t mask_reg;
/* Form a mask for each part of the kernel. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
w->v = vec_blend(mask_reg, w->v, w2.v);
#else
#error \
"Vectorisation not supported for this kernel!!! Choose a different one or configure with --disable-hand-vec."
#endif
/* Return everything */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using vectors. The return value is undefined if $u > \\gamma = H/h$.
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
vector *u, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
/* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else
#error \
"Vectorisation not supported for this kernel!!! Choose a different one or configure with --disable-hand-vec."
#endif
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
vector *u, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
mask_t mask_reg;
/* Form a mask for each part of the kernel. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
#else
#error \
"Vectorisation not supported for this kernel!!! Choose a different one or configure with --disable-hand-vec."
#endif
/* Mask out result for particles that lie outside of the kernel function. */
mask_t mask;
vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
dw_dx->v = vec_and_mask(dw_dx->v, mask);
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using interleaved vectors.
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
* @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for
* second particle.
* @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
* second particle.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) {
/* Go to the range [0,1[ from [0,H[ */
vector x, x_2;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
dw_dx_2->v =
vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v, x.v);
dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2, dw_dx2_2;
mask_t mask_reg;
mask_t mask_reg_v2;
/* Form a mask for one part of the kernel. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
/* Only need the mask for one region as the vec_blend defaults to the vector
* when the mask is 0.*/
dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v);
dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v);
#else
#error \
"Vectorisation not supported for this kernel!!! Choose a different one or configure with --disable-hand-vec."
#endif
/* Mask out result for particles that lie outside of the kernel function. */
mask_t mask, mask_2;
vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */
dw_dx->v = vec_and_mask(dw_dx->v, mask);
dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2);
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx_2->v = vec_mul(
dw_dx_2->v,
vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
}
#endif /* WITH_VECTORIZATION */
/* Some cross-check functions */
void hydro_kernel_dump(int N);
#endif // SWIFT_KERNEL_HYDRO_H