Commit 0d8a12f6 authored by James Willis's avatar James Willis
Browse files

Created functions to evaluate only W and dWdx separately along with vectorised...

Created functions to evaluate only W and dWdx separately along with vectorised versions. Also removed unneeded FMAs when kernel constants are zero.
parent bcfc6382
...@@ -304,6 +304,40 @@ __attribute__((always_inline)) INLINE static void kernel_eval( ...@@ -304,6 +304,40 @@ __attribute__((always_inline)) INLINE static void kernel_eval(
*W = w * kernel_constant * kernel_gamma_inv_dim; *W = w * kernel_constant * kernel_gamma_inv_dim;
} }
/**
* @brief Computes the kernel function derivative.
*
* The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
* by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
*
* Returns 0 if \f$u > \gamma = H/h\f$.
*
* @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
* @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
float u, float *restrict dW_dx) {
/* Go to the range [0,1[ from [0,H[ */
const float x = u * kernel_gamma_inv;
/* Pick the correct branch of the kernel */
const int temp = (int)(x * kernel_ivals_f);
const int ind = temp > kernel_ivals ? kernel_ivals : temp;
const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
/* First two terms of the polynomial ... */
float dw_dx = ((float)kernel_degree * coeffs[0] * x) + (float)(kernel_degree - 1) * coeffs[1];
/* ... and the rest of them */
for (int k = 2; k < kernel_degree; k++) {
dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k];
}
/* Return everything */
*dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
}
/* ------------------------------------------------------------------------- */ /* ------------------------------------------------------------------------- */
#ifdef WITH_VECTORIZATION #ifdef WITH_VECTORIZATION
...@@ -370,24 +404,35 @@ static const vector wendland_const_c2 = FILL_VEC(20.f); ...@@ -370,24 +404,35 @@ static const vector wendland_const_c2 = FILL_VEC(20.f);
static const vector wendland_const_c3 = FILL_VEC(-10.f); static const vector wendland_const_c3 = FILL_VEC(-10.f);
static const vector wendland_const_c4 = FILL_VEC(0.f); static const vector wendland_const_c4 = FILL_VEC(0.f);
static const vector wendland_const_c5 = FILL_VEC(1.f); static const vector wendland_const_c5 = FILL_VEC(1.f);
static const vector wendland_dwdx_const_c0 = FILL_VEC(20.f);
static const vector wendland_dwdx_const_c1 = FILL_VEC(-60.f);
static const vector wendland_dwdx_const_c2 = FILL_VEC(60.f);
static const vector wendland_dwdx_const_c3 = FILL_VEC(-20.f);
#elif defined(CUBIC_SPLINE_KERNEL) #elif defined(CUBIC_SPLINE_KERNEL)
/* First region 0 < u < 0.5 */ /* First region 0 < u < 0.5 */
static const vector cubic_1_const_c0 = FILL_VEC(3.f); static const vector cubic_1_const_c0 = FILL_VEC(3.f);
static const vector cubic_1_const_c1 = FILL_VEC(-3.f); static const vector cubic_1_const_c1 = FILL_VEC(-3.f);
static const vector cubic_1_const_c2 = FILL_VEC(0.f); static const vector cubic_1_const_c2 = FILL_VEC(0.f);
static const vector cubic_1_const_c3 = FILL_VEC(0.5f); static const vector cubic_1_const_c3 = FILL_VEC(0.5f);
static const vector cubic_1_dwdx_const_c0 = FILL_VEC(9.f);
static const vector cubic_1_dwdx_const_c1 = FILL_VEC(-6.f);
static const vector cubic_1_dwdx_const_c2 = FILL_VEC(0.f);
/* Second region 0.5 <= u < 1 */ /* Second region 0.5 <= u < 1 */
static const vector cubic_2_const_c0 = FILL_VEC(-1.f); static const vector cubic_2_const_c0 = FILL_VEC(-1.f);
static const vector cubic_2_const_c1 = FILL_VEC(3.f); static const vector cubic_2_const_c1 = FILL_VEC(3.f);
static const vector cubic_2_const_c2 = FILL_VEC(-3.f); static const vector cubic_2_const_c2 = FILL_VEC(-3.f);
static const vector cubic_2_const_c3 = FILL_VEC(1.f); static const vector cubic_2_const_c3 = FILL_VEC(1.f);
static const vector cubic_2_dwdx_const_c0 = FILL_VEC(-3.f);
static const vector cubic_2_dwdx_const_c1 = FILL_VEC(6.f);
static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f);
static const vector cond = FILL_VEC(0.5f); static const vector cond = FILL_VEC(0.5f);
#endif #endif
/** /**
* @brief Computes the kernel function and its derivative for two particles * @brief Computes the kernel function and its derivative for two particles
* using interleaved vectors. * using vectors.
* *
* Return 0 if $u > \\gamma = H/h$ * Return 0 if $u > \\gamma = H/h$
* *
...@@ -415,7 +460,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( ...@@ -415,7 +460,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w->v = vec_fma(x.v, w->v, wendland_const_c3.v); w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, wendland_const_c4.v); w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = vec_fma(x.v, w->v, wendland_const_c5.v); w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
...@@ -438,7 +483,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( ...@@ -438,7 +483,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
/* Calculate the polynomial interleaving vector operations. */ /* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v); dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c2.v); w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
...@@ -511,8 +556,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( ...@@ -511,8 +556,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
w->v = vec_fma(x.v, w->v, wendland_const_c4.v); w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
w2->v = vec_fma(x2.v, w2->v, wendland_const_c4.v); w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
dw_dx->v = vec_fma(dw_dx->v, x.v, w->v); dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
...@@ -556,8 +601,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( ...@@ -556,8 +601,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v); dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v); dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v); dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c2.v); w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
w2->v = vec_fma(x2.v, w2->v, cubic_1_const_c2.v); w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v); w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v); w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);
...@@ -597,8 +642,129 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( ...@@ -597,8 +642,129 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
#endif #endif
} }
/**
* @brief Computes the kernel function for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param w (return) The value of the kernel function $W(x,h)$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(
vector *u, vector *w) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero.*/
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2;
vector mask1, mask2;
/* Form a mask for each part of the kernel. */
mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
/* Work out w for both regions of the kernel and combine the results together using masks. */
/* Init the iteration for Horner's scheme. */
w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v,mask1.v);
w2.v = vec_and(w2.v,mask2.v);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v,w2.v);
#else
#error
#endif #endif
/* Return everything */
w->v =
vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
}
/**
* @brief Computes the kernel function derivative for two particles
* using vectors.
*
* Return 0 if $u > \\gamma = H/h$
*
* @param u The ratio of the distance to the smoothing length $u = x/h$.
* @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
*/
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
vector *u, vector *dw_dx) {
/* Go to the range [0,1[ from [0,H[ */
vector x;
x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
#ifdef WENDLAND_C2_KERNEL
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(wendland_dwdx_const_c0.v,x.v,wendland_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations */
dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c2.v);
dw_dx->v = vec_fma(dw_dx->v,x.v,wendland_dwdx_const_c3.v);
dw_dx->v = vec_mul(dw_dx->v,x.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2;
vector mask1, mask2;
/* Form a mask for each part of the kernel. */
mask1.v = vec_cmp_lt(x.v,cond.v); /* 0 < x < 0.5 */
mask2.v = vec_cmp_gte(x.v,cond.v); /* 0.5 < x < 1 */;
/* Work out w for both regions of the kernel and combine the results together using masks. */
/* Init the iteration for Horner's scheme. */
dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v,x.v,cubic_1_dwdx_const_c1.v);
dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v,x.v,cubic_2_dwdx_const_c1.v);
/* Calculate the polynomial interleaving vector operations. */
dw_dx->v = vec_mul(dw_dx->v,x.v); /* cubic_1_dwdx_const_c2 is zero. */
dw_dx2.v = vec_fma(dw_dx2.v,x.v,cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v,mask1.v);
dw_dx2.v = vec_and(dw_dx2.v,mask2.v);
/* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v,dw_dx2.v);
#else
#error
#endif
/* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
kernel_gamma_inv_dim_plus_one_vec.v));
}
#endif /* WITH_VECTORIZATION */
/* Some cross-check functions */ /* Some cross-check functions */
void hydro_kernel_dump(int N); void hydro_kernel_dump(int N);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment