Commit 0f4de63e authored by James Willis's avatar James Willis
Browse files

Create masks using macros to support AVX-512.

parent 62aecd22
...@@ -369,11 +369,11 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( ...@@ -369,11 +369,11 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
/* Go to the range [0,1[ from [0,H[ */ /* Go to the range [0,1[ from [0,H[ */
vector x; vector x;
x.v = u->v * kernel_gamma_inv_vec.v; x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
/* Load x and get the interval id. */ /* Load x and get the interval id. */
vector ind; vector ind;
ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); ind.m = vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
/* load the coefficients. */ /* load the coefficients. */
vector c[kernel_degree + 1]; vector c[kernel_degree + 1];
...@@ -382,19 +382,18 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( ...@@ -382,19 +382,18 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec(
c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];
/* Init the iteration for Horner's scheme. */ /* Init the iteration for Horner's scheme. */
w->v = (c[0].v * x.v) + c[1].v; w->v = vec_fma(c[0].v, x.v, c[1].v);
dw_dx->v = c[0].v; dw_dx->v = c[0].v;
/* And we're off! */ /* And we're off! */
for (int k = 2; k <= kernel_degree; k++) { for (int k = 2; k <= kernel_degree; k++) {
dw_dx->v = (dw_dx->v * x.v) + w->v; dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
w->v = (x.v * w->v) + c[k].v; w->v = vec_fma(x.v, w->v, c[k].v);
} }
/* Return everything */ /* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v; w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v = dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
} }
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel /* Define constant vectors for the Wendland C2 and Cubic Spline kernel
...@@ -505,7 +504,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( ...@@ -505,7 +504,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w->v = vec_add(w->v, w2.v); w->v = vec_add(w->v, w2.v);
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else #else
#error #error "Vectorisation not supported for this kernel!!!"
#endif #endif
/* Return everything */ /* Return everything */
...@@ -673,11 +672,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, ...@@ -673,11 +672,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
w->v = vec_fma(x.v, w->v, wendland_const_c5.v); w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL) #elif defined(CUBIC_SPLINE_KERNEL)
vector w2; vector w2;
vector mask_reg1, mask_reg2; mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */ /* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together /* Work out w for both regions of the kernel and combine the results together
* using masks. */ * using masks. */
...@@ -694,13 +693,13 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, ...@@ -694,13 +693,13 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */ /* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v); w->v = vec_and_mask(w->v, mask_reg1);
w2.v = vec_and(w2.v, mask_reg2.v); w2.v = vec_and_mask(w2.v, mask_reg2);
/* Added both w and w2 together to form complete result. */ /* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w2.v); w->v = vec_add(w->v, w2.v);
#else #else
#error #error "Vectorisation not supported for this kernel!!!"
#endif #endif
/* Return everything */ /* Return everything */
...@@ -736,11 +735,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( ...@@ -736,11 +735,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
#elif defined(CUBIC_SPLINE_KERNEL) #elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2; vector dw_dx2;
vector mask_reg1, mask_reg2; mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */ /* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together /* Work out w for both regions of the kernel and combine the results together
* using masks. */ * using masks. */
...@@ -754,13 +753,13 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( ...@@ -754,13 +753,13 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */ /* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
/* Added both dwdx and dwdx2 together to form complete result. */ /* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else #else
#error #error "Vectorisation not supported for this kernel!!!"
#endif #endif
/* Return everything */ /* Return everything */
...@@ -797,11 +796,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( ...@@ -797,11 +796,11 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
#elif defined(CUBIC_SPLINE_KERNEL) #elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2; vector dw_dx2;
vector mask_reg1, mask_reg2; mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */ /* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together /* Work out w for both regions of the kernel and combine the results together
* using masks. */ * using masks. */
...@@ -815,20 +814,20 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( ...@@ -815,20 +814,20 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */ /* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
/* Added both dwdx and dwdx2 together to form complete result. */ /* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else #else
#error #error "Vectorisation not supported for this kernel!!!"
#endif #endif
/* Mask out result for particles that lie outside of the kernel function. */ /* Mask out result for particles that lie outside of the kernel function. */
vector mask; mask_t mask;
mask.v = vec_cmp_lt(x.v, vec_set1(1.f)); vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
dw_dx->v = vec_and(dw_dx->v, mask.v); dw_dx->v = vec_and_mask(dw_dx->v, mask);
/* Return everything */ /* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
...@@ -870,14 +869,14 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( ...@@ -870,14 +869,14 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
#elif defined(CUBIC_SPLINE_KERNEL) #elif defined(CUBIC_SPLINE_KERNEL)
vector dw_dx2, dw_dx2_2; vector dw_dx2, dw_dx2_2;
vector mask_reg1, mask_reg2; mask_t mask_reg1, mask_reg2;
vector mask_reg1_2, mask_reg2_2; mask_t mask_reg1_2, mask_reg2_2;
/* Form a mask for each part of the kernel. */ /* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */ vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
mask_reg1_2.v = vec_cmp_lt(x_2.v, cond.v); /* 0 < x < 0.5 */ vec_create_mask(mask_reg1_2, vec_cmp_lt(x_2.v, cond.v)); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */ vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
mask_reg2_2.v = vec_cmp_gte(x_2.v, cond.v); /* 0.5 < x < 1 */ vec_create_mask(mask_reg2_2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together /* Work out w for both regions of the kernel and combine the results together
* using masks. */ * using masks. */
...@@ -895,25 +894,25 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( ...@@ -895,25 +894,25 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v); dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);
/* Mask out unneeded values. */ /* Mask out unneeded values. */
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v); dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx_2->v = vec_and(dw_dx_2->v, mask_reg1_2.v); dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_reg1_2);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v); dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_2.v); dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_2);
/* Added both dwdx and dwdx2 together to form complete result. */ /* Added both dwdx and dwdx2 together to form complete result. */
dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
dw_dx_2->v = vec_add(dw_dx_2->v, dw_dx2_2.v); dw_dx_2->v = vec_add(dw_dx_2->v, dw_dx2_2.v);
#else #else
#error #error "Vectorisation not supported for this kernel!!!"
#endif #endif
/* Mask out result for particles that lie outside of the kernel function. */ /* Mask out result for particles that lie outside of the kernel function. */
vector mask, mask_2; mask_t mask, mask_2;
mask.v = vec_cmp_lt(x.v, vec_set1(1.f)); vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */
mask_2.v = vec_cmp_lt(x_2.v, vec_set1(1.f)); vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */
dw_dx->v = vec_and(dw_dx->v, mask.v); dw_dx->v = vec_and_mask(dw_dx->v, mask);
dw_dx_2->v = vec_and(dw_dx_2->v, mask_2.v); dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2);
/* Return everything */ /* Return everything */
dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment