Skip to content
Snippets Groups Projects
Commit 4a6c8662 authored by James Willis's avatar James Willis
Browse files

Use intrinsics for vector operations as arithmetic overloading is not...

Use intrinsics for vector operations as arithmetic overloading is not supported by the Intel compiler yet. Updated masking operations with new macros so that AVX-512 masking is supported.

Conflicts:
	src/kernel_hydro.h
parent 1b5cf348
No related branches found
No related tags found
1 merge request!396Avx512 fixes
......@@ -468,12 +468,11 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2, dw_dx2;
vector mask_reg1, mask_reg2;
mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -496,10 +495,10 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v);
w2.v = vec_and(w2.v, mask_reg2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
w->v = vec_and_mask(w->v, mask_reg1);
w2.v = vec_and_mask(w2.v, mask_reg2);
dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w2.v);
......@@ -580,15 +579,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
#elif defined(CUBIC_SPLINE_KERNEL)
vector w_2, dw_dx_2;
vector w2_2, dw_dx2_2;
vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
mask_t mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
;
vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg1_v2, vec_cmp_lt(x2.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
vec_create_mask(mask_reg2_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -623,14 +620,14 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v);
w2->v = vec_and(w2->v, mask_reg1_v2.v);
w_2.v = vec_and(w_2.v, mask_reg2.v);
w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
w->v = vec_and_mask(w->v, mask_reg1);
w2->v = vec_and_mask(w2->v, mask_reg1_v2);
w_2.v = vec_and_mask(w_2.v, mask_reg2);
w2_2.v = vec_and_mask(w2_2.v, mask_reg2_v2);
dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2->v = vec_and_mask(dw_dx2->v, mask_reg1_v2);
dw_dx_2.v = vec_and_mask(dw_dx_2.v, mask_reg2);
dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_v2);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w_2.v);
......@@ -639,12 +636,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
/* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v =
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v =
dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
#endif
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment