Commit 4a6c8662 authored by James Willis's avatar James Willis
Browse files

Use intrinsics for vector operations as arithmetic overloading is not...

Use intrinsics for vector operations as arithmetic overloading is not supported by the Intel compiler yet. Updated masking operations with new macros so that AVX-512 masking is supported.

Conflicts:
	src/kernel_hydro.h
parent 1b5cf348
......@@ -468,12 +468,11 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
vector w2, dw_dx2;
vector mask_reg1, mask_reg2;
mask_t mask_reg1, mask_reg2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -496,10 +495,10 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v);
w2.v = vec_and(w2.v, mask_reg2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2.v = vec_and(dw_dx2.v, mask_reg2.v);
w->v = vec_and_mask(w->v, mask_reg1);
w2.v = vec_and_mask(w2.v, mask_reg2);
dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w2.v);
......@@ -580,15 +579,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
#elif defined(CUBIC_SPLINE_KERNEL)
vector w_2, dw_dx_2;
vector w2_2, dw_dx2_2;
vector mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
mask_t mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
/* Form a mask for each part of the kernel. */
mask_reg1.v = vec_cmp_lt(x.v, cond.v); /* 0 < x < 0.5 */
mask_reg1_v2.v = vec_cmp_lt(x2.v, cond.v); /* 0 < x < 0.5 */
mask_reg2.v = vec_cmp_gte(x.v, cond.v); /* 0.5 < x < 1 */
;
mask_reg2_v2.v = vec_cmp_gte(x2.v, cond.v); /* 0.5 < x < 1 */
;
vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg1_v2, vec_cmp_lt(x2.v, cond.v)); /* 0 < x < 0.5 */
vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
vec_create_mask(mask_reg2_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
/* Work out w for both regions of the kernel and combine the results together
* using masks. */
......@@ -623,14 +620,14 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);
/* Mask out unneeded values. */
w->v = vec_and(w->v, mask_reg1.v);
w2->v = vec_and(w2->v, mask_reg1_v2.v);
w_2.v = vec_and(w_2.v, mask_reg2.v);
w2_2.v = vec_and(w2_2.v, mask_reg2_v2.v);
dw_dx->v = vec_and(dw_dx->v, mask_reg1.v);
dw_dx2->v = vec_and(dw_dx2->v, mask_reg1_v2.v);
dw_dx_2.v = vec_and(dw_dx_2.v, mask_reg2.v);
dw_dx2_2.v = vec_and(dw_dx2_2.v, mask_reg2_v2.v);
w->v = vec_and_mask(w->v, mask_reg1);
w2->v = vec_and_mask(w2->v, mask_reg1_v2);
w_2.v = vec_and_mask(w_2.v, mask_reg2);
w2_2.v = vec_and_mask(w2_2.v, mask_reg2_v2);
dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
dw_dx2->v = vec_and_mask(dw_dx2->v, mask_reg1_v2);
dw_dx_2.v = vec_and_mask(dw_dx_2.v, mask_reg2);
dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_v2);
/* Added both w and w2 together to form complete result. */
w->v = vec_add(w->v, w_2.v);
......@@ -639,12 +636,12 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
/* Return everything */
w->v = w->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
w2->v = w2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_vec.v;
w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
dw_dx->v =
dw_dx->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
dw_dx2->v =
dw_dx2->v * kernel_constant_vec.v * kernel_gamma_inv_dim_plus_one_vec.v;
vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
#endif
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment