Commit 3ce7aab6 authored by James Willis's avatar James Willis

Support SSE vectorisation.

parent 5b055e8a
......@@ -327,7 +327,9 @@
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a)
#define vec_add(a, b) _mm_add_ps(a, b)
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b, mask.v))
#define vec_sub(a, b) _mm_sub_ps(a, b)
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v))
#define vec_mul(a, b) _mm_mul_ps(a, b)
#define vec_div(a, b) _mm_div_ps(a, b)
#define vec_sqrt(a) _mm_sqrt_ps(a)
......@@ -341,8 +343,20 @@
#define vec_cmp_gt(a, b) _mm_cmpgt_ps(a, b)
#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b)
#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm_cmp_ps(a, b, _CMP_GE_OQ)
#define vec_cmp_result(a) _mm_movemask_ps(a)
#define vec_is_mask_true(a) _mm_movemask_ps(a.v)
#define vec_and(a, b) _mm_and_ps(a, b)
#define vec_mask_and(a, b) _mm_and_ps(a.v, b.v)
#define vec_and_mask(a, mask) _mm_and_ps(a, mask.v)
#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_combine_masks(mask1, mask2) \
({ mask1.v = vec_mask_and(mask1, mask2); })
#define vec_zero_mask(mask) mask.v = vec_setzero()
#define vec_pad_mask(mask, pad) \
for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
#define vec_blend(mask, a, b) _mm_blendv_ps(a, b, mask.v)
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
......@@ -364,10 +378,23 @@
a.v = _mm_hadd_ps(a.v, a.v); \
b += a.f[0] + a.f[1];
/* Performs a horizontal maximum on the vector and takes the maximum of the
* result with a float, b. */
#define VEC_HMAX(a, b) \
{ \
for (int k = 0; k < VEC_SIZE; k++) b = max(b, a.f[k]); \
}
/* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */
#ifndef vec_fnma
#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b))
#endif
#else
#define VEC_SIZE 4
#endif /* HAVE_SSE2 */
......
# ID pos_x pos_y pos_z v_x v_y v_z h rho div_v S u P c a_x a_y a_z h_dt v_sig dS/dt du/dt
0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4
0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 2e-3 2e-3 2e-3 1e-4 1e-4 1e-4 1e-4
0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 2.3e-3 2e-3 2e-3 1e-4 1e-4 1e-4 1e-4
0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-4 2e-4 2e-4 1e-6 1e-6 1e-6 1e-6
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment