diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 657dc3ddfcae5c37765bb85388091d9090c4b714..3662510fbcd3817d338d76bc89b1334e3c0cd395 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -236,18 +236,18 @@ static const vector kernel_igamma4_vec = FILL_VEC((float)kernel_igamma4); __attribute__((always_inline)) INLINE static void kernel_deval_vec(vector *u, vector *w, vector *dw_dx) { - vector ind, c[kernel_degree + 1], x; - int j, k; - /* Go to the range [0,1[ from [0,H[ */ + vector x; x.v = u->v * kernel_igamma_vec.v; /* Load x and get the interval id. */ + vector ind; ind.m = vec_ftoi(vec_fmin(x.v * kernel_ivals_vec.v, kernel_ivals_vec.v)); /* load the coefficients. */ - for (k = 0; k < VEC_SIZE; k++) - for (j = 0; j < kernel_degree + 1; j++) + vector c[kernel_degree + 1]; + for (int k = 0; k < VEC_SIZE; k++) + for (int j = 0; j < kernel_degree + 1; j++) c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; /* Init the iteration for Horner's scheme. */ diff --git a/src/vector.h b/src/vector.h index 7c70f79e5a3001d34b455de49b6ab7f26cc104d3..fa311f121f7b702f2288be0d561e520b52330457 100644 --- a/src/vector.h +++ b/src/vector.h @@ -79,22 +79,12 @@ _mm512_set1_epi64(ptrs[0])), \ 1) #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1) -#define FILL_VEC(constant) {.f[0] = constant, \ - .f[1] = constant, \ - .f[2] = constant, \ - .f[3] = constant, \ - .f[4] = constant, \ - .f[5] = constant, \ - .f[6] = constant, \ - .f[7] = constant, \ - .f[8] = constant, \ - .f[9] = constant, \ - .f[10] = constant, \ - .f[11] = constant, \ - .f[12] = constant, \ - .f[13] = constant, \ - .f[14] = constant, \ - .f[15] = constant} +#define FILL_VEC(a) \ + { \ + .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ + .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ + .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ + } #elif defined(NO__AVX__) #define VECTORIZE #define VEC_SIZE 8 @@ -123,14 +113,11 @@ #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm256_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm256_max_pd(a, b) -#define FILL_VEC(constant) {.f[0] = constant, \ - .f[1] = constant, \ - .f[2] = constant, \ - .f[3] = constant, \ - .f[4] = constant, \ - .f[5] = constant, \ - .f[6] = constant, \ - .f[7] = constant} +#define FILL_VEC(a) \ + { \ + .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ + .f[6] = a, .f[7] = a \ + } #ifdef __AVX2__ #define VEC_HAVE_GATHER #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) @@ -163,10 +150,8 @@ #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm_max_pd(a, b) -#define FILL_VEC(constant) {.f[0] = constant, \ - .f[1] = constant, \ - .f[2] = constant, \ - .f[3] = constant} +#define FILL_VEC(a) \ + { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a } #else #define VEC_SIZE 4 #endif