vector.h 17.4 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
19
 ******************************************************************************/
20
21
#ifndef SWIFT_VECTOR_H
#define SWIFT_VECTOR_H
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25

/* Have I already read this file? */
#ifndef VEC_MACRO

26
27
#include "../config.h"

28
29
#ifdef WITH_VECTORIZATION

30
31
32
/* Need to check whether compiler supports this (IBM does not)
   This will prevent the macros to be defined and switch off
   explicit vectorization if the compiled does not support it */
33
#ifdef HAVE_IMMINTRIN_H
34
/* Include the header file with the intrinsics for Intel architecture. */
35
#include <immintrin.h>
36
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
40
/* Define the vector macro. */
#define VEC_MACRO(elcount, type) \
  __attribute__((vector_size((elcount) * sizeof(type)))) type
Pedro Gonnet's avatar
Pedro Gonnet committed
41

42
/* So what will the vector size be? */
James Willis's avatar
James Willis committed
43
44

/* AVX-512 intrinsics*/
James Willis's avatar
James Willis committed
45
#ifdef HAVE_AVX512_F
46
47
48
49
50
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
#define VEC_FLOAT __m512
#define VEC_DBL __m512d
#define VEC_INT __m512i
51
#define KNL_MASK_16 __mmask16
52
#define vec_load(a) _mm512_load_ps(a)
James Willis's avatar
James Willis committed
53
#define vec_store(a, addr) _mm512_store_ps(addr, a)
54
55
#define vec_setzero() _mm512_setzero_ps()
#define vec_setintzero() _mm512_setzero_epi32()
56
#define vec_set1(a) _mm512_set1_ps(a)
57
#define vec_setint1(a) _mm512_set1_epi32(a)
58
59
60
61
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
  _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
  _mm512_set_pd(h, g, f, e, d, c, b, a)
62
#define vec_add(a, b) _mm512_add_ps(a, b)
63
#define vec_mask_add(a, b, mask) _mm512_mask_add_ps(a, mask, b, a) 
64
#define vec_sub(a, b) _mm512_sub_ps(a, b)
65
#define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) 
66
67
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
68
#define vec_sqrt(a) _mm512_sqrt_ps(a)
69
70
#define vec_rcp(a) _mm512_rcp14_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
71
72
73
74
#define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
75
76
#define vec_floor(a) _mm512_floor_ps(a)
#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
James Willis's avatar
James Willis committed
77
78
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
79
#define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
80
#define vec_cmp_result(a) a
81
#define vec_form_int_mask(a) a
James Willis's avatar
James Willis committed
82
#define vec_and(a, b) _mm512_and_ps(a, b)
83
#define vec_mask_and(a, b) a & b
84
85
86
87
#define vec_init_mask(mask) mask = 0xFFFF
#define vec_zero_mask(mask) mask = 0
#define vec_create_mask(mask, cond) mask = cond
#define vec_pad_mask(mask,pad) mask = mask >> (pad)
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
#define vec_dbl_load(a) _mm512_load_pd(a)
#define vec_dbl_set1(a) _mm512_set1_pd(a)
#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
#define vec_getoffsets(ptrs)                                                \
  _mm512_insertf64x4(                                                       \
      _mm512_insertf64x4(_mm512_setzero_pd(),                               \
                         _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) -    \
                                               _mm512_set1_epi64(ptrs[0])), \
                         0),                                                \
      _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) -                   \
                            _mm512_set1_epi64(ptrs[0])),                    \
      1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
James Willis's avatar
James Willis committed
109
110

/* Initialises a vector struct with a default value. */
Matthieu Schaller's avatar
Matthieu Schaller committed
111
#define FILL_VEC(a)                                                     \
James Willis's avatar
James Willis committed
112
113
114
115
116
  {                                                                     \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a,   \
    .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
    .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
  }
James Willis's avatar
James Willis committed
117
118

/* Performs a horizontal add on the vector and adds the result to a float. */
119
#ifdef __ICC
James Willis's avatar
James Willis committed
120
#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
Matthieu Schaller's avatar
Matthieu Schaller committed
121
122
123
124
125
126
#else /* _mm512_reduce_add_ps not present in GCC compiler. \
       TODO: Implement intrinsic version.*/
#define VEC_HADD(a, b)                              \
  {                                                 \
    for (int i = 0; i < VEC_SIZE; i++) b += a.f[i]; \
  }
127
#endif
James Willis's avatar
James Willis committed
128
129
/* Calculates the number of set bits in the mask and adds the result to an int.
 */
James Willis's avatar
James Willis committed
130
131
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
  pack += __builtin_popcount(mask);
James Willis's avatar
James Willis committed
132
133

/* Performs a left-pack on a vector based upon a mask and returns the result. */
James Willis's avatar
James Willis committed
134
135
#define VEC_LEFT_PACK(a, mask, result) \
  _mm512_mask_compressstoreu_ps(result, mask, a)
James Willis's avatar
James Willis committed
136
137

/* AVX intrinsics */
138
#elif defined(HAVE_AVX)
139
140
141
142
143
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_DBL __m256d
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
James Willis's avatar
James Willis committed
144
#define vec_unaligned_load(a) _mm256_loadu_ps(a)
James Willis's avatar
James Willis committed
145
#define vec_store(a, addr) _mm256_store_ps(addr, a)
146
#define vec_unaligned_store(a, addr) _mm256_storeu_ps(addr, a)
147
148
#define vec_setzero() _mm256_setzero_ps()
#define vec_setintzero() _mm256_setzero_si256()
149
#define vec_set1(a) _mm256_set1_ps(a)
150
#define vec_setint1(a) _mm256_set1_epi32(a)
151
152
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
153
#define vec_add(a, b) _mm256_add_ps(a, b)
154
#define vec_mask_add(a, b, mask) vec_add(a, vec_and(b,mask.v)) 
155
#define vec_sub(a, b) _mm256_sub_ps(a, b)
156
#define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b,mask.v)) 
157
#define vec_mul(a, b) _mm256_mul_ps(a, b)
158
159
160
161
162
163
164
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a, b) _mm256_min_ps(a, b)
#define vec_fmax(a, b) _mm256_max_ps(a, b)
#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
165
166
167
168
#define vec_floor(a) _mm256_floor_ps(a)
#define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
#define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
169
#define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
170
#define vec_cmp_result(a) _mm256_movemask_ps(a)
171
#define vec_form_int_mask(a) _mm256_movemask_ps(a.v) 
James Willis's avatar
James Willis committed
172
#define vec_and(a, b) _mm256_and_ps(a, b)
173
174
175
176
177
#define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v)
#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF)
#define vec_create_mask(mask, cond) mask.v = cond
#define vec_zero_mask(mask) mask.v = vec_setzero()
#define vec_pad_mask(mask,pad) for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0
178
179
180
181
182
183
184
185
186
187
188
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
#define vec_dbl_load(a) _mm256_load_pd(a)
#define vec_dbl_set1(a) _mm256_set1_pd(a)
#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
James Willis's avatar
James Willis committed
189
190

/* Initialises a vector struct with a default value. */
Matthieu Schaller's avatar
Matthieu Schaller committed
191
192
193
194
195
#define FILL_VEC(a)                                                   \
  {                                                                   \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
    .f[6] = a, .f[7] = a                                              \
  }
James Willis's avatar
James Willis committed
196
197

/* Performs a horizontal add on the vector and adds the result to a float. */
James Willis's avatar
James Willis committed
198
#define VEC_HADD(a, b)            \
199
200
201
  a.v = _mm256_hadd_ps(a.v, a.v); \
  a.v = _mm256_hadd_ps(a.v, a.v); \
  b += a.f[0] + a.f[4];
James Willis's avatar
James Willis committed
202

203
204
205
206
207
208
209
210
211
212
/* Performs a horizontal maximum on the vector and takes the maximum of the result with a float, b. */
#define VEC_HMAX(a, b)                                                                            \
{                                                                                                 \
__m256 y = _mm256_permute2f128_ps(a.v, a.v, 1); /* Permute 128-bit values, y = [a.high, a.low] */ \
__m256 m1 = _mm256_max_ps(a.v, y); /* m1[0] = max(x[0], x[3]), m1[1] = max(x[1], x[4]), etc. */   \
__m256 m2 = _mm256_permute_ps(m1, 177); /* Set m2[0] = m1[1], m2[1] = m1[0], m2[2] = m1[3] etc. */\
__m256 m = _mm256_max_ps(m1, m2); /* m[0] and m[7] contain maximums of each part of vector. */    \
  b = fmaxf(fmaxf(b,m[0]),m[7]);                                                                  \
}

James Willis's avatar
James Willis committed
213
/* Returns the lower 128-bits of the 256-bit vector. */
214
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
James Willis's avatar
James Willis committed
215
216

/* Returns the higher 128-bits of the 256-bit vector. */
James Willis's avatar
James Willis committed
217
#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a, 1)
James Willis's avatar
James Willis committed
218
219

/* Check if we have AVX2 intrinsics alongside AVX */
220
#ifdef HAVE_AVX2
221
#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
James Willis's avatar
James Willis committed
222
223

/* Used in VEC_FORM_PACKED_MASK */
224
#define identity_indices 0x0706050403020100
225
226
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
James Willis's avatar
James Willis committed
227

James Willis's avatar
James Willis committed
228
229
/* Takes an integer mask and forms a left-packed integer vector
 * containing indices of the set bits in the integer mask.
James Willis's avatar
James Willis committed
230
 * Also returns the total number of bits set in the mask. */
James Willis's avatar
James Willis committed
231
232
233
234
235
236
237
238
239
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)                               \
  {                                                                            \
    unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101);         \
    expanded_mask *= 0xFF;                                                     \
    unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
    __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);                       \
    v_mask = _mm256_cvtepu8_epi32(bytevec);                                    \
    pack += __builtin_popcount(mask);                                          \
  }
James Willis's avatar
James Willis committed
240
241

/* Performs a left-pack on a vector based upon a mask and returns the result. */
James Willis's avatar
James Willis committed
242
#define VEC_LEFT_PACK(a, mask, result) \
Matthieu Schaller's avatar
Matthieu Schaller committed
243
  vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask), result)
James Willis's avatar
James Willis committed
244
245
246
#endif /* HAVE_AVX2 */

/* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
247
248
249
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
James Willis's avatar
James Willis committed
250
251

/* Form a packed mask without intrinsics if AVX2 is not present. */
252
#ifndef VEC_FORM_PACKED_MASK
James Willis's avatar
James Willis committed
253

James Willis's avatar
James Willis committed
254
255
/* Takes an integer mask and forms a left-packed integer vector
 * containing indices of the set bits in the integer mask.
James Willis's avatar
James Willis committed
256
 * Also returns the total number of bits set in the mask. */
James Willis's avatar
James Willis committed
257
258
259
260
261
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)   \
  {                                                \
    for (int i = 0; i < VEC_SIZE; i++)             \
      if ((mask & (1 << i))) v_mask.i[pack++] = i; \
  }
James Willis's avatar
James Willis committed
262

James Willis's avatar
James Willis committed
263
264
/* Takes two integer masks and forms two left-packed integer vectors
 * containing indices of the set bits in each corresponding integer mask.
James Willis's avatar
James Willis committed
265
 * Also returns the total number of bits set in the mask. */
James Willis's avatar
James Willis committed
266
267
268
269
270
271
272
#define VEC_FORM_PACKED_MASK_2(mask, v_mask, pack, mask2, v_mask2, pack2) \
  {                                                                       \
    for (int i = 0; i < VEC_SIZE; i++) {                                  \
      if ((mask & (1 << i))) v_mask.i[pack++] = i;                        \
      if ((mask2 & (1 << i))) v_mask2.i[pack2++] = i;                     \
    }                                                                     \
  }
273
#endif
James Willis's avatar
James Willis committed
274
275

/* Performs a left-pack on a vector based upon a mask and returns the result. */
James Willis's avatar
James Willis committed
276
/* This uses AVX intrinsics, but this is slower than performing the left-pack
James Willis's avatar
James Willis committed
277
 * manually by looping over the vectors. */
278
#ifndef VEC_LEFT_PACK
James Willis's avatar
James Willis committed
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
#define VEC_LEFT_PACK(a, mask, result)                                     \
  {                                                                        \
    __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));       \
    __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(a), 1);    \
    __m256 r0 = _mm256_permutevar_ps(a, mask);                             \
    __m256 r1 = _mm256_permutevar_ps(t2, mask);                            \
    __m128i k1 = _mm_slli_epi32(                                           \
        (__m128i)(_mm_xor_si128((__m128i)VEC_GET_HIGH((__m256)mask),       \
                                (__m128i)_mm_set1_epi32(4))),              \
        29);                                                               \
    __m128i k0 = _mm_slli_epi32((__m128i)(VEC_GET_LOW((__m256)mask)), 29); \
    __m256 kk =                                                            \
        _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), \
                             _mm_castsi128_ps(k1), 1);                     \
    *((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk);                  \
  }
James Willis's avatar
James Willis committed
295
296
297
#endif /* HAVE_AVX2 */

/* SSE intrinsics*/
298
#elif defined(HAVE_SSE2)
299
300
301
302
303
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_DBL __m128d
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
James Willis's avatar
James Willis committed
304
#define vec_store(a, addr) _mm_store_ps(addr, a)
305
306
#define vec_setzero() _mm_setzero_ps()
#define vec_setintzero() _mm_setzero_si256()
307
#define vec_set1(a) _mm_set1_ps(a)
308
#define vec_setint1(a) _mm_set1_epi32(a)
309
310
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a)
311
312
313
#define vec_add(a, b) _mm_add_ps(a, b)
#define vec_sub(a, b) _mm_sub_ps(a, b)
#define vec_mul(a, b) _mm_mul_ps(a, b)
314
315
316
317
318
319
320
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a, b) _mm_min_ps(a, b)
#define vec_fmax(a, b) _mm_max_ps(a, b)
#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
321
#define vec_floor(a) _mm_floor_ps(a)
322
#define vec_cmp_gt(a, b) _mm_cmpgt_ps(a, b)
323
324
325
#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b)
#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm_movemask_ps(a)
326
#define vec_and(a, b) _mm_and_ps(a, b)
327
328
329
330
331
332
333
334
335
336
337
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
#define vec_dbl_load(a) _mm_load_pd(a)
#define vec_dbl_set1(a) _mm_set1_pd(a)
#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
James Willis's avatar
James Willis committed
338
339

/* Initialises a vector struct with a default value. */
Matthieu Schaller's avatar
Matthieu Schaller committed
340
341
#define FILL_VEC(a) \
  { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
James Willis's avatar
James Willis committed
342
343

/* Performs a horizontal add on the vector and adds the result to a float. */
Matthieu Schaller's avatar
Matthieu Schaller committed
344
#define VEC_HADD(a, b)         \
345
346
  a.v = _mm_hadd_ps(a.v, a.v); \
  b += a.f[0] + a.f[1];
James Willis's avatar
James Willis committed
347
348

/* Create an FMA using vec_add and vec_mul if AVX2 is not present. */
349
350
351
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
352
353
#else
#define VEC_SIZE 4
James Willis's avatar
James Willis committed
354
#endif /* HAVE_SSE2 */
Pedro Gonnet's avatar
Pedro Gonnet committed
355

356
357
358
359
360
361
362
363
364
365
/* Define the composite types for element access. */
typedef union {
  VEC_FLOAT v;
  VEC_DBL vd;
  VEC_INT m;
  float f[VEC_SIZE];
  double d[VEC_SIZE / 2];
  int i[VEC_SIZE];
} vector;

366
367
368
369
/* Define the mask type depending on the instruction set used. */
#ifdef HAVE_AVX512_F
typedef __mmask16 mask_t;
#else
370
typedef vector mask_t;
371
372
#endif

373
/**
Matthieu Schaller's avatar
Matthieu Schaller committed
374
375
 * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a
 * Newton iteration to obtain the correct level of accuracy.
376
377
378
379
380
381
382
383
384
 *
 * @param x #vector to be inverted.
 * @return x_inv #vector inverted x.
 */
__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) {

  vector x_inv;

  x_inv.v = vec_rcp(x.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
385
386
  x_inv.v = vec_sub(x_inv.v,
                    vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f)))));
387
388
389
390
391

  return x_inv;
}

/**
Matthieu Schaller's avatar
Matthieu Schaller committed
392
 * @brief Calculates the inverse and square root (\f$1/\sqrt{x}\f$) of a vector
Matthieu Schaller's avatar
Matthieu Schaller committed
393
394
 * using intrinsics and a Newton iteration to obtain the correct level of
 * accuracy.
395
396
397
398
399
400
401
402
403
 *
 * @param x #vector to be inverted.
 * @return x_inv #vector inverted x.
 */
__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {

  vector x_inv;

  x_inv.v = vec_rsqrt(x.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
404
405
406
407
408
  x_inv.v = vec_sub(
      x_inv.v,
      vec_mul(vec_mul(vec_set1(0.5f), x_inv.v),
              (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f)))));

409
410
411
  return x_inv;
}

412
413
414
415
416
417
#else
/* Needed for cache alignment. */
#define VEC_SIZE 16
#endif /* WITH_VECTORIZATION */

#endif /* VEC_MACRO */
418
419

#endif /* SWIFT_VECTOR_H */