vector.h 12.6 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
19
 ******************************************************************************/
20
21
#ifndef SWIFT_VECTOR_H
#define SWIFT_VECTOR_H
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25

/* Have I already read this file? */
#ifndef VEC_MACRO

26
27
#include "../config.h"

28
29
#ifdef WITH_VECTORIZATION

30
31
32
/* Need to check whether compiler supports this (IBM does not)
   This will prevent the macros to be defined and switch off
   explicit vectorization if the compiled does not support it */
33
#ifdef HAVE_IMMINTRIN_H
34
/* Include the header file with the intrinsics for Intel architecture. */
35
#include <immintrin.h>
36
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
40
/* Define the vector macro. */
#define VEC_MACRO(elcount, type) \
  __attribute__((vector_size((elcount) * sizeof(type)))) type
Pedro Gonnet's avatar
Pedro Gonnet committed
41

42
/* So what will the vector size be? */
James Willis's avatar
James Willis committed
43
#ifdef HAVE_AVX512_F
44
45
46
47
48
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
#define VEC_FLOAT __m512
#define VEC_DBL __m512d
#define VEC_INT __m512i
49
#define KNL_MASK_16 __mmask16
50
#define vec_load(a) _mm512_load_ps(a)
James Willis's avatar
James Willis committed
51
#define vec_store(a, addr) _mm512_store_ps(addr, a)
52
53
#define vec_setzero() _mm512_setzero_ps()
#define vec_setintzero() _mm512_setzero_epi32()
54
#define vec_set1(a) _mm512_set1_ps(a)
55
#define vec_setint1(a) _mm512_set1_epi32(a)
56
57
58
59
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
  _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
  _mm512_set_pd(h, g, f, e, d, c, b, a)
60
61
62
63
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
64
#define vec_sqrt(a) _mm512_sqrt_ps(a)
65
66
#define vec_rcp(a) _mm512_rcp14_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
67
68
69
70
#define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
71
72
#define vec_floor(a) _mm512_floor_ps(a)
#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
James Willis's avatar
James Willis committed
73
74
75
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
#define vec_and(a, b) _mm512_and_ps(a, b)
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
#define vec_dbl_load(a) _mm512_load_pd(a)
#define vec_dbl_set1(a) _mm512_set1_pd(a)
#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
#define vec_getoffsets(ptrs)                                                \
  _mm512_insertf64x4(                                                       \
      _mm512_insertf64x4(_mm512_setzero_pd(),                               \
                         _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) -    \
                                               _mm512_set1_epi64(ptrs[0])), \
                         0),                                                \
      _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) -                   \
                            _mm512_set1_epi64(ptrs[0])),                    \
      1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
Matthieu Schaller's avatar
Matthieu Schaller committed
97
#define FILL_VEC(a)                                                     \
James Willis's avatar
James Willis committed
98
99
100
101
102
103
104
105
106
107
  {                                                                     \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a,   \
    .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
    .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
  }
#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack) \
  pack += __builtin_popcount(mask);
#define VEC_LEFT_PACK(a, mask, result) \
  _mm512_mask_compressstoreu_ps(result, mask, a)
108
#elif defined(HAVE_AVX)
109
110
111
112
113
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_DBL __m256d
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
James Willis's avatar
James Willis committed
114
#define vec_store(a, addr) _mm256_store_ps(addr, a)
115
116
#define vec_setzero() _mm256_setzero_ps()
#define vec_setintzero() _mm256_setzero_si256()
117
#define vec_set1(a) _mm256_set1_ps(a)
118
#define vec_setint1(a) _mm256_set1_epi32(a)
119
120
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
121
122
123
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mul(a, b) _mm256_mul_ps(a, b)
124
125
126
127
128
129
130
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a, b) _mm256_min_ps(a, b)
#define vec_fmax(a, b) _mm256_max_ps(a, b)
#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
131
132
133
134
135
#define vec_floor(a) _mm256_floor_ps(a)
#define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
#define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
James Willis's avatar
James Willis committed
136
#define vec_and(a, b) _mm256_and_ps(a, b)
137
138
139
140
141
142
143
144
145
146
147
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
#define vec_dbl_load(a) _mm256_load_pd(a)
#define vec_dbl_set1(a) _mm256_set1_pd(a)
#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
Matthieu Schaller's avatar
Matthieu Schaller committed
148
149
150
151
152
#define FILL_VEC(a)                                                   \
  {                                                                   \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
    .f[6] = a, .f[7] = a                                              \
  }
James Willis's avatar
James Willis committed
153
#define VEC_HADD(a, b)            \
154
155
156
157
  a.v = _mm256_hadd_ps(a.v, a.v); \
  a.v = _mm256_hadd_ps(a.v, a.v); \
  b += a.f[0] + a.f[4];
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
James Willis's avatar
James Willis committed
158
#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a, 1)
159
#ifdef HAVE_AVX2
160
161
#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
#define identity_indices 0x0706050403020100
162
163
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
James Willis's avatar
James Willis committed
164
165
166
167
168
169
170
171
172
173
174
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)                               \
  {                                                                            \
    unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101);         \
    expanded_mask *= 0xFF;                                                     \
    unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \
    __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);                       \
    v_mask = _mm256_cvtepu8_epi32(bytevec);                                    \
    pack += __builtin_popcount(mask);                                          \
  }
#define VEC_LEFT_PACK(a, mask, result) \
  *((__m256 *)(result)) = _mm256_permutevar8x32_ps(a, mask)
175
176
177
178
179
#endif
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
#ifndef VEC_FORM_PACKED_MASK
James Willis's avatar
James Willis committed
180
181
182
183
184
185
186
187
188
189
190
191
#define VEC_FORM_PACKED_MASK(mask, v_mask, pack)   \
  {                                                \
    for (int i = 0; i < VEC_SIZE; i++)             \
      if ((mask & (1 << i))) v_mask.i[pack++] = i; \
  }
#define VEC_FORM_PACKED_MASK_2(mask, v_mask, pack, mask2, v_mask2, pack2) \
  {                                                                       \
    for (int i = 0; i < VEC_SIZE; i++) {                                  \
      if ((mask & (1 << i))) v_mask.i[pack++] = i;                        \
      if ((mask2 & (1 << i))) v_mask2.i[pack2++] = i;                     \
    }                                                                     \
  }
192
193
#endif
#ifndef VEC_LEFT_PACK
James Willis's avatar
James Willis committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#define VEC_LEFT_PACK(a, mask, result)                                     \
  {                                                                        \
    __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));       \
    __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(a), 1);    \
    __m256 r0 = _mm256_permutevar_ps(a, mask);                             \
    __m256 r1 = _mm256_permutevar_ps(t2, mask);                            \
    __m128i k1 = _mm_slli_epi32(                                           \
        (__m128i)(_mm_xor_si128((__m128i)VEC_GET_HIGH((__m256)mask),       \
                                (__m128i)_mm_set1_epi32(4))),              \
        29);                                                               \
    __m128i k0 = _mm_slli_epi32((__m128i)(VEC_GET_LOW((__m256)mask)), 29); \
    __m256 kk =                                                            \
        _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), \
                             _mm_castsi128_ps(k1), 1);                     \
    *((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk);                  \
  }
210
#endif
211
#elif defined(HAVE_SSE2)
212
213
214
215
216
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_DBL __m128d
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
James Willis's avatar
James Willis committed
217
#define vec_store(a, addr) _mm_store_ps(addr, a)
218
219
220
221
222
223
224
225
226
227
#define vec_set1(a) _mm_set1_ps(a)
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a, b) _mm_min_ps(a, b)
#define vec_fmax(a, b) _mm_max_ps(a, b)
#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
228
229
230
231
#define vec_floor(a) _mm_floor_ps(a)
#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b)
#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm_movemask_ps(a)
232
233
234
235
236
237
238
239
240
241
242
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
#define vec_dbl_load(a) _mm_load_pd(a)
#define vec_dbl_set1(a) _mm_set1_pd(a)
#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
Matthieu Schaller's avatar
Matthieu Schaller committed
243
244
#define FILL_VEC(a) \
  { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
245
246
247
#else
#define VEC_SIZE 4
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
248

249
250
251
252
253
254
255
256
257
258
/* Define the composite types for element access. */
typedef union {
  VEC_FLOAT v;
  VEC_DBL vd;
  VEC_INT m;
  float f[VEC_SIZE];
  double d[VEC_SIZE / 2];
  int i[VEC_SIZE];
} vector;

259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
/**
 * @brief Calculates the inverse ($1/x$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy.
 *
 * @param x #vector to be inverted.
 * @return x_inv #vector inverted x.
 */
__attribute__((always_inline)) INLINE vector vec_reciprocal(vector x) {

  vector x_inv;

  x_inv.v = vec_rcp(x.v);
  x_inv.v = vec_sub(x_inv.v, vec_mul(x_inv.v, (vec_fma(x.v, x_inv.v, vec_set1(-1.0f)))));

  return x_inv;
}

/**
 * @brief Calculates the inverse and square root ($1/\sqrt{x}$) of a vector using intrinsics and a Newton iteration to obtain the correct level of accuracy.
 *
 * @param x #vector to be inverted.
 * @return x_inv #vector inverted x.
 */
__attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) {

  vector x_inv;

  x_inv.v = vec_rsqrt(x.v);
  x_inv.v = vec_sub(x_inv.v, vec_mul(vec_mul(vec_set1(0.5f), x_inv.v), (vec_fma(x.v, vec_mul(x_inv.v, x_inv.v), vec_set1(-1.0f)))));
  
  return x_inv;
}

291
292
293
294
295
296
#else
/* Needed for cache alignment. */
#define VEC_SIZE 16
#endif /* WITH_VECTORIZATION */

#endif /* VEC_MACRO */
297
298

#endif /* SWIFT_VECTOR_H */