vector.h 12.3 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
19
 ******************************************************************************/
20
21
#ifndef SWIFT_VECTOR_H
#define SWIFT_VECTOR_H
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25

/* Have I already read this file? */
#ifndef VEC_MACRO

26
27
#include "../config.h"

28
29
#ifdef WITH_VECTORIZATION

30
31
32
/* Need to check whether compiler supports this (IBM does not)
   This will prevent the macros to be defined and switch off
   explicit vectorization if the compiled does not support it */
33
#ifdef HAVE_IMMINTRIN_H
34
/* Include the header file with the intrinsics for Intel architecture. */
35
#include <immintrin.h>
36
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
40
/* Define the vector macro. */
#define VEC_MACRO(elcount, type) \
  __attribute__((vector_size((elcount) * sizeof(type)))) type
Pedro Gonnet's avatar
Pedro Gonnet committed
41

42
43
44
45
46
47
48
49
50
51
/* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of
 * accuracy we need, so an extra two terms are added. */
#define VEC_RECIPROCAL(x, x_inv)                                   \
  x_inv = vec_rcp(x);                                                     \
  x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f))) ) )

#define VEC_RECIPROCAL_SQRT(x, x_inv)                                  \
  x_inv = vec_rsqrt(x);                                                       \
  x_inv = vec_sub(x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv),(vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f)))))
  
52
/* So what will the vector size be? */
James Willis's avatar
James Willis committed
53
#ifdef HAVE_AVX512_F
54
55
56
57
58
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
#define VEC_FLOAT __m512
#define VEC_DBL __m512d
#define VEC_INT __m512i
59
#define KNL_MASK_16 __mmask16
60
#define vec_load(a) _mm512_load_ps(a)
61
62
63
#define vec_store(a,addr) _mm512_store_ps(addr,a)
#define vec_setzero() _mm512_setzero_ps()
#define vec_setintzero() _mm512_setzero_epi32()
64
#define vec_set1(a) _mm512_set1_ps(a)
65
#define vec_setint1(a) _mm512_set1_epi32(a)
66
67
68
69
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
  _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
  _mm512_set_pd(h, g, f, e, d, c, b, a)
70
71
72
73
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
74
#define vec_sqrt(a) _mm512_sqrt_ps(a)
75
76
#define vec_rcp(a) _mm512_rcp14_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
77
78
79
80
#define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
81
82
83
84
85
#define vec_floor(a) _mm512_floor_ps(a)
#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b,_CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b,_CMP_LE_OQ)
#define vec_and(a,b) _mm512_and_ps(a, b)
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
#define vec_dbl_load(a) _mm512_load_pd(a)
#define vec_dbl_set1(a) _mm512_set1_pd(a)
#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
#define vec_getoffsets(ptrs)                                                \
  _mm512_insertf64x4(                                                       \
      _mm512_insertf64x4(_mm512_setzero_pd(),                               \
                         _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) -    \
                                               _mm512_set1_epi64(ptrs[0])), \
                         0),                                                \
      _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) -                   \
                            _mm512_set1_epi64(ptrs[0])),                    \
      1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
Matthieu Schaller's avatar
Matthieu Schaller committed
107
#define FILL_VEC(a)                                                     \
108
109
110
111
112
113
114
115
{                                                                     \
  .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a,   \
  .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
  .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
}
#define VEC_HADD(a,b) b += _mm512_reduce_add_ps(a.v)
#define VEC_FORM_PACKED_MASK(mask,v_mask,pack) pack += __builtin_popcount(mask);
#define VEC_LEFT_PACK(a,mask,result) _mm512_mask_compressstoreu_ps(result, mask, a)
116
#elif defined(HAVE_AVX)
117
118
119
120
121
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_DBL __m256d
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
122
123
124
#define vec_store(a,addr) _mm256_store_ps(addr,a)
#define vec_setzero() _mm256_setzero_ps()
#define vec_setintzero() _mm256_setzero_si256()
125
#define vec_set1(a) _mm256_set1_ps(a)
126
#define vec_setint1(a) _mm256_set1_epi32(a)
127
128
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
129
130
131
#define vec_add(a, b) _mm256_add_ps(a, b)
#define vec_sub(a, b) _mm256_sub_ps(a, b)
#define vec_mul(a, b) _mm256_mul_ps(a, b)
132
133
134
135
136
137
138
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a, b) _mm256_min_ps(a, b)
#define vec_fmax(a, b) _mm256_max_ps(a, b)
#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
139
140
141
142
143
144
#define vec_floor(a) _mm256_floor_ps(a)
#define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
#define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
#define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm256_movemask_ps(a)
#define vec_and(a,b) _mm256_and_ps(a, b)
145
146
147
148
149
150
151
152
153
154
155
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
#define vec_dbl_load(a) _mm256_load_pd(a)
#define vec_dbl_set1(a) _mm256_set1_pd(a)
#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
Matthieu Schaller's avatar
Matthieu Schaller committed
156
157
158
159
160
#define FILL_VEC(a)                                                   \
  {                                                                   \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
    .f[6] = a, .f[7] = a                                              \
  }
161
162
163
164
165
166
#define VEC_HADD(a,b) \
  a.v = _mm256_hadd_ps(a.v, a.v); \
  a.v = _mm256_hadd_ps(a.v, a.v); \
  b += a.f[0] + a.f[4];
#define VEC_GET_LOW(a) _mm256_castps256_ps128(a)
#define VEC_GET_HIGH(a) _mm256_extractf128_ps(a,1)
167
#ifdef HAVE_AVX2
168
169
#define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c)
#define identity_indices 0x0706050403020100
170
171
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#define VEC_FORM_PACKED_MASK(mask,v_mask,pack)                                \
{                                                                             \
  unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101);          \
  expanded_mask *= 0xFF;                                                      \
  unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask);  \
  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);                        \
  v_mask = _mm256_cvtepu8_epi32(bytevec);                                     \
  pack += __builtin_popcount(mask);                                           \
}
#define VEC_LEFT_PACK(a,mask,result) *((__m256 *)(result)) = _mm256_permutevar8x32_ps(a,mask)
#endif
#ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
#endif
#ifndef VEC_FORM_PACKED_MASK
#define VEC_FORM_PACKED_MASK(mask,v_mask,pack) \
{                               \
 for(int i=0; i<VEC_SIZE; i++)  \
   if ((mask & (1 << i)))       \
     v_mask.i[pack++] = i;      \
}
#define VEC_FORM_PACKED_MASK_2(mask,v_mask,pack,mask2,v_mask2,pack2) \
{                                 \
 for(int i=0; i<VEC_SIZE; i++) {  \
   if ((mask & (1 << i)))         \
     v_mask.i[pack++] = i;        \
   if ((mask2 & (1 << i)))        \
     v_mask2.i[pack2++] = i;      \
 }                                \
}
#endif
#ifndef VEC_LEFT_PACK
#define VEC_LEFT_PACK(a,mask,result)                                                                                          \
{                                                                                                                             \
  __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a, 1));                                                            \
  __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(a), 1);                                                         \
  __m256 r0 = _mm256_permutevar_ps(a, mask);                                                                                  \
  __m256 r1 = _mm256_permutevar_ps(t2, mask);                                                                                 \
  __m128i k1 = _mm_slli_epi32((__m128i)(_mm_xor_si128((__m128i)VEC_GET_HIGH((__m256)mask),(__m128i)_mm_set1_epi32(4))), 29);  \
  __m128i k0 = _mm_slli_epi32((__m128i)(VEC_GET_LOW((__m256)mask)), 29);                                                      \
  __m256 kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)),                                              \
                                                          _mm_castsi128_ps(k1), 1);                                           \
  *((__m256 *)(result)) = _mm256_blendv_ps(r0, r1, kk);                                                                       \
}
216
#endif
217
#elif defined(HAVE_SSE2)
218
219
220
221
222
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_DBL __m128d
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
223
#define vec_store(a,addr) _mm_store_ps(addr,a)
224
225
226
227
228
229
230
231
232
233
#define vec_set1(a) _mm_set1_ps(a)
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a, b) _mm_min_ps(a, b)
#define vec_fmax(a, b) _mm_max_ps(a, b)
#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
234
235
236
237
#define vec_floor(a) _mm_floor_ps(a)
#define vec_cmp_lt(a, b) _mm_cmplt_ps(a, b)
#define vec_cmp_lte(a, b) _mm_cmp_ps(a, b, _CMP_LE_OQ)
#define vec_cmp_result(a) _mm_movemask_ps(a)
238
239
240
241
242
243
244
245
246
247
248
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
#define vec_dbl_load(a) _mm_load_pd(a)
#define vec_dbl_set1(a) _mm_set1_pd(a)
#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
Matthieu Schaller's avatar
Matthieu Schaller committed
249
250
#define FILL_VEC(a) \
  { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
251
252
253
#else
#define VEC_SIZE 4
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
254

255
256
257
258
259
260
261
262
263
264
/* Define the composite types for element access. */
typedef union {
  VEC_FLOAT v;
  VEC_DBL vd;
  VEC_INT m;
  float f[VEC_SIZE];
  double d[VEC_SIZE / 2];
  int i[VEC_SIZE];
} vector;

265
266
267
268
269
270
#else
/* Needed for cache alignment. */
#define VEC_SIZE 16
#endif /* WITH_VECTORIZATION */

#endif /* VEC_MACRO */
271
272

#endif /* SWIFT_VECTOR_H */