vector.h 7.48 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
19
 ******************************************************************************/
20
21
#ifndef SWIFT_VECTOR_H
#define SWIFT_VECTOR_H
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25

/* Have I already read this file? */
#ifndef VEC_MACRO

26
27
#include "../config.h"

28
29
30
/* Need to check whether compiler supports this (IBM does not)
   This will prevent the macros to be defined and switch off
   explicit vectorization if the compiled does not support it */
31
#ifdef HAVE_IMMINTRIN_H
32
/* Include the header file with the intrinsics for Intel architecture. */
33
#include <immintrin.h>
34
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
35

36
37
38
/* Define the vector macro. */
#define VEC_MACRO(elcount, type) \
  __attribute__((vector_size((elcount) * sizeof(type)))) type
Pedro Gonnet's avatar
Pedro Gonnet committed
39

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
/* So what will the vector size be? */
#ifdef __MIC__
#define VECTORIZE
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
#define VEC_FLOAT __m512
#define VEC_DBL __m512d
#define VEC_INT __m512i
#define vec_load(a) _mm512_load_ps(a)
#define vec_set1(a) _mm512_set1_ps(a)
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
  _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
  _mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_sqrt(a) _mm512_sqrt_ps(a)
#define vec_rcp(a) _mm512_rcp_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt_ps(a)
#define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
#define vec_dbl_load(a) _mm512_load_pd(a)
#define vec_dbl_set1(a) _mm512_set1_pd(a)
#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
#define vec_getoffsets(ptrs)                                                \
  _mm512_insertf64x4(                                                       \
      _mm512_insertf64x4(_mm512_setzero_pd(),                               \
                         _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) -    \
                                               _mm512_set1_epi64(ptrs[0])), \
                         0),                                                \
      _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) -                   \
                            _mm512_set1_epi64(ptrs[0])),                    \
      1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#define FILL_VEC(constant) {.f[0] = constant, \
                           .f[1] = constant, \
                           .f[2] = constant, \
                           .f[3] = constant, \
                           .f[4] = constant, \
                           .f[5] = constant, \
                           .f[6] = constant, \
                           .f[7] = constant, \
                           .f[8] = constant, \
                           .f[9] = constant, \
                           .f[10] = constant, \
                           .f[11] = constant, \
                           .f[12] = constant, \
                           .f[13] = constant, \
                           .f[14] = constant, \
                           .f[15] = constant}
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#elif defined(NO__AVX__)
#define VECTORIZE
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_DBL __m256d
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
#define vec_set1(a) _mm256_set1_ps(a)
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a, b) _mm256_min_ps(a, b)
#define vec_fmax(a, b) _mm256_max_ps(a, b)
#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
#define vec_dbl_load(a) _mm256_load_pd(a)
#define vec_dbl_set1(a) _mm256_set1_pd(a)
#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
126
127
128
129
130
131
132
133
#define FILL_VEC(constant) {.f[0] = constant, \
                           .f[1] = constant, \
                           .f[2] = constant, \
                           .f[3] = constant, \
                           .f[4] = constant, \
                           .f[5] = constant, \
                           .f[6] = constant, \
                           .f[7] = constant}
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#ifdef __AVX2__
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
#endif
#elif defined(NO__SSE2__)
#define VECTORIZE
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_DBL __m128d
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
#define vec_set1(a) _mm_set1_ps(a)
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a, b) _mm_min_ps(a, b)
#define vec_fmax(a, b) _mm_max_ps(a, b)
#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
#define vec_dbl_load(a) _mm_load_pd(a)
#define vec_dbl_set1(a) _mm_set1_pd(a)
#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
166
167
168
169
#define FILL_VEC(constant) {.f[0] = constant, \
                           .f[1] = constant, \
                           .f[2] = constant, \
                           .f[3] = constant}
170
171
172
#else
#define VEC_SIZE 4
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
173

174
175
176
177
178
179
180
181
182
183
/* Define the composite types for element access. */
#ifdef VECTORIZE
typedef union {
  VEC_FLOAT v;
  VEC_DBL vd;
  VEC_INT m;
  float f[VEC_SIZE];
  double d[VEC_SIZE / 2];
  int i[VEC_SIZE];
} vector;
Pedro Gonnet's avatar
Pedro Gonnet committed
184
#endif
185
186
187
188

#endif

#endif /* SWIFT_VECTOR_H */