vector.h 6.96 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
19
 ******************************************************************************/
20
21
#ifndef SWIFT_VECTOR_H
#define SWIFT_VECTOR_H
Pedro Gonnet's avatar
Pedro Gonnet committed
22
23
24
25

/* Have I already read this file? */
#ifndef VEC_MACRO

26
27
#include "../config.h"

28
29
#ifdef WITH_VECTORIZATION

30
31
32
/* Need to check whether compiler supports this (IBM does not)
   This will prevent the macros to be defined and switch off
   explicit vectorization if the compiled does not support it */
33
#ifdef HAVE_IMMINTRIN_H
34
/* Include the header file with the intrinsics for Intel architecture. */
35
#include <immintrin.h>
36
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
40
/* Define the vector macro. */
#define VEC_MACRO(elcount, type) \
  __attribute__((vector_size((elcount) * sizeof(type)))) type
Pedro Gonnet's avatar
Pedro Gonnet committed
41

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
/* So what will the vector size be? */
#ifdef __MIC__
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
#define VEC_FLOAT __m512
#define VEC_DBL __m512d
#define VEC_INT __m512i
#define vec_load(a) _mm512_load_ps(a)
#define vec_set1(a) _mm512_set1_ps(a)
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
  _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
  _mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_sqrt(a) _mm512_sqrt_ps(a)
#define vec_rcp(a) _mm512_rcp_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt_ps(a)
#define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
#define vec_dbl_load(a) _mm512_load_pd(a)
#define vec_dbl_set1(a) _mm512_set1_pd(a)
#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
#define vec_getoffsets(ptrs)                                                \
  _mm512_insertf64x4(                                                       \
      _mm512_insertf64x4(_mm512_setzero_pd(),                               \
                         _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) -    \
                                               _mm512_set1_epi64(ptrs[0])), \
                         0),                                                \
      _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) -                   \
                            _mm512_set1_epi64(ptrs[0])),                    \
      1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
Matthieu Schaller's avatar
Matthieu Schaller committed
83
84
85
86
87
88
#define FILL_VEC(a)                                                     \
  {                                                                     \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a,   \
    .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
    .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a                      \
  }
89
#elif defined(HAVE_AVX)
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_DBL __m256d
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
#define vec_set1(a) _mm256_set1_ps(a)
#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a, b) _mm256_min_ps(a, b)
#define vec_fmax(a, b) _mm256_max_ps(a, b)
#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1)
#define vec_dbl_load(a) _mm256_load_pd(a)
#define vec_dbl_set1(a) _mm256_set1_pd(a)
#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b)
Matthieu Schaller's avatar
Matthieu Schaller committed
116
117
118
119
120
#define FILL_VEC(a)                                                   \
  {                                                                   \
    .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
    .f[6] = a, .f[7] = a                                              \
  }
121
#ifdef HAVE_AVX2
122
123
124
#define VEC_HAVE_GATHER
#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1)
#endif
125
#elif defined(HAVE_SSE2)
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_DBL __m128d
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
#define vec_set1(a) _mm_set1_ps(a)
#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a)
#define vec_dbl_set(a, b) _mm_set_pd(b, a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a, b) _mm_min_ps(a, b)
#define vec_fmax(a, b) _mm_max_ps(a, b)
#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a))
#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b))
#define vec_dbl_load(a) _mm_load_pd(a)
#define vec_dbl_set1(a) _mm_set1_pd(a)
#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm_max_pd(a, b)
Matthieu Schaller's avatar
Matthieu Schaller committed
152
153
#define FILL_VEC(a) \
  { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a }
154
155
156
#else
#define VEC_SIZE 4
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
157

158
159
160
161
162
163
164
165
166
167
/* Define the composite types for element access. */
typedef union {
  VEC_FLOAT v;
  VEC_DBL vd;
  VEC_INT m;
  float f[VEC_SIZE];
  double d[VEC_SIZE / 2];
  int i[VEC_SIZE];
} vector;

168
169
170
171
172
173
#else
/* Needed for cache alignment. */
#define VEC_SIZE 16
#endif /* WITH_VECTORIZATION */

#endif /* VEC_MACRO */
174
175

#endif /* SWIFT_VECTOR_H */