vector.h 6.13 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/*******************************************************************************
 * This file is part of SWIFT.
 * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
 * 
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 * 
 ******************************************************************************/

/* Have I already read this file? */
#ifndef VEC_MACRO

    /* Include the header file with the intrinsics. */
    #include <immintrin.h>
    
    /* Define the vector macro. */
    #define VEC_MACRO(elcount, type)  __attribute__((vector_size((elcount)*sizeof(type)))) type

    /* So what will the vector size be? */
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
    #ifdef __MIC__
        #define VECTORIZE
        #define VEC_HAVE_GATHER
        #define VEC_SIZE 16
        #define VEC_FLOAT __m512
        #define VEC_DBL __m512d
        #define VEC_INT __m512i
        #define vec_load(a) _mm512_load_ps(a)
        #define vec_set1(a) _mm512_set1_ps(a)
        #define vec_set(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) _mm512_set_ps(p,o,n,m,l,k,j,i,h,g,f,e,d,c,b,a)
        #define vec_dbl_set(a,b,c,d,e,f,g,h) _mm512_set_pd(h,g,f,e,d,c,b,a)
        #define vec_sqrt(a) _mm512_sqrt_ps(a)
        #define vec_rcp(a) _mm512_rcp_ps(a)
        #define vec_rsqrt(a) _mm512_rsqrt_ps(a)
        #define vec_ftoi(a) _mm512_cvttps_epi32(a)
        #define vec_fmin(a,b) _mm512_min_ps(a,b)
        #define vec_fmax(a,b) _mm512_max_ps(a,b)
        #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
        #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a,0))
        #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a,1))
        #define vec_dbl_tofloat(a,b) _mm512_insertf128( _mm512_castps128_ps512(a) , b , 1 )
        #define vec_dbl_load(a) _mm512_load_pd(a)
        #define vec_dbl_set1(a) _mm512_set1_pd(a)
        #define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
        #define vec_dbl_rcp(a) _mm512_rcp_pd(a)
        #define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
        #define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
        #define vec_dbl_fmin(a,b) _mm512_min_pd(a,b)
        #define vec_dbl_fmax(a,b) _mm512_max_pd(a,b)
        #define vec_getoffsets(ptrs) _mm512_insertf64x4( _mm512_insertf64x4( _mm512_setzero_pd() , _mm512_cvtepi64_epi32( _mm512_load_epi64(ptrs) - _mm512_set1_epi64(ptrs[0]) ) , 0 ) , _mm512_cvtepi64_epi32( _mm512_load_epi64(&ptrs[4]) - _mm512_set1_epi64(ptrs[0]) ) , 1 ) 
        #define vec_gather(base,offsets) _mm512_i32gather_ps( offsets.m , base , 1 )
    #elif defined(__AVX__)
62
        #define VECTORIZE
Pedro Gonnet's avatar
Pedro Gonnet committed
63
        #define VEC_SIZE 8
64
        #define VEC_FLOAT __m256
65
        #define VEC_DBL __m256d
66
        #define VEC_INT __m256i
Pedro Gonnet's avatar
Pedro Gonnet committed
67
68
        #define vec_load(a) _mm256_load_ps(a)
        #define vec_set1(a) _mm256_set1_ps(a)
69
70
        #define vec_set(a,b,c,d,e,f,g,h) _mm256_set_ps(h,g,f,e,d,c,b,a)
        #define vec_dbl_set(a,b,c,d) _mm256_set_pd(d,c,b,a)
Pedro Gonnet's avatar
Pedro Gonnet committed
71
72
73
74
75
        #define vec_sqrt(a) _mm256_sqrt_ps(a)
        #define vec_rcp(a) _mm256_rcp_ps(a)
        #define vec_rsqrt(a) _mm256_rsqrt_ps(a)
        #define vec_ftoi(a) _mm256_cvttps_epi32(a)
        #define vec_fmin(a,b) _mm256_min_ps(a,b)
76
        #define vec_fmax(a,b) _mm256_max_ps(a,b)
77
        #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a)
78
79
80
81
82
83
84
85
86
87
88
        #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,0))
        #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,1))
        #define vec_dbl_tofloat(a,b) _mm256_insertf128( _mm256_castps128_ps256(a) , b , 1 )
        #define vec_dbl_load(a) _mm256_load_pd(a)
        #define vec_dbl_set1(a) _mm256_set1_pd(a)
        #define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
        #define vec_dbl_rcp(a) _mm256_rcp_pd(a)
        #define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
        #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
        #define vec_dbl_fmin(a,b) _mm256_min_pd(a,b)
        #define vec_dbl_fmax(a,b) _mm256_max_pd(a,b)
89
90
91
92
        #ifdef __AVX2__
            #define VEC_HAVE_GATHER
            #define vec_gather(base,offsets) _mm256_i32gather_ps( base , offsets.m , 1 )
        #endif
93
    #elif defined( __SSE2__ )
94
        #define VECTORIZE
Pedro Gonnet's avatar
Pedro Gonnet committed
95
        #define VEC_SIZE 4
96
        #define VEC_FLOAT __m128
97
        #define VEC_DBL __m128d
98
        #define VEC_INT __m128i
Pedro Gonnet's avatar
Pedro Gonnet committed
99
100
        #define vec_load(a) _mm_load_ps(a)
        #define vec_set1(a) _mm_set1_ps(a)
101
102
        #define vec_set(a,b,c,d) _mm_set_ps(d,c,b,a)
        #define vec_dbl_set(a,b) _mm_set_pd(b,a)
Pedro Gonnet's avatar
Pedro Gonnet committed
103
104
105
106
107
        #define vec_sqrt(a) _mm_sqrt_ps(a)
        #define vec_rcp(a) _mm_rcp_ps(a)
        #define vec_rsqrt(a) _mm_rsqrt_ps(a)
        #define vec_ftoi(a) _mm_cvttps_epi32(a)
        #define vec_fmin(a,b) _mm_min_ps(a,b)
108
        #define vec_fmax(a,b) _mm_max_ps(a,b)
109
        #define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a)
110
111
112
113
114
115
116
117
118
119
120
        #define vec_todbl_lo(a) _mm_cvtps_pd(a)
        #define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a,a))
        #define vec_dbl_tofloat(a,b) _mm_movelh_ps( _mm_cvtpd_ps(a) , _mm_cvtpd_ps(b) )
        #define vec_dbl_load(a) _mm_load_pd(a)
        #define vec_dbl_set1(a) _mm_set1_pd(a)
        #define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
        #define vec_dbl_rcp(a) _mm_rcp_pd(a)
        #define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
        #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
        #define vec_dbl_fmin(a,b) _mm_min_pd(a,b)
        #define vec_dbl_fmax(a,b) _mm_max_pd(a,b)
121
122
    #else
        #define VEC_SIZE 4
Pedro Gonnet's avatar
Pedro Gonnet committed
123
124
125
126
127
    #endif

    /* Define the composite types for element access. */
    #ifdef VECTORIZE
    typedef union {
128
        VEC_FLOAT v;
129
        VEC_DBL vd;
130
        VEC_INT m;
Pedro Gonnet's avatar
Pedro Gonnet committed
131
        float f[VEC_SIZE];
132
        double d[VEC_SIZE/2];
Pedro Gonnet's avatar
Pedro Gonnet committed
133
134
135
136
137
        int i[VEC_SIZE];
        } vector;
    #endif

#endif