/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ #ifndef SWIFT_VECTOR_H #define SWIFT_VECTOR_H /* Have I already read this file? */ #ifndef VEC_MACRO #include "../config.h" #ifdef WITH_VECTORIZATION /* Need to check whether compiler supports this (IBM does not) This will prevent the macros to be defined and switch off explicit vectorization if the compiled does not support it */ #ifdef HAVE_IMMINTRIN_H /* Include the header file with the intrinsics for Intel architecture. */ #include #endif /* Define the vector macro. */ #define VEC_MACRO(elcount, type) \ __attribute__((vector_size((elcount) * sizeof(type)))) type /* So what will the vector size be? */ /* AVX-512 intrinsics*/ #ifdef HAVE_AVX512_F #define VEC_HAVE_GATHER #define VEC_SIZE 16 #define VEC_FLOAT __m512 #define VEC_DBL __m512d #define VEC_INT __m512i #define KNL_MASK_16 __mmask16 #define vec_load(a) _mm512_load_ps(a) #define vec_store(a, addr) _mm512_store_ps(addr, a) #define vec_setzero() _mm512_setzero_ps() #define vec_setintzero() _mm512_setzero_epi32() #define vec_set1(a) _mm512_set1_ps(a) #define vec_setint1(a) _mm512_set1_epi32(a) #define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) #define vec_add(a, b) _mm512_add_ps(a, b) #define vec_sub(a, b) _mm512_sub_ps(a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_rcp(a) _mm512_rcp14_ps(a) #define vec_rsqrt(a) _mm512_rsqrt14_ps(a) #define vec_ftoi(a) _mm512_cvttps_epi32(a) #define vec_fmin(a, b) _mm512_min_ps(a, b) #define vec_fmax(a, b) _mm512_max_ps(a, b) #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a) #define vec_floor(a) _mm512_floor_ps(a) #define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ) #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) #define vec_and(a, b) _mm512_and_ps(a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) #define vec_dbl_load(a) _mm512_load_pd(a) #define vec_dbl_set1(a) _mm512_set1_pd(a) #define vec_dbl_sqrt(a) _mm512_sqrt_pd(a) #define vec_dbl_rcp(a) _mm512_rcp_pd(a) #define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a) #define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm512_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm512_max_pd(a, b) #define vec_getoffsets(ptrs) \ _mm512_insertf64x4( \ _mm512_insertf64x4(_mm512_setzero_pd(), \ _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) - \ _mm512_set1_epi64(ptrs[0])), \ 0), \ _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) - \ _mm512_set1_epi64(ptrs[0])), \ 1) #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1) /* Initialises a vector struct with a default value. */ #define FILL_VEC(a) \ { \ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ } /* Performs a horizontal add on the vector and adds the result to a float. */ #ifdef __ICC #define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v) #else /* _mm512_reduce_add_ps not present in GCC compiler. TODO: Implement intrinsic version.*/ #define VEC_HADD(a, b) { \ for( int i=0; i