/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ #ifndef SWIFT_VECTOR_H #define SWIFT_VECTOR_H /* Have I already read this file? */ #ifndef VEC_MACRO #include "../config.h" #ifdef WITH_VECTORIZATION /* Need to check whether compiler supports this (IBM does not) This will prevent the macros to be defined and switch off explicit vectorization if the compiled does not support it */ #ifdef HAVE_IMMINTRIN_H /* Include the header file with the intrinsics for Intel architecture. */ #include #endif /* Define the vector macro. */ #define VEC_MACRO(elcount, type) \ __attribute__((vector_size((elcount) * sizeof(type)))) type /* Define vector reciprocals. vec_rcp and vec_rsqrt do not have the level of * accuracy we need, so an extra two terms are added. */ #define VEC_RECIPROCAL(x, x_inv) \ x_inv = vec_rcp(x); \ x_inv = vec_sub(x_inv, vec_mul(x_inv, (vec_fma(x, x_inv, vec_set1(-1.0f))) ) ) #define VEC_RECIPROCAL_SQRT(x, x_inv) \ x_inv = vec_rsqrt(x); \ x_inv = vec_sub(x_inv, vec_mul(vec_mul(vec_set1(0.5f), x_inv),(vec_fma(x, vec_mul(x_inv, x_inv), vec_set1(-1.0f))))) /* So what will the vector size be? */ #ifdef HAVE_AVX512_F #define VEC_HAVE_GATHER #define VEC_SIZE 16 #define VEC_FLOAT __m512 #define VEC_DBL __m512d #define VEC_INT __m512i #define KNL_MASK_16 __mmask16 #define vec_load(a) _mm512_load_ps(a) #define vec_store(a,addr) _mm512_store_ps(addr,a) #define vec_setzero() _mm512_setzero_ps() #define vec_setintzero() _mm512_setzero_epi32() #define vec_set1(a) _mm512_set1_ps(a) #define vec_setint1(a) _mm512_set1_epi32(a) #define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d, e, f, g, h) \ _mm512_set_pd(h, g, f, e, d, c, b, a) #define vec_add(a, b) _mm512_add_ps(a, b) #define vec_sub(a, b) _mm512_sub_ps(a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_rcp(a) _mm512_rcp14_ps(a) #define vec_rsqrt(a) _mm512_rsqrt14_ps(a) #define vec_ftoi(a) _mm512_cvttps_epi32(a) #define vec_fmin(a, b) _mm512_min_ps(a, b) #define vec_fmax(a, b) _mm512_max_ps(a, b) #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a) #define vec_floor(a) _mm512_floor_ps(a) #define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ) #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b,_CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b,_CMP_LE_OQ) #define vec_and(a,b) _mm512_and_ps(a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) #define vec_dbl_load(a) _mm512_load_pd(a) #define vec_dbl_set1(a) _mm512_set1_pd(a) #define vec_dbl_sqrt(a) _mm512_sqrt_pd(a) #define vec_dbl_rcp(a) _mm512_rcp_pd(a) #define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a) #define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm512_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm512_max_pd(a, b) #define vec_getoffsets(ptrs) \ _mm512_insertf64x4( \ _mm512_insertf64x4(_mm512_setzero_pd(), \ _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) - \ _mm512_set1_epi64(ptrs[0])), \ 0), \ _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) - \ _mm512_set1_epi64(ptrs[0])), \ 1) #define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1) #define FILL_VEC(a) \ { \ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ } #define VEC_HADD(a,b) b += _mm512_reduce_add_ps(a.v) #define VEC_FORM_PACKED_MASK(mask,v_mask,pack) pack += __builtin_popcount(mask); #define VEC_LEFT_PACK(a,mask,result) _mm512_mask_compressstoreu_ps(result, mask, a) #elif defined(HAVE_AVX) #define VEC_SIZE 8 #define VEC_FLOAT __m256 #define VEC_DBL __m256d #define VEC_INT __m256i #define vec_load(a) _mm256_load_ps(a) #define vec_store(a,addr) _mm256_store_ps(addr,a) #define vec_setzero() _mm256_setzero_ps() #define vec_setintzero() _mm256_setzero_si256() #define vec_set1(a) _mm256_set1_ps(a) #define vec_setint1(a) _mm256_set1_epi32(a) #define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) #define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) #define vec_add(a, b) _mm256_add_ps(a, b) #define vec_sub(a, b) _mm256_sub_ps(a, b) #define vec_mul(a, b) _mm256_mul_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rsqrt(a) _mm256_rsqrt_ps(a) #define vec_ftoi(a) _mm256_cvttps_epi32(a) #define vec_fmin(a, b) _mm256_min_ps(a, b) #define vec_fmax(a, b) _mm256_max_ps(a, b) #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a) #define vec_floor(a) _mm256_floor_ps(a) #define vec_cmp_lt(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) #define vec_cmp_gt(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) #define vec_and(a,b) _mm256_and_ps(a, b) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) #define vec_dbl_load(a) _mm256_load_pd(a) #define vec_dbl_set1(a) _mm256_set1_pd(a) #define vec_dbl_sqrt(a) _mm256_sqrt_pd(a) #define vec_dbl_rcp(a) _mm256_rcp_pd(a) #define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a) #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a) #define vec_dbl_fmin(a, b) _mm256_min_pd(a, b) #define vec_dbl_fmax(a, b) _mm256_max_pd(a, b) #define FILL_VEC(a) \ { \ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[6] = a, .f[7] = a \ } #define VEC_HADD(a,b) \ a.v = _mm256_hadd_ps(a.v, a.v); \ a.v = _mm256_hadd_ps(a.v, a.v); \ b += a.f[0] + a.f[4]; #define VEC_GET_LOW(a) _mm256_castps256_ps128(a) #define VEC_GET_HIGH(a) _mm256_extractf128_ps(a,1) #ifdef HAVE_AVX2 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c) #define identity_indices 0x0706050403020100 #define VEC_HAVE_GATHER #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) #define VEC_FORM_PACKED_MASK(mask,v_mask,pack) \ { \ unsigned long expanded_mask = _pdep_u64(mask, 0x0101010101010101); \ expanded_mask *= 0xFF; \ unsigned long wanted_indices = _pext_u64(identity_indices, expanded_mask); \ __m128i bytevec = _mm_cvtsi64_si128(wanted_indices); \ v_mask = _mm256_cvtepu8_epi32(bytevec); \ pack += __builtin_popcount(mask); \ } #define VEC_LEFT_PACK(a,mask,result) *((__m256 *)(result)) = _mm256_permutevar8x32_ps(a,mask) #endif #ifndef vec_fma #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c) #endif #ifndef VEC_FORM_PACKED_MASK #define VEC_FORM_PACKED_MASK(mask,v_mask,pack) \ { \ for(int i=0; i