/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
* 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
******************************************************************************/
#ifndef SWIFT_VECTOR_H
#define SWIFT_VECTOR_H
/* Have I already read this file? */
#ifndef VEC_MACRO
#include "../config.h"
#ifdef WITH_VECTORIZATION
/* Need to check whether compiler supports this (IBM does not)
This will prevent the macros to be defined and switch off
explicit vectorization if the compiled does not support it */
#ifdef HAVE_IMMINTRIN_H
/* Include the header file with the intrinsics for Intel architecture. */
#include
#endif
/* Define the vector macro. */
#define VEC_MACRO(elcount, type) \
__attribute__((vector_size((elcount) * sizeof(type)))) type
/* So what will the vector size be? */
/* AVX-512 intrinsics*/
#ifdef HAVE_AVX512_F
#define VEC_HAVE_GATHER
#define VEC_SIZE 16
#define VEC_FLOAT __m512
#define VEC_DBL __m512d
#define VEC_INT __m512i
#define KNL_MASK_16 __mmask16
#define vec_load(a) _mm512_load_ps(a)
#define vec_store(a, addr) _mm512_store_ps(addr, a)
#define vec_setzero() _mm512_setzero_ps()
#define vec_setintzero() _mm512_setzero_epi32()
#define vec_set1(a) _mm512_set1_ps(a)
#define vec_setint1(a) _mm512_set1_epi32(a)
#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \
_mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a)
#define vec_dbl_set(a, b, c, d, e, f, g, h) \
_mm512_set_pd(h, g, f, e, d, c, b, a)
#define vec_add(a, b) _mm512_add_ps(a, b)
#define vec_sub(a, b) _mm512_sub_ps(a, b)
#define vec_mul(a, b) _mm512_mul_ps(a, b)
#define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c)
#define vec_sqrt(a) _mm512_sqrt_ps(a)
#define vec_rcp(a) _mm512_rcp14_ps(a)
#define vec_rsqrt(a) _mm512_rsqrt14_ps(a)
#define vec_ftoi(a) _mm512_cvttps_epi32(a)
#define vec_fmin(a, b) _mm512_min_ps(a, b)
#define vec_fmax(a, b) _mm512_max_ps(a, b)
#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a)
#define vec_floor(a) _mm512_floor_ps(a)
#define vec_cmp_gt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GT_OQ)
#define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ)
#define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ)
#define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ)
#define vec_and(a, b) _mm512_and_ps(a, b)
#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0))
#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1))
#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1)
#define vec_dbl_load(a) _mm512_load_pd(a)
#define vec_dbl_set1(a) _mm512_set1_pd(a)
#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm512_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a)
#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b)
#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b)
#define vec_getoffsets(ptrs) \
_mm512_insertf64x4( \
_mm512_insertf64x4(_mm512_setzero_pd(), \
_mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) - \
_mm512_set1_epi64(ptrs[0])), \
0), \
_mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) - \
_mm512_set1_epi64(ptrs[0])), \
1)
#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1)
/* Initialises a vector struct with a default value. */
#define FILL_VEC(a) \
{ \
.f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \
.f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \
.f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \
}
/* Performs a horizontal add on the vector and adds the result to a float. */
#ifdef __ICC
#define VEC_HADD(a, b) b += _mm512_reduce_add_ps(a.v)
#else /* _mm512_reduce_add_ps not present in GCC compiler. TODO: Implement intrinsic version.*/
#define VEC_HADD(a, b) { \
for( int i=0; i