/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * 2018 STFC (author email aidan.chalk@stfc.ac.uk) * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ #ifndef SWIFT_MEMSWAP_H #define SWIFT_MEMSWAP_H /* Config parameters. */ #include /* System includes. */ #include #ifdef HAVE_IMMINTRIN_H /* Include the header file with the intrinsics for Intel architecture. */ #include #endif #ifdef HAVE_ALTIVEC_H /* Include the header file with the intrinsics for Intel architecture. */ #include #endif /* Macro for in-place swap of two values a and b of type t. a and b are assumed to be of type uint8_t* so that the pointer arithmetic works. */ #define swap_loop(type, a, b, count) \ while (count >= sizeof(type)) { \ register type temp = *(type *)a; \ *(type *)a = *(type *)b; \ *(type *)b = temp; \ a += sizeof(type); \ b += sizeof(type); \ count -= sizeof(type); \ } /** * @brief Swap the contents of two elements in-place. * * Keep in mind that this function only works when the underlying data * is aligned to the vector length, e.g. with the @c * __attribute__((aligned(32))) syntax! * Furthermore, register re-labeling only seems to work when the code is * compiled with @c -funroll-loops. * * Note that GCC (at least until 7.3) produces incorrect AVX512 code here * by automatically assuming alignment. * * @param void_a Pointer to the first element. * @param void_b Pointer to the second element. * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. */ __attribute__((always_inline)) inline void memswap(void *restrict void_a, void *restrict void_b, size_t bytes) { int8_t *restrict a = (int8_t *)void_a, *restrict b = (int8_t *)void_b; #if defined(__AVX512F__) && defined(__INTEL_COMPILER) swap_loop(__m512i, a, b, bytes); #endif #ifdef __AVX__ swap_loop(__m256i, a, b, bytes); #endif #ifdef __SSE2__ swap_loop(__m128i, a, b, bytes); #endif #ifdef __ALTIVEC__ swap_loop(vector int, a, b, bytes); #endif swap_loop(int_least64_t, a, b, bytes); swap_loop(int_least32_t, a, b, bytes); swap_loop(int_least16_t, a, b, bytes); swap_loop(int_least8_t, a, b, bytes); /* This is a known bug for the current version of clang on ARM. * We add this synchronization as a temporary bug fix. * See https://bugs.llvm.org/show_bug.cgi?id=40051 */ #if defined(__clang__) && defined(__aarch64__) __sync_synchronize(); #endif } /** * @brief Swap the contents of two elements in-place. * * As opposed to #memswap, this function does not require the parameters * to be aligned in any specific way. * Furthermore, register re-labeling only seems to work when the code is * compiled with @c -funroll-loops. * * @param void_a Pointer to the first element. * @param void_b Pointer to the second element. * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. */ __attribute__((always_inline)) inline void memswap_unaligned( void *restrict void_a, void *restrict void_b, size_t bytes) { int8_t *restrict a = (int8_t *)void_a, *restrict b = (int8_t *)void_b; #ifdef __AVX512F__ while (bytes >= sizeof(__m512i)) { register __m512i temp; temp = _mm512_loadu_si512((__m512i *)a); _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b)); _mm512_storeu_si512((__m512i *)b, temp); a += sizeof(__m512i); b += sizeof(__m512i); bytes -= sizeof(__m512i); } #endif #ifdef __AVX__ while (bytes >= sizeof(__m256i)) { register __m256i temp; temp = _mm256_loadu_si256((__m256i *)a); _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b)); _mm256_storeu_si256((__m256i *)b, temp); a += sizeof(__m256i); b += sizeof(__m256i); bytes -= sizeof(__m256i); } #endif #ifdef __SSE2__ while (bytes >= sizeof(__m128i)) { register __m128i temp; temp = _mm_loadu_si128((__m128i *)a); _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b)); _mm_storeu_si128((__m128i *)b, temp); a += sizeof(__m128i); b += sizeof(__m128i); bytes -= sizeof(__m128i); } #endif #ifdef __ALTIVEC__ // Power8 supports unaligned load/stores, but not sure what it will do here. swap_loop(vector int, a, b, bytes); #endif swap_loop(int_least64_t, a, b, bytes); swap_loop(int_least32_t, a, b, bytes); swap_loop(int_least16_t, a, b, bytes); swap_loop(int_least8_t, a, b, bytes); /* This is a known bug for the current version of clang on ARM. * We add this synchronization as a temporary bug fix. * See https://bugs.llvm.org/show_bug.cgi?id=40051 */ #if defined(__clang__) && defined(__aarch64__) __sync_synchronize(); #endif } #endif /* SWIFT_MEMSWAP_H */