/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
* 2018 STFC (author email aidan.chalk@stfc.ac.uk)
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see .
*
******************************************************************************/
#ifndef SWIFT_MEMSWAP_H
#define SWIFT_MEMSWAP_H
/* Config parameters. */
#include
/* System includes. */
#include
#ifdef HAVE_IMMINTRIN_H
/* Include the header file with the intrinsics for Intel architecture. */
#include
#endif
#ifdef HAVE_ALTIVEC_H
/* Include the header file with the intrinsics for Intel architecture. */
#include
#endif
/* Macro for in-place swap of two values a and b of type t. a and b are
assumed to be of type uint8_t* so that the pointer arithmetic works. */
#define swap_loop(type, a, b, count) \
while (count >= sizeof(type)) { \
register type temp = *(type *)a; \
*(type *)a = *(type *)b; \
*(type *)b = temp; \
a += sizeof(type); \
b += sizeof(type); \
count -= sizeof(type); \
}
/**
* @brief Swap the contents of two elements in-place.
*
* Keep in mind that this function only works when the underlying data
* is aligned to the vector length, e.g. with the @c
* __attribute__((aligned(32))) syntax!
* Furthermore, register re-labeling only seems to work when the code is
* compiled with @c -funroll-loops.
*
* Note that GCC (at least until 7.3) produces incorrect AVX512 code here
* by automatically assuming alignment.
*
* @param void_a Pointer to the first element.
* @param void_b Pointer to the second element.
* @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
*/
__attribute__((always_inline)) inline void memswap(void *restrict void_a,
void *restrict void_b,
size_t bytes) {
int8_t *restrict a = (int8_t *)void_a, *restrict b = (int8_t *)void_b;
#if defined(__AVX512F__) && defined(__INTEL_COMPILER)
swap_loop(__m512i, a, b, bytes);
#endif
#ifdef __AVX__
swap_loop(__m256i, a, b, bytes);
#endif
#ifdef __SSE2__
swap_loop(__m128i, a, b, bytes);
#endif
#ifdef __ALTIVEC__
swap_loop(vector int, a, b, bytes);
#endif
swap_loop(int_least64_t, a, b, bytes);
swap_loop(int_least32_t, a, b, bytes);
swap_loop(int_least16_t, a, b, bytes);
swap_loop(int_least8_t, a, b, bytes);
/* This is a known bug for the current version of clang on ARM.
* We add this synchronization as a temporary bug fix.
* See https://bugs.llvm.org/show_bug.cgi?id=40051 */
#if defined(__clang__) && defined(__aarch64__)
__sync_synchronize();
#endif
}
/**
* @brief Swap the contents of two elements in-place.
*
* As opposed to #memswap, this function does not require the parameters
* to be aligned in any specific way.
* Furthermore, register re-labeling only seems to work when the code is
* compiled with @c -funroll-loops.
*
* @param void_a Pointer to the first element.
* @param void_b Pointer to the second element.
* @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
*/
__attribute__((always_inline)) inline void memswap_unaligned(
void *restrict void_a, void *restrict void_b, size_t bytes) {
int8_t *restrict a = (int8_t *)void_a, *restrict b = (int8_t *)void_b;
#ifdef __AVX512F__
while (bytes >= sizeof(__m512i)) {
register __m512i temp;
temp = _mm512_loadu_si512((__m512i *)a);
_mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b));
_mm512_storeu_si512((__m512i *)b, temp);
a += sizeof(__m512i);
b += sizeof(__m512i);
bytes -= sizeof(__m512i);
}
#endif
#ifdef __AVX__
while (bytes >= sizeof(__m256i)) {
register __m256i temp;
temp = _mm256_loadu_si256((__m256i *)a);
_mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b));
_mm256_storeu_si256((__m256i *)b, temp);
a += sizeof(__m256i);
b += sizeof(__m256i);
bytes -= sizeof(__m256i);
}
#endif
#ifdef __SSE2__
while (bytes >= sizeof(__m128i)) {
register __m128i temp;
temp = _mm_loadu_si128((__m128i *)a);
_mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b));
_mm_storeu_si128((__m128i *)b, temp);
a += sizeof(__m128i);
b += sizeof(__m128i);
bytes -= sizeof(__m128i);
}
#endif
#ifdef __ALTIVEC__
// Power8 supports unaligned load/stores, but not sure what it will do here.
swap_loop(vector int, a, b, bytes);
#endif
swap_loop(int_least64_t, a, b, bytes);
swap_loop(int_least32_t, a, b, bytes);
swap_loop(int_least16_t, a, b, bytes);
swap_loop(int_least8_t, a, b, bytes);
/* This is a known bug for the current version of clang on ARM.
* We add this synchronization as a temporary bug fix.
* See https://bugs.llvm.org/show_bug.cgi?id=40051 */
#if defined(__clang__) && defined(__aarch64__)
__sync_synchronize();
#endif
}
#endif /* SWIFT_MEMSWAP_H */