diff --git a/src/memswap.h b/src/memswap.h index 8c0e9ec2c0dc119b38358be5346174b6c1a1fe46..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644 --- a/src/memswap.h +++ b/src/memswap.h @@ -79,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, swap_loop(char, a, b, bytes); } +/** + * @brief Swap the contents of two elements in-place. + * + * As opposed to #memswap, this function does not require the parameters + * to be aligned in any specific way. + * Furthermore, register re-labeling only seems to work when the code is + * compiled with @c -funroll-loops. + * + * @param void_a Pointer to the first element. + * @param void_b Pointer to the second element. + * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. + */ +__attribute__((always_inline)) inline void memswap_unaligned(void *void_a, + void *void_b, + size_t bytes) { + char *a = (char *)void_a, *b = (char *)void_b; +#ifdef __AVX512F__ + while (bytes >= sizeof(__m512i)) { + register __m512i temp; + temp = _mm512_loadu_si512((__m512i *)a); + _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b)); + _mm512_storeu_si512((__m512i *)b, temp); + a += sizeof(__m512i); + b += sizeof(__m512i); + bytes -= sizeof(__m512i); + } +#endif +#ifdef __AVX__ + while (bytes >= sizeof(__m256i)) { + register __m256i temp; + temp = _mm256_loadu_si256((__m256i *)a); + _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b)); + _mm256_storeu_si256((__m256i *)b, temp); + a += sizeof(__m256i); + b += sizeof(__m256i); + bytes -= sizeof(__m256i); + } +#endif +#ifdef __SSE2__ + while (bytes >= sizeof(__m128i)) { + register __m128i temp; + temp = _mm_loadu_si128((__m128i *)a); + _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b)); + _mm_storeu_si128((__m128i *)b, temp); + a += sizeof(__m128i); + b += sizeof(__m128i); + bytes -= sizeof(__m128i); + } +#endif +#ifdef __ALTIVEC__ + // Power8 supports unaligned load/stores, but not sure what it will do here. + swap_loop(vector int, a, b, bytes); +#endif + swap_loop(size_t, a, b, bytes); + swap_loop(int, a, b, bytes); + swap_loop(short, a, b, bytes); + swap_loop(char, a, b, bytes); +} + #endif /* SWIFT_MEMSWAP_H */