Commit c97849f4 authored by James Willis's avatar James Willis
Browse files

Left-pack interactions into secondary cache with un-aligned writes instead of aligned writes.

parent d4e7f8b5
...@@ -112,6 +112,7 @@ ...@@ -112,6 +112,7 @@
#define VEC_INT __m256i #define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a) #define vec_load(a) _mm256_load_ps(a)
#define vec_store(a, addr) _mm256_store_ps(addr, a) #define vec_store(a, addr) _mm256_store_ps(addr, a)
#define vec_unaligned_store(a, addr) _mm256_storeu_ps(addr, a)
#define vec_setzero() _mm256_setzero_ps() #define vec_setzero() _mm256_setzero_ps()
#define vec_setintzero() _mm256_setzero_si256() #define vec_setintzero() _mm256_setzero_si256()
#define vec_set1(a) _mm256_set1_ps(a) #define vec_set1(a) _mm256_set1_ps(a)
...@@ -171,7 +172,7 @@ ...@@ -171,7 +172,7 @@
pack += __builtin_popcount(mask); \ pack += __builtin_popcount(mask); \
} }
#define VEC_LEFT_PACK(a, mask, result) \ #define VEC_LEFT_PACK(a, mask, result) \
*((__m256 *)(result)) = _mm256_permutevar8x32_ps(a, mask) vec_unaligned_store(_mm256_permutevar8x32_ps(a, mask),result)
#endif #endif
#ifndef vec_fma #ifndef vec_fma
#define vec_fma(a, b, c) vec_add(vec_mul(a, b), c) #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment