diff --git a/src/cache.h b/src/cache.h index e66534ebc4859317724cf5d65eada4e850af253c..49ed478f91c5cb0b028c19d052621a05c02ff0da 100644 --- a/src/cache.h +++ b/src/cache.h @@ -23,6 +23,7 @@ #include "../config.h" /* Local headers */ +#include "align.h" #include "cell.h" #include "error.h" #include "part.h" @@ -30,9 +31,7 @@ #include "vector.h" #define NUM_VEC_PROC 2 -#define CACHE_ALIGN 64 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) -#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE #ifdef WITH_VECTORIZATION /* Cache struct to hold a local copy of a cells' particle @@ -40,31 +39,31 @@ struct cache { /* Particle x position. */ - float *restrict x __attribute__((aligned(CACHE_ALIGN))); + float *restrict x SWIFT_CACHE_ALIGN; /* Particle y position. */ - float *restrict y __attribute__((aligned(CACHE_ALIGN))); + float *restrict y SWIFT_CACHE_ALIGN; /* Particle z position. */ - float *restrict z __attribute__((aligned(CACHE_ALIGN))); + float *restrict z SWIFT_CACHE_ALIGN; /* Particle smoothing length. */ - float *restrict h __attribute__((aligned(CACHE_ALIGN))); + float *restrict h SWIFT_CACHE_ALIGN; /* Particle mass. */ - float *restrict m __attribute__((aligned(CACHE_ALIGN))); + float *restrict m SWIFT_CACHE_ALIGN; /* Particle x velocity. */ - float *restrict vx __attribute__((aligned(CACHE_ALIGN))); + float *restrict vx SWIFT_CACHE_ALIGN; /* Particle y velocity. */ - float *restrict vy __attribute__((aligned(CACHE_ALIGN))); + float *restrict vy SWIFT_CACHE_ALIGN; /* Particle z velocity. */ - float *restrict vz __attribute__((aligned(CACHE_ALIGN))); + float *restrict vz SWIFT_CACHE_ALIGN; /* Maximum index into neighbouring cell for particles that are in range. */ - int *restrict max_index __attribute__((aligned(CACHE_ALIGN))); + int *restrict max_index SWIFT_CACHE_ALIGN; /* Cache size. */ int count; @@ -75,28 +74,28 @@ struct cache { struct c2_cache { /* Separation between two particles squared. */ - float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x separation between two particles. */ - float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y separation between two particles. */ - float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z separation between two particles. */ - float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Mass of particle pj. */ - float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x velocity of particle pj. */ - float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y velocity of particle pj. */ - float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z velocity of particle pj. */ - float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; }; /** @@ -130,15 +129,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, free(c->max_index); } - error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->max_index, CACHE_ALIGN, sizeIntBytes); + error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, sizeIntBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -152,25 +151,39 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, * @param ci_cache The cache. */ __attribute__((always_inline)) INLINE void cache_read_particles( - const struct cell *const ci, struct cache *const ci_cache) { + const struct cell *restrict const ci, struct cache *restrict const ci_cache) { #if defined(GADGET2_SPH) +/* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ +swift_align_and_restrict_information(x, ci_cache->x, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(y, ci_cache->y, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(z, ci_cache->z, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(h, ci_cache->h, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(m, ci_cache->m, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(vx, ci_cache->vx, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(vy, ci_cache->vy, float, SWIFT_CACHE_ALIGNMENT); +swift_align_and_restrict_information(vz, ci_cache->vz, float, SWIFT_CACHE_ALIGNMENT); + +const struct part *restrict parts = ci->parts; +double loc[3]; +loc[0] = ci->loc[0]; +loc[1] = ci->loc[1]; +loc[2] = ci->loc[2]; + /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; + x[i] = (float)(parts[i].x[0] - loc[0]); + y[i] = (float)(parts[i].x[1] - loc[1]); + z[i] = (float)(parts[i].x[2] - loc[2]); + h[i] = parts[i].h; + + m[i] = parts[i].mass; + vx[i] = parts[i].v[0]; + vy[i] = parts[i].v[1]; + vz[i] = parts[i].v[2]; } #endif @@ -322,13 +335,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, + const struct cell *restrict const ci, const struct cell *restrict const cj, + struct cache *restrict const ci_cache, struct cache *restrict const cj_cache, const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift, int *first_pi, int *last_pj, + const double *restrict const shift, int *first_pi, int *last_pj, const int num_vec_proc) { - int idx, ci_cache_idx; + int idx; /* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); if (rem != 0) { @@ -346,74 +359,95 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( int first_pi_align = *first_pi; int last_pj_align = *last_pj; - -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif - for (int i = first_pi_align; i < ci->count; i++) { - /* Make sure ci_cache is filled from the first element. */ - ci_cache_idx = i - first_pi_align; - idx = sort_i[i].i; - ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[ci_cache_idx] = ci->parts[idx].h; - - ci_cache->m[ci_cache_idx] = ci->parts[idx].mass; - ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; - ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; - ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + +/* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_align_and_restrict_information(x, ci_cache->x, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(y, ci_cache->y, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(z, ci_cache->z, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(h, ci_cache->h, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(m, ci_cache->m, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(vx, ci_cache->vx, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(vy, ci_cache->vy, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(vz, ci_cache->vz, float, SWIFT_CACHE_ALIGNMENT); + + int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single precision + * can be + * used instead of double precision. Also shift the cell ci, particles positions + * due to BCs but leave cell cj. */ + for (int i = 0; i < ci_cache_count; i++) { + idx = sort_i[i + first_pi_align].i; + x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]); + y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]); + z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]); + h[i] = parts_i[idx].h; + + m[i] = parts_i[idx].mass; + vx[i] = parts_i[idx].v[0]; + vy[i] = parts_i[idx].v[1]; + vz[i] = parts_i[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ - float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; + float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0]; for (int i = ci->count - first_pi_align; i < ci->count - first_pi_align + VEC_SIZE; i++) { - ci_cache->x[i] = fake_pix; - ci_cache->y[i] = 1.f; - ci_cache->z[i] = 1.f; - ci_cache->h[i] = 1.f; - - ci_cache->m[i] = 1.f; - ci_cache->vx[i] = 1.f; - ci_cache->vy[i] = 1.f; - ci_cache->vz[i] = 1.f; + x[i] = fake_pix; + y[i] = 1.f; + z[i] = 1.f; + h[i] = 1.f; + + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; } + +/* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_align_and_restrict_information(xj, cj_cache->x, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(yj, cj_cache->y, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(zj, cj_cache->z, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(hj, cj_cache->h, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(mj, cj_cache->m, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(vxj, cj_cache->vx, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(vyj, cj_cache->vy, float, SWIFT_CACHE_ALIGNMENT); + swift_align_and_restrict_information(vzj, cj_cache->vz, float, SWIFT_CACHE_ALIGNMENT); -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif for (int i = 0; i <= last_pj_align; i++) { idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; + xj[i] = (float)(parts_j[idx].x[0] - loc[0]); + yj[i] = (float)(parts_j[idx].x[1] - loc[1]); + zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + hj[i] = parts_j[idx].h; + + mj[i] = parts_j[idx].mass; + vxj[i] = parts_j[idx].v[0]; + vyj[i] = parts_j[idx].v[1]; + vzj[i] = parts_j[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { - cj_cache->x[i] = fake_pjx; - cj_cache->y[i] = 1.f; - cj_cache->z[i] = 1.f; - cj_cache->h[i] = 1.f; - - cj_cache->m[i] = 1.f; - cj_cache->vx[i] = 1.f; - cj_cache->vy[i] = 1.f; - cj_cache->vz[i] = 1.f; + xj[i] = fake_pjx; + yj[i] = 1.f; + zj[i] = 1.f; + hj[i] = 1.f; + + mj[i] = 1.f; + vxj[i] = 1.f; + vyj[i] = 1.f; + vzj[i] = 1.f; } }