Skip to content
Snippets Groups Projects
Commit 280a6c95 authored by James Willis's avatar James Willis
Browse files

Use generic SWIFT alignment and macros to allow auto-vectorisation of cache reads.

parent ec54107b
No related branches found
No related tags found
1 merge request!404Cache auto vec
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "../config.h" #include "../config.h"
/* Local headers */ /* Local headers */
#include "align.h"
#include "cell.h" #include "cell.h"
#include "error.h" #include "error.h"
#include "part.h" #include "part.h"
...@@ -30,9 +31,7 @@ ...@@ -30,9 +31,7 @@
#include "vector.h" #include "vector.h"
#define NUM_VEC_PROC 2 #define NUM_VEC_PROC 2
#define CACHE_ALIGN 64
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
#ifdef WITH_VECTORIZATION #ifdef WITH_VECTORIZATION
/* Cache struct to hold a local copy of a cells' particle /* Cache struct to hold a local copy of a cells' particle
...@@ -40,31 +39,31 @@ ...@@ -40,31 +39,31 @@
struct cache { struct cache {
/* Particle x position. */ /* Particle x position. */
float *restrict x __attribute__((aligned(CACHE_ALIGN))); float *restrict x SWIFT_CACHE_ALIGN;
/* Particle y position. */ /* Particle y position. */
float *restrict y __attribute__((aligned(CACHE_ALIGN))); float *restrict y SWIFT_CACHE_ALIGN;
/* Particle z position. */ /* Particle z position. */
float *restrict z __attribute__((aligned(CACHE_ALIGN))); float *restrict z SWIFT_CACHE_ALIGN;
/* Particle smoothing length. */ /* Particle smoothing length. */
float *restrict h __attribute__((aligned(CACHE_ALIGN))); float *restrict h SWIFT_CACHE_ALIGN;
/* Particle mass. */ /* Particle mass. */
float *restrict m __attribute__((aligned(CACHE_ALIGN))); float *restrict m SWIFT_CACHE_ALIGN;
/* Particle x velocity. */ /* Particle x velocity. */
float *restrict vx __attribute__((aligned(CACHE_ALIGN))); float *restrict vx SWIFT_CACHE_ALIGN;
/* Particle y velocity. */ /* Particle y velocity. */
float *restrict vy __attribute__((aligned(CACHE_ALIGN))); float *restrict vy SWIFT_CACHE_ALIGN;
/* Particle z velocity. */ /* Particle z velocity. */
float *restrict vz __attribute__((aligned(CACHE_ALIGN))); float *restrict vz SWIFT_CACHE_ALIGN;
/* Maximum index into neighbouring cell for particles that are in range. */ /* Maximum index into neighbouring cell for particles that are in range. */
int *restrict max_index __attribute__((aligned(CACHE_ALIGN))); int *restrict max_index SWIFT_CACHE_ALIGN;
/* Cache size. */ /* Cache size. */
int count; int count;
...@@ -75,28 +74,28 @@ struct cache { ...@@ -75,28 +74,28 @@ struct cache {
struct c2_cache { struct c2_cache {
/* Separation between two particles squared. */ /* Separation between two particles squared. */
float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* x separation between two particles. */ /* x separation between two particles. */
float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* y separation between two particles. */ /* y separation between two particles. */
float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* z separation between two particles. */ /* z separation between two particles. */
float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* Mass of particle pj. */ /* Mass of particle pj. */
float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* x velocity of particle pj. */ /* x velocity of particle pj. */
float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* y velocity of particle pj. */ /* y velocity of particle pj. */
float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* z velocity of particle pj. */ /* z velocity of particle pj. */
float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
}; };
/** /**
...@@ -130,15 +129,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, ...@@ -130,15 +129,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
free(c->max_index); free(c->max_index);
} }
error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes); error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->max_index, CACHE_ALIGN, sizeIntBytes); error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, sizeIntBytes);
if (error != 0) if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count); error("Couldn't allocate cache, no. of particles: %d", (int)count);
...@@ -152,25 +151,39 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, ...@@ -152,25 +151,39 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
* @param ci_cache The cache. * @param ci_cache The cache.
*/ */
__attribute__((always_inline)) INLINE void cache_read_particles( __attribute__((always_inline)) INLINE void cache_read_particles(
const struct cell *const ci, struct cache *const ci_cache) { const struct cell *restrict const ci, struct cache *restrict const ci_cache) {
#if defined(GADGET2_SPH) #if defined(GADGET2_SPH)
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_align_and_restrict_information(x, ci_cache->x, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(y, ci_cache->y, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(z, ci_cache->z, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(h, ci_cache->h, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(m, ci_cache->m, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vx, ci_cache->vx, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vy, ci_cache->vy, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vz, ci_cache->vz, float, SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = 0; i < ci->count; i++) { for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; x[i] = (float)(parts[i].x[0] - loc[0]);
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; y[i] = (float)(parts[i].x[1] - loc[1]);
ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2]; z[i] = (float)(parts[i].x[2] - loc[2]);
ci_cache->h[i] = ci->parts[i].h; h[i] = parts[i].h;
ci_cache->m[i] = ci->parts[i].mass; m[i] = parts[i].mass;
ci_cache->vx[i] = ci->parts[i].v[0]; vx[i] = parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1]; vy[i] = parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2]; vz[i] = parts[i].v[2];
} }
#endif #endif
...@@ -322,13 +335,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( ...@@ -322,13 +335,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
* interaction. * interaction.
*/ */
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const struct cell *const ci, const struct cell *const cj, const struct cell *restrict const ci, const struct cell *restrict const cj,
struct cache *const ci_cache, struct cache *const cj_cache, struct cache *restrict const ci_cache, struct cache *restrict const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j, const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj, const double *restrict const shift, int *first_pi, int *last_pj,
const int num_vec_proc) { const int num_vec_proc) {
int idx, ci_cache_idx; int idx;
/* Pad number of particles read to the vector size. */ /* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) { if (rem != 0) {
...@@ -346,74 +359,95 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -346,74 +359,95 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
int first_pi_align = *first_pi; int first_pi_align = *first_pi;
int last_pj_align = *last_pj; int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
/* Shift the particles positions to a local frame (ci frame) so single precision const struct part *restrict parts_j = cj->parts;
* can be double loc[3];
* used instead of double precision. Also shift the cell ci, particles positions loc[0] = ci->loc[0];
* due to BCs but leave cell cj. */ loc[1] = ci->loc[1];
#if defined(WITH_VECTORIZATION) && defined(__ICC) loc[2] = ci->loc[2];
#pragma vector aligned
#endif /* Let the compiler know that the data is aligned and create pointers to the
for (int i = first_pi_align; i < ci->count; i++) { * arrays inside the cache. */
/* Make sure ci_cache is filled from the first element. */ swift_align_and_restrict_information(x, ci_cache->x, float, SWIFT_CACHE_ALIGNMENT);
ci_cache_idx = i - first_pi_align; swift_align_and_restrict_information(y, ci_cache->y, float, SWIFT_CACHE_ALIGNMENT);
idx = sort_i[i].i; swift_align_and_restrict_information(z, ci_cache->z, float, SWIFT_CACHE_ALIGNMENT);
ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; swift_align_and_restrict_information(h, ci_cache->h, float, SWIFT_CACHE_ALIGNMENT);
ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; swift_align_and_restrict_information(m, ci_cache->m, float, SWIFT_CACHE_ALIGNMENT);
ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; swift_align_and_restrict_information(vx, ci_cache->vx, float, SWIFT_CACHE_ALIGNMENT);
ci_cache->h[ci_cache_idx] = ci->parts[idx].h; swift_align_and_restrict_information(vy, ci_cache->vy, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vz, ci_cache->vz, float, SWIFT_CACHE_ALIGNMENT);
ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; int ci_cache_count = ci->count - first_pi_align;
ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; /* Shift the particles positions to a local frame (ci frame) so single precision
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; * can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
for (int i = 0; i < ci_cache_count; i++) {
idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
vz[i] = parts_i[idx].v[2];
} }
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact.*/ * interact.*/
float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
for (int i = ci->count - first_pi_align; for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) { i < ci->count - first_pi_align + VEC_SIZE; i++) {
ci_cache->x[i] = fake_pix; x[i] = fake_pix;
ci_cache->y[i] = 1.f; y[i] = 1.f;
ci_cache->z[i] = 1.f; z[i] = 1.f;
ci_cache->h[i] = 1.f; h[i] = 1.f;
ci_cache->m[i] = 1.f; m[i] = 1.f;
ci_cache->vx[i] = 1.f; vx[i] = 1.f;
ci_cache->vy[i] = 1.f; vy[i] = 1.f;
ci_cache->vz[i] = 1.f; vz[i] = 1.f;
} }
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_align_and_restrict_information(xj, cj_cache->x, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(yj, cj_cache->y, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(zj, cj_cache->z, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(hj, cj_cache->h, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(mj, cj_cache->m, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vxj, cj_cache->vx, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vyj, cj_cache->vy, float, SWIFT_CACHE_ALIGNMENT);
swift_align_and_restrict_information(vzj, cj_cache->vz, float, SWIFT_CACHE_ALIGNMENT);
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = 0; i <= last_pj_align; i++) { for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i; idx = sort_j[i].i;
cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
cj_cache->h[i] = cj->parts[idx].h; hj[i] = parts_j[idx].h;
cj_cache->m[i] = cj->parts[idx].mass; mj[i] = parts_j[idx].mass;
cj_cache->vx[i] = cj->parts[idx].v[0]; vxj[i] = parts_j[idx].v[0];
cj_cache->vy[i] = cj->parts[idx].v[1]; vyj[i] = parts_j[idx].v[1];
cj_cache->vz[i] = cj->parts[idx].v[2]; vzj[i] = parts_j[idx].v[2];
} }
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact.*/ * interact.*/
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
cj_cache->x[i] = fake_pjx; xj[i] = fake_pjx;
cj_cache->y[i] = 1.f; yj[i] = 1.f;
cj_cache->z[i] = 1.f; zj[i] = 1.f;
cj_cache->h[i] = 1.f; hj[i] = 1.f;
cj_cache->m[i] = 1.f; mj[i] = 1.f;
cj_cache->vx[i] = 1.f; vxj[i] = 1.f;
cj_cache->vy[i] = 1.f; vyj[i] = 1.f;
cj_cache->vz[i] = 1.f; vzj[i] = 1.f;
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment