Use generic SWIFT alignment and macros to allow auto-vectorisation of cache reads.

280a6c95 · James Willis · ec54107b · 280a6c95
Commit 280a6c95 authored 7 years ago by James Willis
--- a/src/cache.h
+++ b/src/cache.h
@@ -23,6 +23,7 @@
 #include "../config.h"
 /* Local headers */
+#include "align.h"
 #include "cell.h"
 #include "error.h"
 #include "part.h"
@@ -30,9 +31,7 @@
 #include "vector.h"
 #define NUM_VEC_PROC 2
-#define CACHE_ALIGN 64
 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
-#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
 #ifdef WITH_VECTORIZATION
 /* Cache struct to hold a local copy of a cells' particle
@@ -40,31 +39,31 @@
 struct cache {
  /* Particle x position. */
-  float *restrict x __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict x SWIFT_CACHE_ALIGN;
  /* Particle y position. */
-  float *restrict y __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict y SWIFT_CACHE_ALIGN;
  /* Particle z position. */
-  float *restrict z __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict z SWIFT_CACHE_ALIGN;
  /* Particle smoothing length. */
-  float *restrict h __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict h SWIFT_CACHE_ALIGN;
  /* Particle mass. */
-  float *restrict m __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict m SWIFT_CACHE_ALIGN;
  /* Particle x velocity. */
-  float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vx SWIFT_CACHE_ALIGN;
  /* Particle y velocity. */
-  float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vy SWIFT_CACHE_ALIGN;
  /* Particle z velocity. */
-  float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
+  float *restrict vz SWIFT_CACHE_ALIGN;
  /* Maximum index into neighbouring cell for particles that are in range. */
-  int *restrict max_index __attribute__((aligned(CACHE_ALIGN)));
+  int *restrict max_index SWIFT_CACHE_ALIGN;
  /* Cache size. */
  int count;
@@ -75,28 +74,28 @@ struct cache {
 struct c2_cache {
  /* Separation between two particles squared. */
-  float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* x separation between two particles. */
-  float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* y separation between two particles. */
-  float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* z separation between two particles. */
-  float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* Mass of particle pj. */
-  float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* x velocity of particle pj. */
-  float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* y velocity of particle pj. */
-  float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
  /* z velocity of particle pj. */
-  float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
+  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
 };
 /**
@@ -130,15 +129,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
    free(c->max_index);
  }
-  error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
+  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
-  error += posix_memalign((void **)&c->max_index, CACHE_ALIGN, sizeIntBytes);
+  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, sizeIntBytes);
  if (error != 0)
    error("Couldn't allocate cache, no. of particles: %d", (int)count);
@@ -152,25 +151,39 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
 * @param ci_cache The cache.
 */
 __attribute__((always_inline)) INLINE void cache_read_particles(
-    const struct cell *const ci, struct cache *const ci_cache) {
+    const struct cell *restrict const ci, struct cache *restrict const ci_cache) {
 #if defined(GADGET2_SPH)
+/* Let the compiler know that the data is aligned and create pointers to the 
+ * arrays inside the cache. */
+swift_align_and_restrict_information(x, ci_cache->x, float, SWIFT_CACHE_ALIGNMENT);
+swift_align_and_restrict_information(y, ci_cache->y, float, SWIFT_CACHE_ALIGNMENT);
+swift_align_and_restrict_information(z, ci_cache->z, float, SWIFT_CACHE_ALIGNMENT);
+swift_align_and_restrict_information(h, ci_cache->h, float, SWIFT_CACHE_ALIGNMENT);  
+swift_align_and_restrict_information(m, ci_cache->m, float, SWIFT_CACHE_ALIGNMENT);
+swift_align_and_restrict_information(vx, ci_cache->vx, float, SWIFT_CACHE_ALIGNMENT);
+swift_align_and_restrict_information(vy, ci_cache->vy, float, SWIFT_CACHE_ALIGNMENT);
+swift_align_and_restrict_information(vz, ci_cache->vz, float, SWIFT_CACHE_ALIGNMENT);
+const struct part *restrict parts = ci->parts;
+double loc[3];
+loc[0] = ci->loc[0];
+loc[1] = ci->loc[1];
+loc[2] = ci->loc[2];
 /* Shift the particles positions to a local frame so single precision can be
 * used instead of double precision. */
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
  for (int i = 0; i < ci->count; i++) {
-    ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
+    x[i] = (float)(parts[i].x[0] - loc[0]);
-    ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
+    y[i] = (float)(parts[i].x[1] - loc[1]);
-    ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
+    z[i] = (float)(parts[i].x[2] - loc[2]);
-    ci_cache->h[i] = ci->parts[i].h;
+    h[i] = parts[i].h;
-    ci_cache->m[i] = ci->parts[i].mass;
+    m[i] = parts[i].mass;
-    ci_cache->vx[i] = ci->parts[i].v[0];
+    vx[i] = parts[i].v[0];
-    ci_cache->vy[i] = ci->parts[i].v[1];
+    vy[i] = parts[i].v[1];
-    ci_cache->vz[i] = ci->parts[i].v[2];
+    vz[i] = parts[i].v[2];
  }
 #endif
@@ -322,13 +335,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
 * interaction.
 */
 __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
-    const struct cell *const ci, const struct cell *const cj,
+    const struct cell *restrict const ci, const struct cell *restrict const cj,
-    struct cache *const ci_cache, struct cache *const cj_cache,
+    struct cache *restrict const ci_cache, struct cache *restrict const cj_cache,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
-    const double *const shift, int *first_pi, int *last_pj,
+    const double *restrict const shift, int *first_pi, int *last_pj,
    const int num_vec_proc) {
-  int idx, ci_cache_idx;
+  int idx;
  /* Pad number of particles read to the vector size. */
  int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
  if (rem != 0) {
@@ -346,74 +359,95 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
  int first_pi_align = *first_pi;
  int last_pj_align = *last_pj;
+  const struct part *restrict parts_i = ci->parts;
-/* Shift the particles positions to a local frame (ci frame) so single precision
+  const struct part *restrict parts_j = cj->parts;
- * can be
+  double loc[3];
- * used instead of double precision. Also shift the cell ci, particles positions
+  loc[0] = ci->loc[0];
- * due to BCs but leave cell cj. */
+  loc[1] = ci->loc[1];
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
+  loc[2] = ci->loc[2];
-#pragma vector aligned
-#endif
+/* Let the compiler know that the data is aligned and create pointers to the 
-  for (int i = first_pi_align; i < ci->count; i++) {
+ * arrays inside the cache. */
-    /* Make sure ci_cache is filled from the first element. */
+  swift_align_and_restrict_information(x, ci_cache->x, float, SWIFT_CACHE_ALIGNMENT);
-    ci_cache_idx = i - first_pi_align;
+  swift_align_and_restrict_information(y, ci_cache->y, float, SWIFT_CACHE_ALIGNMENT);
-    idx = sort_i[i].i;
+  swift_align_and_restrict_information(z, ci_cache->z, float, SWIFT_CACHE_ALIGNMENT);
-    ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
+  swift_align_and_restrict_information(h, ci_cache->h, float, SWIFT_CACHE_ALIGNMENT);  
-    ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
+  swift_align_and_restrict_information(m, ci_cache->m, float, SWIFT_CACHE_ALIGNMENT);
-    ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
+  swift_align_and_restrict_information(vx, ci_cache->vx, float, SWIFT_CACHE_ALIGNMENT);
-    ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
+  swift_align_and_restrict_information(vy, ci_cache->vy, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(vz, ci_cache->vz, float, SWIFT_CACHE_ALIGNMENT);
-    ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
-    ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
+  int ci_cache_count = ci->count - first_pi_align;
-    ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
+  /* Shift the particles positions to a local frame (ci frame) so single precision
-    ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
+   * can be
+   * used instead of double precision. Also shift the cell ci, particles positions
+   * due to BCs but leave cell cj. */
+  for (int i = 0; i < ci_cache_count; i++) {
+    idx = sort_i[i + first_pi_align].i;
+    x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
+    y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
+    z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
+    h[i] = parts_i[idx].h;
+    m[i] = parts_i[idx].mass;
+    vx[i] = parts_i[idx].v[0];
+    vy[i] = parts_i[idx].v[1];
+    vz[i] = parts_i[idx].v[2];
  }
  /* Pad cache with fake particles that exist outside the cell so will not
   * interact.*/
-  float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
+  float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
  for (int i = ci->count - first_pi_align;
       i < ci->count - first_pi_align + VEC_SIZE; i++) {
-    ci_cache->x[i] = fake_pix;
+    x[i] = fake_pix;
-    ci_cache->y[i] = 1.f;
+    y[i] = 1.f;
-    ci_cache->z[i] = 1.f;
+    z[i] = 1.f;
-    ci_cache->h[i] = 1.f;
+    h[i] = 1.f;
-    ci_cache->m[i] = 1.f;
+    m[i] = 1.f;
-    ci_cache->vx[i] = 1.f;
+    vx[i] = 1.f;
-    ci_cache->vy[i] = 1.f;
+    vy[i] = 1.f;
-    ci_cache->vz[i] = 1.f;
+    vz[i] = 1.f;
  }
+/* Let the compiler know that the data is aligned and create pointers to the 
+ * arrays inside the cache. */
+  swift_align_and_restrict_information(xj, cj_cache->x, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(yj, cj_cache->y, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(zj, cj_cache->z, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(hj, cj_cache->h, float, SWIFT_CACHE_ALIGNMENT);  
+  swift_align_and_restrict_information(mj, cj_cache->m, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(vxj, cj_cache->vx, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(vyj, cj_cache->vy, float, SWIFT_CACHE_ALIGNMENT);
+  swift_align_and_restrict_information(vzj, cj_cache->vz, float, SWIFT_CACHE_ALIGNMENT);
-#if defined(WITH_VECTORIZATION) && defined(__ICC)
-#pragma vector aligned
-#endif
  for (int i = 0; i <= last_pj_align; i++) {
    idx = sort_j[i].i;
-    cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
+    xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
-    cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
+    yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
-    cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
+    zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
-    cj_cache->h[i] = cj->parts[idx].h;
+    hj[i] = parts_j[idx].h;
-    cj_cache->m[i] = cj->parts[idx].mass;
+    mj[i] = parts_j[idx].mass;
-    cj_cache->vx[i] = cj->parts[idx].v[0];
+    vxj[i] = parts_j[idx].v[0];
-    cj_cache->vy[i] = cj->parts[idx].v[1];
+    vyj[i] = parts_j[idx].v[1];
-    cj_cache->vz[i] = cj->parts[idx].v[2];
+    vzj[i] = parts_j[idx].v[2];
  }
  /* Pad cache with fake particles that exist outside the cell so will not
   * interact.*/
  float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
-    cj_cache->x[i] = fake_pjx;
+    xj[i] = fake_pjx;
-    cj_cache->y[i] = 1.f;
+    yj[i] = 1.f;
-    cj_cache->z[i] = 1.f;
+    zj[i] = 1.f;
-    cj_cache->h[i] = 1.f;
+    hj[i] = 1.f;
-    cj_cache->m[i] = 1.f;
+    mj[i] = 1.f;
-    cj_cache->vx[i] = 1.f;
+    vxj[i] = 1.f;
-    cj_cache->vy[i] = 1.f;
+    vyj[i] = 1.f;
-    cj_cache->vz[i] = 1.f;
+    vzj[i] = 1.f;
  }
 }