From 2aee3db73cec01cbaa793b6b8b810de39c33e7c5 Mon Sep 17 00:00:00 2001
From: James Willis <james.s.willis@durham.ac.uk>
Date: Wed, 15 Mar 2017 11:42:42 +0000
Subject: [PATCH] Pad cache for self-interactions when reading the cache
 instead of on the fly.

---
 src/cache.h             | 11 +++++++++++
 src/runner_doiact_vec.c |  7 -------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/cache.h b/src/cache.h
index facbb1c4e7..db85216b7d 100644
--- a/src/cache.h
+++ b/src/cache.h
@@ -157,6 +157,9 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
 
   /* Shift the particles positions to a local frame so single precision can be
    * used instead of double precision. */
+#if defined(WITH_VECTORIZATION) && defined(__ICC)
+#pragma simd
+#endif
   for (int i = 0; i < ci->count; i++) {
     ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
     ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
@@ -169,6 +172,11 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
     ci_cache->vz[i] = ci->parts[i].v[2];
   }
 
+  /* Pad cache with fake particles that exist outside the cell so will not interact.*/
+  float fake_pix = 2.0f * ci->width[0] * ci->parts[ci->count - 1].x[0];
+  for (int i = ci->count; i < ci->count + (2 * VEC_SIZE); i++)
+    ci_cache->x[i] = fake_pix;
+
 #endif
 }
 
@@ -364,6 +372,8 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
     ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
   }
+
+  /* Pad cache with fake particles that exist outside the cell so will not interact.*/
   float fake_pix = 2.0f * ci_cache->x[ci->count - 1];
   for (int i = ci->count - first_pi_align;
        i < ci->count - first_pi_align + VEC_SIZE; i++)
@@ -385,6 +395,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
     cj_cache->vz[i] = cj->parts[idx].v[2];
   }
 
+  /* Pad cache with fake particles that exist outside the cell so will not interact.*/
   float fake_pjx = 2.0f * cj_cache->x[last_pj_align];
   for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++)
     cj_cache->x[i] = fake_pjx;
diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index df2a1074f8..c29de502c4 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -445,13 +445,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
       int pad = (num_vec_proc * VEC_SIZE) - rem;
 
       count_align += pad;
-      /* Set positions to the same as particle pi so when the r2 > 0 mask is
-       * applied these extra contributions are masked out.*/
-      for (int i = count; i < count_align; i++) {
-        cell_cache->x[i] = pix.f[0];
-        cell_cache->y[i] = piy.f[0];
-        cell_cache->z[i] = piz.f[0];
-      }
     }
 
     vector pjx, pjy, pjz;
-- 
GitLab