From 134540fbd8f7d418060b5435cdcef5e5dbb72fb1 Mon Sep 17 00:00:00 2001
From: James Willis <james.s.willis@durham.ac.uk>
Date: Tue, 14 Mar 2017 14:21:30 +0000
Subject: [PATCH] Fixed bug with finding the exit iteration of the inner loops,
 -1 and +1 to value. Replaced vec_load with vec_unaligned_load for GCC. Pad
 cache correctly when the number of iterations of the inner loop is not a
 multiple of the vector length.

---
 src/runner_doiact_vec.c | 58 +++++++++++++++++++++++++++++++----------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c
index 223bf3c625..b01facceff 100644
--- a/src/runner_doiact_vec.c
+++ b/src/runner_doiact_vec.c
@@ -1059,7 +1059,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
 
   int first_pi_align = first_pi;
   int last_pj_align = last_pj;
-  cache_read_two_cells_sorted_2(ci, cj, ci_cache, cj_cache, sort_i, sort_j, shift, &first_pi_align, &last_pj_align, num_vec_proc);
+  cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, sort_j, shift, &first_pi_align, &last_pj_align, num_vec_proc);
 
   /* Loop over the parts in ci. */
   for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid--) {
@@ -1074,7 +1074,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
 
       dj = sort_j[max_ind_j].d;
     }
-    int exit_iteration = max_ind_j;    
+    int exit_iteration = max_ind_j + 1;    
 
     int ci_cache_idx = pid - first_pi_align; //sort_i[pid].i;
 
@@ -1121,8 +1121,18 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     if (rem != 0) {
       int pad = (num_vec_proc * VEC_SIZE) - rem;
 
-      if (exit_iteration_align + pad <= last_pj_align + 1)
+      if (exit_iteration_align + pad <= last_pj_align + 1) {
+        exit_iteration_align += pad;
+      }
+      else {
         exit_iteration_align += pad;
+        for(int i=last_pj_align + 1; i<exit_iteration_align; i++) {
+          cj_cache->x[i] = pix.f[0] + 2.0f * hi * kernel_gamma;
+          cj_cache->y[i] = 0.f;
+          cj_cache->z[i] = 0.f;
+        }
+
+      }
     }
 
     vector pjx, pjy, pjz;
@@ -1136,9 +1146,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
       vector v_dx, v_dy, v_dz, v_r2;
 
       /* Load 2 sets of vectors from the particle cache. */
-      pjx.v = vec_load(&cj_cache->x[cj_cache_idx]);
-      pjy.v = vec_load(&cj_cache->y[cj_cache_idx]);
-      pjz.v = vec_load(&cj_cache->z[cj_cache_idx]);
+      pjx.v = vec_unaligned_load(&cj_cache->x[cj_cache_idx]);
+      pjy.v = vec_unaligned_load(&cj_cache->y[cj_cache_idx]);
+      pjz.v = vec_unaligned_load(&cj_cache->z[cj_cache_idx]);
 
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pix.v, pjx.v);
@@ -1197,7 +1207,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
 
       di = sort_i[max_ind_i].d;
     }
-    int exit_iteration = max_ind_i;
+    int exit_iteration = max_ind_i - 1;
     
     int cj_cache_idx = pjd;
 
@@ -1245,8 +1255,16 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
     if (rem != 0) {
       int pad = (num_vec_proc * VEC_SIZE) - rem;
 
-      if (exit_iteration_align - pad >= first_pi_align)
+      if (exit_iteration_align - pad >= first_pi_align) {
         exit_iteration_align -= pad;
+      }
+      else {
+        for(int i=count_i - first_pi_align; i<count_i - first_pi_align + pad; i++) {
+            ci_cache->x[i] = pjx.f[0] + 2.0f * hj * kernel_gamma;
+            ci_cache->y[i] = 0.f;
+            ci_cache->z[i] = 0.f;
+        }
+      }
     }
 
     vector pix, piy, piz;
@@ -1260,9 +1278,9 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
       vector v_dx, v_dy, v_dz, v_r2;
 
       /* Load 2 sets of vectors from the particle cache. */
-      pix.v = vec_load(&ci_cache->x[ci_cache_idx]);
-      piy.v = vec_load(&ci_cache->y[ci_cache_idx]);
-      piz.v = vec_load(&ci_cache->z[ci_cache_idx]);
+      pix.v = vec_unaligned_load(&ci_cache->x[ci_cache_idx]);
+      piy.v = vec_unaligned_load(&ci_cache->y[ci_cache_idx]);
+      piz.v = vec_unaligned_load(&ci_cache->z[ci_cache_idx]);
     
       /* Compute the pairwise distance. */
       v_dx.v = vec_sub(pjx.v, pix.v);
@@ -1431,7 +1449,7 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
 
       dj = sort_j[max_ind_j].d;
     }
-    int exit_iteration = max_ind_j;    
+    int exit_iteration = max_ind_j + 1;    
 
     int ci_cache_idx = pid; //sort_i[pid].i;
 
@@ -1622,7 +1640,7 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
 
       di = sort_i[max_ind_i].d;
     }
-    int exit_iteration = max_ind_i;
+    int exit_iteration = max_ind_i - 1;
     
     int cj_cache_idx = pjd;
 
@@ -1772,6 +1790,18 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell
 
   } /* loop over the parts in ci. */
 
+  if(face) {
+    faceCtr++;
+    message("Total number of face interactions: %d, average per particle: %f, number tested: %d.", faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), numFaceTested);
+  }
+  else if(edge) {
+    edgeCtr++;
+    message("Total number of edge interactions: %d, average per particle: %f, number tested: %d", edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), numEdgeTested);
+  }
+  else if(corner) {
+    cornerCtr++;
+    message("Total number of corner interactions: %d, average per particle: %f, number tested: %d", cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), numCornerTested);
+  }
   TIMER_TOC(timer_dopair_density);
 
 #endif /* WITH_VECTORIZATION */
@@ -1868,7 +1898,7 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell
   last_pj = max(last_pj, max_ind_j); 
   first_pi = min(first_pi, max_ind_i);
 
-  cache_read_two_cells_sorted_2(ci, cj, ci_cache, cj_cache, sort_i, sort_j, shift, &first_pi, &last_pj, num_vec_proc);
+  cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, sort_j, shift, &first_pi, &last_pj, num_vec_proc);
 
   /* Loop over the parts in ci. */
   for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid-=2) {
-- 
GitLab