From 930ed971d0626f1c15fb20f26d8a8d3c3b0b67ff Mon Sep 17 00:00:00 2001
From: Pedro Gonnet Anders <gonnet@localhost>
Date: Wed, 14 Jan 2015 11:42:44 +0100
Subject: [PATCH] merged-in changes made in the svn last night, i.e. added the
 single-precision code from test_bh.c and cleaned up the multipole interaction
 computation a bit.

---
 examples/test_bh_sorted.c | 209 ++++++++++++++++++++++----------------
 1 file changed, 124 insertions(+), 85 deletions(-)

diff --git a/examples/test_bh_sorted.c b/examples/test_bh_sorted.c
index 185377b..de3928d 100644
--- a/examples/test_bh_sorted.c
+++ b/examples/test_bh_sorted.c
@@ -25,6 +25,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <strings.h>
 #include <unistd.h>
 #include <math.h>
 #include <float.h>
@@ -37,7 +38,7 @@
 
 /* Some local constants. */
 #define cell_pool_grow 1000
-#define cell_maxparts 1
+#define cell_maxparts 100
 #define task_limit 1e8
 #define const_G 1    // 6.6738e-8
 #define dist_min 0.5 /* Used for legacy walk only */
@@ -61,6 +62,13 @@ struct part {
   int id;
 };  // __attribute__((aligned(64)));
 
+struct part_local {
+  float x[3];
+  float a[3];
+  float mass;
+  float d;
+} __attribute__((aligned(32)));
+
 /** Data structure for the sorted particle positions. */
 struct index {
   int ind;
@@ -121,31 +129,21 @@ struct multipole {
 };
 
 static inline void multipole_reset(struct multipole *d) {
-  d->mu[0] = 0.0;
-  d->mu[1] = 0.0;
-  d->mu[2] = 0.0;
-
-  d->sigma_00 = 0.0;
-  d->sigma_11 = 0.0;
-  d->sigma_22 = 0.0;
-  d->sigma_01 = 0.0;
-  d->sigma_02 = 0.0;
-  d->sigma_12 = 0.0;
-
-  d->mass = 0.0;
+  bzero(d, sizeof(struct multipole));
 }
 
 static inline void multipole_add(struct multipole *d, const float *x, float mass) {
-  d->mu[0] += x[0] * mass;
-  d->mu[1] += x[1] * mass;
-  d->mu[2] += x[2] * mass;
+  float xm[3] = { x[0] * mass, x[1] * mass, x[2] * mass };
+  d->mu[0] += xm[0];
+  d->mu[1] += xm[1];
+  d->mu[2] += xm[2];
   
-  d->sigma_00 += mass * x[0] * x[0];
-  d->sigma_11 += mass * x[1] * x[1];
-  d->sigma_22 += mass * x[2] * x[2];
-  d->sigma_01 += mass * x[0] * x[1];
-  d->sigma_02 += mass * x[0] * x[2];
-  d->sigma_12 += mass * x[1] * x[2];
+  d->sigma_00 += xm[0] * x[0];
+  d->sigma_11 += xm[1] * x[1];
+  d->sigma_22 += xm[2] * x[2];
+  d->sigma_01 += xm[0] * x[1];
+  d->sigma_02 += xm[0] * x[2];
+  d->sigma_12 += xm[1] * x[2];
 
   d->mass += mass;
 }
@@ -154,47 +152,48 @@ static inline void multipole_add(struct multipole *d, const float *x, float mass
 static inline void multipole_iact(struct multipole *d, const float *x, float* a) {
 
   int k;
-  float r2,ir,w, dx[3];
+  float r2,ir, dx[3];
 
   /* early abort */
-  if (d->mass == 0.) return;
+  if (d->mass == 0.0f) return;
 
-  float inv_mass = 1. / d->mass;
+  float inv_mass = 1.0f / d->mass;
+  float mu[3] = {d->mu[0], d->mu[1], d->mu[2]};
   
-  /* Temporary quantity */
-  dx[0] = d->sigma_00 - inv_mass*d->mu[0]*d->mu[0];  /* Abusing a so far unused */
-  dx[1] = d->sigma_11 - inv_mass*d->mu[1]*d->mu[1];  /* variable temporarily... */
-  dx[2] = d->sigma_22 - inv_mass*d->mu[2]*d->mu[2];
+  /* Temporary quantities. */
+  dx[0] = d->sigma_00 - inv_mass*mu[0]*mu[0];  /* Abusing a so far unused */
+  dx[1] = d->sigma_11 - inv_mass*mu[1]*mu[1];  /* variable temporarily... */
+  dx[2] = d->sigma_22 - inv_mass*mu[2]*mu[2];
 
   /* Construct quadrupole matrix */
   float Q_00 = 2.0f*dx[0] - dx[1] - dx[2]; 
   float Q_11 = 2.0f*dx[1] - dx[0] - dx[2]; 
-  float Q_22 = Q_00 + Q_11; /* Q_ij is traceless */
-  Q_22 *= -1.0f;
+  float Q_22 = -(Q_00 + Q_11); /* Q_ij is traceless */
+  // Q_22 *= -1.0f;
 
-  float Q_01 = d->sigma_01 - inv_mass*d->mu[0]*d->mu[1];
-  float Q_02 = d->sigma_02 - inv_mass*d->mu[0]*d->mu[2];
-  float Q_12 = d->sigma_12 - inv_mass*d->mu[1]*d->mu[2];
+  float Q_01 = d->sigma_01 - inv_mass*mu[0]*mu[1];
+  float Q_02 = d->sigma_02 - inv_mass*mu[0]*mu[2];
+  float Q_12 = d->sigma_12 - inv_mass*mu[1]*mu[2];
   Q_01 *= 3.0f;
   Q_02 *= 3.0f;
   Q_12 *= 3.0f;
 
   /* Compute the distance to the CoM. */
   for (r2 = 0.0f, k = 0; k < 3; k++) {
-    dx[k] = x[k] - d->mu[k] * inv_mass;
+    dx[k] = x[k] - mu[k] * inv_mass;
     r2 += dx[k] * dx[k];
   }
   ir = 1.0f / sqrtf(r2);
 
   /* Compute the acceleration from the monopole */
-  w = const_G * ir * ir * ir * d->mass;
-  for (k = 0; k < 3; k++) a[k] -= w * dx[k];
+  float ir2 = ir * ir;
+  float ir3G = ir * ir2 * const_G;
+  // float w0 = ir3G * d->mass;
 
 
   /* Compute some helpful terms */
-  float w1 = const_G * ir * ir * ir * ir * ir; 
-  float w2 = -2.5f * const_G * ir * ir * ir * ir * ir * ir * ir;
-  float xi = 0.;
+  float w1 = ir3G * ir2; 
+  float xi = 0.0f;
 
   xi += dx[0] * dx[0] * Q_00;
   xi += dx[1] * dx[1] * Q_11;
@@ -202,11 +201,13 @@ static inline void multipole_iact(struct multipole *d, const float *x, float* a)
   xi += 2.f * dx[0] * dx[1] * Q_01;
   xi += 2.f * dx[0] * dx[2] * Q_02;
   xi += 2.f * dx[1] * dx[2] * Q_12;
+  
+  float w2 = ir3G * (-2.5f * ir2 * ir2 * xi - d->mass);
 
   /* Compute the acceleration from the quadrupole */    
-  a[0] += w2 * xi * dx[0] + w1 * (dx[0]*Q_00 + dx[1]*Q_01 + dx[2]*Q_02);
-  a[1] += w2 * xi * dx[1] + w1 * (dx[0]*Q_01 + dx[1]*Q_11 + dx[2]*Q_12);
-  a[2] += w2 * xi * dx[2] + w1 * (dx[0]*Q_02 + dx[1]*Q_12 + dx[2]*Q_22);
+  a[0] += w2 * dx[0] + w1 * (dx[0]*Q_00 + dx[1]*Q_01 + dx[2]*Q_02);
+  a[1] += w2 * dx[1] + w1 * (dx[0]*Q_01 + dx[1]*Q_11 + dx[2]*Q_12);
+  a[2] += w2 * dx[2] + w1 * (dx[0]*Q_02 + dx[1]*Q_12 + dx[2]*Q_22);
 
 }
 
@@ -802,10 +803,11 @@ void comp_com(struct cell *c) {
 /**
  * @brief Interacts all particles in ci with the monopole in cj
  */
-static inline void make_interact_pc(struct cell *leaf, struct cell *cj) {
+static inline void make_interact_pc(struct part_local *parts, int count,
+                                    double *loc, struct cell *cj) {
 
   int j, k;
-  double com[3] = {0.0, 0.0, 0.0};
+  float com[3] = {0.0, 0.0, 0.0};
   float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
 
 #ifdef SANITY_CHECKS
@@ -827,22 +829,22 @@ static inline void make_interact_pc(struct cell *leaf, struct cell *cj) {
 #endif
 
   /* Init the com's data. */
-  for (k = 0; k < 3; k++) com[k] = cj->new.com[k];
+  for (k = 0; k < 3; k++) com[k] = cj->new.com[k] - loc[k];
   mcom = cj->new.mass;
 
   /* Loop over every particle in leaf. */
-  for (j = 0; j < leaf->count; j++) {
+  for (j = 0; j < count; j++) {
 
     /* Compute the pairwise distance. */
     for (r2 = 0.0, k = 0; k < 3; k++) {
-      dx[k] = com[k] - leaf->parts[j].x[k];
+      dx[k] = com[k] - parts[j].x[k];
       r2 += dx[k] * dx[k];
     }
 
     /* Apply the gravitational acceleration. */
     ir = 1.0f / sqrtf(r2);
     w = mcom * const_G * ir * ir * ir;
-    for (k = 0; k < 3; k++) leaf->parts[j].a[k] += w * dx[k];
+    for (k = 0; k < 3; k++) parts[j].a[k] += w * dx[k];
 
 #if ICHECK >= 0
     if (leaf->parts[j].id == ICHECK)
@@ -873,21 +875,19 @@ static inline int is_inside(struct cell *leaf, struct cell *c) {
 static inline int are_neighbours_different_size(struct cell *ci,
                                                 struct cell *cj) {
 
-  int k;
-  float dx[3];
   double cih = ci->h, cjh = cj->h;
 
   /* Maximum allowed distance */
   float min_dist = 0.5 * (cih + cjh);
 
   /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
+  for (int k = 0; k < 3; k++) {
     float center_i = ci->loc[k] + 0.5 * cih;
     float center_j = cj->loc[k] + 0.5 * cjh;
-    dx[k] = fabs(center_i - center_j);
+    if (fabsf(center_i - center_j) > min_dist) return 0;
   }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+  return 1;
 }
 
 /**
@@ -896,9 +896,6 @@ static inline int are_neighbours_different_size(struct cell *ci,
  */
 static inline int are_neighbours(struct cell *ci, struct cell *cj) {
 
-  int k;
-  float dx[3];
-
 #ifdef SANITY_CHECKS
   if (ci->h != cj->h)
     error(" Cells of different size in distance calculation.");
@@ -908,25 +905,26 @@ static inline int are_neighbours(struct cell *ci, struct cell *cj) {
   float min_dist = ci->h;
 
   /* (Manhattan) Distance between the cells */
-  for (k = 0; k < 3; k++) {
+  for (int k = 0; k < 3; k++) {
     float center_i = ci->loc[k];
     float center_j = cj->loc[k];
-    dx[k] = fabs(center_i - center_j);
+    if (fabsf(center_i - center_j) > min_dist) return 0;
   }
 
-  return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist);
+  return 1;
 }
 
 /**
  * @brief Compute the interactions between all particles in a cell leaf
  *        and the center of mass of all the cells in a part of the tree
-* described by ci and cj
+ * described by ci and cj
  *
  * @param ci The #cell containing the particle
  * @param cj The #cell containing the center of mass.
  */
 static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
-                                struct cell *leaf) {
+                                struct cell *leaf, struct part_local *parts,
+                                int count, double *loc) {
 
   struct cell *cp, *cps;
 
@@ -968,11 +966,11 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
 
         /* We only recurse if the children are split */
         if (cp->split && cps->split) {
-          iact_pair_pc(cp, cps, leaf);
+          iact_pair_pc(cp, cps, leaf, parts, count, loc);
         }
 
       } else {
-        make_interact_pc(leaf, cps);
+        make_interact_pc(parts, count, loc, cps);
       }
     }
   } else {
@@ -981,7 +979,7 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
      * multipoles */
     for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) {
 
-      make_interact_pc(leaf, cps);
+      make_interact_pc(parts, count, loc, cps);
     }
   }
 }
@@ -993,9 +991,11 @@ static inline void iact_pair_pc(struct cell *ci, struct cell *cj,
  * @param c The #cell containing the monopoles
  * @param leaf The #cell containing the particles
  */
-static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
+static inline void iact_self_pc(struct cell *c, struct cell *leaf,
+                                struct part_local *parts) {
 
   struct cell *cp, *cps;
+  int collect_part_data = 0;
 
 #ifdef SANITY_CHECKS
 
@@ -1006,6 +1006,23 @@ static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
 
 #endif
 
+  /* Get local copies of the particle data. */
+  if (parts == NULL) {
+    int count = leaf->count;
+    if ((parts =
+             (struct part_local *)malloc(sizeof(struct part_local) * count)) ==
+        NULL)
+      error("Failed to allocate local parts.");
+    for (int k = 0; k < count; k++) {
+      for (int j = 0; j < 3; j++) {
+        parts[k].x[j] = leaf->parts[k].x[j] - leaf->loc[j];
+        parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = leaf->parts[k].mass;
+    }
+    collect_part_data = 1;
+  }
+
   /* Find in which subcell of c the leaf is */
   for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) {
 
@@ -1016,7 +1033,7 @@ static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
   if (cp->split) {
 
     /* Recurse if the cell can be split */
-    iact_self_pc(cp, leaf);
+    iact_self_pc(cp, leaf, parts);
 
     /* Now, interact with every other subcell */
     for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) {
@@ -1024,8 +1041,17 @@ static inline void iact_self_pc(struct cell *c, struct cell *leaf) {
       /* Since cp and cps will be direct neighbours it is only worth recursing
        */
       /* if the cells can both be split */
-      if (cp != cps && cps->split) iact_pair_pc(cp, cps, leaf);
+      if (cp != cps && cps->split)
+        iact_pair_pc(cp, cps, leaf, parts, leaf->count, leaf->loc);
+    }
+  }
+
+  /* Clean up local parts? */
+  if (collect_part_data) {
+    for (int k = 0; k < leaf->count; k++) {
+      for (int j = 0; j < 3; j++) leaf->parts[k].a[j] += parts[k].a[j];
     }
+    free(parts);
   }
 }
 
@@ -1038,7 +1064,7 @@ void init_multipole_walk(struct cell *root, struct cell *leaf) {
   __builtin_prefetch(leaf->parts, 1, 3);
 
   /* Start the recursion */
-  iact_self_pc(root, leaf);
+  iact_self_pc(root, leaf, NULL);
 }
 
 /**
@@ -1125,12 +1151,6 @@ static inline void iact_pair_direct_unsorted(struct cell *ci, struct cell *cj) {
 static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
 						     struct cell *cj) {
 
-  struct part_local {
-    float x[3];
-    float a[3];
-    float mass, d;
-  };
-
   int i, j, k;
   int count_i, count_j;
   struct part_local *parts_i, *parts_j;
@@ -1163,11 +1183,12 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
            (struct part_local *)alloca(sizeof(struct part_local) * count_j)) ==
           NULL)
     error("Failed to allocate local part arrays.");
+  float midpoint[3] = { cj->loc[0] , cj->loc[1], cj->loc[2] };
   for (i = 0; i < count_i; i++) {
     int pid = ind_i[i].ind;
     parts_i[i].d = ind_i[i].d;
     for (k = 0; k < 3; k++) {
-      parts_i[i].x[k] = ci->parts[pid].x[k] - cj->loc[k];
+      parts_i[i].x[k] = ci->parts[pid].x[k] - midpoint[k];
       parts_i[i].a[k] = 0.0f;
     }
     parts_i[i].mass = ci->parts[pid].mass;
@@ -1176,7 +1197,7 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
     int pjd = ind_j[j].ind;
     parts_j[j].d = ind_j[j].d;
     for (k = 0; k < 3; k++) {
-      parts_j[j].x[k] = cj->parts[pjd].x[k] - cj->loc[k];
+      parts_j[j].x[k] = cj->parts[pjd].x[k] - midpoint[k];
       parts_j[j].a[k] = 0.0f;
     }
     parts_j[j].mass = cj->parts[pjd].mass;
@@ -1204,7 +1225,7 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
   for (i = count_i - 1; i >= 0; i--) {
 
     /* Get a local copy of the distance along the axis. */
-    float di = parts_i[i].d;
+    float di = parts_i[i].d + d_max;
 
     /* Init the ith particle's data. */
     for (k = 0; k < 3; k++) {
@@ -1214,7 +1235,7 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
     mi = parts_i[i].mass;
 
     /* Loop over every following particle within d_max along the axis. */
-    for (j = 0; j < count_j && (parts_j[j].d - di) < d_max; j++) {
+    for (j = 0; j < count_j && parts_j[j].d < di; j++) {
 
       /* Compute the pairwise distance. */
       for (r2 = 0.0, k = 0; k < 3; k++) {
@@ -1292,10 +1313,10 @@ static inline void iact_pair_direct_sorted_multipole(struct cell *ci,
   for (j = 0; j < count_j; j++) {
 
     /* Get the sorted index. */
-    float dj = parts_j[j].d;
+    float dj = parts_j[j].d - d_max;
 
     /* Fill the COM with any new particles. */
-    for (i = last_i; i < count_i && (dj - parts_i[i].d) > d_max; i++) {
+    for (i = last_i; i < count_i && dj > parts_i[i].d; i++) {
       multipole_add(&multi, parts_i[i].x, parts_i[i].mass);
     }
 
@@ -1382,9 +1403,8 @@ void iact_pair(struct cell *ci, struct cell *cj) {
  */
 void iact_self_direct(struct cell *c) {
   int i, j, k, count = c->count;
-  double xi[3] = {0.0, 0.0, 0.0};
+  float xi[3] = {0.0, 0.0, 0.0};
   float ai[3] = {0.0, 0.0, 0.0}, mi, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w;
-  struct part *parts = c->parts;
   struct cell *cp, *cps;
 
 #ifdef SANITY_CHECKS
@@ -1406,6 +1426,20 @@ void iact_self_direct(struct cell *c) {
     /* Otherwise, compute the interactions directly. */
   } else {
 
+    /* Init the local copies of the particles. */
+    double loc[3];
+    struct part_local parts[count];
+    loc[0] = c->loc[0];
+    loc[1] = c->loc[1];
+    loc[2] = c->loc[2];
+    for (k = 0; k < count; k++) {
+      for (j = 0; j < 3; j++) {
+        parts[k].x[j] = c->parts[k].x[j] - loc[j];
+        parts[k].a[j] = 0.0f;
+      }
+      parts[k].mass = c->parts[k].mass;
+    }
+
     /* Loop over all particles... */
     for (i = 0; i < count; i++) {
 
@@ -1452,9 +1486,14 @@ void iact_self_direct(struct cell *c) {
 
     } /* loop over all particles. */
 
+    /* Copy the local particle data back. */
+    for (k = 0; k < count; k++) {
+      for (j = 0; j < 3; j++) c->parts[k].a[j] = parts[k].a[j];
+    }
   } /* otherwise, compute interactions directly. */
 }
 
+
 /**
  * @brief Create the tasks for the cell pair/self.
  *
@@ -1992,14 +2031,14 @@ void test_bh(int N, int nr_threads, int runs, char *fileName) {
       parts[i].a[2] = 0.0;
     }
 
-    struct cell *finger = root;
+    /* struct cell *finger = root;
     while (finger != NULL) {
       finger->sorted = 0;
       if (finger->split)
         finger = finger->firstchild;
       else
         finger = finger->sibling;
-    }
+    } */
 
     /* Execute the tasks. */
     tic = getticks();
-- 
GitLab