From 02b12046982a73bf8c4ce4f80219a949ec8f5c6f Mon Sep 17 00:00:00 2001
From: James Willis <james.s.willis@durham.ac.uk>
Date: Fri, 18 Aug 2017 12:58:06 +0100
Subject: [PATCH] Use intrinsics in arithmetic operations to support AVX-512.

---
 src/hydro/Gadget2/hydro_iact.h | 197 +++++++++++++++++----------------
 1 file changed, 104 insertions(+), 93 deletions(-)

diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h
index 81b6381f27..4492334e72 100644
--- a/src/hydro/Gadget2/hydro_iact.h
+++ b/src/hydro/Gadget2/hydro_iact.h
@@ -1171,7 +1171,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
 #ifdef WITH_VECTORIZATION
 __attribute__((always_inline)) INLINE static void
 runner_iact_nonsym_1_vec_force(
-    float *R2, float *Dx, float *Dy, float *Dz, vector vix, vector viy,
+    vector *r2, vector *dx, vector *dy, vector *dz, vector vix, vector viy,
     vector viz, vector pirho, vector grad_hi, vector piPOrho2,
     vector balsara_i, vector ci, float *Vjx, float *Vjy, float *Vjz,
     float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj,
@@ -1181,8 +1181,7 @@ runner_iact_nonsym_1_vec_force(
 
 #ifdef WITH_VECTORIZATION
 
-  vector r, r2, ri;
-  vector dx, dy, dz;
+  vector r, ri;
   vector vjx, vjy, vjz;
   vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
   vector xi, xj;
@@ -1195,11 +1194,6 @@ runner_iact_nonsym_1_vec_force(
   vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
 
   /* Fill vectors. */
-  r2.v = vec_load(R2);
-  dx.v = vec_load(Dx);
-  dy.v = vec_load(Dy);
-  dz.v = vec_load(Dz);
-
   vjx.v = vec_load(Vjx);
   vjy.v = vec_load(Vjy);
   vjz.v = vec_load(Vjz);
@@ -1218,8 +1212,8 @@ runner_iact_nonsym_1_vec_force(
   balsara.v = balsara_i.v + balsara_j.v;
 
   /* Get the radius and inverse radius. */
-  ri = vec_reciprocal_sqrt(r2);
-  r.v = r2.v * ri.v;
+  ri = vec_reciprocal_sqrt(*r2);
+  r.v = r2->v * ri.v;
 
   /* Get the kernel for hi. */
   hid_inv = pow_dimension_plus_one_vec(hi_inv);
@@ -1237,8 +1231,8 @@ runner_iact_nonsym_1_vec_force(
   wj_dr.v = hjd_inv.v * wj_dx.v;
 
   /* Compute dv dot r. */
-  dvdr.v = ((vix.v - vjx.v) * dx.v) + ((viy.v - vjy.v) * dy.v) +
-           ((viz.v - vjz.v) * dz.v);
+  dvdr.v = ((vix.v - vjx.v) * dx->v) + ((viy.v - vjy.v) * dy->v) +
+           ((viz.v - vjz.v) * dz->v);
 
   /* Compute the relative velocity. (This is 0 if the particles move away from
    * each other and negative otherwise) */
@@ -1263,9 +1257,9 @@ runner_iact_nonsym_1_vec_force(
   acc.v = visc_term.v + sph_term.v;
 
   /* Use the force, Luke! */
-  piax.v = mj.v * dx.v * acc.v;
-  piay.v = mj.v * dy.v * acc.v;
-  piaz.v = mj.v * dz.v * acc.v;
+  piax.v = mj.v * dx->v * acc.v;
+  piay.v = mj.v * dy->v * acc.v;
+  piaz.v = mj.v * dz->v * acc.v;
 
   /* Get the time derivative for h. */
   pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
@@ -1304,10 +1298,10 @@ runner_iact_nonsym_2_vec_force(
 #ifdef WITH_VECTORIZATION
 
   vector r, r2, ri;
-  vector dx, dy, dz;
+  vector dx, dy, dz, dvx, dvy, dvz;
   vector vjx, vjy, vjz;
   vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
-  vector xi, xj;
+  vector ui, uj;
   vector hid_inv, hjd_inv;
   vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
   vector piax, piay, piaz;
@@ -1317,10 +1311,10 @@ runner_iact_nonsym_2_vec_force(
   vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
 
   vector r_2, r2_2, ri_2;
-  vector dx_2, dy_2, dz_2;
+  vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2;
   vector vjx_2, vjy_2, vjz_2;
   vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2;
-  vector xi_2, xj_2;
+  vector ui_2, uj_2;
   vector hjd_inv_2;
   vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2;
   vector piax_2, piay_2, piaz_2;
@@ -1330,128 +1324,145 @@ runner_iact_nonsym_2_vec_force(
   vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2;
 
   /* Fill vectors. */
-  r2.v = vec_load(R2);
-  dx.v = vec_load(Dx);
-  dy.v = vec_load(Dy);
-  dz.v = vec_load(Dz);
-
+  mj.v = vec_load(Mj);
+  mj_2.v = vec_load(&Mj[VEC_SIZE]);
   vjx.v = vec_load(Vjx);
+  vjx_2.v = vec_load(&Vjx[VEC_SIZE]);
   vjy.v = vec_load(Vjy);
+  vjy_2.v = vec_load(&Vjy[VEC_SIZE]);
   vjz.v = vec_load(Vjz);
-  mj.v = vec_load(Mj);
-
-  pjrho.v = vec_load(Pjrho);
-  grad_hj.v = vec_load(Grad_hj);
-  pjPOrho2.v = vec_load(PjPOrho2);
-  balsara_j.v = vec_load(Balsara_j);
-  cj.v = vec_load(Cj);
-  hj_inv.v = vec_load(Hj_inv);
-
-  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
-
-  r2_2.v = vec_load(&R2[VEC_SIZE]);
+  vjz_2.v = vec_load(&Vjz[VEC_SIZE]);
+  dx.v = vec_load(Dx);
   dx_2.v = vec_load(&Dx[VEC_SIZE]);
+  dy.v = vec_load(Dy);
   dy_2.v = vec_load(&Dy[VEC_SIZE]);
+  dz.v = vec_load(Dz);
   dz_2.v = vec_load(&Dz[VEC_SIZE]);
 
-  vjx_2.v = vec_load(&Vjx[VEC_SIZE]);
-  vjy_2.v = vec_load(&Vjy[VEC_SIZE]);
-  vjz_2.v = vec_load(&Vjz[VEC_SIZE]);
-  mj_2.v = vec_load(&Mj[VEC_SIZE]);
+  /* Get the radius and inverse radius. */
+  r2.v = vec_load(R2);
+  r2_2.v = vec_load(&R2[VEC_SIZE]);
+  ri = vec_reciprocal_sqrt(r2);
+  ri_2 = vec_reciprocal_sqrt(r2_2);
+  r.v = vec_mul(r2.v, ri.v);
+  r_2.v = vec_mul(r2_2.v, ri_2.v);
 
+  /* Get remaining properties. */
+  pjrho.v = vec_load(Pjrho);
   pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]);
+  grad_hj.v = vec_load(Grad_hj);
   grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]);
+  pjPOrho2.v = vec_load(PjPOrho2);
   pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]);
+  balsara_j.v = vec_load(Balsara_j);
   balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]);
+  cj.v = vec_load(Cj);
   cj_2.v = vec_load(&Cj[VEC_SIZE]);
+  hj_inv.v = vec_load(Hj_inv);
   hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]);
 
-  /* Load stuff. */
-  balsara.v = balsara_i.v + balsara_j.v;
-  balsara_2.v = balsara_i.v + balsara_j_2.v;
+  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
 
-  /* Get the radius and inverse radius. */
-  ri = vec_reciprocal_sqrt(r2);
-  ri_2 = vec_reciprocal_sqrt(r2_2);
-  r.v = r2.v * ri.v;
-  r_2.v = r2_2.v * ri_2.v;
+  /* Find the balsara switch. */
+  balsara.v = vec_add(balsara_i.v, balsara_j.v);
+  balsara_2.v = vec_add(balsara_i.v, balsara_j_2.v);
 
   /* Get the kernel for hi. */
   hid_inv = pow_dimension_plus_one_vec(hi_inv);
-  xi.v = r.v * hi_inv.v;
-  xi_2.v = r_2.v * hi_inv.v;
-  kernel_eval_dWdx_force_vec(&xi, &wi_dx);
-  kernel_eval_dWdx_force_vec(&xi_2, &wi_dx_2);
-  wi_dr.v = hid_inv.v * wi_dx.v;
-  wi_dr_2.v = hid_inv.v * wi_dx_2.v;
+  ui.v = vec_mul(r.v, hi_inv.v);
+  ui_2.v = vec_mul(r_2.v, hi_inv.v);
+  kernel_eval_dWdx_force_vec(&ui, &wi_dx);
+  kernel_eval_dWdx_force_vec(&ui_2, &wi_dx_2);
+  wi_dr.v = vec_mul(hid_inv.v, wi_dx.v);
+  wi_dr_2.v = vec_mul(hid_inv.v, wi_dx_2.v);
 
   /* Get the kernel for hj. */
   hjd_inv = pow_dimension_plus_one_vec(hj_inv);
   hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2);
-  xj.v = r.v * hj_inv.v;
-  xj_2.v = r_2.v * hj_inv_2.v;
+  uj.v = vec_mul(r.v, hj_inv.v);
+  uj_2.v = vec_mul(r_2.v, hj_inv_2.v);
 
   /* Calculate the kernel for two particles. */
-  kernel_eval_dWdx_force_vec(&xj, &wj_dx);
-  kernel_eval_dWdx_force_vec(&xj_2, &wj_dx_2);
+  kernel_eval_dWdx_force_vec(&uj, &wj_dx);
+  kernel_eval_dWdx_force_vec(&uj_2, &wj_dx_2);
 
-  wj_dr.v = hjd_inv.v * wj_dx.v;
-  wj_dr_2.v = hjd_inv_2.v * wj_dx_2.v;
+  wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v);
+  wj_dr_2.v = vec_mul(hjd_inv_2.v, wj_dx_2.v);
 
+  /* Compute dv. */
+  dvx.v = vec_sub(vix.v, vjx.v);
+  dvx_2.v = vec_sub(vix.v, vjx_2.v);
+  dvy.v = vec_sub(viy.v, vjy.v);
+  dvy_2.v = vec_sub(viy.v, vjy_2.v);
+  dvz.v = vec_sub(viz.v, vjz.v);
+  dvz_2.v = vec_sub(viz.v, vjz_2.v);
+  
   /* Compute dv dot r. */
-  dvdr.v = ((vix.v - vjx.v) * dx.v) + ((viy.v - vjy.v) * dy.v) +
-           ((viz.v - vjz.v) * dz.v);
-  dvdr_2.v = ((vix.v - vjx_2.v) * dx_2.v) + ((viy.v - vjy_2.v) * dy_2.v) +
-             ((viz.v - vjz_2.v) * dz_2.v);
+  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
+  dvdr_2.v =
+      vec_fma(dvx_2.v, dx_2.v, vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v)));
 
   /* Compute the relative velocity. (This is 0 if the particles move away from
    * each other and negative otherwise) */
   omega_ij.v = vec_fmin(dvdr.v, vec_setzero());
   omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero());
-  mu_ij.v = fac_mu.v * ri.v * omega_ij.v;       /* This is 0 or negative */
-  mu_ij_2.v = fac_mu.v * ri_2.v * omega_ij_2.v; /* This is 0 or negative */
+  mu_ij.v = vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v));       /* This is 0 or negative */
+  mu_ij_2.v = vec_mul(fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */
 
   /* Compute signal velocity */
-  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v;
-  v_sig_2.v = ci.v + cj_2.v - vec_set1(3.0f) * mu_ij_2.v;
+  v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v));
+  v_sig_2.v = vec_fnma(vec_set1(3.f), mu_ij_2.v, vec_add(ci.v, cj_2.v));
 
   /* Now construct the full viscosity term */
-  rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v);
-  rho_ij_2.v = vec_set1(0.5f) * (pirho.v + pjrho_2.v);
-  visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v *
-           mu_ij.v * balsara.v / rho_ij.v;
-  visc_2.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig_2.v *
-             mu_ij_2.v * balsara_2.v / rho_ij_2.v;
+  rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v));
+  rho_ij_2.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho_2.v));
+
+  vector const_viscosity_alpha_fac;
+  const_viscosity_alpha_fac.v = vec_set1(-0.25f * const_viscosity_alpha);
+  
+  visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), rho_ij.v);
+  visc_2.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))), rho_ij_2.v);
 
   /* Now, convolve with the kernel */
-  visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
-  visc_term_2.v = vec_set1(0.5f) * visc_2.v * (wi_dr_2.v + wj_dr_2.v) * ri_2.v;
+  visc_term.v = vec_mul(vec_set1(0.5f), vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v)));
+  visc_term_2.v = vec_mul(vec_set1(0.5f), vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v)));
+  
+  vector grad_hi_mul_piPOrho2;
+  grad_hi_mul_piPOrho2.v = vec_mul(grad_hi.v, piPOrho2.v);
+
   sph_term.v =
-      (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
-      ri.v;
-  sph_term_2.v = (grad_hi.v * piPOrho2.v * wi_dr_2.v +
-                  grad_hj_2.v * pjPOrho2_2.v * wj_dr_2.v) *
-                 ri_2.v;
+      vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v, vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), ri.v);
+  sph_term_2.v = vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v, vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))), ri_2.v);
 
   /* Eventually get the acceleration */
-  acc.v = visc_term.v + sph_term.v;
-  acc_2.v = visc_term_2.v + sph_term_2.v;
+  acc.v = vec_add(visc_term.v, sph_term.v);
+  acc_2.v = vec_add(visc_term_2.v, sph_term_2.v);
 
   /* Use the force, Luke! */
-  piax.v = mj.v * dx.v * acc.v;
-  piax_2.v = mj_2.v * dx_2.v * acc_2.v;
-  piay.v = mj.v * dy.v * acc.v;
-  piay_2.v = mj_2.v * dy_2.v * acc_2.v;
-  piaz.v = mj.v * dz.v * acc.v;
-  piaz_2.v = mj_2.v * dz_2.v * acc_2.v;
+  piax.v = vec_mul(mj.v, vec_mul(dx.v, acc.v));
+  piax_2.v = vec_mul(mj_2.v, vec_mul(dx_2.v, acc_2.v));
+  piay.v = vec_mul(mj.v, vec_mul(dy.v, acc.v));
+  piay_2.v = vec_mul(mj_2.v, vec_mul(dy_2.v, acc_2.v));
+  piaz.v = vec_mul(mj.v, vec_mul(dz.v, acc.v));
+  piaz_2.v = vec_mul(mj_2.v, vec_mul(dz_2.v, acc_2.v));
+
+//  for(int i=0; i<VEC_SIZE; i++) {
+//    message("mj: %f",mj.f[i]);
+//    message("dvdr: %f",dvdr.f[i]);
+//    message("ri: %f",ri.f[i]);
+//    message("pjrho: %f",pjrho.f[i]);
+//    message("wi_dr: %f",wi_dr.f[i]);
+//    message("wi_dx: %f",wi_dx.f[i]);
+//    message("hid_inv: %f",hid_inv.f[i]);
+//  }
 
   /* Get the time derivative for h. */
-  pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
-  pih_dt_2.v = mj_2.v * dvdr_2.v * ri_2.v / pjrho_2.v * wi_dr_2.v;
+  pih_dt.v = vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v);
+  pih_dt_2.v = vec_div(vec_mul(mj_2.v, vec_mul(dvdr_2.v, vec_mul(ri_2.v, wi_dr_2.v))), pjrho_2.v);
 
   /* Change in entropy */
-  entropy_dt.v = mj.v * visc_term.v * dvdr.v;
-  entropy_dt_2.v = mj_2.v * visc_term_2.v * dvdr_2.v;
+  entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v));
+  entropy_dt_2.v = vec_mul(mj_2.v, vec_mul(visc_term_2.v, dvdr_2.v));
 
   /* Store the forces back on the particles. */
   if (mask_cond) {
-- 
GitLab