hydro_iact.h 53.8 KB
Newer Older
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
 ******************************************************************************/
20
21
#ifndef SWIFT_GADGET2_HYDRO_IACT_H
#define SWIFT_GADGET2_HYDRO_IACT_H
22
23

/**
24
 * @file Gadget2/hydro_iact.h
25
26
 * @brief SPH interaction functions following the Gadget-2 version of SPH.
 *
27
 * The interactions computed here are the ones presented in the Gadget-2 paper
28
29
 * Springel, V., MNRAS, Volume 364, Issue 4, pp. 1105-1134.
 * We use the same numerical coefficients as the Gadget-2 code. When used with
30
31
32
 * the Spline-3 kernel, the results should be equivalent to the ones obtained
 * with Gadget-2 up to the rounding errors and interactions missed by the
 * Gadget-2 tree-code neighbours search.
33
34
 */

35
#include "cache.h"
James Willis's avatar
James Willis committed
36
#include "minmax.h"
37

38
39
40
/**
 * @brief Density loop
 */
41
42
43
__attribute__((always_inline)) INLINE static void runner_iact_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

44
45
  float wi, wi_dx;
  float wj, wj_dx;
46
  float dv[3], curlvr[3];
47

48
  /* Get the masses. */
49
  const float mi = pi->mass;
50
51
52
53
54
55
56
57
58
59
60
61
62
  const float mj = pj->mass;

  /* Get r and r inverse. */
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Compute the kernel function for pi */
  const float hi_inv = 1.f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);

  /* Compute contribution to the density */
  pi->rho += mj * wi;
63
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
64

65
66
67
68
69
70
71
72
73
74
75
  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
  pi->density.wcount_dh -= ui * wi_dx;

  /* Compute the kernel function for pj */
  const float hj_inv = 1.f / hj;
  const float uj = r * hj_inv;
  kernel_deval(uj, &wj, &wj_dx);

  /* Compute contribution to the density */
  pj->rho += mi * wj;
76
  pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
77

78
79
80
  /* Compute contribution to the number of neighbours */
  pj->density.wcount += wj;
  pj->density.wcount_dh -= uj * wj_dx;
81

82
83
  const float faci = mj * wi_dx * r_inv;
  const float facj = mi * wj_dx * r_inv;
84

85
86
87
88
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
89
90
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];

91
92
  pi->density.div_v -= faci * dvdr;
  pj->density.div_v -= facj * dvdr;
93
94
95
96
97
98

  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

99
100
101
  pi->density.rot_v[0] += faci * curlvr[0];
  pi->density.rot_v[1] += faci * curlvr[1];
  pi->density.rot_v[2] += faci * curlvr[2];
102

103
104
105
  pj->density.rot_v[0] += facj * curlvr[0];
  pj->density.rot_v[1] += facj * curlvr[1];
  pj->density.rot_v[2] += facj * curlvr[2];
106
107
}

108
109
110
111
112
113
/**
 * @brief Density loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_density(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

#ifdef WITH_VECTORIZATION

  vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
  vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
  vector mi, mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr, div_vi, div_vj;
  vector curlvr[3], curl_vi[3], curl_vj[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
139
140
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
141
142
143
144
145
146
147
148
149
150
151
152
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
153
154
#else
  error("Unknown vector size.")
155
156
157
158
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
159
  ri = vec_reciprocal_sqrt(r2);
160
161
162
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
163
  hi_inv = vec_reciprocal(hi);
164
165
166
  xi.v = r.v * hi_inv.v;

  hj.v = vec_load(Hj);
167
  hj_inv = vec_reciprocal(hj);
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
  xj.v = r.v * hj_inv.v;

  /* Compute the kernel function. */
  kernel_deval_vec(&xi, &wi, &wi_dx);
  kernel_deval_vec(&xj, &wj, &wj_dx);

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
191
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
192
193
194
195
196
197
198
  wcounti.v = wi.v;
  wcounti_dh.v = xi.v * wi_dx.v;
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Compute density of pj. */
  rhoj.v = mi.v * wj.v;
199
  rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + xj.v * wj_dx.v);
200
201
202
203
204
205
206
207
  wcountj.v = wj.v;
  wcountj_dh.v = xj.v * wj_dx.v;
  div_vj.v = mi.v * dvdr.v * wj_dx.v;
  for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
208
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
209
210
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
211
    pi[k]->density.div_v -= div_vi.f[k];
212
213
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
    pj[k]->rho += rhoj.f[k];
214
    pj[k]->density.rho_dh -= rhoj_dh.f[k];
215
216
    pj[k]->density.wcount += wcountj.f[k];
    pj[k]->density.wcount_dh -= wcountj_dh.f[k];
217
    pj[k]->density.div_v -= div_vj.f[k];
218
219
220
221
222
    for (j = 0; j < 3; j++) pj[k]->density.rot_v[j] += curl_vj[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
223
224
  error(
      "The Gadget2 serial version of runner_iact_density was called when the "
225
      "vectorised version should have been used.");
226
227

#endif
228
229
}

230
231
232
/**
 * @brief Density loop (non-symmetric version)
 */
233
234
235
236
237
238
239
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

  float wi, wi_dx;
  float dv[3], curlvr[3];

  /* Get the masses. */
240
  const float mj = pj->mass;
241
242

  /* Get r and r inverse. */
243
244
  const float r = sqrtf(r2);
  const float ri = 1.0f / r;
245

246
  /* Compute the kernel function */
247
248
249
  const float hi_inv = 1.0f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
250
251
252

  /* Compute contribution to the density */
  pi->rho += mj * wi;
253
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
254
255
256

  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
257
  pi->density.wcount_dh -= ui * wi_dx;
258

259
  const float fac = mj * wi_dx * ri;
260

261
262
263
264
265
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
266
  pi->density.div_v -= fac * dvdr;
267

268
269
270
271
272
  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

273
274
275
  pi->density.rot_v[0] += fac * curlvr[0];
  pi->density.rot_v[1] += fac * curlvr[1];
  pi->density.rot_v[2] += fac * curlvr[2];
276
277
}

James Willis's avatar
James Willis committed
278
279
280
281
282
283
284
285
286
287
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_density_jsw(
    const float r2, const float hig2, const float dx, const float dy,
    const float dz, const float h_inv, const float hj, const float vi_x,
    const float vi_y, const float vi_z, const float vj_x, const float vj_y,
    const float vj_z, const float mj, float *const restrict rho,
    float *const restrict rho_dh, float *const restrict wcount,
    float *const restrict wcount_dh, float *const restrict div_v,
    float *const restrict curl_vx, float *const restrict curl_vy,
    float *const restrict curl_vz) {
288
289
290
291
292
293
294
295
296
297
298
299
300
301

  if (r2 < hig2) {

    float wi, wi_dx;

    /* Get r and r inverse. */
    const float r = sqrtf(r2);
    const float ri = 1.0f / r;

    /* Compute kernel function */
    const float u = r * h_inv;
    kernel_deval(u, &wi, &wi_dx);

    const float fac = mj * wi_dx * ri;
James Willis's avatar
James Willis committed
302

303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
    /* Compute dv dot r */
    const float dv_x = vi_x - vj_x;
    const float dv_y = vi_y - vj_y;
    const float dv_z = vi_z - vj_z;
    const float dvdr = dv_x * dx + dv_y * dy + dv_z * dz;
    *div_v -= fac * dvdr;

    /* Compute dv cross r */
    const float curlvr_x = dv_y * dz - dv_z * dy;
    const float curlvr_y = dv_z * dx - dv_x * dz;
    const float curlvr_z = dv_x * dy - dv_y * dx;

    /* Compute contribution to the density */
    *rho += mj * wi;
    *rho_dh -= mj * (3.0f * wi + u * wi_dx);

    /* Compute contribution to the number of neighbours */
    *wcount += wi;
    *wcount_dh -= u * wi_dx;
    *curl_vx += fac * curlvr_x;
    *curl_vy += fac * curlvr_y;
    *curl_vz += fac * curlvr_z;
  }
}

328
329
330
331
332
333
/**
 * @brief Density loop (non-symmetric vectorized version)
 */
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                               struct part **pi, struct part **pj) {
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

#ifdef WITH_VECTORIZATION

  vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
  vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
  vector mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr;
  vector curlvr[3], curl_vi[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
357
358
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
359
360
361
362
363
364
365
366
367
368
369
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
370
371
#else
  error("Unknown vector size.")
372
373
374
375
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
376
  ri = vec_reciprocal_sqrt(r2);
377
378
379
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
380
  hi_inv = vec_reciprocal(hi);
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
  xi.v = r.v * hi_inv.v;

  kernel_deval_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
402
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
403
404
405
406
407
408
409
410
  wcounti.v = wi.v;
  wcounti_dh.v = xi.v * wi_dx.v;
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
411
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
412
413
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
414
    pi[k]->density.div_v -= div_vi.f[k];
415
416
417
418
419
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
420
421
  error(
      "The Gadget2 serial version of runner_iact_nonsym_density was called "
422
      "when the vectorised version should have been used.");
423
424

#endif
425
426
}

427
#ifdef WITH_VECTORIZATION
428
429
430
431
432
433
434
435
436
437
438
439
440
441
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_intrinsic_vec_density(
    vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix,
    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector mask, int knlMask) {

  vector r, ri, xi, wi, wi_dx;
  vector mj;
  vector dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
James Willis's avatar
James Willis committed
442

443
  /* Fill the vectors. */
444
445
446
447
  mj.v = vec_unaligned_load(Mj);
  vjx.v = vec_unaligned_load(Vjx);
  vjy.v = vec_unaligned_load(Vjy);
  vjz.v = vec_unaligned_load(Vjz);
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466

  /* Get the radius and inverse radius. */
  ri = vec_reciprocal_sqrt(*r2);
  r.v = vec_mul(r2->v, ri.v);

  xi.v = vec_mul(r.v, hi_inv.v);

  /* Calculate the kernel for two particles. */
  kernel_deval_1_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvz.v = vec_sub(viz.v, vjz.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
  dvdr.v = vec_mul(dvdr.v, ri.v);

467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
  /* Compute dv cross r */
  curlvrx.v =
      vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
  curlvry.v =
      vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
  curlvrz.v =
      vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);

/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                               vec_mul(xi.v, wi_dx.v))));

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
James Willis's avatar
James Willis committed
499

500
501
502
  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
James Willis's avatar
James Willis committed
503

504
505
506
  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
James Willis's avatar
James Willis committed
507
#else
508
509
  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
James Willis's avatar
James Willis committed
510
511
                                                vec_mul(xi.v, wi_dx.v))),
                          mask.v);
512
513
514
515
516
517
518
519
520
521
522
  wcountSum->v += vec_and(wi.v, mask.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
}

__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_intrinsic_vec_2_density(
James Willis's avatar
James Willis committed
523
524
525
526
527
    const struct cache *const cj_cache, const int *const indices, vector *r2,
    vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, vector viy,
    vector viz, vector *rhoSum, vector *rho_dhSum, vector *wcountSum,
    vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,
    vector *curlvySum, vector *curlvzSum, vector mask, int knlMask) {
528

James Willis's avatar
James Willis committed
529
  // vector r, ri, r2, xi, wi, wi_dx;
530
531
  vector r, ri, xi, wi, wi_dx;
  vector mj;
James Willis's avatar
James Willis committed
532
  // vector dx, dy, dz, dvx, dvy, dvz;
533
534
535
536
  vector dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
James Willis's avatar
James Willis committed
537

538
  /* Fill the vectors. */
James Willis's avatar
James Willis committed
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
  mj.v = vec_set(cj_cache->m[indices[0]], cj_cache->m[indices[1]],
                 cj_cache->m[indices[2]], cj_cache->m[indices[3]],
                 cj_cache->m[indices[4]], cj_cache->m[indices[5]],
                 cj_cache->m[indices[6]], cj_cache->m[indices[7]]);
  vjx.v = vec_set(cj_cache->vx[indices[0]], cj_cache->vx[indices[1]],
                  cj_cache->vx[indices[2]], cj_cache->vx[indices[3]],
                  cj_cache->vx[indices[4]], cj_cache->vx[indices[5]],
                  cj_cache->vx[indices[6]], cj_cache->vx[indices[7]]);
  vjy.v = vec_set(cj_cache->vy[indices[0]], cj_cache->vy[indices[1]],
                  cj_cache->vy[indices[2]], cj_cache->vy[indices[3]],
                  cj_cache->vy[indices[4]], cj_cache->vy[indices[5]],
                  cj_cache->vy[indices[6]], cj_cache->vy[indices[7]]);
  vjz.v = vec_set(cj_cache->vz[indices[0]], cj_cache->vz[indices[1]],
                  cj_cache->vz[indices[2]], cj_cache->vz[indices[3]],
                  cj_cache->vz[indices[4]], cj_cache->vz[indices[5]],
                  cj_cache->vz[indices[6]], cj_cache->vz[indices[7]]);
  // dx.v = vec_load(Dx);
  // dy.v = vec_load(Dy);
  // dz.v = vec_load(Dz);
558
559

  /* Get the radius and inverse radius. */
James Willis's avatar
James Willis committed
560
  // r2.v = vec_load(R2);
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
  ri = vec_reciprocal_sqrt(*r2);
  r.v = vec_mul(r2->v, ri.v);

  xi.v = vec_mul(r.v, hi_inv.v);

  /* Calculate the kernel for two particles. */
  kernel_deval_1_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvz.v = vec_sub(viz.v, vjz.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
  dvdr.v = vec_mul(dvdr.v, ri.v);

578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
  /* Compute dv cross r */
  curlvrx.v =
      vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
  curlvry.v =
      vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
  curlvrz.v =
      vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);

/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                               vec_mul(xi.v, wi_dx.v))));

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
James Willis's avatar
James Willis committed
610

611
612
613
  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
James Willis's avatar
James Willis committed
614

615
616
617
  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
James Willis's avatar
James Willis committed
618
#else
619
620
  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
James Willis's avatar
James Willis committed
621
622
                                                vec_mul(xi.v, wi_dx.v))),
                          mask.v);
623
624
625
626
627
628
629
630
631
  wcountSum->v += vec_and(wi.v, mask.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
}

632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
/**
 * @brief Density interaction computed using 2 interleaved vectors
 * (non-symmetric vectorized version).
 */
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_1_vec_density(
    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector mask, vector mask2, int knlMask, int knlMask2) {

  vector r, ri, r2, xi, wi, wi_dx;
  vector mj;
  vector dx, dy, dz, dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
James Willis's avatar
James Willis committed
650

651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
  /* Fill the vectors. */
  mj.v = vec_load(Mj);
  vjx.v = vec_load(Vjx);
  vjy.v = vec_load(Vjy);
  vjz.v = vec_load(Vjz);
  dx.v = vec_load(Dx);
  dy.v = vec_load(Dy);
  dz.v = vec_load(Dz);

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  ri = vec_reciprocal_sqrt(r2);
  r.v = vec_mul(r2.v, ri.v);

  xi.v = vec_mul(r.v, hi_inv.v);

  /* Calculate the kernel for two particles. */
  kernel_deval_1_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvz.v = vec_sub(viz.v, vjz.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
  dvdr.v = vec_mul(dvdr.v, ri.v);

  /* Compute dv cross r */
  curlvrx.v =
      vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
  curlvry.v =
      vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
  curlvrz.v =
      vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);

/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                               vec_mul(xi.v, wi_dx.v))));

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
James Willis's avatar
James Willis committed
711

712
713
714
  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
James Willis's avatar
James Willis committed
715

716
717
718
  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
James Willis's avatar
James Willis committed
719
#else
720
721
722
723
724
725
726
727
728
729
730
731
732
733
  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                                vec_mul(xi.v, wi_dx.v))),
                          mask.v);
  wcountSum->v += vec_and(wi.v, mask.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
}
#endif

James Willis's avatar
James Willis committed
734
#ifdef WITH_VECTORIZATION
735
/**
James Willis's avatar
James Willis committed
736
737
 * @brief Density interaction computed using 2 interleaved vectors
 * (non-symmetric vectorized version).
738
739
 */
__attribute__((always_inline)) INLINE static void
James Willis's avatar
James Willis committed
740
741
742
743
744
745
runner_iact_nonsym_2_vec_density(
    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector mask, vector mask2, int knlMask, int knlMask2) {
746
747
748
749
750
751
752
753
754
755
756
757
758
759

  vector r, ri, r2, xi, wi, wi_dx;
  vector mj;
  vector dx, dy, dz, dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
  vector r_2, ri2, r2_2, xi2, wi2, wi_dx2;
  vector mj2;
  vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
  vector vjx2, vjy2, vjz2;
  vector dvdr2;
  vector curlvrx2, curlvry2, curlvrz2;

James Willis's avatar
James Willis committed
760
  /* Fill the vectors. */
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
  mj.v = vec_load(Mj);
  mj2.v = vec_load(&Mj[VEC_SIZE]);
  vjx.v = vec_load(Vjx);
  vjx2.v = vec_load(&Vjx[VEC_SIZE]);
  vjy.v = vec_load(Vjy);
  vjy2.v = vec_load(&Vjy[VEC_SIZE]);
  vjz.v = vec_load(Vjz);
  vjz2.v = vec_load(&Vjz[VEC_SIZE]);
  dx.v = vec_load(Dx);
  dx2.v = vec_load(&Dx[VEC_SIZE]);
  dy.v = vec_load(Dy);
  dy2.v = vec_load(&Dy[VEC_SIZE]);
  dz.v = vec_load(Dz);
  dz2.v = vec_load(&Dz[VEC_SIZE]);

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  r2_2.v = vec_load(&R2[VEC_SIZE]);
779
780
  ri = vec_reciprocal_sqrt(r2);
  ri2 = vec_reciprocal_sqrt(r2_2);
781
782
783
784
785
786
  r.v = vec_mul(r2.v, ri.v);
  r_2.v = vec_mul(r2_2.v, ri2.v);

  xi.v = vec_mul(r.v, hi_inv.v);
  xi2.v = vec_mul(r_2.v, hi_inv.v);

James Willis's avatar
James Willis committed
787
  /* Calculate the kernel for two particles. */
James Willis's avatar
James Willis committed
788
  kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2);
789
790
791
792
793
794
795
796
797
798
799

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvx2.v = vec_sub(vix.v, vjx2.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvy2.v = vec_sub(viy.v, vjy2.v);
  dvz.v = vec_sub(viz.v, vjz.v);
  dvz2.v = vec_sub(viz.v, vjz2.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
James Willis's avatar
James Willis committed
800
801
  dvdr2.v =
      vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
802
803
804
805
  dvdr.v = vec_mul(dvdr.v, ri.v);
  dvdr2.v = vec_mul(dvdr2.v, ri2.v);

  /* Compute dv cross r */
James Willis's avatar
James Willis committed
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
  curlvrx.v =
      vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
  curlvrx2.v =
      vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v)));
  curlvry.v =
      vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
  curlvry2.v =
      vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
  curlvrz.v =
      vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
  curlvrz2.v =
      vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvrx2.v = vec_mul(curlvrx2.v, ri2.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvry2.v = vec_mul(curlvry2.v, ri2.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);
  curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);

/* Mask updates to intermediate vector sums for particle pi. */
826
#ifdef HAVE_AVX512_F
James Willis's avatar
James Willis committed
827
828
829
830
831
832
833
834
835
836
837
838
839
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                               vec_mul(xi.v, wi_dx.v))));
  rho_dhSum->v = _mm512_mask_sub_ps(
      rho_dhSum->v, knlMask2, rho_dhSum->v,
      vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
                             vec_mul(xi2.v, wi_dx2.v))));
840
841

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
James Willis's avatar
James Willis committed
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
  wcountSum->v =
      _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
  wcount_dhSum->v = _mm512_mask_sub_ps(
      wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
                                   vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
  curlvxSum->v = _mm512_mask_add_ps(
      curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
      curlvxSum->v);

  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
  curlvySum->v = _mm512_mask_add_ps(
      curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
      curlvySum->v);

  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
  curlvzSum->v = _mm512_mask_add_ps(
      curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
      curlvzSum->v);
875
#else
James Willis's avatar
James Willis committed
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
  rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v), mask2.v);
  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                                vec_mul(xi.v, wi_dx.v))),
                          mask.v);
  rho_dhSum->v -=
      vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
                                     vec_mul(xi2.v, wi_dx2.v))),
              mask2.v);
  wcountSum->v += vec_and(wi.v, mask.v);
  wcountSum->v += vec_and(wi2.v, mask2.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v), mask2.v);
  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
  div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2.v);
  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
  curlvxSum->v +=
      vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2.v);
  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
  curlvySum->v +=
      vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2.v);
  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
  curlvzSum->v +=
      vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2.v);
900
901
#endif
}
James Willis's avatar
James Willis committed
902
#endif
903

904
905
906
/**
 * @brief Force loop
 */
907
908
909
__attribute__((always_inline)) INLINE static void runner_iact_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

910
911
912
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
913

914
915
916
917
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
918
  const float mi = pi->mass;
919
920
921
922
923
924
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
925
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
926
927
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
928
  const float wi_dr = hid_inv * wi_dx;
929
930
931

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
932
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
933
934
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
935
  const float wj_dr = hjd_inv * wj_dx;
936

937
938
939
940
941
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
942
943
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
944
945

  /* Compute sound speeds */
946
947
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
948

949
  /* Compute dv dot r. */
950
951
952
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
953

954
  /* Balsara term */
955
956
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
Matthieu Schaller's avatar
Matthieu Schaller committed
957

958
  /* Are the particles moving towards each others ? */
959
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
960
961
962
963
964
965
966
967
968
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
969
970

  /* Now, convolve with the kernel */
971
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
972
  const float sph_term =
973
      (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
974
975
976
977
978

  /* Eventually got the acceleration */
  const float acc = visc_term + sph_term;

  /* Use the force Luke ! */
979
980
981
  pi->a_hydro[0] -= mj * acc * dx[0];
  pi->a_hydro[1] -= mj * acc * dx[1];
  pi->a_hydro[2] -= mj * acc * dx[2];
982

983
984
985
  pj->a_hydro[0] += mi * acc * dx[0];
  pj->a_hydro[1] += mi * acc * dx[1];
  pj->a_hydro[2] += mi * acc * dx[2];
986

987
  /* Get the time derivative for h. */
988
989
  pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
  pj->force.h_dt -= mi * dvdr * r_inv / rhoi * wj_dr;
990

991
  /* Update the signal velocity. */
992
993
  pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig;
  pj->force.v_sig = (pj->force.v_sig > v_sig) ? pj->force.v_sig : v_sig;
994

995
  /* Change in entropy */
996
997
  pi->entropy_dt += mj * visc_term * dvdr;
  pj->entropy_dt += mi * visc_term * dvdr;
998
}
999

1000
1001
1002
1003
1004
1005
/**
 * @brief Force loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
1006
1007
1008
1009
1010
1011

#ifdef WITH_VECTORIZATION

  vector r, r2, ri;
  vector xi, xj;
  vector hi, hj, hi_inv, hj_inv;
1012
  vector hid_inv, hjd_inv;
1013
  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
1014
  vector piPOrho2, pjPOrho2, pirho, pjrho;
1015
1016
  vector mi, mj;
  vector f;
1017
  vector grad_hi, grad_hj;
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
  vector dx[3];
  vector vi[3], vj[3];
  vector pia[3], pja[3];
  vector pih_dt, pjh_dt;
  vector ci, cj, v_sig;
  vector omega_ij, mu_ij, fac_mu, balsara;
  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
  int j, k;

  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */

Matthieu Schaller's avatar
Matthieu Schaller committed
1029
/* Load stuff. */
1030
1031
1032
1033
1034
#if VEC_SIZE == 8
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
1035
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
1036
1037
1038
                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
                       pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
                       pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
1039
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
1040
1041
1042
1043
1044
1045
1046
1047
1048
                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
                       pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
                       pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
  grad_hi.v =
      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
              pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
  grad_hj.v =
      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
              pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
1049
1050
1051
1052
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
1053
1054
1055
1056
1057
1058
1059
1060
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed,
                 pi[4]->force.soundspeed, pi[5]->force.soundspeed,
                 pi[6]->force.soundspeed, pi[7]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed,
                 pj[4]->force.soundspeed, pj[5]->force.soundspeed,
                 pj[6]->force.soundspeed, pj[7]->force.soundspeed);
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
  balsara.v =
      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
              pi[6]->force.balsara, pi[7]->force.balsara) +
      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
              pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
1078
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
1079
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
1080
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
1081
                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
1082
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
1083
1084
1085
1086
1087
                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
  grad_hi.v =
      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
  grad_hj.v =
      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
1088
1089
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
1090
1091
1092
1093
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed);
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
                      pi[2]->force.balsara, pi[3]->force.balsara) +
              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                      pj[2]->force.balsara, pj[3]->force.balsara);
#else
  error("Unknown vector size.")
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
1110
  ri = vec_reciprocal_sqrt(r2);
1111
1112
1113
1114
  r.v = r2.v * ri.v;

  /* Get the kernel for hi. */
  hi.v = vec_load(Hi);
1115
  hi_inv = vec_reciprocal(hi);
1116
  hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */
1117
1118
  xi.v = r.v * hi_inv.v;
  kernel_deval_vec(&xi, &wi, &wi_dx);
1119
  wi_dr.v = hid_inv.v * wi_dx.v;
1120
1121
1122

  /* Get the kernel for hj. */
  hj.v = vec_load(Hj);
1123
  hj_inv = vec_reciprocal(hj);
1124
  hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */
1125
1126
  xj.v = r.v * hj_inv.v;
  kernel_deval_vec(&xj, &wj, &wj_dx);
1127
  wj_dr.v = hjd_inv.v * wj_dx.v;
1128
1129
1130
1131

  /* Compute dv dot r. */
  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
           ((vi[2].v - vj[2].v) * dx[2].v);
Matthieu Schaller's avatar
Matthieu Schaller committed
1132
  // dvdr.v = dvdr.v * ri.v;
1133
1134
1135
1136
1137

  /* Compute the relative velocity. (This is 0 if the particles move away from
   * each other and negative otherwise) */
  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
  mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
Matthieu Schaller's avatar
Matthieu Schaller committed
1138

1139
1140
  /* Compute signal velocity */
  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
1141

1142
1143
  /* Now construct the full viscosity term */
  rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
1144
1145
  visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v *
           mu_ij.v * balsara.v / rho_ij.v;
1146
1147
1148

  /* Now, convolve with the kernel */
  visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
James Willis's avatar
James Willis committed