hydro_iact.h 64 KB
Newer Older
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
 ******************************************************************************/
20
21
#ifndef SWIFT_GADGET2_HYDRO_IACT_H
#define SWIFT_GADGET2_HYDRO_IACT_H
22
23

/**
24
 * @file Gadget2/hydro_iact.h
25
26
 * @brief SPH interaction functions following the Gadget-2 version of SPH.
 *
27
 * The interactions computed here are the ones presented in the Gadget-2 paper
28
29
 * Springel, V., MNRAS, Volume 364, Issue 4, pp. 1105-1134.
 * We use the same numerical coefficients as the Gadget-2 code. When used with
30
31
32
 * the Spline-3 kernel, the results should be equivalent to the ones obtained
 * with Gadget-2 up to the rounding errors and interactions missed by the
 * Gadget-2 tree-code neighbours search.
33
34
 */

35
#include "cache.h"
James Willis's avatar
James Willis committed
36
#include "minmax.h"
37

38
39
40
/**
 * @brief Density loop
 */
41
42
43
__attribute__((always_inline)) INLINE static void runner_iact_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

44
45
  float wi, wi_dx;
  float wj, wj_dx;
46
  float dv[3], curlvr[3];
47

48
  /* Get the masses. */
49
  const float mi = pi->mass;
50
51
52
53
54
55
56
57
58
59
60
61
62
  const float mj = pj->mass;

  /* Get r and r inverse. */
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Compute the kernel function for pi */
  const float hi_inv = 1.f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);

  /* Compute contribution to the density */
  pi->rho += mj * wi;
63
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
64

65
66
67
68
69
70
71
72
73
74
75
  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
  pi->density.wcount_dh -= ui * wi_dx;

  /* Compute the kernel function for pj */
  const float hj_inv = 1.f / hj;
  const float uj = r * hj_inv;
  kernel_deval(uj, &wj, &wj_dx);

  /* Compute contribution to the density */
  pj->rho += mi * wj;
76
  pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
77

78
79
80
  /* Compute contribution to the number of neighbours */
  pj->density.wcount += wj;
  pj->density.wcount_dh -= uj * wj_dx;
81

82
83
  const float faci = mj * wi_dx * r_inv;
  const float facj = mi * wj_dx * r_inv;
84

85
86
87
88
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
89
90
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];

91
92
  pi->density.div_v -= faci * dvdr;
  pj->density.div_v -= facj * dvdr;
93
94
95
96
97
98

  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

99
100
101
  pi->density.rot_v[0] += faci * curlvr[0];
  pi->density.rot_v[1] += faci * curlvr[1];
  pi->density.rot_v[2] += faci * curlvr[2];
102

103
104
105
  pj->density.rot_v[0] += facj * curlvr[0];
  pj->density.rot_v[1] += facj * curlvr[1];
  pj->density.rot_v[2] += facj * curlvr[2];
106
107
}

108
109
110
111
112
113
/**
 * @brief Density loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_density(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

#ifdef WITH_VECTORIZATION

  vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
  vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
  vector mi, mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr, div_vi, div_vj;
  vector curlvr[3], curl_vi[3], curl_vj[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
139
140
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
141
142
143
144
145
146
147
148
149
150
151
152
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
153
#else
154
  error("Unknown vector size.");
155
156
157
158
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
159
  ri = vec_reciprocal_sqrt(r2);
160
161
162
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
163
  hi_inv = vec_reciprocal(hi);
164
165
166
  xi.v = r.v * hi_inv.v;

  hj.v = vec_load(Hj);
167
  hj_inv = vec_reciprocal(hj);
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
  xj.v = r.v * hj_inv.v;

  /* Compute the kernel function. */
  kernel_deval_vec(&xi, &wi, &wi_dx);
  kernel_deval_vec(&xj, &wj, &wj_dx);

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
191
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
192
193
194
195
196
197
198
  wcounti.v = wi.v;
  wcounti_dh.v = xi.v * wi_dx.v;
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Compute density of pj. */
  rhoj.v = mi.v * wj.v;
199
  rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + xj.v * wj_dx.v);
200
201
202
203
204
205
206
207
  wcountj.v = wj.v;
  wcountj_dh.v = xj.v * wj_dx.v;
  div_vj.v = mi.v * dvdr.v * wj_dx.v;
  for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
208
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
209
210
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
211
    pi[k]->density.div_v -= div_vi.f[k];
212
213
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
    pj[k]->rho += rhoj.f[k];
214
    pj[k]->density.rho_dh -= rhoj_dh.f[k];
215
216
    pj[k]->density.wcount += wcountj.f[k];
    pj[k]->density.wcount_dh -= wcountj_dh.f[k];
217
    pj[k]->density.div_v -= div_vj.f[k];
218
219
220
221
222
    for (j = 0; j < 3; j++) pj[k]->density.rot_v[j] += curl_vj[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
223
224
  error(
      "The Gadget2 serial version of runner_iact_density was called when the "
225
      "vectorised version should have been used.");
226
227

#endif
228
229
}

230
231
232
/**
 * @brief Density loop (non-symmetric version)
 */
233
234
235
236
237
238
239
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

  float wi, wi_dx;
  float dv[3], curlvr[3];

  /* Get the masses. */
240
  const float mj = pj->mass;
241
242

  /* Get r and r inverse. */
243
244
  const float r = sqrtf(r2);
  const float ri = 1.0f / r;
245

246
  /* Compute the kernel function */
247
248
249
  const float hi_inv = 1.0f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
250
251
252

  /* Compute contribution to the density */
  pi->rho += mj * wi;
253
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
254
255
256

  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
257
  pi->density.wcount_dh -= ui * wi_dx;
258

259
  const float fac = mj * wi_dx * ri;
260

261
262
263
264
265
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
266
  pi->density.div_v -= fac * dvdr;
267

268
269
270
271
272
  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

273
274
275
  pi->density.rot_v[0] += fac * curlvr[0];
  pi->density.rot_v[1] += fac * curlvr[1];
  pi->density.rot_v[2] += fac * curlvr[2];
276
277
}

278
279
280
281
282
283
/**
 * @brief Density loop (non-symmetric vectorized version)
 */
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                               struct part **pi, struct part **pj) {
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306

#ifdef WITH_VECTORIZATION

  vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
  vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
  vector mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr;
  vector curlvr[3], curl_vi[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
307
308
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
309
310
311
312
313
314
315
316
317
318
319
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
320
#else
321
  error("Unknown vector size.");
322
323
324
325
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
326
  ri = vec_reciprocal_sqrt(r2);
327
328
329
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
330
  hi_inv = vec_reciprocal(hi);
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
  xi.v = r.v * hi_inv.v;

  kernel_deval_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
352
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
353
354
355
356
357
358
359
360
  wcounti.v = wi.v;
  wcounti_dh.v = xi.v * wi_dx.v;
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
361
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
362
363
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
364
    pi[k]->density.div_v -= div_vi.f[k];
365
366
367
368
369
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
370
371
  error(
      "The Gadget2 serial version of runner_iact_nonsym_density was called "
372
      "when the vectorised version should have been used.");
373
374

#endif
375
376
}

377
#ifdef WITH_VECTORIZATION
378
379
380
381
382

/**
 * @brief Density interaction computed using 1 vector
 * (non-symmetric vectorized version).
 */
383
__attribute__((always_inline)) INLINE static void
Matthieu Schaller's avatar
Matthieu Schaller committed
384
385
386
387
388
389
390
runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
                                 vector hi_inv, vector vix, vector viy,
                                 vector viz, float *Vjx, float *Vjy, float *Vjz,
                                 float *Mj, vector *rhoSum, vector *rho_dhSum,
                                 vector *wcountSum, vector *wcount_dhSum,
                                 vector *div_vSum, vector *curlvxSum,
                                 vector *curlvySum, vector *curlvzSum,
391
                                 mask_t mask) {
392
393
394
395
396
397
398

  vector r, ri, xi, wi, wi_dx;
  vector mj;
  vector dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
James Willis's avatar
James Willis committed
399

400
  /* Fill the vectors. */
401
402
403
404
  mj.v = vec_load(Mj);
  vjx.v = vec_load(Vjx);
  vjy.v = vec_load(Vjy);
  vjz.v = vec_load(Vjz);
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423

  /* Get the radius and inverse radius. */
  ri = vec_reciprocal_sqrt(*r2);
  r.v = vec_mul(r2->v, ri.v);

  xi.v = vec_mul(r.v, hi_inv.v);

  /* Calculate the kernel for two particles. */
  kernel_deval_1_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvz.v = vec_sub(viz.v, vjz.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
  dvdr.v = vec_mul(dvdr.v, ri.v);

424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
  /* Compute dv cross r */
  curlvrx.v =
      vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
  curlvry.v =
      vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
  curlvrz.v =
      vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);

/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                               vec_mul(xi.v, wi_dx.v))));

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
James Willis's avatar
James Willis committed
456

457
458
459
  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
James Willis's avatar
James Willis committed
460

461
462
463
  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
James Willis's avatar
James Willis committed
464
#else
465
466
467
468
469
470
471
472
473
  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
  rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                                vec_mul(xi.v, wi_dx.v))), mask);
  wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
  div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
  curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
  curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
  curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
474
475
476
#endif
}

477
/**
James Willis's avatar
James Willis committed
478
479
 * @brief Density interaction computed using 2 interleaved vectors
 * (non-symmetric vectorized version).
480
481
 */
__attribute__((always_inline)) INLINE static void
James Willis's avatar
James Willis committed
482
483
484
485
486
runner_iact_nonsym_2_vec_density(
    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
487
    mask_t mask, mask_t mask2) {
488
489
490
491
492
493
494
495
496
497
498
499
500
501

  vector r, ri, r2, xi, wi, wi_dx;
  vector mj;
  vector dx, dy, dz, dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
  vector r_2, ri2, r2_2, xi2, wi2, wi_dx2;
  vector mj2;
  vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
  vector vjx2, vjy2, vjz2;
  vector dvdr2;
  vector curlvrx2, curlvry2, curlvrz2;

James Willis's avatar
James Willis committed
502
  /* Fill the vectors. */
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
  mj.v = vec_load(Mj);
  mj2.v = vec_load(&Mj[VEC_SIZE]);
  vjx.v = vec_load(Vjx);
  vjx2.v = vec_load(&Vjx[VEC_SIZE]);
  vjy.v = vec_load(Vjy);
  vjy2.v = vec_load(&Vjy[VEC_SIZE]);
  vjz.v = vec_load(Vjz);
  vjz2.v = vec_load(&Vjz[VEC_SIZE]);
  dx.v = vec_load(Dx);
  dx2.v = vec_load(&Dx[VEC_SIZE]);
  dy.v = vec_load(Dy);
  dy2.v = vec_load(&Dy[VEC_SIZE]);
  dz.v = vec_load(Dz);
  dz2.v = vec_load(&Dz[VEC_SIZE]);

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  r2_2.v = vec_load(&R2[VEC_SIZE]);
521
522
  ri = vec_reciprocal_sqrt(r2);
  ri2 = vec_reciprocal_sqrt(r2_2);
523
524
525
526
527
528
  r.v = vec_mul(r2.v, ri.v);
  r_2.v = vec_mul(r2_2.v, ri2.v);

  xi.v = vec_mul(r.v, hi_inv.v);
  xi2.v = vec_mul(r_2.v, hi_inv.v);

James Willis's avatar
James Willis committed
529
  /* Calculate the kernel for two particles. */
James Willis's avatar
James Willis committed
530
  kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2);
531
532
533
534
535
536
537
538
539
540
541

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvx2.v = vec_sub(vix.v, vjx2.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvy2.v = vec_sub(viy.v, vjy2.v);
  dvz.v = vec_sub(viz.v, vjz.v);
  dvz2.v = vec_sub(viz.v, vjz2.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
James Willis's avatar
James Willis committed
542
543
  dvdr2.v =
      vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
544
545
546
547
  dvdr.v = vec_mul(dvdr.v, ri.v);
  dvdr2.v = vec_mul(dvdr2.v, ri2.v);

  /* Compute dv cross r */
James Willis's avatar
James Willis committed
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
  curlvrx.v =
      vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
  curlvrx2.v =
      vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v)));
  curlvry.v =
      vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
  curlvry2.v =
      vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
  curlvrz.v =
      vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
  curlvrz2.v =
      vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvrx2.v = vec_mul(curlvrx2.v, ri2.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvry2.v = vec_mul(curlvry2.v, ri2.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);
  curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);

/* Mask updates to intermediate vector sums for particle pi. */
568
#ifdef HAVE_AVX512_F
James Willis's avatar
James Willis committed
569
570
571
572
573
574
575
576
577
578
579
580
581
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                               vec_mul(xi.v, wi_dx.v))));
  rho_dhSum->v = _mm512_mask_sub_ps(
      rho_dhSum->v, knlMask2, rho_dhSum->v,
      vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
                             vec_mul(xi2.v, wi_dx2.v))));
582
583

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
James Willis's avatar
James Willis committed
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
  wcountSum->v =
      _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask,
                                       wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
  wcount_dhSum->v = _mm512_mask_sub_ps(
      wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v,
                                   vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
  curlvxSum->v = _mm512_mask_add_ps(
      curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),
      curlvxSum->v);

  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
  curlvySum->v = _mm512_mask_add_ps(
      curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),
      curlvySum->v);

  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
  curlvzSum->v = _mm512_mask_add_ps(
      curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),
      curlvzSum->v);
617
#else
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
  rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
                                                vec_mul(xi.v, wi_dx.v))), mask);
  rho_dhSum->v = vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
                                                vec_mul(xi2.v, wi_dx2.v))), mask2);
  wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
  wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
  wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
  div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
  div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
  curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
  curlvxSum->v = vec_mask_add(curlvxSum->v,vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
  curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
  curlvySum->v = vec_mask_add(curlvySum->v,vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
  curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
  curlvzSum->v = vec_mask_add(curlvzSum->v,vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
636
637
#endif
}
James Willis's avatar
James Willis committed
638
#endif
639

640
641
642
/**
 * @brief Force loop
 */
643
644
645
__attribute__((always_inline)) INLINE static void runner_iact_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

646
647
648
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
649

650
651
652
653
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
654
  const float mi = pi->mass;
655
656
657
658
659
660
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
661
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
662
663
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
664
  const float wi_dr = hid_inv * wi_dx;
665
666
667

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
668
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
669
670
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
671
  const float wj_dr = hjd_inv * wj_dx;
672

673
674
675
676
677
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
678
679
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
680
681

  /* Compute sound speeds */
682
683
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
684

685
  /* Compute dv dot r. */
686
687
688
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
689

690
  /* Balsara term */
691
692
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
Matthieu Schaller's avatar
Matthieu Schaller committed
693

694
  /* Are the particles moving towards each others ? */
695
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
696
697
698
699
700
701
702
703
704
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
705
706

  /* Now, convolve with the kernel */
707
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
708
  const float sph_term =
709
      (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
710
711
712
713
714

  /* Eventually got the acceleration */
  const float acc = visc_term + sph_term;

  /* Use the force Luke ! */
715
716
717
  pi->a_hydro[0] -= mj * acc * dx[0];
  pi->a_hydro[1] -= mj * acc * dx[1];
  pi->a_hydro[2] -= mj * acc * dx[2];
718

719
720
721
  pj->a_hydro[0] += mi * acc * dx[0];
  pj->a_hydro[1] += mi * acc * dx[1];
  pj->a_hydro[2] += mi * acc * dx[2];
722

723
  /* Get the time derivative for h. */
724
725
  pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
  pj->force.h_dt -= mi * dvdr * r_inv / rhoi * wj_dr;
726

727
  /* Update the signal velocity. */
728
729
  pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig;
  pj->force.v_sig = (pj->force.v_sig > v_sig) ? pj->force.v_sig : v_sig;
730

731
  /* Change in entropy */
732
733
  pi->entropy_dt += mj * visc_term * dvdr;
  pj->entropy_dt += mi * visc_term * dvdr;
734
}
735

736
737
738
739
740
741
/**
 * @brief Force loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
742
743
744
745
746
747

#ifdef WITH_VECTORIZATION

  vector r, r2, ri;
  vector xi, xj;
  vector hi, hj, hi_inv, hj_inv;
748
  vector hid_inv, hjd_inv;
749
  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
750
  vector piPOrho2, pjPOrho2, pirho, pjrho;
751
752
  vector mi, mj;
  vector f;
753
  vector grad_hi, grad_hj;
754
755
756
757
758
759
760
761
762
763
764
  vector dx[3];
  vector vi[3], vj[3];
  vector pia[3], pja[3];
  vector pih_dt, pjh_dt;
  vector ci, cj, v_sig;
  vector omega_ij, mu_ij, fac_mu, balsara;
  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
  int j, k;

  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */

Matthieu Schaller's avatar
Matthieu Schaller committed
765
/* Load stuff. */
766
767
768
769
770
#if VEC_SIZE == 8
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
771
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
772
773
774
                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
                       pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
                       pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
775
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
776
777
778
779
780
781
782
783
784
                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
                       pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
                       pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
  grad_hi.v =
      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
              pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
  grad_hj.v =
      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
              pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
785
786
787
788
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
789
790
791
792
793
794
795
796
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed,
                 pi[4]->force.soundspeed, pi[5]->force.soundspeed,
                 pi[6]->force.soundspeed, pi[7]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed,
                 pj[4]->force.soundspeed, pj[5]->force.soundspeed,
                 pj[6]->force.soundspeed, pj[7]->force.soundspeed);
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
  balsara.v =
      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
              pi[6]->force.balsara, pi[7]->force.balsara) +
      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
              pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
814
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
815
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
816
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
817
                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
818
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
819
820
821
822
823
                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
  grad_hi.v =
      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
  grad_hj.v =
      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
824
825
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
826
827
828
829
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed);
830
831
832
833
834
835
836
837
838
839
840
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
                      pi[2]->force.balsara, pi[3]->force.balsara) +
              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                      pj[2]->force.balsara, pj[3]->force.balsara);
#else
841
  error("Unknown vector size.");
842
843
844
845
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
846
  ri = vec_reciprocal_sqrt(r2);
847
848
849
850
  r.v = r2.v * ri.v;

  /* Get the kernel for hi. */
  hi.v = vec_load(Hi);
851
  hi_inv = vec_reciprocal(hi);
852
  hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */
853
854
  xi.v = r.v * hi_inv.v;
  kernel_deval_vec(&xi, &wi, &wi_dx);
855
  wi_dr.v = hid_inv.v * wi_dx.v;
856
857
858

  /* Get the kernel for hj. */
  hj.v = vec_load(Hj);
859
  hj_inv = vec_reciprocal(hj);
860
  hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */
861
862
  xj.v = r.v * hj_inv.v;
  kernel_deval_vec(&xj, &wj, &wj_dx);
863
  wj_dr.v = hjd_inv.v * wj_dx.v;
864
865
866
867

  /* Compute dv dot r. */
  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
           ((vi[2].v - vj[2].v) * dx[2].v);
Matthieu Schaller's avatar
Matthieu Schaller committed
868
  // dvdr.v = dvdr.v * ri.v;
869
870
871
872
873

  /* Compute the relative velocity. (This is 0 if the particles move away from
   * each other and negative otherwise) */
  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
  mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
Matthieu Schaller's avatar
Matthieu Schaller committed
874

875
876
  /* Compute signal velocity */
  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
877

878
879
  /* Now construct the full viscosity term */
  rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
880
881
  visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v *
           mu_ij.v * balsara.v / rho_ij.v;
882
883
884

  /* Now, convolve with the kernel */
  visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
James Willis's avatar
James Willis committed
885
886
887
  sph_term.v =
      (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
      ri.v;
888
889
890

  /* Eventually get the acceleration */
  acc.v = visc_term.v + sph_term.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
891

892
893
894
895
896
897
898
899
900
901
902
903
  /* Use the force, Luke! */
  for (k = 0; k < 3; k++) {
    f.v = dx[k].v * acc.v;
    pia[k].v = mj.v * f.v;
    pja[k].v = mi.v * f.v;
  }

  /* Get the time derivative for h. */
  pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
  pjh_dt.v = mi.v * dvdr.v * ri.v / pirho.v * wj_dr.v;

  /* Change in entropy */
904
  entropy_dt.v = visc_term.v * dvdr.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
905

906
907
908
909
910
911
  /* Store the forces back on the particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    for (j = 0; j < 3; j++) {
      pi[k]->a_hydro[j] -= pia[j].f[k];
      pj[k]->a_hydro[j] += pja[j].f[k];
    }
912
913
    pi[k]->force.h_dt -= pih_dt.f[k];
    pj[k]->force.h_dt -= pjh_dt.f[k];
914
915
    pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]);
    pj[k]->force.v_sig = max(pj[k]->force.v_sig, v_sig.f[k]);
916
    pi[k]->entropy_dt += entropy_dt.f[k] * mj.f[k];
917
    pj[k]->entropy_dt += entropy_dt.f[k] * mi.f[k];
918
919
  }

Matthieu Schaller's avatar
Matthieu Schaller committed
920
#else
921

Matthieu Schaller's avatar
Matthieu Schaller committed
922
923
  error(
      "The Gadget2 serial version of runner_iact_nonsym_force was called when "
924
      "the vectorised version should have been used.");
925
926

#endif
927
928
}

929
930
931
/**
 * @brief Force loop (non-symmetric version)
 */
932
933
934
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

935
936
937
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
938

939
940
941
942
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
943
  // const float mi = pi->mass;
944
945
946
947
948
949
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
950
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
951
952
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
953
  const float wi_dr = hid_inv * wi_dx;
954
955
956

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
957
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
958
959
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
960
  const float wj_dr = hjd_inv * wj_dx;
961

962
963
964
965
966
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
967
968
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
969
970

  /* Compute sound speeds */
971
972
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
973

974
  /* Compute dv dot r. */
975
976
977
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
978

979
  /* Balsara term */
980
981
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
982
983

  /* Are the particles moving towards each others ? */
984
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
985
986
987
988
989
990
991
992
993
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
994
995

  /* Now, convolve with the kernel */
996
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
997
  const float sph_term =
998
      (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
999
1000
1001

  /* Eventually got the acceleration */
  const float acc = visc_term + sph_term;
1002

1003
  /* Use the force Luke ! */
1004
1005
1006
  pi->a_hydro[0] -= mj * acc * dx[0];
  pi->a_hydro[1] -= mj * acc * dx[1];
  pi->a_hydro[2] -= mj * acc * dx[2];
1007

1008
  /* Get the time derivative for h. */
1009
  pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
1010

1011
  /* Update the signal velocity. */
1012
  pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig;
1013

1014
  /* Change in entropy */
1015
  pi->entropy_dt += mj * visc_term * dvdr;
1016
}
1017

1018
1019
1020
1021
1022
1023
/**
 * @brief Force loop (Vectorized non-symmetric version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
1024

Matthieu Schaller's avatar
Matthieu Schaller committed
1025
#ifdef WITH_VECTORIZATION
1026