hydro_iact.h 39.9 KB
Newer Older
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
 ******************************************************************************/
20
21
#ifndef SWIFT_GADGET2_HYDRO_IACT_H
#define SWIFT_GADGET2_HYDRO_IACT_H
22
23

/**
24
 * @file Gadget2/hydro_iact.h
25
26
 * @brief SPH interaction functions following the Gadget-2 version of SPH.
 *
27
 * The interactions computed here are the ones presented in the Gadget-2 paper
28
29
 * Springel, V., MNRAS, Volume 364, Issue 4, pp. 1105-1134.
 * We use the same numerical coefficients as the Gadget-2 code. When used with
30
31
32
 * the Spline-3 kernel, the results should be equivalent to the ones obtained
 * with Gadget-2 up to the rounding errors and interactions missed by the
 * Gadget-2 tree-code neighbours search.
33
34
 */

35
36
#include "minmax.h"

37
38
39
/**
 * @brief Density loop
 */
40
41
42
__attribute__((always_inline)) INLINE static void runner_iact_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

43
44
  float wi, wi_dx;
  float wj, wj_dx;
45
  float dv[3], curlvr[3];
46

47
  /* Get the masses. */
48
  const float mi = pi->mass;
49
50
51
52
53
54
55
56
57
58
59
60
61
  const float mj = pj->mass;

  /* Get r and r inverse. */
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Compute the kernel function for pi */
  const float hi_inv = 1.f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);

  /* Compute contribution to the density */
  pi->rho += mj * wi;
62
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
63

64
65
66
67
68
69
70
71
72
73
74
  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
  pi->density.wcount_dh -= ui * wi_dx;

  /* Compute the kernel function for pj */
  const float hj_inv = 1.f / hj;
  const float uj = r * hj_inv;
  kernel_deval(uj, &wj, &wj_dx);

  /* Compute contribution to the density */
  pj->rho += mi * wj;
75
  pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
76

77
78
79
  /* Compute contribution to the number of neighbours */
  pj->density.wcount += wj;
  pj->density.wcount_dh -= uj * wj_dx;
80

81
82
  const float faci = mj * wi_dx * r_inv;
  const float facj = mi * wj_dx * r_inv;
83

84
85
86
87
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
88
89
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];

90
91
  pi->density.div_v -= faci * dvdr;
  pj->density.div_v -= facj * dvdr;
92
93
94
95
96
97

  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

98
99
100
  pi->density.rot_v[0] += faci * curlvr[0];
  pi->density.rot_v[1] += faci * curlvr[1];
  pi->density.rot_v[2] += faci * curlvr[2];
101

102
103
104
  pj->density.rot_v[0] += facj * curlvr[0];
  pj->density.rot_v[1] += facj * curlvr[1];
  pj->density.rot_v[2] += facj * curlvr[2];
105
106
}

107
108
109
110
111
112
/**
 * @brief Density loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_density(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

#ifdef WITH_VECTORIZATION

  vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
  vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
  vector mi, mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr, div_vi, div_vj;
  vector curlvr[3], curl_vi[3], curl_vj[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
138
139
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
140
141
142
143
144
145
146
147
148
149
150
151
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
152
153
#else
  error("Unknown vector size.")
154
155
156
157
158
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  ri.v = vec_rsqrt(r2.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
159
160
  /*vec_rsqrt does not have the level of accuracy we need, so an extra term is
   * added below.*/
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
  hi_inv.v = vec_rcp(hi.v);
  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
  xi.v = r.v * hi_inv.v;

  hj.v = vec_load(Hj);
  hj_inv.v = vec_rcp(hj.v);
  hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f));
  xj.v = r.v * hj_inv.v;

  /* Compute the kernel function. */
  kernel_deval_vec(&xi, &wi, &wi_dx);
  kernel_deval_vec(&xj, &wj, &wj_dx);

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
195
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
196
197
198
199
200
201
202
  wcounti.v = wi.v;
  wcounti_dh.v = xi.v * wi_dx.v;
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Compute density of pj. */
  rhoj.v = mi.v * wj.v;
203
  rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + xj.v * wj_dx.v);
204
205
206
207
208
209
210
211
  wcountj.v = wj.v;
  wcountj_dh.v = xj.v * wj_dx.v;
  div_vj.v = mi.v * dvdr.v * wj_dx.v;
  for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
212
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
213
214
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
215
    pi[k]->density.div_v -= div_vi.f[k];
216
217
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
    pj[k]->rho += rhoj.f[k];
218
    pj[k]->density.rho_dh -= rhoj_dh.f[k];
219
220
    pj[k]->density.wcount += wcountj.f[k];
    pj[k]->density.wcount_dh -= wcountj_dh.f[k];
221
    pj[k]->density.div_v -= div_vj.f[k];
222
223
224
225
226
    for (j = 0; j < 3; j++) pj[k]->density.rot_v[j] += curl_vj[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
227
228
  error(
      "The Gadget2 serial version of runner_iact_density was called when the "
229
      "vectorised version should have been used.");
230
231

#endif
232
233
}

234
235
236
/**
 * @brief Density loop (non-symmetric version)
 */
237
238
239
240
241
242
243
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

  float wi, wi_dx;
  float dv[3], curlvr[3];

  /* Get the masses. */
244
  const float mj = pj->mass;
245
246

  /* Get r and r inverse. */
247
248
  const float r = sqrtf(r2);
  const float ri = 1.0f / r;
249

250
  /* Compute the kernel function */
251
252
253
  const float hi_inv = 1.0f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
254
255
256

  /* Compute contribution to the density */
  pi->rho += mj * wi;
257
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
258
259
260

  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
261
  pi->density.wcount_dh -= ui * wi_dx;
262

263
  const float fac = mj * wi_dx * ri;
264

265
266
267
268
269
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
270
  pi->density.div_v -= fac * dvdr;
271

272
273
274
275
276
  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

277
278
279
  pi->density.rot_v[0] += fac * curlvr[0];
  pi->density.rot_v[1] += fac * curlvr[1];
  pi->density.rot_v[2] += fac * curlvr[2];
280
281
}

282
283
284
285
286
287
/**
 * @brief Density loop (non-symmetric vectorized version)
 */
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                               struct part **pi, struct part **pj) {
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

#ifdef WITH_VECTORIZATION

  vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx;
  vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
  vector mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr;
  vector curlvr[3], curl_vi[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
311
312
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
313
314
315
316
317
318
319
320
321
322
323
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
324
325
#else
  error("Unknown vector size.")
326
327
328
329
330
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  ri.v = vec_rsqrt(r2.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
331
332
  /*vec_rsqrt does not have the level of accuracy we need, so an extra term is
   * added below.*/
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
  ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f));
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
  hi_inv.v = vec_rcp(hi.v);
  hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f));
  xi.v = r.v * hi_inv.v;

  kernel_deval_vec(&xi, &wi, &wi_dx);

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
360
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v);
361
362
363
364
365
366
367
368
  wcounti.v = wi.v;
  wcounti_dh.v = xi.v * wi_dx.v;
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
369
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
370
371
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
372
    pi[k]->density.div_v -= div_vi.f[k];
373
374
375
376
377
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
378
379
  error(
      "The Gadget2 serial version of runner_iact_nonsym_density was called "
380
      "when the vectorised version should have been used.");
381
382

#endif
383
384
}

James Willis's avatar
James Willis committed
385
#ifdef WITH_VECTORIZATION
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
/**
 * @brief Density interaction computed using 2 interleaved vectors (non-symmetric vectorized version).
 */
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, 
                                vector vix, vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum,vector *curlvySum, vector *curlvzSum, vector mask, vector mask2, int knlMask, int knlMask2) {

  vector r, ri, r2, xi, wi, wi_dx;
  vector mj;
  vector dx, dy, dz, dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
  vector r_2, ri2, r2_2, xi2, wi2, wi_dx2;
  vector mj2;
  vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
  vector vjx2, vjy2, vjz2;
  vector dvdr2;
  vector curlvrx2, curlvry2, curlvrz2;

James Willis's avatar
James Willis committed
406
  /* Fill the vectors. */
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
  mj.v = vec_load(Mj);
  mj2.v = vec_load(&Mj[VEC_SIZE]);
  vjx.v = vec_load(Vjx);
  vjx2.v = vec_load(&Vjx[VEC_SIZE]);
  vjy.v = vec_load(Vjy);
  vjy2.v = vec_load(&Vjy[VEC_SIZE]);
  vjz.v = vec_load(Vjz);
  vjz2.v = vec_load(&Vjz[VEC_SIZE]);
  dx.v = vec_load(Dx);
  dx2.v = vec_load(&Dx[VEC_SIZE]);
  dy.v = vec_load(Dy);
  dy2.v = vec_load(&Dy[VEC_SIZE]);
  dz.v = vec_load(Dz);
  dz2.v = vec_load(&Dz[VEC_SIZE]);

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  r2_2.v = vec_load(&R2[VEC_SIZE]);
  VEC_RECIPROCAL_SQRT(r2.v, ri.v);
  VEC_RECIPROCAL_SQRT(r2_2.v, ri2.v);
  r.v = vec_mul(r2.v, ri.v);
  r_2.v = vec_mul(r2_2.v, ri2.v);

  xi.v = vec_mul(r.v, hi_inv.v);
  xi2.v = vec_mul(r_2.v, hi_inv.v);

James Willis's avatar
James Willis committed
433
  /* Calculate the kernel for two particles. */
434
  kernel_deval_2_vec(&xi, &wi, &wi_dx,&xi2, &wi2, &wi_dx2);
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvx2.v = vec_sub(vix.v, vjx2.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvy2.v = vec_sub(viy.v, vjy2.v);
  dvz.v = vec_sub(viz.v, vjz.v);
  dvz2.v = vec_sub(viz.v, vjz2.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
  dvdr2.v = vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
  dvdr.v = vec_mul(dvdr.v, ri.v);
  dvdr2.v = vec_mul(dvdr2.v, ri2.v);

  /* Compute dv cross r */
  curlvrx.v = vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f),vec_mul(dvz.v, dy.v)));
  curlvrx2.v = vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f),vec_mul(dvz2.v, dy2.v)));
  curlvry.v = vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
  curlvry2.v = vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
  curlvrz.v = vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
  curlvrz2.v = vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
  curlvrx.v = vec_mul(curlvrx.v,ri.v);
  curlvrx2.v = vec_mul(curlvrx2.v,ri2.v);
  curlvry.v = vec_mul(curlvry.v,ri.v);
  curlvry2.v = vec_mul(curlvry2.v,ri2.v);
  curlvrz.v = vec_mul(curlvrz.v,ri.v);
  curlvrz2.v = vec_mul(curlvrz2.v,ri2.v);

  /* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
  rhoSum->v = _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);
  rhoSum->v = _mm512_mask_add_ps(rhoSum->v, knlMask2, vec_mul(mj2.v, wi2.v), rhoSum->v);

  rho_dhSum->v = _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v))));
  rho_dhSum->v = _mm512_mask_sub_ps(rho_dhSum->v, knlMask2, rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(xi2.v, wi_dx2.v))));

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);
  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask2, wi2.v, wcountSum->v);

  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask, wcount_dhSum->v, vec_mul(xi.v, wi_dx.v));
  wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask2, wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v));

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));
  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask2, div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), curlvxSum->v);
  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), curlvxSum->v);

  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), curlvySum->v);
  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), curlvySum->v);

  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), curlvzSum->v);
  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask2, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), curlvzSum->v);
#else
  rhoSum->v += vec_and(vec_mul(mj.v, wi.v),mask.v);
  rhoSum->v += vec_and(vec_mul(mj2.v, wi2.v),mask2.v);
  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v))),mask.v);
  rho_dhSum->v -= vec_and(vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(xi2.v, wi_dx2.v))),mask2.v);
  wcountSum->v += vec_and(wi.v,mask.v);
  wcountSum->v += vec_and(wi2.v,mask2.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v),mask.v);
  wcount_dhSum->v -= vec_and(vec_mul(xi2.v, wi_dx2.v),mask2.v);
  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)),mask.v);
  div_vSum->v -= vec_and(vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)),mask2.v);
  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),mask.v);
  curlvxSum->v += vec_and(vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)),mask2.v);
  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),mask.v);
  curlvySum->v += vec_and(vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)),mask2.v);
  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),mask.v);
  curlvzSum->v += vec_and(vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)),mask2.v);
#endif
}
James Willis's avatar
James Willis committed
508
#endif
509

510
511
512
/**
 * @brief Force loop
 */
513
514
515
__attribute__((always_inline)) INLINE static void runner_iact_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

516
517
518
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
519

520
521
522
523
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
524
  const float mi = pi->mass;
525
526
527
528
529
530
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
531
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
532
533
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
534
  const float wi_dr = hid_inv * wi_dx;
535
536
537

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
538
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
539
540
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
541
  const float wj_dr = hjd_inv * wj_dx;
542

543
544
545
546
547
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
548
549
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
550
551

  /* Compute sound speeds */
552
553
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
554

555
  /* Compute dv dot r. */
556
557
558
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
559

560
  /* Balsara term */
561
562
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
Matthieu Schaller's avatar
Matthieu Schaller committed
563

564
  /* Are the particles moving towards each others ? */
565
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
566
567
568
569
570
571
572
573
574
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
575
576

  /* Now, convolve with the kernel */
577
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
578
  const float sph_term =
579
      (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
580
581
582
583
584

  /* Eventually got the acceleration */
  const float acc = visc_term + sph_term;

  /* Use the force Luke ! */
585
586
587
  pi->a_hydro[0] -= mj * acc * dx[0];
  pi->a_hydro[1] -= mj * acc * dx[1];
  pi->a_hydro[2] -= mj * acc * dx[2];
588

589
590
591
  pj->a_hydro[0] += mi * acc * dx[0];
  pj->a_hydro[1] += mi * acc * dx[1];
  pj->a_hydro[2] += mi * acc * dx[2];
592

593
  /* Get the time derivative for h. */
594
595
  pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
  pj->force.h_dt -= mi * dvdr * r_inv / rhoi * wj_dr;
596

597
  /* Update the signal velocity. */
598
599
  pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig;
  pj->force.v_sig = (pj->force.v_sig > v_sig) ? pj->force.v_sig : v_sig;
600

601
  /* Change in entropy */
602
603
  pi->entropy_dt += mj * visc_term * dvdr;
  pj->entropy_dt += mi * visc_term * dvdr;
604
}
605

606
607
608
609
610
611
/**
 * @brief Force loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
612
613
614
615
616
617

#ifdef WITH_VECTORIZATION

  vector r, r2, ri;
  vector xi, xj;
  vector hi, hj, hi_inv, hj_inv;
618
  vector hid_inv, hjd_inv;
619
  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
620
  vector piPOrho2, pjPOrho2, pirho, pjrho;
621
622
  vector mi, mj;
  vector f;
623
  vector grad_hi, grad_hj;
624
625
626
627
628
629
630
631
632
633
634
  vector dx[3];
  vector vi[3], vj[3];
  vector pia[3], pja[3];
  vector pih_dt, pjh_dt;
  vector ci, cj, v_sig;
  vector omega_ij, mu_ij, fac_mu, balsara;
  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
  int j, k;

  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */

Matthieu Schaller's avatar
Matthieu Schaller committed
635
/* Load stuff. */
636
637
638
639
640
#if VEC_SIZE == 8
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
641
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
642
643
644
                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
                      pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
                      pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
645
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
646
647
648
                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
                      pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
                      pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
649
650
651
652
  grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, 
                      pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
  grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, 
                      pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
653
654
655
656
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
657
658
659
660
661
662
663
664
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed,
                 pi[4]->force.soundspeed, pi[5]->force.soundspeed,
                 pi[6]->force.soundspeed, pi[7]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed,
                 pj[4]->force.soundspeed, pj[5]->force.soundspeed,
                 pj[6]->force.soundspeed, pj[7]->force.soundspeed);
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
  balsara.v =
      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
              pi[6]->force.balsara, pi[7]->force.balsara) +
      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
              pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
682
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
683
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
684
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
685
                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
686
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
687
                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
688
689
  grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f); 
  grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f); 
690
691
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
692
693
694
695
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed);
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
                      pi[2]->force.balsara, pi[3]->force.balsara) +
              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                      pj[2]->force.balsara, pj[3]->force.balsara);
#else
  error("Unknown vector size.")
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
712
  VEC_RECIPROCAL_SQRT(r2.v, ri.v);
713
714
715
716
  r.v = r2.v * ri.v;

  /* Get the kernel for hi. */
  hi.v = vec_load(Hi);
717
  VEC_RECIPROCAL(hi.v, hi_inv.v);
718
  hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */
719
720
  xi.v = r.v * hi_inv.v;
  kernel_deval_vec(&xi, &wi, &wi_dx);
721
  wi_dr.v = hid_inv.v * wi_dx.v;
722
723
724

  /* Get the kernel for hj. */
  hj.v = vec_load(Hj);
725
  VEC_RECIPROCAL(hj.v, hj_inv.v);
726
  hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */
727
728
  xj.v = r.v * hj_inv.v;
  kernel_deval_vec(&xj, &wj, &wj_dx);
729
  wj_dr.v = hjd_inv.v * wj_dx.v;
730
731
732
733

  /* Compute dv dot r. */
  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
           ((vi[2].v - vj[2].v) * dx[2].v);
Matthieu Schaller's avatar
Matthieu Schaller committed
734
  // dvdr.v = dvdr.v * ri.v;
735
736
737
738
739

  /* Compute the relative velocity. (This is 0 if the particles move away from
   * each other and negative otherwise) */
  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
  mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
Matthieu Schaller's avatar
Matthieu Schaller committed
740

741
742
  /* Compute signal velocity */
  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
743

744
745
  /* Now construct the full viscosity term */
  rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
746
747
  visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v *
           mu_ij.v * balsara.v / rho_ij.v;
748
749
750

  /* Now, convolve with the kernel */
  visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
751
  sph_term.v = (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) * ri.v;
752
753
754

  /* Eventually get the acceleration */
  acc.v = visc_term.v + sph_term.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
755

756
757
758
759
760
761
762
763
764
765
766
767
  /* Use the force, Luke! */
  for (k = 0; k < 3; k++) {
    f.v = dx[k].v * acc.v;
    pia[k].v = mj.v * f.v;
    pja[k].v = mi.v * f.v;
  }

  /* Get the time derivative for h. */
  pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
  pjh_dt.v = mi.v * dvdr.v * ri.v / pirho.v * wj_dr.v;

  /* Change in entropy */
768
  entropy_dt.v = visc_term.v * dvdr.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
769

770
771
772
773
774
775
  /* Store the forces back on the particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    for (j = 0; j < 3; j++) {
      pi[k]->a_hydro[j] -= pia[j].f[k];
      pj[k]->a_hydro[j] += pja[j].f[k];
    }
776
777
    pi[k]->force.h_dt -= pih_dt.f[k];
    pj[k]->force.h_dt -= pjh_dt.f[k];
778
779
    pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]);
    pj[k]->force.v_sig = max(pj[k]->force.v_sig, v_sig.f[k]);
780
    pi[k]->entropy_dt += entropy_dt.f[k] * mj.f[k];
781
    pj[k]->entropy_dt += entropy_dt.f[k] * mi.f[k];
782
783
  }

Matthieu Schaller's avatar
Matthieu Schaller committed
784
#else
785

Matthieu Schaller's avatar
Matthieu Schaller committed
786
787
  error(
      "The Gadget2 serial version of runner_iact_nonsym_force was called when "
788
      "the vectorised version should have been used.");
789
790

#endif
791
792
}

793
794
795
/**
 * @brief Force loop (non-symmetric version)
 */
796
797
798
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

799
800
801
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
802

803
804
805
806
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
807
  // const float mi = pi->mass;
808
809
810
811
812
813
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
814
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
815
816
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
817
  const float wi_dr = hid_inv * wi_dx;
818
819
820

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
821
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
822
823
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
824
  const float wj_dr = hjd_inv * wj_dx;
825

826
827
828
829
830
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
831
832
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
833
834

  /* Compute sound speeds */
835
836
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
837

838
  /* Compute dv dot r. */
839
840
841
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
842

843
  /* Balsara term */
844
845
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
846
847

  /* Are the particles moving towards each others ? */
848
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
849
850
851
852
853
854
855
856
857
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
858
859

  /* Now, convolve with the kernel */
860
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
861
  const float sph_term =
862
      (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
863
864
865

  /* Eventually got the acceleration */
  const float acc = visc_term + sph_term;
866

867
  /* Use the force Luke ! */
868
869
870
  pi->a_hydro[0] -= mj * acc * dx[0];
  pi->a_hydro[1] -= mj * acc * dx[1];
  pi->a_hydro[2] -= mj * acc * dx[2];
871

872
  /* Get the time derivative for h. */
873
  pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
874

875
  /* Update the signal velocity. */
876
  pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig;
877

878
  /* Change in entropy */
879
  pi->entropy_dt += mj * visc_term * dvdr;
880
}
881

882
883
884
885
886
887
/**
 * @brief Force loop (Vectorized non-symmetric version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
888

Matthieu Schaller's avatar
Matthieu Schaller committed
889
#ifdef WITH_VECTORIZATION
890
891
892
893

  vector r, r2, ri;
  vector xi, xj;
  vector hi, hj, hi_inv, hj_inv;
894
  vector hid_inv, hjd_inv;
895
  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
896
  vector piPOrho2, pjPOrho2, pirho, pjrho;
897
898
  vector mj;
  vector f;
899
  vector grad_hi, grad_hj;
900
901
902
903
  vector dx[3];
  vector vi[3], vj[3];
  vector pia[3];
  vector pih_dt;
904
905
  vector ci, cj, v_sig;
  vector omega_ij, mu_ij, fac_mu, balsara;
906
907
908
909
910
  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
  int j, k;

  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */

Matthieu Schaller's avatar
Matthieu Schaller committed
911
/* Load stuff. */
912
913
914
#if VEC_SIZE == 8
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
915
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
916
917
918
                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
                      pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
                      pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
919
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
920
921
922
                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
                      pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
                      pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
923
924
925
926
  grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f, 
                      pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
  grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f, 
                      pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
927
928
929
930
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
931
932
933
934
935
936
937
938
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed,
                 pi[4]->force.soundspeed, pi[5]->force.soundspeed,
                 pi[6]->force.soundspeed, pi[7]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed,
                 pj[4]->force.soundspeed, pj[5]->force.soundspeed,
                 pj[6]->force.soundspeed, pj[7]->force.soundspeed);
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
  balsara.v =
      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
              pi[6]->force.balsara, pi[7]->force.balsara) +
      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
              pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
957
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
958
                      pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
959
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
Matthieu Schaller's avatar
Matthieu Schaller committed
960
                      pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
961
962
  grad_hi.v = vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f); 
  grad_hj.v = vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f); 
963
964
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
965
966
967
968
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed);
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
                      pi[2]->force.balsara, pi[3]->force.balsara) +
              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                      pj[2]->force.balsara, pj[3]->force.balsara);
#else
  error("Unknown vector size.")
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
985
  VEC_RECIPROCAL_SQRT(r2.v, ri.v);
986
987
988
989
<