hydro_iact.h 42 KB
Newer Older
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
 ******************************************************************************/
20
21
#ifndef SWIFT_GADGET2_HYDRO_IACT_H
#define SWIFT_GADGET2_HYDRO_IACT_H
22
23

/**
24
 * @file Gadget2/hydro_iact.h
25
26
 * @brief SPH interaction functions following the Gadget-2 version of SPH.
 *
27
 * The interactions computed here are the ones presented in the Gadget-2 paper
28
29
 * Springel, V., MNRAS, Volume 364, Issue 4, pp. 1105-1134.
 * We use the same numerical coefficients as the Gadget-2 code. When used with
30
31
32
 * the Spline-3 kernel, the results should be equivalent to the ones obtained
 * with Gadget-2 up to the rounding errors and interactions missed by the
 * Gadget-2 tree-code neighbours search.
33
34
 */

35
#include "cache.h"
James Willis's avatar
James Willis committed
36
#include "minmax.h"
37

38
39
40
/**
 * @brief Density loop
 */
41
42
43
__attribute__((always_inline)) INLINE static void runner_iact_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

44
45
  float wi, wi_dx;
  float wj, wj_dx;
46
  float dv[3], curlvr[3];
47

48
  /* Get the masses. */
49
  const float mi = pi->mass;
50
51
52
53
54
55
56
57
58
59
60
61
62
  const float mj = pj->mass;

  /* Get r and r inverse. */
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Compute the kernel function for pi */
  const float hi_inv = 1.f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);

  /* Compute contribution to the density */
  pi->rho += mj * wi;
63
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
64

65
66
  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
67
  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
68
69
70
71
72
73
74
75

  /* Compute the kernel function for pj */
  const float hj_inv = 1.f / hj;
  const float uj = r * hj_inv;
  kernel_deval(uj, &wj, &wj_dx);

  /* Compute contribution to the density */
  pj->rho += mi * wj;
76
  pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx);
77

78
79
  /* Compute contribution to the number of neighbours */
  pj->density.wcount += wj;
80
  pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx);
81

82
83
  const float faci = mj * wi_dx * r_inv;
  const float facj = mi * wj_dx * r_inv;
84

85
86
87
88
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
89
90
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];

91
92
  pi->density.div_v -= faci * dvdr;
  pj->density.div_v -= facj * dvdr;
93
94
95
96
97
98

  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

99
100
101
  pi->density.rot_v[0] += faci * curlvr[0];
  pi->density.rot_v[1] += faci * curlvr[1];
  pi->density.rot_v[2] += faci * curlvr[2];
102

103
104
105
  pj->density.rot_v[0] += facj * curlvr[0];
  pj->density.rot_v[1] += facj * curlvr[1];
  pj->density.rot_v[2] += facj * curlvr[2];
106
107
}

108
109
110
111
112
113
/**
 * @brief Density loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_density(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
114

115
#ifdef WITH_OLD_VECTORIZATION
116

117
  vector r, ri, r2, ui, uj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx;
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
  vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
  vector mi, mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr, div_vi, div_vj;
  vector curlvr[3], curl_vi[3], curl_vj[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
139
140
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
141
142
143
144
145
146
147
148
149
150
151
152
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
153
#else
154
  error("Unknown vector size.");
155
156
157
158
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
159
  ri = vec_reciprocal_sqrt(r2);
160
161
162
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
163
  hi_inv = vec_reciprocal(hi);
164
  ui.v = r.v * hi_inv.v;
165
166

  hj.v = vec_load(Hj);
167
  hj_inv = vec_reciprocal(hj);
168
  uj.v = r.v * hj_inv.v;
169
170

  /* Compute the kernel function. */
171
172
  kernel_deval_vec(&ui, &wi, &wi_dx);
  kernel_deval_vec(&uj, &wj, &wj_dx);
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
191
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
192
  wcounti.v = wi.v;
193
  wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
194
195
196
197
198
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Compute density of pj. */
  rhoj.v = mi.v * wj.v;
199
  rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v);
200
  wcountj.v = wj.v;
201
  wcountj_dh.v = (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v);
202
203
204
205
206
207
  div_vj.v = mi.v * dvdr.v * wj_dx.v;
  for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
208
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
209
210
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
211
    pi[k]->density.div_v -= div_vi.f[k];
212
213
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
    pj[k]->rho += rhoj.f[k];
214
    pj[k]->density.rho_dh -= rhoj_dh.f[k];
215
216
    pj[k]->density.wcount += wcountj.f[k];
    pj[k]->density.wcount_dh -= wcountj_dh.f[k];
217
    pj[k]->density.div_v -= div_vj.f[k];
218
219
220
221
222
    for (j = 0; j < 3; j++) pj[k]->density.rot_v[j] += curl_vj[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
223
224
  error(
      "The Gadget2 serial version of runner_iact_density was called when the "
225
      "vectorised version should have been used.");
226
227

#endif
228
229
}

230
231
232
/**
 * @brief Density loop (non-symmetric version)
 */
233
234
235
236
237
238
239
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

  float wi, wi_dx;
  float dv[3], curlvr[3];

  /* Get the masses. */
240
  const float mj = pj->mass;
241
242

  /* Get r and r inverse. */
243
  const float r = sqrtf(r2);
244
  const float r_inv = 1.0f / r;
245

246
  /* Compute the kernel function */
247
248
249
  const float hi_inv = 1.0f / hi;
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
250
251
252

  /* Compute contribution to the density */
  pi->rho += mj * wi;
253
  pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx);
254
255
256

  /* Compute contribution to the number of neighbours */
  pi->density.wcount += wi;
257
  pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx);
258

259
  const float fac = mj * wi_dx * r_inv;
260

261
262
263
264
265
  /* Compute dv dot r */
  dv[0] = pi->v[0] - pj->v[0];
  dv[1] = pi->v[1] - pj->v[1];
  dv[2] = pi->v[2] - pj->v[2];
  const float dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2];
266
  pi->density.div_v -= fac * dvdr;
267

268
269
270
271
272
  /* Compute dv cross r */
  curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1];
  curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2];
  curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0];

273
274
275
  pi->density.rot_v[0] += fac * curlvr[0];
  pi->density.rot_v[1] += fac * curlvr[1];
  pi->density.rot_v[2] += fac * curlvr[2];
276
277
}

278
279
280
281
282
283
/**
 * @brief Density loop (non-symmetric vectorized version)
 */
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj,
                               struct part **pi, struct part **pj) {
284

285
#ifdef WITH_OLD_VECTORIZATION
286

287
  vector r, ri, r2, ui, hi, hi_inv, wi, wi_dx;
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
  vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi;
  vector mj;
  vector dx[3], dv[3];
  vector vi[3], vj[3];
  vector dvdr;
  vector curlvr[3], curl_vi[3];
  int k, j;

#if VEC_SIZE == 8
  /* Get the masses. */
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
  /* Get each velocity component. */
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
Matthieu Schaller's avatar
Matthieu Schaller committed
307
308
  /* Get each component of particle separation.
   * (Dx={dx1,dy1,dz1,dx2,dy2,dz2,...,dxn,dyn,dzn})*/
309
310
311
312
313
314
315
316
317
318
319
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
#elif VEC_SIZE == 4
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
320
#else
321
  error("Unknown vector size.");
322
323
324
325
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
326
  ri = vec_reciprocal_sqrt(r2);
327
328
329
  r.v = r2.v * ri.v;

  hi.v = vec_load(Hi);
330
  hi_inv = vec_reciprocal(hi);
331
  ui.v = r.v * hi_inv.v;
332

333
  kernel_deval_vec(&ui, &wi, &wi_dx);
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351

  /* Compute dv. */
  dv[0].v = vi[0].v - vj[0].v;
  dv[1].v = vi[1].v - vj[1].v;
  dv[2].v = vi[2].v - vj[2].v;

  /* Compute dv dot r */
  dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v);
  dvdr.v = dvdr.v * ri.v;

  /* Compute dv cross r */
  curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v;
  curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v;
  curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v;
  for (k = 0; k < 3; k++) curlvr[k].v *= ri.v;

  /* Compute density of pi. */
  rhoi.v = mj.v * wi.v;
352
  rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
353
  wcounti.v = wi.v;
354
  wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v);
355
356
357
358
359
360
  div_vi.v = mj.v * dvdr.v * wi_dx.v;
  for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v;

  /* Update particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    pi[k]->rho += rhoi.f[k];
361
    pi[k]->density.rho_dh -= rhoi_dh.f[k];
362
363
    pi[k]->density.wcount += wcounti.f[k];
    pi[k]->density.wcount_dh -= wcounti_dh.f[k];
364
    pi[k]->density.div_v -= div_vi.f[k];
365
366
367
368
369
    for (j = 0; j < 3; j++) pi[k]->density.rot_v[j] += curl_vi[j].f[k];
  }

#else

Matthieu Schaller's avatar
Matthieu Schaller committed
370
371
  error(
      "The Gadget2 serial version of runner_iact_nonsym_density was called "
372
      "when the vectorised version should have been used.");
373
374

#endif
375
376
}

377
#ifdef WITH_VECTORIZATION
378
379
380
381
382

/**
 * @brief Density interaction computed using 1 vector
 * (non-symmetric vectorized version).
 */
383
__attribute__((always_inline)) INLINE static void
Matthieu Schaller's avatar
Matthieu Schaller committed
384
385
386
387
388
389
390
391
runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
                                 vector hi_inv, vector vix, vector viy,
                                 vector viz, float *Vjx, float *Vjy, float *Vjz,
                                 float *Mj, vector *rhoSum, vector *rho_dhSum,
                                 vector *wcountSum, vector *wcount_dhSum,
                                 vector *div_vSum, vector *curlvxSum,
                                 vector *curlvySum, vector *curlvzSum,
                                 vector mask, int knlMask) {
392

393
  vector r, ri, ui, wi, wi_dx;
394
395
396
397
398
  vector mj;
  vector dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
James Willis's avatar
James Willis committed
399

400
  /* Fill the vectors. */
401
402
403
404
  mj.v = vec_load(Mj);
  vjx.v = vec_load(Vjx);
  vjy.v = vec_load(Vjy);
  vjz.v = vec_load(Vjz);
405
406
407
408
409

  /* Get the radius and inverse radius. */
  ri = vec_reciprocal_sqrt(*r2);
  r.v = vec_mul(r2->v, ri.v);

410
  ui.v = vec_mul(r.v, hi_inv.v);
411
412

  /* Calculate the kernel for two particles. */
413
  kernel_deval_1_vec(&ui, &wi, &wi_dx);
414
415
416
417
418
419
420
421
422
423

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvz.v = vec_sub(viz.v, vjz.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v)));
  dvdr.v = vec_mul(dvdr.v, ri.v);

424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
  /* Compute dv cross r */
  curlvrx.v =
      vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v)));
  curlvry.v =
      vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v)));
  curlvrz.v =
      vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);

/* Mask updates to intermediate vector sums for particle pi. */
#ifdef HAVE_AVX512_F
  rhoSum->v =
      _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v);

  rho_dhSum->v =
      _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v,
                         vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
443
                                               vec_mul(ui.v, wi_dx.v))));
444
445
446

  wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v);

447
448
449
  wcount_dhSum->v = _mm512_mask_sub_ps(
      rho_dhSum->v, knlMask, rho_dhSum->v,
      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)));
450
451
452
453
454
455
456

  div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v,
                                   vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)));

  curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)),
                                    curlvxSum->v);
James Willis's avatar
James Willis committed
457

458
459
460
  curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)),
                                    curlvySum->v);
James Willis's avatar
James Willis committed
461

462
463
464
  curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask,
                                    vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)),
                                    curlvzSum->v);
James Willis's avatar
James Willis committed
465
#else
466
467
  rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v);
  rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
468
                                                vec_mul(ui.v, wi_dx.v))),
James Willis's avatar
James Willis committed
469
                          mask.v);
470
  wcountSum->v += vec_and(wi.v, mask.v);
471
472
  wcount_dhSum->v -= vec_and(
      vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)), mask.v);
473
474
475
476
477
478
479
  div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v);
  curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v);
  curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v);
  curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v);
#endif
}

480
/**
James Willis's avatar
James Willis committed
481
482
 * @brief Density interaction computed using 2 interleaved vectors
 * (non-symmetric vectorized version).
483
484
 */
__attribute__((always_inline)) INLINE static void
James Willis's avatar
James Willis committed
485
486
487
488
489
490
runner_iact_nonsym_2_vec_density(
    float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix,
    vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj,
    vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector mask, vector mask2, int knlMask, int knlMask2) {
491

492
  vector r, ri, r2, ui, wi, wi_dx;
493
494
495
496
497
  vector mj;
  vector dx, dy, dz, dvx, dvy, dvz;
  vector vjx, vjy, vjz;
  vector dvdr;
  vector curlvrx, curlvry, curlvrz;
498
  vector r_2, ri2, r2_2, ui2, wi2, wi_dx2;
499
500
501
502
503
504
  vector mj2;
  vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
  vector vjx2, vjy2, vjz2;
  vector dvdr2;
  vector curlvrx2, curlvry2, curlvrz2;

James Willis's avatar
James Willis committed
505
  /* Fill the vectors. */
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
  mj.v = vec_load(Mj);
  mj2.v = vec_load(&Mj[VEC_SIZE]);
  vjx.v = vec_load(Vjx);
  vjx2.v = vec_load(&Vjx[VEC_SIZE]);
  vjy.v = vec_load(Vjy);
  vjy2.v = vec_load(&Vjy[VEC_SIZE]);
  vjz.v = vec_load(Vjz);
  vjz2.v = vec_load(&Vjz[VEC_SIZE]);
  dx.v = vec_load(Dx);
  dx2.v = vec_load(&Dx[VEC_SIZE]);
  dy.v = vec_load(Dy);
  dy2.v = vec_load(&Dy[VEC_SIZE]);
  dz.v = vec_load(Dz);
  dz2.v = vec_load(&Dz[VEC_SIZE]);

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
  r2_2.v = vec_load(&R2[VEC_SIZE]);
524
525
  ri = vec_reciprocal_sqrt(r2);
  ri2 = vec_reciprocal_sqrt(r2_2);
526
527
528
  r.v = vec_mul(r2.v, ri.v);
  r_2.v = vec_mul(r2_2.v, ri2.v);

529
530
  ui.v = vec_mul(r.v, hi_inv.v);
  ui2.v = vec_mul(r_2.v, hi_inv.v);
531

James Willis's avatar
James Willis committed
532
  /* Calculate the kernel for two particles. */
533
  kernel_deval_2_vec(&ui, &wi, &wi_dx, &ui2, &wi2, &wi_dx2);
534
535
536
537
538
539
540
541
542
543
544

  /* Compute dv. */
  dvx.v = vec_sub(vix.v, vjx.v);
  dvx2.v = vec_sub(vix.v, vjx2.v);
  dvy.v = vec_sub(viy.v, vjy.v);
  dvy2.v = vec_sub(viy.v, vjy2.v);
  dvz.v = vec_sub(viz.v, vjz.v);
  dvz2.v = vec_sub(viz.v, vjz2.v);

  /* Compute dv dot r */
  dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v)));
James Willis's avatar
James Willis committed
545
546
  dvdr2.v =
      vec_fma(dvx2.v, dx2.v, vec_fma(dvy2.v, dy2.v, vec_mul(dvz2.v, dz2.v)));
547
548
549
550
  dvdr.v = vec_mul(dvdr.v, ri.v);
  dvdr2.v = vec_mul(dvdr2.v, ri2.v);

  /* Compute dv cross r */
James Willis's avatar
James Willis committed
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
  curlvrx.v =
      vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v)));
  curlvrx2.v =
      vec_fma(dvy2.v, dz2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz2.v, dy2.v)));
  curlvry.v =
      vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v)));
  curlvry2.v =
      vec_fma(dvz2.v, dx2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx2.v, dz2.v)));
  curlvrz.v =
      vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v)));
  curlvrz2.v =
      vec_fma(dvx2.v, dy2.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy2.v, dx2.v)));
  curlvrx.v = vec_mul(curlvrx.v, ri.v);
  curlvrx2.v = vec_mul(curlvrx2.v, ri2.v);
  curlvry.v = vec_mul(curlvry.v, ri.v);
  curlvry2.v = vec_mul(curlvry2.v, ri2.v);
  curlvrz.v = vec_mul(curlvrz.v, ri.v);
  curlvrz2.v = vec_mul(curlvrz2.v, ri2.v);

570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask);
  rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2);
  rho_dhSum->v = vec_mask_sub(
      rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v,
          vec_mul(xi.v, wi_dx.v))),
      mask);
  rho_dhSum->v = vec_mask_sub(
      rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v,
          vec_mul(xi2.v, wi_dx2.v))),
      mask2);
  wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask);
  wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2);
  wcount_dhSum->v =
    vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask);
  wcount_dhSum->v =
    vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2);
  div_vSum->v = vec_mask_sub(div_vSum->v,
      vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask);
  div_vSum->v = vec_mask_sub(
      div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v)), mask2);
  curlvxSum->v = vec_mask_add(
      curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask);
  curlvxSum->v = vec_mask_add(
      curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v)), mask2);
  curlvySum->v = vec_mask_add(
      curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask);
  curlvySum->v = vec_mask_add(
      curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v)), mask2);
  curlvzSum->v = vec_mask_add(
      curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask);
  curlvzSum->v = vec_mask_add(
      curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v)), mask2);
602
}
James Willis's avatar
James Willis committed
603
#endif
604

605
606
607
/**
 * @brief Force loop
 */
608
609
610
__attribute__((always_inline)) INLINE static void runner_iact_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

611
612
613
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
614

615
616
617
618
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
619
  const float mi = pi->mass;
620
621
622
623
624
625
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
626
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
627
628
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
629
  const float wi_dr = hid_inv * wi_dx;
630
631
632

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
633
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
634
635
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
636
  const float wj_dr = hjd_inv * wj_dx;
637

638
639
640
641
642
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
643
644
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
645
646

  /* Compute sound speeds */
647
648
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
649

650
  /* Compute dv dot r. */
651
652
653
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
654

655
  /* Balsara term */
656
657
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
Matthieu Schaller's avatar
Matthieu Schaller committed
658

659
  /* Are the particles moving towards each others ? */
660
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
661
662
663
664
665
666
667
668
669
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
670
671

  /* Now, convolve with the kernel */
672
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
673
  const float sph_term =
674
      (f_i * P_over_rho2_i * wi_dr + f_j * P_over_rho2_j * wj_dr) * r_inv;
675
676
677
678
679

  /* Eventually got the acceleration */
  const float acc = visc_term + sph_term;

  /* Use the force Luke ! */
680
681
682
  pi->a_hydro[0] -= mj * acc * dx[0];
  pi->a_hydro[1] -= mj * acc * dx[1];
  pi->a_hydro[2] -= mj * acc * dx[2];
683

684
685
686
  pj->a_hydro[0] += mi * acc * dx[0];
  pj->a_hydro[1] += mi * acc * dx[1];
  pj->a_hydro[2] += mi * acc * dx[2];
687

688
  /* Get the time derivative for h. */
689
690
  pi->force.h_dt -= mj * dvdr * r_inv / rhoj * wi_dr;
  pj->force.h_dt -= mi * dvdr * r_inv / rhoi * wj_dr;
691

692
  /* Update the signal velocity. */
693
694
  pi->force.v_sig = (pi->force.v_sig > v_sig) ? pi->force.v_sig : v_sig;
  pj->force.v_sig = (pj->force.v_sig > v_sig) ? pj->force.v_sig : v_sig;
695

696
  /* Change in entropy */
697
698
  pi->entropy_dt += mj * visc_term * dvdr;
  pj->entropy_dt += mi * visc_term * dvdr;
699
}
700

701
702
703
704
705
706
/**
 * @brief Force loop (Vectorized version)
 */
__attribute__((always_inline)) INLINE static void runner_iact_vec_force(
    float *R2, float *Dx, float *Hi, float *Hj, struct part **pi,
    struct part **pj) {
707
708
709
710
711
712

#ifdef WITH_VECTORIZATION

  vector r, r2, ri;
  vector xi, xj;
  vector hi, hj, hi_inv, hj_inv;
713
  vector hid_inv, hjd_inv;
714
  vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
715
  vector piPOrho2, pjPOrho2, pirho, pjrho;
716
717
  vector mi, mj;
  vector f;
718
  vector grad_hi, grad_hj;
719
720
721
722
723
724
725
726
727
728
729
  vector dx[3];
  vector vi[3], vj[3];
  vector pia[3], pja[3];
  vector pih_dt, pjh_dt;
  vector ci, cj, v_sig;
  vector omega_ij, mu_ij, fac_mu, balsara;
  vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
  int j, k;

  fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */

Matthieu Schaller's avatar
Matthieu Schaller committed
730
/* Load stuff. */
731
732
733
734
735
#if VEC_SIZE == 8
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass,
                 pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass);
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass,
                 pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass);
736
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
737
738
739
                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2,
                       pi[4]->force.P_over_rho2, pi[5]->force.P_over_rho2,
                       pi[6]->force.P_over_rho2, pi[7]->force.P_over_rho2);
740
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
741
742
743
744
745
746
747
748
749
                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2,
                       pj[4]->force.P_over_rho2, pj[5]->force.P_over_rho2,
                       pj[6]->force.P_over_rho2, pj[7]->force.P_over_rho2);
  grad_hi.v =
      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f,
              pi[4]->force.f, pi[5]->force.f, pi[6]->force.f, pi[7]->force.f);
  grad_hj.v =
      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f,
              pj[4]->force.f, pj[5]->force.f, pj[6]->force.f, pj[7]->force.f);
750
751
752
753
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho,
                    pi[5]->rho, pi[6]->rho, pi[7]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho,
                    pj[5]->rho, pj[6]->rho, pj[7]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
754
755
756
757
758
759
760
761
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed,
                 pi[4]->force.soundspeed, pi[5]->force.soundspeed,
                 pi[6]->force.soundspeed, pi[7]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed,
                 pj[4]->force.soundspeed, pj[5]->force.soundspeed,
                 pj[6]->force.soundspeed, pj[7]->force.soundspeed);
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k],
                      pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k],
                      pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k],
                      Dx[15 + k], Dx[18 + k], Dx[21 + k]);
  balsara.v =
      vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara,
              pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara,
              pi[6]->force.balsara, pi[7]->force.balsara) +
      vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara,
              pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara,
              pj[6]->force.balsara, pj[7]->force.balsara);
#elif VEC_SIZE == 4
779
  mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass);
780
  mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass);
781
  piPOrho2.v = vec_set(pi[0]->force.P_over_rho2, pi[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
782
                       pi[2]->force.P_over_rho2, pi[3]->force.P_over_rho2);
783
  pjPOrho2.v = vec_set(pj[0]->force.P_over_rho2, pj[1]->force.P_over_rho2,
James Willis's avatar
James Willis committed
784
785
786
787
788
                       pj[2]->force.P_over_rho2, pj[3]->force.P_over_rho2);
  grad_hi.v =
      vec_set(pi[0]->force.f, pi[1]->force.f, pi[2]->force.f, pi[3]->force.f);
  grad_hj.v =
      vec_set(pj[0]->force.f, pj[1]->force.f, pj[2]->force.f, pj[3]->force.f);
789
790
  pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho);
  pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
791
792
793
794
  ci.v = vec_set(pi[0]->force.soundspeed, pi[1]->force.soundspeed,
                 pi[2]->force.soundspeed, pi[3]->force.soundspeed);
  cj.v = vec_set(pj[0]->force.soundspeed, pj[1]->force.soundspeed,
                 pj[2]->force.soundspeed, pj[3]->force.soundspeed);
795
796
797
798
799
800
801
802
803
804
805
  for (k = 0; k < 3; k++) {
    vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]);
    vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]);
  }
  for (k = 0; k < 3; k++)
    dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]);
  balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara,
                      pi[2]->force.balsara, pi[3]->force.balsara) +
              vec_set(pj[0]->force.balsara, pj[1]->force.balsara,
                      pj[2]->force.balsara, pj[3]->force.balsara);
#else
806
  error("Unknown vector size.");
807
808
809
810
#endif

  /* Get the radius and inverse radius. */
  r2.v = vec_load(R2);
811
  ri = vec_reciprocal_sqrt(r2);
812
813
814
815
  r.v = r2.v * ri.v;

  /* Get the kernel for hi. */
  hi.v = vec_load(Hi);
816
  hi_inv = vec_reciprocal(hi);
817
  hid_inv = pow_dimension_plus_one_vec(hi_inv); /* 1/h^(d+1) */
818
819
  xi.v = r.v * hi_inv.v;
  kernel_deval_vec(&xi, &wi, &wi_dx);
820
  wi_dr.v = hid_inv.v * wi_dx.v;
821
822
823

  /* Get the kernel for hj. */
  hj.v = vec_load(Hj);
824
  hj_inv = vec_reciprocal(hj);
825
  hjd_inv = pow_dimension_plus_one_vec(hj_inv); /* 1/h^(d+1) */
826
827
  xj.v = r.v * hj_inv.v;
  kernel_deval_vec(&xj, &wj, &wj_dx);
828
  wj_dr.v = hjd_inv.v * wj_dx.v;
829
830
831
832

  /* Compute dv dot r. */
  dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) +
           ((vi[2].v - vj[2].v) * dx[2].v);
Matthieu Schaller's avatar
Matthieu Schaller committed
833
  // dvdr.v = dvdr.v * ri.v;
834
835
836
837
838

  /* Compute the relative velocity. (This is 0 if the particles move away from
   * each other and negative otherwise) */
  omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f));
  mu_ij.v = fac_mu.v * ri.v * omega_ij.v; /* This is 0 or negative */
Matthieu Schaller's avatar
Matthieu Schaller committed
839

840
841
  /* Compute signal velocity */
  v_sig.v = ci.v + cj.v - vec_set1(3.0f) * mu_ij.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
842

843
844
  /* Now construct the full viscosity term */
  rho_ij.v = vec_set1(0.5f) * (pirho.v + pjrho.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
845
846
  visc.v = vec_set1(-0.25f) * vec_set1(const_viscosity_alpha) * v_sig.v *
           mu_ij.v * balsara.v / rho_ij.v;
847
848
849

  /* Now, convolve with the kernel */
  visc_term.v = vec_set1(0.5f) * visc.v * (wi_dr.v + wj_dr.v) * ri.v;
James Willis's avatar
James Willis committed
850
851
852
  sph_term.v =
      (grad_hi.v * piPOrho2.v * wi_dr.v + grad_hj.v * pjPOrho2.v * wj_dr.v) *
      ri.v;
853
854
855

  /* Eventually get the acceleration */
  acc.v = visc_term.v + sph_term.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
856

857
858
859
860
861
862
863
864
865
866
867
868
  /* Use the force, Luke! */
  for (k = 0; k < 3; k++) {
    f.v = dx[k].v * acc.v;
    pia[k].v = mj.v * f.v;
    pja[k].v = mi.v * f.v;
  }

  /* Get the time derivative for h. */
  pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
  pjh_dt.v = mi.v * dvdr.v * ri.v / pirho.v * wj_dr.v;

  /* Change in entropy */
869
  entropy_dt.v = visc_term.v * dvdr.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
870

871
872
873
874
875
876
  /* Store the forces back on the particles. */
  for (k = 0; k < VEC_SIZE; k++) {
    for (j = 0; j < 3; j++) {
      pi[k]->a_hydro[j] -= pia[j].f[k];
      pj[k]->a_hydro[j] += pja[j].f[k];
    }
877
878
    pi[k]->force.h_dt -= pih_dt.f[k];
    pj[k]->force.h_dt -= pjh_dt.f[k];
879
880
    pi[k]->force.v_sig = max(pi[k]->force.v_sig, v_sig.f[k]);
    pj[k]->force.v_sig = max(pj[k]->force.v_sig, v_sig.f[k]);
881
    pi[k]->entropy_dt += entropy_dt.f[k] * mj.f[k];
882
    pj[k]->entropy_dt += entropy_dt.f[k] * mi.f[k];
883
884
  }

Matthieu Schaller's avatar
Matthieu Schaller committed
885
#else
886

Matthieu Schaller's avatar
Matthieu Schaller committed
887
888
  error(
      "The Gadget2 serial version of runner_iact_nonsym_force was called when "
889
      "the vectorised version should have been used.");
890
891

#endif
892
893
}

894
895
896
/**
 * @brief Force loop (non-symmetric version)
 */
897
898
899
__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force(
    float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) {

900
901
902
  float wi, wj, wi_dx, wj_dx;

  const float fac_mu = 1.f; /* Will change with cosmological integration */
903

904
905
906
907
  const float r = sqrtf(r2);
  const float r_inv = 1.0f / r;

  /* Get some values in local variables. */
908
  // const float mi = pi->mass;
909
910
911
912
913
914
  const float mj = pj->mass;
  const float rhoi = pi->rho;
  const float rhoj = pj->rho;

  /* Get the kernel for hi. */
  const float hi_inv = 1.0f / hi;
915
  const float hid_inv = pow_dimension_plus_one(hi_inv); /* 1/h^(d+1) */
916
917
  const float ui = r * hi_inv;
  kernel_deval(ui, &wi, &wi_dx);
918
  const float wi_dr = hid_inv * wi_dx;
919
920
921

  /* Get the kernel for hj. */
  const float hj_inv = 1.0f / hj;
922
  const float hjd_inv = pow_dimension_plus_one(hj_inv); /* 1/h^(d+1) */
923
924
  const float xj = r * hj_inv;
  kernel_deval(xj, &wj, &wj_dx);
925
  const float wj_dr = hjd_inv * wj_dx;
926

927
928
929
930
931
  /* Compute h-gradient terms */
  const float f_i = pi->force.f;
  const float f_j = pj->force.f;

  /* Compute pressure terms */
932
933
  const float P_over_rho2_i = pi->force.P_over_rho2;
  const float P_over_rho2_j = pj->force.P_over_rho2;
934
935

  /* Compute sound speeds */
936
937
  const float ci = pi->force.soundspeed;
  const float cj = pj->force.soundspeed;
938

939
  /* Compute dv dot r. */
940
941
942
  const float dvdr = (pi->v[0] - pj->v[0]) * dx[0] +
                     (pi->v[1] - pj->v[1]) * dx[1] +
                     (pi->v[2] - pj->v[2]) * dx[2];
943

944
  /* Balsara term */
945
946
  const float balsara_i = pi->force.balsara;
  const float balsara_j = pj->force.balsara;
947
948

  /* Are the particles moving towards each others ? */
949
  const float omega_ij = (dvdr < 0.f) ? dvdr : 0.f;
950
951
952
953
954
955
956
957
958
  const float mu_ij = fac_mu * r_inv * omega_ij; /* This is 0 or negative */

  /* Signal velocity */
  const float v_sig = ci + cj - 3.f * mu_ij;

  /* Now construct the full viscosity term */
  const float rho_ij = 0.5f * (rhoi + rhoj);
  const float visc = -0.25f * const_viscosity_alpha * v_sig * mu_ij *
                     (balsara_i + balsara_j) / rho_ij;
959
960

  /* Now, convolve with the kernel */
961
  const float visc_term = 0.5f * visc * (wi_dr + wj_dr) * r_inv;
962
  const float sph_term =