kernel_hydro.h 35.3 KB
Newer Older
1
2
/*******************************************************************************
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
 ******************************************************************************/
20
21
#ifndef SWIFT_KERNEL_HYDRO_H
#define SWIFT_KERNEL_HYDRO_H
22

Matthieu Schaller's avatar
Matthieu Schaller committed
23
24
25
26
27
28
29
30
31
32
33
34
/**
 * @file kernel_hydro.h
 * @brief Kernel functions for SPH (scalar and vector version).
 *
 * All constants and kernel coefficients are taken from table 1 of
 * Dehnen & Aly, MNRAS, 425, pp. 1062-1082 (2012).
 */

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
35
36
#include <math.h>

Matthieu Schaller's avatar
Matthieu Schaller committed
37
/* Local headers. */
38
#include "dimension.h"
Matthieu Schaller's avatar
Matthieu Schaller committed
39
#include "error.h"
40
#include "inline.h"
41
#include "vector.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
42

43
/* ------------------------------------------------------------------------- */
Matthieu Schaller's avatar
Matthieu Schaller committed
44
#if defined(CUBIC_SPLINE_KERNEL)
45

46
/* Coefficients for the kernel. */
Matthieu Schaller's avatar
Matthieu Schaller committed
47
#define kernel_name "Cubic spline (M4)"
48
49
#define kernel_degree 3 /*!< Degree of the polynomial */
#define kernel_ivals 2  /*!< Number of branches */
50
#if defined(HYDRO_DIMENSION_3D)
51
52
#define kernel_gamma ((float)(1.825742))
#define kernel_constant ((float)(16. * M_1_PI))
53
54
55
56
57
58
59
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(1.778002))
#define kernel_constant ((float)(80. * M_1_PI / 7.))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma ((float)(1.732051))
#define kernel_constant ((float)(8. / 3.))
#endif
60
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
61
62
63
64
65
66
67
68
69
    __attribute__((aligned(16))) = {3.f,  -3.f, 0.f,  0.5f, /* 0 < u < 0.5 */
                                    -1.f, 3.f,  -3.f, 1.f,  /* 0.5 < u < 1 */
                                    0.f,  0.f,  0.f,  0.f}; /* 1 < u */

/* ------------------------------------------------------------------------- */
#elif defined(QUARTIC_SPLINE_KERNEL)

/* Coefficients for the kernel. */
#define kernel_name "Quartic spline (M5)"
70
71
72
#define kernel_degree 4 /* Degree of the polynomial */
#define kernel_ivals 5  /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
73
74
#define kernel_gamma ((float)(2.018932))
#define kernel_constant ((float)(15625. * M_1_PI / 512.))
75
76
77
78
79
80
81
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(1.977173))
#define kernel_constant ((float)(46875. * M_1_PI / 2398.))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma ((float)(1.936492))
#define kernel_constant ((float)(3125. / 768.))
#endif
82
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
83
    __attribute__((aligned(16))) = {
84
85
86
87
88
89
90
91
92
        6.f,  0.f,  -2.4f, 0.f,   0.368f, /* 0 < u < 0.2 */
        -4.f, 8.f,  -4.8f, 0.32f, 0.352f, /* 0.2 < u < 0.4 */
        -4.f, 8.f,  -4.8f, 0.32f, 0.352f, /* 0.4 < u < 0.6 */
        1.f,  -4.f, 6.f,   -4.f,  1.f,    /* 0.6 < u < 0.8 */
        1.f,  -4.f, 6.f,   -4.f,  1.f,    /* 0.8 < u < 1 */
        0.f,  0.f,  0.f,   0.f,   0.f};   /* 1 < u */

/* ------------------------------------------------------------------------- */
#elif defined(QUINTIC_SPLINE_KERNEL)
93

94
95
/* Coefficients for the kernel. */
#define kernel_name "Quintic spline (M6)"
96
97
98
#define kernel_degree 5 /* Degree of the polynomial */
#define kernel_ivals 3  /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
99
100
#define kernel_gamma ((float)(2.195775))
#define kernel_constant ((float)(2187. * M_1_PI / 40.))
101
102
103
104
105
106
107
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(2.158131))
#define kernel_constant ((float)(15309. * M_1_PI / 478.))
#elif defined(HYDRO_DIMENSION_1D)
#define kernel_gamma ((float)(2.121321))
#define kernel_constant ((float)(243. / 40.))
#endif
108
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
109
110
111
112
113
114
115
116
117
118
119
    __attribute__((aligned(16))) = {
        -10.f,        10.f,      0.f,
        -2.2222222f,  0.f,       0.271604938f, /* 0 < u < 1/3 */
        5.f,          -15.f,     16.666667f,
        -7.77777777f, 0.925925f, 0.209876543f, /* 1/3 < u < 2/3 */
        -1.f,         5.f,       -10.f,
        10.f,         -5.f,      1.f, /* 2/3 < u < 1. */
        0.f,          0.f,       0.f,
        0.f,          0.f,       0.f}; /* 1 < u */

/* ------------------------------------------------------------------------- */
Matthieu Schaller's avatar
Matthieu Schaller committed
120
#elif defined(WENDLAND_C2_KERNEL)
121

122
/* Coefficients for the kernel. */
Matthieu Schaller's avatar
Matthieu Schaller committed
123
#define kernel_name "Wendland C2"
124
125
126
#define kernel_degree 5 /* Degree of the polynomial */
#define kernel_ivals 1  /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
127
128
#define kernel_gamma ((float)(1.936492))
#define kernel_constant ((float)(21. * M_1_PI / 2.))
129
130
131
132
133
134
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(1.897367))
#define kernel_constant ((float)(7. * M_1_PI))
#elif defined(HYDRO_DIMENSION_1D)
#error "Wendland C2 kernel not defined in 1D."
#endif
135
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
136
    __attribute__((aligned(16))) = {
137
138
        4.f, -15.f, 20.f, -10.f, 0.f, 1.f,  /* 0 < u < 1 */
        0.f, 0.f,   0.f,  0.f,   0.f, 0.f}; /* 1 < u */
139
140
141
142
143
144

/* ------------------------------------------------------------------------- */
#elif defined(WENDLAND_C4_KERNEL)

/* Coefficients for the kernel. */
#define kernel_name "Wendland C4"
145
#define kernel_degree 8 /* Degree of the polynomial */
146
#define kernel_ivals 1  /* Number of branches */
147
#if defined(HYDRO_DIMENSION_3D)
148
149
#define kernel_gamma ((float)(2.207940))
#define kernel_constant ((float)(495. * M_1_PI / 32.))
150
151
152
153
154
155
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(2.171239))
#define kernel_constant ((float)(9. * M_1_PI))
#elif defined(HYDRO_DIMENSION_1D)
#error "Wendland C4 kernel not defined in 1D."
#endif
156
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
157
158
159
160
161
162
163
164
165
166
167
    __attribute__((aligned(16))) = {
        11.666667f, -64.f,       140.f, -149.333333f, 70.f,
        0.f,        -9.3333333f, 0.f,   1.f, /* 0 < u < 1 */
        0.f,        0.f,         0.f,   0.f,          0.f,
        0.f,        0.f,         0.f,   0.f}; /* 1 < u */

/* ------------------------------------------------------------------------- */
#elif defined(WENDLAND_C6_KERNEL)

/* Coefficients for the kernel. */
#define kernel_name "Wendland C6"
168
169
170
#define kernel_degree 11 /* Degree of the polynomial */
#define kernel_ivals 1   /* Number of branches */
#if defined(HYDRO_DIMENSION_3D)
171
172
#define kernel_gamma ((float)(2.449490))
#define kernel_constant ((float)(1365. * M_1_PI / 64.))
173
174
175
176
177
178
#elif defined(HYDRO_DIMENSION_2D)
#define kernel_gamma ((float)(2.415230))
#define kernel_constant ((float)(78. * M_1_PI / 7.))
#elif defined(HYDRO_DIMENSION_1D)
#error "Wendland C6 kernel not defined in 1D."
#endif
179
static const float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)]
180
    __attribute__((aligned(16))) = {
181
182
183
184
        32.f, -231.f, 704.f, -1155.f, 1056.f, -462.f,
        0.f,  66.f,   0.f,   -11.f,   0.f,    1.f, /* 0 < u < 1 */
        0.f,  0.f,    0.f,   0.f,     0.f,    0.f,
        0.f,  0.f,    0.f,   0.f,     0.f,    0.f}; /* 1 < u */
185

186
/* ------------------------------------------------------------------------- */
Matthieu Schaller's avatar
Matthieu Schaller committed
187
#else
188

Matthieu Schaller's avatar
Matthieu Schaller committed
189
#error "A kernel function must be chosen in const.h !!"
190

191
/* ------------------------------------------------------------------------- */
Matthieu Schaller's avatar
Matthieu Schaller committed
192
#endif
193

Matthieu Schaller's avatar
Matthieu Schaller committed
194
/* Ok, now comes the real deal. */
195

Matthieu Schaller's avatar
Matthieu Schaller committed
196
/* First some powers of gamma = H/h */
197
#define kernel_gamma_inv ((float)(1. / kernel_gamma))
198
#define kernel_gamma2 ((float)(kernel_gamma * kernel_gamma))
199

200
/* define gamma^d, gamma^(d+1), 1/gamma^d and 1/gamma^(d+1) */
201
#if defined(HYDRO_DIMENSION_3D)
202
203
204
205
206
207
208
#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
#define kernel_gamma_dim_plus_one \
  ((float)(kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma))
#define kernel_gamma_inv_dim \
  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
#define kernel_gamma_inv_dim_plus_one \
  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma * kernel_gamma)))
209
#elif defined(HYDRO_DIMENSION_2D)
210
211
212
213
214
215
#define kernel_gamma_dim ((float)(kernel_gamma * kernel_gamma))
#define kernel_gamma_dim_plus_one \
  ((float)(kernel_gamma * kernel_gamma * kernel_gamma))
#define kernel_gamma_inv_dim ((float)(1. / (kernel_gamma * kernel_gamma)))
#define kernel_gamma_inv_dim_plus_one \
  ((float)(1. / (kernel_gamma * kernel_gamma * kernel_gamma)))
216
#elif defined(HYDRO_DIMENSION_1D)
217
218
219
220
221
#define kernel_gamma_dim ((float)(kernel_gamma))
#define kernel_gamma_dim_plus_one ((float)(kernel_gamma * kernel_gamma))
#define kernel_gamma_inv_dim ((float)(1. / (kernel_gamma)))
#define kernel_gamma_inv_dim_plus_one \
  ((float)(1. / (kernel_gamma * kernel_gamma)))
222
#endif
223

224
225
226
227
228
229
230
231
/* The number of branches (floating point conversion) */
#define kernel_ivals_f ((float)(kernel_ivals))

/* Kernel self contribution (i.e. W(0,h)) */
#define kernel_root                                          \
  ((float)(kernel_coeffs[kernel_degree]) * kernel_constant * \
   kernel_gamma_inv_dim)

232
/* Kernel normalisation constant (volume term) */
233
#define kernel_norm ((float)(hydro_dimension_unit_sphere * kernel_gamma_dim))
234
235

/* ------------------------------------------------------------------------- */
236

237
/**
Matthieu Schaller's avatar
Matthieu Schaller committed
238
239
 * @brief Computes the kernel function and its derivative.
 *
Matthieu Schaller's avatar
Matthieu Schaller committed
240
241
 * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
 * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
Matthieu Schaller's avatar
Matthieu Schaller committed
242
 *
243
244
245
246
247
 * Returns 0 if \f$u > \gamma = H/h\f$.
 *
 * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
 * @param W (return) The value of the kernel function \f$W(x,h)\f$.
 * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
248
 */
Matthieu Schaller's avatar
Matthieu Schaller committed
249
__attribute__((always_inline)) INLINE static void kernel_deval(
250
    float u, float *restrict W, float *restrict dW_dx) {
251

Matthieu Schaller's avatar
Matthieu Schaller committed
252
  /* Go to the range [0,1[ from [0,H[ */
253
  const float x = u * kernel_gamma_inv;
254

Matthieu Schaller's avatar
Matthieu Schaller committed
255
  /* Pick the correct branch of the kernel */
Matthieu Schaller's avatar
Matthieu Schaller committed
256
  const int temp = (int)(x * kernel_ivals_f);
257
  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
Matthieu Schaller's avatar
Matthieu Schaller committed
258
  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
259

Matthieu Schaller's avatar
Matthieu Schaller committed
260
  /* First two terms of the polynomial ... */
261
262
  float w = coeffs[0] * x + coeffs[1];
  float dw_dx = coeffs[0];
Matthieu Schaller's avatar
Matthieu Schaller committed
263
264

  /* ... and the rest of them */
265
266
267
268
  for (int k = 2; k <= kernel_degree; k++) {
    dw_dx = dw_dx * x + w;
    w = x * w + coeffs[k];
  }
269

Matthieu Schaller's avatar
Matthieu Schaller committed
270
  /* Return everything */
271
272
  *W = w * kernel_constant * kernel_gamma_inv_dim;
  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
273
}
274

275
/**
Matthieu Schaller's avatar
Matthieu Schaller committed
276
277
 * @brief Computes the kernel function.
 *
278
279
280
281
 * The kernel function needs to be mutliplied by \f$h^{-d}\f$,
 * where \f$d\f$ is the dimensionality of the problem.
 *
 * Returns 0 if \f$u > \gamma = H/h\f$
282
 *
283
284
 * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
 * @param W (return) The value of the kernel function \f$W(x,h)\f$.
285
 */
286
287
__attribute__((always_inline)) INLINE static void kernel_eval(
    float u, float *restrict W) {
288

289
  /* Go to the range [0,1[ from [0,H[ */
290
  const float x = u * kernel_gamma_inv;
291
292

  /* Pick the correct branch of the kernel */
Matthieu Schaller's avatar
Matthieu Schaller committed
293
  const int temp = (int)(x * kernel_ivals_f);
294
  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
Matthieu Schaller's avatar
Matthieu Schaller committed
295
  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];
296
297

  /* First two terms of the polynomial ... */
298
  float w = coeffs[0] * x + coeffs[1];
299
300

  /* ... and the rest of them */
301
  for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k];
302
303

  /* Return everything */
304
  *W = w * kernel_constant * kernel_gamma_inv_dim;
305
}
306

307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
/**
 * @brief Computes the kernel function derivative.
 *
 * The kernel function needs to be mutliplied by \f$h^{-d}\f$ and the gradient
 * by \f$h^{-(d+1)}\f$, where \f$d\f$ is the dimensionality of the problem.
 *
 * Returns 0 if \f$u > \gamma = H/h\f$.
 *
 * @param u The ratio of the distance to the smoothing length \f$u = x/h\f$.
 * @param dW_dx (return) The norm of the gradient of \f$|\nabla W(x,h)|\f$.
 */
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx(
    float u, float *restrict dW_dx) {

  /* Go to the range [0,1[ from [0,H[ */
  const float x = u * kernel_gamma_inv;

  /* Pick the correct branch of the kernel */
  const int temp = (int)(x * kernel_ivals_f);
  const int ind = temp > kernel_ivals ? kernel_ivals : temp;
  const float *const coeffs = &kernel_coeffs[ind * (kernel_degree + 1)];

  /* First two terms of the polynomial ... */
Matthieu Schaller's avatar
Matthieu Schaller committed
330
331
  float dw_dx = ((float)kernel_degree * coeffs[0] * x) +
                (float)(kernel_degree - 1) * coeffs[1];
332
333
334
335
336
337
338
339
340
341

  /* ... and the rest of them */
  for (int k = 2; k < kernel_degree; k++) {
    dw_dx = dw_dx * x + (float)(kernel_degree - k) * coeffs[k];
  }

  /* Return everything */
  *dW_dx = dw_dx * kernel_constant * kernel_gamma_inv_dim_plus_one;
}

342
343
/* ------------------------------------------------------------------------- */

344
#ifdef WITH_VECTORIZATION
345

346
static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv);
347
348
349
350
351

static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals);

static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant);

352
353
static const vector kernel_gamma_inv_dim_vec =
    FILL_VEC((float)kernel_gamma_inv_dim);
354

355
356
static const vector kernel_gamma_inv_dim_plus_one_vec =
    FILL_VEC((float)kernel_gamma_inv_dim_plus_one);
357

358
359
360
361
362
363
/**
 * @brief Computes the kernel function and its derivative (Vectorised version).
 *
 * Return 0 if $u > \\gamma = H/h$
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
364
365
 * @param w (return) The value of the kernel function $W(x,h)$.
 * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
366
 */
367
368
__attribute__((always_inline)) INLINE static void kernel_deval_vec(
    vector *u, vector *w, vector *dw_dx) {
369
370

  /* Go to the range [0,1[ from [0,H[ */
Matthieu Schaller's avatar
Matthieu Schaller committed
371
  vector x;
372
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
373
374

  /* Load x and get the interval id. */
Matthieu Schaller's avatar
Matthieu Schaller committed
375
  vector ind;
376
  ind.m = vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v));
377
378

  /* load the coefficients. */
Matthieu Schaller's avatar
Matthieu Schaller committed
379
380
381
  vector c[kernel_degree + 1];
  for (int k = 0; k < VEC_SIZE; k++)
    for (int j = 0; j < kernel_degree + 1; j++)
382
383
384
      c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j];

  /* Init the iteration for Horner's scheme. */
385
  w->v = vec_fma(c[0].v, x.v, c[1].v);
386
387
388
389
  dw_dx->v = c[0].v;

  /* And we're off! */
  for (int k = 2; k <= kernel_degree; k++) {
390
391
    dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
    w->v = vec_fma(x.v, w->v, c[k].v);
392
  }
393
394

  /* Return everything */
395
396
  w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
397
398
}

Matthieu Schaller's avatar
Matthieu Schaller committed
399
400
/* Define constant vectors for the Wendland C2 and Cubic Spline kernel
 * coefficients. */
401
#ifdef WENDLAND_C2_KERNEL
James Willis's avatar
James Willis committed
402
403
404
405
406
407
static const vector wendland_const_c0 = FILL_VEC(4.f);
static const vector wendland_const_c1 = FILL_VEC(-15.f);
static const vector wendland_const_c2 = FILL_VEC(20.f);
static const vector wendland_const_c3 = FILL_VEC(-10.f);
static const vector wendland_const_c4 = FILL_VEC(0.f);
static const vector wendland_const_c5 = FILL_VEC(1.f);
408
409
410
411
412

static const vector wendland_dwdx_const_c0 = FILL_VEC(20.f);
static const vector wendland_dwdx_const_c1 = FILL_VEC(-60.f);
static const vector wendland_dwdx_const_c2 = FILL_VEC(60.f);
static const vector wendland_dwdx_const_c3 = FILL_VEC(-20.f);
413
414
415
416
417
418
#elif defined(CUBIC_SPLINE_KERNEL)
/* First region 0 < u < 0.5 */
static const vector cubic_1_const_c0 = FILL_VEC(3.f);
static const vector cubic_1_const_c1 = FILL_VEC(-3.f);
static const vector cubic_1_const_c2 = FILL_VEC(0.f);
static const vector cubic_1_const_c3 = FILL_VEC(0.5f);
419
420
421
static const vector cubic_1_dwdx_const_c0 = FILL_VEC(9.f);
static const vector cubic_1_dwdx_const_c1 = FILL_VEC(-6.f);
static const vector cubic_1_dwdx_const_c2 = FILL_VEC(0.f);
422
423
424
425
426
427

/* Second region 0.5 <= u < 1 */
static const vector cubic_2_const_c0 = FILL_VEC(-1.f);
static const vector cubic_2_const_c1 = FILL_VEC(3.f);
static const vector cubic_2_const_c2 = FILL_VEC(-3.f);
static const vector cubic_2_const_c3 = FILL_VEC(1.f);
428
429
430
static const vector cubic_2_dwdx_const_c0 = FILL_VEC(-3.f);
static const vector cubic_2_dwdx_const_c1 = FILL_VEC(6.f);
static const vector cubic_2_dwdx_const_c2 = FILL_VEC(-3.f);
431
static const vector cond = FILL_VEC(0.5f);
432
433
#endif

James Willis's avatar
James Willis committed
434
435
/*TODO: Comment kernels for each region */

436
437
/**
 * @brief Computes the kernel function and its derivative for two particles
James Willis's avatar
James Willis committed
438
 * using vectors. Does not return zero if $u > \\gamma = H/h$, should only
James Willis's avatar
James Willis committed
439
 * be called if particles are known to interact.
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
 * @param w (return) The value of the kernel function $W(x,h)$.
 * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
 */
__attribute__((always_inline)) INLINE static void kernel_deval_1_vec(
    vector *u, vector *w, vector *dw_dx) {

  /* Go to the range [0,1[ from [0,H[ */
  vector x;
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);

#ifdef WENDLAND_C2_KERNEL
  /* Init the iteration for Horner's scheme. */
  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
  dw_dx->v = wendland_const_c0.v;

  /* Calculate the polynomial interleaving vector operations */
  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);

  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);

  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
465
  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero. */
466
467
468

  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
469
470
#elif defined(CUBIC_SPLINE_KERNEL)
  vector w2, dw_dx2;
471
  mask_t mask_reg1, mask_reg2;
472

473
  /* Form a mask for each part of the kernel. */
474
475
  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
476

Matthieu Schaller's avatar
Matthieu Schaller committed
477
478
  /* Work out w for both regions of the kernel and combine the results together
   * using masks. */
479
480

  /* Init the iteration for Horner's scheme. */
481
482
483
484
  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
  w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
  dw_dx->v = cubic_1_const_c0.v;
  dw_dx2.v = cubic_2_const_c0.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
485

486
487
488
  /* Calculate the polynomial interleaving vector operations. */
  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
489
  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero. */
490
  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
491

492
493
494
495
  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2.v = vec_fma(dw_dx2.v, x.v, w2.v);
  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
496

497
  /* Mask out unneeded values. */
498
499
500
501
  w->v = vec_and_mask(w->v, mask_reg1);
  w2.v = vec_and_mask(w2.v, mask_reg2);
  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
502
503

  /* Added both w and w2 together to form complete result. */
Matthieu Schaller's avatar
Matthieu Schaller committed
504
505
  w->v = vec_add(w->v, w2.v);
  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
506
#else
507
#error "Vectorisation not supported for this kernel!!!"
508
509
#endif

510
  /* Return everything */
511
512
513
514
  w->v =
      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                       kernel_gamma_inv_dim_plus_one_vec.v));
515
516
}

James Willis's avatar
James Willis committed
517
/**
James Willis's avatar
James Willis committed
518
 * @brief Computes the kernel function and its derivative for two particles
James Willis's avatar
James Willis committed
519
520
 * using interleaved vectors. Does not return zero if $u > \\gamma = H/h$,
 * should only
James Willis's avatar
James Willis committed
521
 * be called if particles are known to interact.
James Willis's avatar
James Willis committed
522
523
524
525
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
 * @param w (return) The value of the kernel function $W(x,h)$.
 * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
James Willis's avatar
James Willis committed
526
527
528
529
530
531
 * @param u2 The ratio of the distance to the smoothing length $u = x/h$ for
 * second particle.
 * @param w2 (return) The value of the kernel function $W(x,h)$ for second
 * particle.
 * @param dw_dx2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for
 * second particle.
James Willis's avatar
James Willis committed
532
 */
533
__attribute__((always_inline)) INLINE static void kernel_deval_2_vec(
James Willis's avatar
James Willis committed
534
535
    vector *u, vector *w, vector *dw_dx, vector *u2, vector *w2,
    vector *dw_dx2) {
536
537
538
539
540
541
542
543

  /* Go to the range [0,1[ from [0,H[ */
  vector x, x2;
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
  x2.v = vec_mul(u2->v, kernel_gamma_inv_vec.v);

#ifdef WENDLAND_C2_KERNEL
  /* Init the iteration for Horner's scheme. */
James Willis's avatar
James Willis committed
544
545
546
547
  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);
  w2->v = vec_fma(wendland_const_c0.v, x2.v, wendland_const_c1.v);
  dw_dx->v = wendland_const_c0.v;
  dw_dx2->v = wendland_const_c0.v;
548

James Willis's avatar
James Willis committed
549
  /* Calculate the polynomial interleaving vector operations */
550
551
  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
James Willis's avatar
James Willis committed
552
553
  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
  w2->v = vec_fma(x2.v, w2->v, wendland_const_c2.v);
554
555
556

  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
James Willis's avatar
James Willis committed
557
558
  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
  w2->v = vec_fma(x2.v, w2->v, wendland_const_c3.v);
559
560
561

  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
Matthieu Schaller's avatar
Matthieu Schaller committed
562
  w->v = vec_mul(x.v, w->v);    /* wendland_const_c4 is zero. */
563
  w2->v = vec_mul(x2.v, w2->v); /* wendland_const_c4 is zero. */
James Willis's avatar
James Willis committed
564

565
566
  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
James Willis's avatar
James Willis committed
567
568
  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
  w2->v = vec_fma(x2.v, w2->v, wendland_const_c5.v);
569
570

  /* Return everything */
James Willis's avatar
James Willis committed
571
572
573
574
575
576
577
578
  w->v =
      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
  w2->v = vec_mul(w2->v,
                  vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                       kernel_gamma_inv_dim_plus_one_vec.v));
  dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v,
                                         kernel_gamma_inv_dim_plus_one_vec.v));
579
580
581
#elif defined(CUBIC_SPLINE_KERNEL)
  vector w_2, dw_dx_2;
  vector w2_2, dw_dx2_2;
582
  mask_t mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2;
583

584
  /* Form a mask for each part of the kernel. */
585
586
587
588
  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));      /* 0 < x < 0.5 */
  vec_create_mask(mask_reg1_v2, vec_cmp_lt(x2.v, cond.v));  /* 0 < x < 0.5 */
  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
  vec_create_mask(mask_reg2_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */
589

Matthieu Schaller's avatar
Matthieu Schaller committed
590
591
  /* Work out w for both regions of the kernel and combine the results together
   * using masks. */
592
593

  /* Init the iteration for Horner's scheme. */
594
595
596
597
598
599
600
601
  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
  w2->v = vec_fma(cubic_1_const_c0.v, x2.v, cubic_1_const_c1.v);
  w_2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
  w2_2.v = vec_fma(cubic_2_const_c0.v, x2.v, cubic_2_const_c1.v);
  dw_dx->v = cubic_1_const_c0.v;
  dw_dx2->v = cubic_1_const_c0.v;
  dw_dx_2.v = cubic_2_const_c0.v;
  dw_dx2_2.v = cubic_2_const_c0.v;
Matthieu Schaller's avatar
Matthieu Schaller committed
602

603
604
605
606
607
  /* Calculate the polynomial interleaving vector operations. */
  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
  dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
608
  w->v = vec_mul(x.v, w->v);    /* cubic_1_const_c2 is zero. */
609
  w2->v = vec_mul(x2.v, w2->v); /* cubic_1_const_c2 is zero. */
610
611
612
613
614
615
616
617
618
619
620
621
622
  w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c2.v);
  w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c2.v);

  dw_dx->v = vec_fma(dw_dx->v, x.v, w->v);
  dw_dx2->v = vec_fma(dw_dx2->v, x2.v, w2->v);
  dw_dx_2.v = vec_fma(dw_dx_2.v, x.v, w_2.v);
  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x2.v, w2_2.v);
  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
  w2->v = vec_fma(x2.v, w2->v, cubic_1_const_c3.v);
  w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v);
  w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v);

  /* Mask out unneeded values. */
623
624
625
626
627
628
629
630
  w->v = vec_and_mask(w->v, mask_reg1);
  w2->v = vec_and_mask(w2->v, mask_reg1_v2);
  w_2.v = vec_and_mask(w_2.v, mask_reg2);
  w2_2.v = vec_and_mask(w2_2.v, mask_reg2_v2);
  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
  dw_dx2->v = vec_and_mask(dw_dx2->v, mask_reg1_v2);
  dw_dx_2.v = vec_and_mask(dw_dx_2.v, mask_reg2);
  dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_v2);
631
632

  /* Added both w and w2 together to form complete result. */
Matthieu Schaller's avatar
Matthieu Schaller committed
633
634
635
636
  w->v = vec_add(w->v, w_2.v);
  w2->v = vec_add(w2->v, w2_2.v);
  dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v);
  dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v);
637
638

  /* Return everything */
639
640
  w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
  w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
641
  dw_dx->v =
642
      vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
643
  dw_dx2->v =
644
      vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
645
646
647
648

#endif
}

649
650
/**
 * @brief Computes the kernel function for two particles
James Willis's avatar
James Willis committed
651
 * using vectors. Does not return zero if $u > \\gamma = H/h$, should only
James Willis's avatar
James Willis committed
652
 * be called if particles are known to interact.
653
654
655
656
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
 * @param w (return) The value of the kernel function $W(x,h)$.
 */
Matthieu Schaller's avatar
Matthieu Schaller committed
657
658
__attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u,
                                                                    vector *w) {
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674

  /* Go to the range [0,1[ from [0,H[ */
  vector x;
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);

#ifdef WENDLAND_C2_KERNEL
  /* Init the iteration for Horner's scheme. */
  w->v = vec_fma(wendland_const_c0.v, x.v, wendland_const_c1.v);

  /* Calculate the polynomial interleaving vector operations */
  w->v = vec_fma(x.v, w->v, wendland_const_c2.v);
  w->v = vec_fma(x.v, w->v, wendland_const_c3.v);
  w->v = vec_mul(x.v, w->v); /* wendland_const_c4 is zero.*/
  w->v = vec_fma(x.v, w->v, wendland_const_c5.v);
#elif defined(CUBIC_SPLINE_KERNEL)
  vector w2;
675
  mask_t mask_reg1, mask_reg2;
676
677

  /* Form a mask for each part of the kernel. */
678
679
680
  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
  
Matthieu Schaller's avatar
Matthieu Schaller committed
681
682
  /* Work out w for both regions of the kernel and combine the results together
   * using masks. */
683
684
685
686

  /* Init the iteration for Horner's scheme. */
  w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v);
  w2.v = vec_fma(cubic_2_const_c0.v, x.v, cubic_2_const_c1.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
687

688
689
690
  /* Calculate the polynomial interleaving vector operations. */
  w->v = vec_mul(x.v, w->v); /* cubic_1_const_c2 is zero */
  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c2.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
691

692
693
  w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v);
  w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v);
Matthieu Schaller's avatar
Matthieu Schaller committed
694

695
  /* Mask out unneeded values. */
696
697
  w->v = vec_and_mask(w->v, mask_reg1);
  w2.v = vec_and_mask(w2.v, mask_reg2);
698
699

  /* Added both w and w2 together to form complete result. */
Matthieu Schaller's avatar
Matthieu Schaller committed
700
  w->v = vec_add(w->v, w2.v);
701
#else
702
#error "Vectorisation not supported for this kernel!!!"
703
704
#endif

705
706
707
708
709
710
711
  /* Return everything */
  w->v =
      vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v));
}

/**
 * @brief Computes the kernel function derivative for two particles
James Willis's avatar
James Willis committed
712
 * using vectors. Does not return zero if $u > \\gamma = H/h$, should only
James Willis's avatar
James Willis committed
713
 * be called if particles are known to interact.
714
715
716
717
718
719
720
721
722
723
724
725
726
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
 * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
 */
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec(
    vector *u, vector *dw_dx) {

  /* Go to the range [0,1[ from [0,H[ */
  vector x;
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);

#ifdef WENDLAND_C2_KERNEL
  /* Init the iteration for Horner's scheme. */
Matthieu Schaller's avatar
Matthieu Schaller committed
727
  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
728
729

  /* Calculate the polynomial interleaving vector operations */
Matthieu Schaller's avatar
Matthieu Schaller committed
730
  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
731

Matthieu Schaller's avatar
Matthieu Schaller committed
732
  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
733

Matthieu Schaller's avatar
Matthieu Schaller committed
734
  dw_dx->v = vec_mul(dw_dx->v, x.v);
735
736
737

#elif defined(CUBIC_SPLINE_KERNEL)
  vector dw_dx2;
738
  mask_t mask_reg1, mask_reg2;
739
740

  /* Form a mask for each part of the kernel. */
741
742
  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
743

Matthieu Schaller's avatar
Matthieu Schaller committed
744
745
  /* Work out w for both regions of the kernel and combine the results together
   * using masks. */
746
747

  /* Init the iteration for Horner's scheme. */
Matthieu Schaller's avatar
Matthieu Schaller committed
748
749
750
  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);

751
  /* Calculate the polynomial interleaving vector operations. */
Matthieu Schaller's avatar
Matthieu Schaller committed
752
753
754
  dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);

755
  /* Mask out unneeded values. */
756
757
  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
758
759

  /* Added both dwdx and dwdx2 together to form complete result. */
Matthieu Schaller's avatar
Matthieu Schaller committed
760
  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
761
#else
762
#error "Vectorisation not supported for this kernel!!!"
763
764
765
766
767
768
769
#endif

  /* Return everything */
  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                       kernel_gamma_inv_dim_plus_one_vec.v));
}

770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
/**
 * @brief Computes the kernel function derivative for two particles
 * using vectors.
 *
 * Return 0 if $u > \\gamma = H/h$
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
 * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
 */
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec(
    vector *u, vector *dw_dx) {

  /* Go to the range [0,1[ from [0,H[ */
  vector x;
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);

#ifdef WENDLAND_C2_KERNEL
  /* Init the iteration for Horner's scheme. */
  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);

  /* Calculate the polynomial interleaving vector operations */
  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);

  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);

  dw_dx->v = vec_mul(dw_dx->v, x.v);

#elif defined(CUBIC_SPLINE_KERNEL)
  vector dw_dx2;
799
  mask_t mask_reg1, mask_reg2;
800
801

  /* Form a mask for each part of the kernel. */
802
803
  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));  /* 0 < x < 0.5 */
  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */
804
805
806
807
808
809
810
811
812
813
814
815
816

  /* Work out w for both regions of the kernel and combine the results together
   * using masks. */

  /* Init the iteration for Horner's scheme. */
  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);

  /* Calculate the polynomial interleaving vector operations. */
  dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */
  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);

  /* Mask out unneeded values. */
817
818
  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
819
820
821
822

  /* Added both dwdx and dwdx2 together to form complete result. */
  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
#else
823
#error "Vectorisation not supported for this kernel!!!"
824
825
#endif

James Willis's avatar
James Willis committed
826
  /* Mask out result for particles that lie outside of the kernel function. */
827
828
  mask_t mask;
  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));  /* x < 1 */
829

830
  dw_dx->v = vec_and_mask(dw_dx->v, mask);
831
832
833
834
835
836
837
838

  /* Return everything */
  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                       kernel_gamma_inv_dim_plus_one_vec.v));
}

/**
 * @brief Computes the kernel function derivative for two particles
James Willis's avatar
James Willis committed
839
 * using interleaved vectors.
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
 *
 * Return 0 if $u > \\gamma = H/h$
 *
 * @param u The ratio of the distance to the smoothing length $u = x/h$.
 * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$.
 */
__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec(
    vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) {

  /* Go to the range [0,1[ from [0,H[ */
  vector x, x_2;
  x.v = vec_mul(u->v, kernel_gamma_inv_vec.v);
  x_2.v = vec_mul(u_2->v, kernel_gamma_inv_vec.v);

#ifdef WENDLAND_C2_KERNEL
  /* Init the iteration for Horner's scheme. */
  dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v);
James Willis's avatar
James Willis committed
857
858
  dw_dx_2->v =
      vec_fma(wendland_dwdx_const_c0.v, x_2.v, wendland_dwdx_const_c1.v);
859
860
861
862
863
864
865
866
867
868
869
870
871

  /* Calculate the polynomial interleaving vector operations */
  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v);
  dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c2.v);

  dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v);
  dw_dx_2->v = vec_fma(dw_dx_2->v, x_2.v, wendland_dwdx_const_c3.v);

  dw_dx->v = vec_mul(dw_dx->v, x.v);
  dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v);

#elif defined(CUBIC_SPLINE_KERNEL)
  vector dw_dx2, dw_dx2_2;
872
873
  mask_t mask_reg1, mask_reg2;
  mask_t mask_reg1_2, mask_reg2_2;
874
875

  /* Form a mask for each part of the kernel. */
876
877
878
879
880
  vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v));      /* 0 < x < 0.5 */
  vec_create_mask(mask_reg1_2, vec_cmp_lt(x_2.v, cond.v));  /* 0 < x < 0.5 */
  vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v));     /* 0.5 < x < 1 */
  vec_create_mask(mask_reg2_2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */
  
881
882
883
884
885
886
887
888
889
890
  /* Work out w for both regions of the kernel and combine the results together
   * using masks. */

  /* Init the iteration for Horner's scheme. */
  dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v);
  dw_dx_2->v = vec_fma(cubic_1_dwdx_const_c0.v, x_2.v, cubic_1_dwdx_const_c1.v);
  dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v);
  dw_dx2_2.v = vec_fma(cubic_2_dwdx_const_c0.v, x_2.v, cubic_2_dwdx_const_c1.v);

  /* Calculate the polynomial interleaving vector operations. */
James Willis's avatar
James Willis committed
891
  dw_dx->v = vec_mul(dw_dx->v, x.v);       /* cubic_1_dwdx_const_c2 is zero. */
892
893
894
895
896
  dw_dx_2->v = vec_mul(dw_dx_2->v, x_2.v); /* cubic_1_dwdx_const_c2 is zero. */
  dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v);
  dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v);

  /* Mask out unneeded values. */
897
898
899
900
  dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1);
  dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_reg1_2);
  dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2);
  dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_2);
901
902
903
904
905

  /* Added both dwdx and dwdx2 together to form complete result. */
  dw_dx->v = vec_add(dw_dx->v, dw_dx2.v);
  dw_dx_2->v = vec_add(dw_dx_2->v, dw_dx2_2.v);
#else
906
#error "Vectorisation not supported for this kernel!!!"
907
908
#endif

James Willis's avatar
James Willis committed
909
  /* Mask out result for particles that lie outside of the kernel function. */
910
911
912
  mask_t mask, mask_2;
  vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f)));  /* x < 1 */
  vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f)));  /* x < 1 */
913

914
915
  dw_dx->v = vec_and_mask(dw_dx->v, mask);
  dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2);
916
917
918
919

  /* Return everything */
  dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v,
                                       kernel_gamma_inv_dim_plus_one_vec.v));
James Willis's avatar
James Willis committed
920
921
922
  dw_dx_2->v = vec_mul(
      dw_dx_2->v,
      vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v));
923
}
James Willis's avatar
James Willis committed
924

925
926
#endif /* WITH_VECTORIZATION */

927
/* Some cross-check functions */
928
void hydro_kernel_dump(int N);
929

930
#endif  // SWIFT_KERNEL_HYDRO_H