runner_doiact_vec.c 53 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* This object's header. */
#include "runner_doiact_vec.h"

26
27
28
/* Local headers. */
#include "active.h"

29
30
31
#ifdef WITH_VECTORIZATION
static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);

James Willis's avatar
James Willis committed
32
33
34
/**
 * @brief Compute the vector remainder interactions from the secondary cache.
 *
Matthieu Schaller's avatar
Matthieu Schaller committed
35
 * @param int_cache (return) secondary #cache of interactions between two
James Willis's avatar
James Willis committed
36
 * particles.
James Willis's avatar
James Willis committed
37
 * @param icount Interaction count.
Matthieu Schaller's avatar
Matthieu Schaller committed
38
 * @param rhoSum (return) #vector holding the cumulative sum of the density
James Willis's avatar
James Willis committed
39
 * update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
40
 * @param rho_dhSum (return) #vector holding the cumulative sum of the density
James Willis's avatar
James Willis committed
41
 * gradient update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
42
 * @param wcountSum (return) #vector holding the cumulative sum of the wcount
James Willis's avatar
James Willis committed
43
 * update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
44
 * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
James Willis's avatar
James Willis committed
45
 * gradient update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
46
 * @param div_vSum (return) #vector holding the cumulative sum of the divergence
James Willis's avatar
James Willis committed
47
 * update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
48
 * @param curlvxSum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
49
 * vx update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
50
 * @param curlvySum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
51
 * vy update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
52
 * @param curlvzSum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
53
 * vz update on pi.
James Willis's avatar
James Willis committed
54
55
56
57
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
58
 * @param icount_align (return) Interaction count after the remainder
James Willis's avatar
James Willis committed
59
 * interactions have been performed, should be a multiple of the vector length.
James Willis's avatar
James Willis committed
60
 */
James Willis's avatar
James Willis committed
61
__attribute__((always_inline)) INLINE static void calcRemInteractions(
Matthieu Schaller's avatar
Matthieu Schaller committed
62
63
64
65
66
    struct c2_cache *const int_cache, const int icount, vector *rhoSum,
    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
    int *icount_align) {
67

68
  mask_t int_mask, int_mask2;
James Willis's avatar
James Willis committed
69
70

  /* Work out the number of remainder interactions and pad secondary cache. */
71
72
73
74
75
76
  *icount_align = icount;
  int rem = icount % (NUM_VEC_PROC * VEC_SIZE);
  if (rem != 0) {
    int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
    *icount_align += pad;

77
    /* Initialise masks to true. */
78
79
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
80

James Willis's avatar
James Willis committed
81
82
83
    /* Pad secondary cache so that there are no contributions in the interaction
     * function. */
    for (int i = icount; i < *icount_align; i++) {
84
85
86
87
88
89
90
91
      int_cache->mq[i] = 0.f;
      int_cache->r2q[i] = 1.f;
      int_cache->dxq[i] = 0.f;
      int_cache->dyq[i] = 0.f;
      int_cache->dzq[i] = 0.f;
      int_cache->vxq[i] = 0.f;
      int_cache->vyq[i] = 0.f;
      int_cache->vzq[i] = 0.f;
92
93
94
95
    }

    /* Zero parts of mask that represent the padded values.*/
    if (pad < VEC_SIZE) {
James Willis's avatar
James Willis committed
96
      vec_pad_mask(int_mask2, pad);
James Willis's avatar
James Willis committed
97
    } else {
James Willis's avatar
James Willis committed
98
      vec_pad_mask(int_mask, VEC_SIZE - rem);
99
      vec_zero_mask(int_mask2);
100
101
    }

James Willis's avatar
James Willis committed
102
103
    /* Perform remainder interaction and remove remainder from aligned
     * interaction count. */
104
    *icount_align = icount - rem;
James Willis's avatar
James Willis committed
105
106
107
108
109
110
    runner_iact_nonsym_2_vec_density(
        &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
        &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align],
        v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
        &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
        &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
James Willis's avatar
James Willis committed
111
112
        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
        int_mask2, 1);
113
114
115
  }
}

James Willis's avatar
James Willis committed
116
/**
James Willis's avatar
James Willis committed
117
118
 * @brief Left-packs the values needed by an interaction into the secondary
 * cache (Supports AVX, AVX2 and AVX512 instruction sets).
James Willis's avatar
James Willis committed
119
120
 *
 * @param mask Contains which particles need to interact.
Matthieu Schaller's avatar
Matthieu Schaller committed
121
 * @param pjd Index of the particle to store into.
James Willis's avatar
James Willis committed
122
123
124
125
126
 * @param v_r2 #vector of the separation between two particles squared.
 * @param v_dx #vector of the x separation between two particles.
 * @param v_dy #vector of the y separation between two particles.
 * @param v_dz #vector of the z separation between two particles.
 * @param cell_cache #cache of all particles in the cell.
Matthieu Schaller's avatar
Matthieu Schaller committed
127
 * @param int_cache (return) secondary #cache of interactions between two
James Willis's avatar
James Willis committed
128
 * particles.
James Willis's avatar
James Willis committed
129
130
 * @param icount Interaction count.
 * @param rhoSum #vector holding the cumulative sum of the density update on pi.
James Willis's avatar
James Willis committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
 * @param rho_dhSum #vector holding the cumulative sum of the density gradient
 * update on pi.
 * @param wcountSum #vector holding the cumulative sum of the wcount update on
 * pi.
 * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
 * update on pi.
 * @param div_vSum #vector holding the cumulative sum of the divergence update
 * on pi.
 * @param curlvxSum #vector holding the cumulative sum of the curl of vx update
 * on pi.
 * @param curlvySum #vector holding the cumulative sum of the curl of vy update
 * on pi.
 * @param curlvzSum #vector holding the cumulative sum of the curl of vz update
 * on pi.
James Willis's avatar
James Willis committed
145
146
147
148
149
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
 */
James Willis's avatar
James Willis committed
150
__attribute__((always_inline)) INLINE static void storeInteractions(
151
    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
James Willis's avatar
James Willis committed
152
153
154
155
156
    vector *v_dz, const struct cache *const cell_cache,
    struct c2_cache *const int_cache, int *icount, vector *rhoSum,
    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) {
James Willis's avatar
James Willis committed
157
158
159

/* Left-pack values needed into the secondary cache using the interaction mask.
 */
160
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
161
162
163
164
165
166
167
  mask_t packed_mask;
  VEC_FORM_PACKED_MASK(mask, packed_mask);

  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
James Willis's avatar
James Willis committed
168
169
170
171
172
173
174
175
  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
                &int_cache->mq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
                &int_cache->vxq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
                &int_cache->vyq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
                &int_cache->vzq[*icount]);
176
177
178

  /* Increment interaction count by number of bits set in mask. */
  (*icount) += __builtin_popcount(mask);
179
#else
James Willis's avatar
James Willis committed
180
  /* Quicker to do it serially in AVX rather than use intrinsics. */
James Willis's avatar
James Willis committed
181
  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
182
183
    if (mask & (1 << bit_index)) {
      /* Add this interaction to the queue. */
184
185
186
187
188
189
190
191
      int_cache->r2q[*icount] = v_r2->f[bit_index];
      int_cache->dxq[*icount] = v_dx->f[bit_index];
      int_cache->dyq[*icount] = v_dy->f[bit_index];
      int_cache->dzq[*icount] = v_dz->f[bit_index];
      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
192
193
194
195

      (*icount)++;
    }
  }
196

James Willis's avatar
James Willis committed
197
198
#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */

James Willis's avatar
James Willis committed
199
  /* Flush the c2 cache if it has reached capacity. */
James Willis's avatar
James Willis committed
200
  if (*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) {
201
202

    int icount_align = *icount;
James Willis's avatar
James Willis committed
203

James Willis's avatar
James Willis committed
204
    /* Peform remainder interactions. */
Matthieu Schaller's avatar
Matthieu Schaller committed
205
206
207
    calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum,
                        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
                        v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
208

209
    mask_t int_mask, int_mask2;
210
211
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
James Willis's avatar
James Willis committed
212
213

    /* Perform interactions. */
James Willis's avatar
James Willis committed
214
215
216
217
218
219
    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
      runner_iact_nonsym_2_vec_density(
          &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd],
          &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
          &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd],
          &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum,
220
          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0);
221
    }
James Willis's avatar
James Willis committed
222
223

    /* Reset interaction count. */
224
225
226
    *icount = 0;
  }
}
227

228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
/**
 * @brief Compute the vector remainder interactions from the secondary cache.
 *
 * @param int_cache (return) secondary #cache of interactions between two
 * particles.
 * @param icount Interaction count.
 * @param rhoSum (return) #vector holding the cumulative sum of the density
 * update on pi.
 * @param rho_dhSum (return) #vector holding the cumulative sum of the density
 * gradient update on pi.
 * @param wcountSum (return) #vector holding the cumulative sum of the wcount
 * update on pi.
 * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
 * gradient update on pi.
 * @param div_vSum (return) #vector holding the cumulative sum of the divergence
 * update on pi.
 * @param curlvxSum (return) #vector holding the cumulative sum of the curl of
 * vx update on pi.
 * @param curlvySum (return) #vector holding the cumulative sum of the curl of
 * vy update on pi.
 * @param curlvzSum (return) #vector holding the cumulative sum of the curl of
 * vz update on pi.
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
 * @param icount_align (return) Interaction count after the remainder
 * interactions have been performed, should be a multiple of the vector length.
 */
__attribute__((always_inline)) INLINE static void calcRemForceInteractions(
    struct c2_cache *const int_cache, const int icount, vector *a_hydro_xSum,
    vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
260
261
262
    vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix,
    vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi,
    vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int *icount_align,
James Willis's avatar
James Willis committed
263
    int num_vec_proc) {
264

265
  mask_t int_mask, int_mask2;
266
267
268

  /* Work out the number of remainder interactions and pad secondary cache. */
  *icount_align = icount;
269
  int rem = icount % (num_vec_proc * VEC_SIZE);
270
  if (rem != 0) {
271
    int pad = (num_vec_proc * VEC_SIZE) - rem;
272
273
    *icount_align += pad;

274
    /* Initialise masks to true. */
275
276
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
277

278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
    /* Pad secondary cache so that there are no contributions in the interaction
     * function. */
    for (int i = icount; i < *icount_align; i++) {
      int_cache->mq[i] = 0.f;
      int_cache->r2q[i] = 1.f;
      int_cache->dxq[i] = 0.f;
      int_cache->dyq[i] = 0.f;
      int_cache->dzq[i] = 0.f;
      int_cache->vxq[i] = 0.f;
      int_cache->vyq[i] = 0.f;
      int_cache->vzq[i] = 0.f;
      int_cache->rhoq[i] = 1.f;
      int_cache->grad_hq[i] = 1.f;
      int_cache->pOrho2q[i] = 1.f;
      int_cache->balsaraq[i] = 1.f;
      int_cache->soundspeedq[i] = 1.f;
      int_cache->h_invq[i] = 1.f;
    }

    /* Zero parts of mask that represent the padded values.*/
    if (pad < VEC_SIZE) {
James Willis's avatar
James Willis committed
299
      vec_pad_mask(int_mask2, pad);
300
    } else {
James Willis's avatar
James Willis committed
301
      vec_pad_mask(int_mask, VEC_SIZE - rem);
302
      vec_zero_mask(int_mask2);
303
304
305
306
307
    }

    /* Perform remainder interaction and remove remainder from aligned
     * interaction count. */
    *icount_align = icount - rem;
308
    runner_iact_nonsym_2_vec_force(
James Willis's avatar
James Willis committed
309
310
311
312
313
314
315
316
317
318
        &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
        &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix,
        v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci,
        &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align],
        &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align],
        &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align],
        &int_cache->balsaraq[*icount_align],
        &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align],
        v_hi_inv, &int_cache->h_invq[*icount_align], a_hydro_xSum, a_hydro_ySum,
        a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2, 1);
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
  }
}

/**
 * @brief Left-packs the values needed by an interaction into the secondary
 * cache (Supports AVX, AVX2 and AVX512 instruction sets).
 *
 * @param mask Contains which particles need to interact.
 * @param pjd Index of the particle to store into.
 * @param v_r2 #vector of the separation between two particles squared.
 * @param v_dx #vector of the x separation between two particles.
 * @param v_dy #vector of the y separation between two particles.
 * @param v_dz #vector of the z separation between two particles.
 * @param v_mj #vector of the mass of particle pj.
 * @param v_vjx #vector of x velocity of pj.
 * @param v_vjy #vector of y velocity of pj.
 * @param v_vjz #vector of z velocity of pj.
 * @param cell_cache #cache of all particles in the cell.
 * @param int_cache (return) secondary #cache of interactions between two
 * particles.
 * @param icount Interaction count.
 * @param rhoSum #vector holding the cumulative sum of the density update on pi.
 * @param rho_dhSum #vector holding the cumulative sum of the density gradient
 * update on pi.
 * @param wcountSum #vector holding the cumulative sum of the wcount update on
 * pi.
 * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
 * update on pi.
 * @param div_vSum #vector holding the cumulative sum of the divergence update
 * on pi.
 * @param curlvxSum #vector holding the cumulative sum of the curl of vx update
 * on pi.
 * @param curlvySum #vector holding the cumulative sum of the curl of vy update
 * on pi.
 * @param curlvzSum #vector holding the cumulative sum of the curl of vz update
 * on pi.
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
 */
__attribute__((always_inline)) INLINE static void storeForceInteractions(
    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
James Willis's avatar
James Willis committed
362
363
364
    vector *v_dz, const struct cache *const cell_cache,
    struct c2_cache *const int_cache, int *icount, vector *a_hydro_xSum,
    vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
365
366
    vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix,
    vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi,
367
    vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int num_vec_proc) {
368
369
370
371

/* Left-pack values needed into the secondary cache using the interaction mask.
 */
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
372
373
  /* Invert hj. */
  vector v_hj, v_hj_inv;
374
  v_hj.v = vec_load(&cell_cache->h[pjd]);
375
376
  v_hj_inv = vec_reciprocal(v_hj);

377
378
  mask_t packed_mask;
  VEC_FORM_PACKED_MASK(mask, packed_mask);
James Willis's avatar
James Willis committed
379

380
381
382
383
  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
James Willis's avatar
James Willis committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
                &int_cache->mq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
                &int_cache->vxq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
                &int_cache->vyq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
                &int_cache->vzq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask,
                &int_cache->rhoq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask,
                &int_cache->grad_hq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask,
                &int_cache->pOrho2q[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask,
                &int_cache->balsaraq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask,
                &int_cache->soundspeedq[*icount]);
402
  VEC_LEFT_PACK(v_hj_inv.v, packed_mask, &int_cache->h_invq[*icount]);
James Willis's avatar
James Willis committed
403

404
405
  /* Increment interaction count by number of bits set in mask. */
  (*icount) += __builtin_popcount(mask);
406
407
408
409
410
411
412
413
414
415
416
417
418
#else
  /* Quicker to do it serially in AVX rather than use intrinsics. */
  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
    if (mask & (1 << bit_index)) {
      /* Add this interaction to the queue. */
      int_cache->r2q[*icount] = v_r2->f[bit_index];
      int_cache->dxq[*icount] = v_dx->f[bit_index];
      int_cache->dyq[*icount] = v_dy->f[bit_index];
      int_cache->dzq[*icount] = v_dz->f[bit_index];
      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
James Willis's avatar
James Willis committed
419

420
421
422
423
424
      int_cache->rhoq[*icount] = cell_cache->rho[pjd + bit_index];
      int_cache->grad_hq[*icount] = cell_cache->grad_h[pjd + bit_index];
      int_cache->pOrho2q[*icount] = cell_cache->pOrho2[pjd + bit_index];
      int_cache->balsaraq[*icount] = cell_cache->balsara[pjd + bit_index];
      int_cache->soundspeedq[*icount] = cell_cache->soundspeed[pjd + bit_index];
425
      int_cache->h_invq[*icount] = 1.f / cell_cache->h[pjd + bit_index];
426
427
428
429
430
431
432
433

      (*icount)++;
    }
  }

#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */

  /* Flush the c2 cache if it has reached capacity. */
434
  if (*icount >= (C2_CACHE_SIZE - (num_vec_proc * VEC_SIZE))) {
435
436
437
438

    int icount_align = *icount;

    /* Peform remainder interactions. */
James Willis's avatar
James Willis committed
439
440
441
442
    calcRemForceInteractions(int_cache, *icount, a_hydro_xSum, a_hydro_ySum,
                             a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum,
                             v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi,
                             v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2);
443

444
445
446
    /* Initialise masks to true in case remainder interactions have been
     * performed. */
    mask_t int_mask, int_mask2;
447
448
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
449

450
    /* Perform interactions. */
451
    for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
452

453
      runner_iact_nonsym_2_vec_force(
James Willis's avatar
James Willis committed
454
455
456
457
458
459
460
461
462
          &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd],
          &int_cache->dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi,
          v_pOrhoi2, v_balsara_i, v_ci, &int_cache->vxq[pjd],
          &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->rhoq[pjd],
          &int_cache->grad_hq[pjd], &int_cache->pOrho2q[pjd],
          &int_cache->balsaraq[pjd], &int_cache->soundspeedq[pjd],
          &int_cache->mq[pjd], v_hi_inv, &int_cache->h_invq[pjd], a_hydro_xSum,
          a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum,
          int_mask, int_mask2, 0);
463
464
465
466
467
468
469
    }

    /* Reset interaction count. */
    *icount = 0;
  }
}

470
/**
471
472
 * @brief Populates the arrays max_index_i and max_index_j with the maximum
 * indices of
James Willis's avatar
James Willis committed
473
474
475
 * particles into their neighbouring cells. Also finds the first pi that
 * interacts with any particle in cj and the last pj that interacts with any
 * particle in ci.
476
 *
James Willis's avatar
James Willis committed
477
478
479
480
481
482
 * @param ci #cell pointer to ci
 * @param cj #cell pointer to cj
 * @param sort_i #entry array for particle distance in ci
 * @param sort_j #entry array for particle distance in cj
 * @param dx_max maximum particle movement allowed in cell
 * @param rshift cutoff shift
483
484
485
486
 * @param hi_max Maximal smoothing length in cell ci
 * @param hj_max Maximal smoothing length in cell cj
 * @param di_max Maximal position on the axis that can interact in cell ci
 * @param dj_min Minimal position on the axis that can interact in cell ci
487
488
 * @param max_index_i array to hold the maximum distances of pi particles into
 * cell
James Willis's avatar
James Willis committed
489
 * cj
490
491
 * @param max_index_j array to hold the maximum distances of pj particles into
 * cell
James Willis's avatar
James Willis committed
492
 * cj
James Willis's avatar
James Willis committed
493
494
 * @param init_pi first pi to interact with a pj particle
 * @param init_pj last pj to interact with a pi particle
495
 * @param e The #engine.
James Willis's avatar
James Willis committed
496
 */
497
__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
James Willis's avatar
James Willis committed
498
499
    const struct cell *ci, const struct cell *cj,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
500
501
    const float dx_max, const float rshift, const double hi_max,
    const double hj_max, const double di_max, const double dj_min,
502
    int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
503
    const struct engine *e) {
504

505
506
  const struct part *restrict parts_i = ci->parts;
  const struct part *restrict parts_j = cj->parts;
507
508

  int first_pi = 0, last_pj = cj->count - 1;
509
  int temp;
510

James Willis's avatar
James Willis committed
511
512
  /* Find the leftmost active particle in cell i that interacts with any
   * particle in cell j. */
513
  first_pi = ci->count;
514
  int active_id = first_pi - 1;
515
  while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
516
    first_pi--;
517
518
519
    /* Store the index of the particle if it is active. */
    if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi;
  }
James Willis's avatar
James Willis committed
520

521
522
  /* Set the first active pi in range of any particle in cell j. */
  first_pi = active_id;
523

524
  /* Find the maximum index into cell j for each particle in range in cell i. */
James Willis's avatar
James Willis committed
525
  if (first_pi < ci->count) {
526

527
528
    /* Start from the first particle in cell j. */
    temp = 0;
529

530
    const struct part *pi = &parts_i[sort_i[first_pi].i];
531

532
    /* Loop through particles in cell j until they are not in range of pi. */
James Willis's avatar
James Willis committed
533
534
535
    while (temp <= cj->count &&
           (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) >
            sort_j[temp].d))
536
      temp++;
537

538
    max_index_i[first_pi] = temp;
539

540
    /* Populate max_index_i for remaining particles that are within range. */
James Willis's avatar
James Willis committed
541
    for (int i = first_pi + 1; i < ci->count; i++) {
542
      temp = max_index_i[i - 1];
543
      pi = &parts_i[sort_i[i].i];
544

James Willis's avatar
James Willis committed
545
546
547
      while (temp <= cj->count &&
             (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) >
              sort_j[temp].d))
548
        temp++;
549

550
      max_index_i[i] = temp;
551
    }
James Willis's avatar
James Willis committed
552
  } else {
553
554
    /* Make sure that max index is set to first particle in cj.*/
    max_index_i[ci->count - 1] = 0;
555
556
  }

James Willis's avatar
James Willis committed
557
558
  /* Find the rightmost active particle in cell j that interacts with any
   * particle in cell i. */
559
  last_pj = -1;
560
  active_id = last_pj;
James Willis's avatar
James Willis committed
561
562
  while (last_pj < cj->count &&
         sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
563
    last_pj++;
564
    /* Store the index of the particle if it is active. */
565
    if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj;
566
  }
James Willis's avatar
James Willis committed
567

568
  /* Set the last active pj in range of any particle in cell i. */
569
  last_pj = active_id;
Matthieu Schaller's avatar
Matthieu Schaller committed
570

571
  /* Find the maximum index into cell i for each particle in range in cell j. */
572
  if (last_pj > 0) {
Matthieu Schaller's avatar
Matthieu Schaller committed
573

574
575
    /* Start from the last particle in cell i. */
    temp = ci->count - 1;
576

577
    const struct part *pj = &parts_j[sort_j[last_pj].i];
578

579
    /* Loop through particles in cell i until they are not in range of pj. */
James Willis's avatar
James Willis committed
580
581
582
    while (temp > 0 &&
           sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) <
               sort_i[temp].d - rshift)
583
      temp--;
584

585
    max_index_j[last_pj] = temp;
586

587
    /* Populate max_index_j for remaining particles that are within range. */
James Willis's avatar
James Willis committed
588
    for (int i = last_pj - 1; i >= 0; i--) {
589
      temp = max_index_j[i + 1];
590
      pj = &parts_j[sort_j[i].i];
591

James Willis's avatar
James Willis committed
592
593
594
      while (temp > 0 &&
             sort_j[i].d - dx_max - (pj->h * kernel_gamma) <
                 sort_i[temp].d - rshift)
595
        temp--;
596

597
598
      max_index_j[i] = temp;
    }
James Willis's avatar
James Willis committed
599
  } else {
600
    /* Make sure that max index is set to last particle in ci.*/
James Willis's avatar
James Willis committed
601
    max_index_j[0] = ci->count - 1;
602
603
  }

James Willis's avatar
James Willis committed
604
605
  *init_pi = first_pi;
  *init_pj = last_pj;
606
}
James Willis's avatar
James Willis committed
607
#endif /* WITH_VECTORIZATION */
608
609

/**
James Willis's avatar
James Willis committed
610
611
 * @brief Compute the cell self-interaction (non-symmetric) using vector
 * intrinsics with one particle pi at a time.
612
613
614
615
 *
 * @param r The #runner.
 * @param c The #cell.
 */
James Willis's avatar
James Willis committed
616
617
__attribute__((always_inline)) INLINE void runner_doself1_density_vec(
    struct runner *r, struct cell *restrict c) {
618
619

#ifdef WITH_VECTORIZATION
620
  const struct engine *e = r->e;
621
622
623
624
625
626
  struct part *restrict pi;
  int count_align;
  int num_vec_proc = NUM_VEC_PROC;

  struct part *restrict parts = c->parts;
  const int count = c->count;
James Willis's avatar
James Willis committed
627

628
629
  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;

James Willis's avatar
James Willis committed
630
  TIMER_TIC
631

632
633
  if (!cell_is_active(c, e)) return;

634
  if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
635

James Willis's avatar
James Willis committed
636
  /* Get the particle cache from the runner and re-allocate
637
   * the cache if it is not big enough for the cell. */
638
  struct cache *restrict cell_cache = &r->ci_cache;
James Willis's avatar
James Willis committed
639
640
641

  if (cell_cache->count < count) {
    cache_init(cell_cache, count);
642
643
  }

James Willis's avatar
James Willis committed
644
  /* Read the particles from the cell and store them locally in the cache. */
James Willis's avatar
James Willis committed
645
  cache_read_particles(c, cell_cache);
646
647
648
649

  /* Create secondary cache to store particle interactions. */
  struct c2_cache int_cache;
  int icount = 0, icount_align = 0;
650
651
652
653
654
655
656
657

  /* Loop over the particles in the cell. */
  for (int pid = 0; pid < count; pid++) {

    /* Get a pointer to the ith particle. */
    pi = &parts[pid];

    /* Is the ith particle active? */
658
    if (!part_is_active(pi, e)) continue;
659
660
661
662
663

    vector pix, piy, piz;

    const float hi = cell_cache->h[pid];

James Willis's avatar
James Willis committed
664
    /* Fill particle pi vectors. */
665
666
667
668
669
670
671
672
673
674
675
    pix.v = vec_set1(cell_cache->x[pid]);
    piy.v = vec_set1(cell_cache->y[pid]);
    piz.v = vec_set1(cell_cache->z[pid]);
    v_hi.v = vec_set1(hi);
    v_vix.v = vec_set1(cell_cache->vx[pid]);
    v_viy.v = vec_set1(cell_cache->vy[pid]);
    v_viz.v = vec_set1(cell_cache->vz[pid]);

    const float hig2 = hi * hi * kernel_gamma2;
    v_hig2.v = vec_set1(hig2);

James Willis's avatar
James Willis committed
676
    /* Reset cumulative sums of update vectors. */
James Willis's avatar
James Willis committed
677
678
679
    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
        curlvySum, curlvzSum;

James Willis's avatar
James Willis committed
680
    /* Get the inverse of hi. */
681
    vector v_hi_inv;
James Willis's avatar
James Willis committed
682

683
    v_hi_inv = vec_reciprocal(v_hi);
James Willis's avatar
James Willis committed
684

685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
    rhoSum.v = vec_setzero();
    rho_dhSum.v = vec_setzero();
    wcountSum.v = vec_setzero();
    wcount_dhSum.v = vec_setzero();
    div_vSum.v = vec_setzero();
    curlvxSum.v = vec_setzero();
    curlvySum.v = vec_setzero();
    curlvzSum.v = vec_setzero();

    /* Pad cache if there is a serial remainder. */
    count_align = count;
    int rem = count % (num_vec_proc * VEC_SIZE);
    if (rem != 0) {
      int pad = (num_vec_proc * VEC_SIZE) - rem;

      count_align += pad;
701
702
703
704
705
706
707
708

      /* Set positions to the same as particle pi so when the r2 > 0 mask is
       * applied these extra contributions are masked out.*/
      for (int i = count; i < count_align; i++) {
        cell_cache->x[i] = pix.f[0];
        cell_cache->y[i] = piy.f[0];
        cell_cache->z[i] = piz.f[0];
      }
709
710
711
712
713
    }

    vector pjx, pjy, pjz;
    vector pjx2, pjy2, pjz2;

James Willis's avatar
James Willis committed
714
715
    /* Find all of particle pi's interacions and store needed values in the
     * secondary cache.*/
716
717
718
719
720
721
    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {

      /* Load 2 sets of vectors from the particle cache. */
      pjx.v = vec_load(&cell_cache->x[pjd]);
      pjy.v = vec_load(&cell_cache->y[pjd]);
      pjz.v = vec_load(&cell_cache->z[pjd]);
722

723
724
725
      pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
      pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
      pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
James Willis's avatar
James Willis committed
726

727
728
729
730
      /* Compute the pairwise distance. */
      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
      vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;

James Willis's avatar
James Willis committed
731
732
      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
      v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
733
      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
James Willis's avatar
James Willis committed
734
      v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
735
      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
James Willis's avatar
James Willis committed
736
737
738
739
      v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);

      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
      v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
740
      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
James Willis's avatar
James Willis committed
741
      v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
742
      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
James Willis's avatar
James Willis committed
743
744
      v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);

745
      /* Form a mask from r2 < hig2 and r2 > 0.*/
James Willis's avatar
James Willis committed
746
747
      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
          v_doi_mask2_self_check;
748
      int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check;
749

James Willis's avatar
James Willis committed
750
      /* Form r2 > 0 mask and r2 < hig2 mask. */
751
      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
752
      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
753

James Willis's avatar
James Willis committed
754
      /* Form r2 > 0 mask and r2 < hig2 mask. */
James Willis's avatar
James Willis committed
755
756
      vec_create_mask(v_doi_mask2_self_check,
                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
757
      vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
758

759
760
761
762
763
764
      /* Form integer masks. */
      doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
      doi_mask = vec_form_int_mask(v_doi_mask);

      doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check);
      doi_mask2 = vec_form_int_mask(v_doi_mask2);
James Willis's avatar
James Willis committed
765

766
767
768
      /* Combine the two masks. */
      doi_mask = doi_mask & doi_mask_self_check;
      doi_mask2 = doi_mask2 & doi_mask2_self_check;
769

James Willis's avatar
James Willis committed
770
771
      /* If there are any interactions left pack interaction values into c2
       * cache. */
772
      if (doi_mask) {
James Willis's avatar
James Willis committed
773
        storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
James Willis's avatar
James Willis committed
774
775
776
777
778
779
780
781
                          cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum,
                          &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
                          &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy,
                          v_viz);
      }
      if (doi_mask2) {
        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,
                          &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache,
James Willis's avatar
James Willis committed
782
783
784
                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
785
786
787
      }
    }

James Willis's avatar
James Willis committed
788
    /* Perform padded vector remainder interactions if any are present. */
Matthieu Schaller's avatar
Matthieu Schaller committed
789
790
791
    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                        &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
James Willis's avatar
James Willis committed
792
793
794
795
                        &icount_align);

    /* Initialise masks to true in case remainder interactions have been
     * performed. */
796
    mask_t int_mask, int_mask2;
797
798
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
799
800

    /* Perform interaction with 2 vectors. */
James Willis's avatar
James Willis committed
801
802
803
804
805
806
    for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
      runner_iact_nonsym_2_vec_density(
          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
James Willis's avatar
James Willis committed
807
808
          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
          0);
809
810
    }

James Willis's avatar
James Willis committed
811
812
813
814
815
816
817
818
819
820
    /* Perform horizontal adds on vector sums and store result in particle pi.
     */
    VEC_HADD(rhoSum, pi->rho);
    VEC_HADD(rho_dhSum, pi->density.rho_dh);
    VEC_HADD(wcountSum, pi->density.wcount);
    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
    VEC_HADD(div_vSum, pi->density.div_v);
    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
    VEC_HADD(curlvySum, pi->density.rot_v[1]);
    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
821
822
823
824
825

    /* Reset interaction count. */
    icount = 0;
  } /* loop over all particles. */

James Willis's avatar
James Willis committed
826
  TIMER_TOC(timer_doself_density);
827
#endif /* WITH_VECTORIZATION */
828
829
}

830
831
832
833
834
835
836
837
838
839
840
841
842
843
/**
 * @brief Compute the cell self-interaction (non-symmetric) using vector
 * intrinsics with one particle pi at a time.
 *
 * @param r The #runner.
 * @param c The #cell.
 */
__attribute__((always_inline)) INLINE void runner_doself2_force_vec(
    struct runner *r, struct cell *restrict c) {

#ifdef WITH_VECTORIZATION
  const struct engine *e = r->e;
  struct part *restrict pi;
  int count_align;
James Willis's avatar
James Willis committed
844
  const int num_vec_proc = 1;
845
846
847
848
849
850
851

  struct part *restrict parts = c->parts;
  const int count = c->count;

  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
  vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;

852
  TIMER_TIC
853
854
855

  if (!cell_is_active(c, e)) return;

856
  if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
857
858
859
860
861
862
863
864
865
866

  /* Get the particle cache from the runner and re-allocate
   * the cache if it is not big enough for the cell. */
  struct cache *restrict cell_cache = &r->ci_cache;

  if (cell_cache->count < count) {
    cache_init(cell_cache, count);
  }

  /* Read the particles from the cell and store them locally in the cache. */
867
  cache_read_force_particles(c, cell_cache);
868

James Willis's avatar
James Willis committed
869
#ifdef SWIFT_DEBUG_CHECKS
James Willis's avatar
James Willis committed
870
  for (int i = 0; i < count; i++) {
James Willis's avatar
James Willis committed
871
872
873
874
875
    pi = &c->parts[i];
    /* Check that particles have been drifted to the current time */
    if (pi->ti_drift != e->ti_current)
      error("Particle pi not drifted to current time");
  }
James Willis's avatar
James Willis committed
876
}
James Willis's avatar
James Willis committed
877
878
#endif

James Willis's avatar
James Willis committed
879
880
881
/* Create secondary cache to store particle interactions. */
struct c2_cache int_cache;
int icount = 0, icount_align = 0;
882

James Willis's avatar
James Willis committed
883
884
/* Loop over the particles in the cell. */
for (int pid = 0; pid < count; pid++) {
885

James Willis's avatar
James Willis committed
886
887
  /* Get a pointer to the ith particle. */
  pi = &parts[pid];
888

James Willis's avatar
James Willis committed
889
890
  /* Is the ith particle active? */
  if (!part_is_active(pi, e)) continue;
891

James Willis's avatar
James Willis committed
892
  vector pix, piy, piz;
893

James Willis's avatar
James Willis committed
894
  const float hi = cell_cache->h[pid];
895

James Willis's avatar
James Willis committed
896
897
898
899
900
901
902
903
  /* Fill particle pi vectors. */
  pix.v = vec_set1(cell_cache->x[pid]);
  piy.v = vec_set1(cell_cache->y[pid]);
  piz.v = vec_set1(cell_cache->z[pid]);
  v_hi.v = vec_set1(hi);
  v_vix.v = vec_set1(cell_cache->vx[pid]);
  v_viy.v = vec_set1(cell_cache->vy[pid]);
  v_viz.v = vec_set1(cell_cache->vz[pid]);
904

James Willis's avatar
James Willis committed
905
906
907
908
909
  v_rhoi.v = vec_set1(cell_cache->rho[pid]);
  v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]);
  v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]);
  v_balsara_i.v = vec_set1(cell_cache->balsara[pid]);
  v_ci.v = vec_set1(cell_cache->soundspeed[pid]);
910

James Willis's avatar
James Willis committed
911
912
  const float hig2 = hi * hi * kernel_gamma2;
  v_hig2.v = vec_set1(hig2);
913

James Willis's avatar
James Willis committed
914
915
916
  /* Reset cumulative sums of update vectors. */
  vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum,
      entropy_dtSum;
917

James Willis's avatar
James Willis committed
918
919
  /* Get the inverse of hi. */
  vector v_hi_inv;
920

James Willis's avatar
James Willis committed
921
  v_hi_inv = vec_reciprocal(v_hi);
922

James Willis's avatar
James Willis committed
923
924
925
926
927
928
  a_hydro_xSum.v = vec_setzero();
  a_hydro_ySum.v = vec_setzero();
  a_hydro_zSum.v = vec_setzero();
  h_dtSum.v = vec_setzero();
  v_sigSum.v = vec_set1(pi->force.v_sig);
  entropy_dtSum.v = vec_setzero();
929

James Willis's avatar
James Willis committed
930
931
932
933
934
  /* Pad cache if there is a serial remainder. */
  count_align = count;
  int rem = count % (num_vec_proc * VEC_SIZE);
  if (rem != 0) {
    int pad = (num_vec_proc * VEC_SIZE) - rem;
935

James Willis's avatar
James Willis committed
936
937
938
939
940
941
942
943
944
    count_align += pad;

    /* Set positions to the same as particle pi so when the r2 > 0 mask is
     * applied these extra contributions are masked out.*/
    for (int i = count; i < count_align; i++) {
      cell_cache->x[i] = pix.f[0];
      cell_cache->y[i] = piy.f[0];
      cell_cache->z[i] = piz.f[0];
      cell_cache->h[i] = 1.f;
945
    }
James Willis's avatar
James Willis committed
946
  }
947

James Willis's avatar
James Willis committed
948
  vector pjx, pjy, pjz, hj, hjg2;
949

James Willis's avatar
James Willis committed
950
951
952
  /* Find all of particle pi's interacions and store needed values in the
   * secondary cache.*/
  for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
953

James Willis's avatar
James Willis committed
954
955
956
957
958
959
    /* Load 2 sets of vectors from the particle cache. */
    pjx.v = vec_load(&cell_cache->x[pjd]);
    pjy.v = vec_load(&cell_cache->y[pjd]);
    pjz.v = vec_load(&cell_cache->z[pjd]);
    hj.v = vec_load(&cell_cache->h[pjd]);
    hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v);
960

James Willis's avatar
James Willis committed
961
962
    /* Compute the pairwise distance. */
    vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
963

James Willis's avatar
James Willis committed
964
965
966
    v_dx_tmp.v = vec_sub(pix.v, pjx.v);
    v_dy_tmp.v = vec_sub(piy.v, pjy.v);
    v_dz_tmp.v = vec_sub(piz.v, pjz.v);
967

James Willis's avatar
James Willis committed
968
969
970
    v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
    v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
    v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
971

James Willis's avatar
James Willis committed
972
973
974
    /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
    mask_t v_doi_mask, v_doi_mask_self_check;
    int doi_mask, doi_mask_self_check;
975

James Willis's avatar
James Willis committed
976
977
    /* Form r2 > 0 mask.*/
    vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
978

James Willis's avatar
James Willis committed
979
980
981
982
    /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */
    vector v_h2;
    v_h2.v = vec_fmax(v_hig2.v, hjg2.v);
    vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v));
983

James Willis's avatar
James Willis committed
984
985
986
    /* Form integer masks. */
    doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
    doi_mask = vec_form_int_mask(v_doi_mask);
987

James Willis's avatar
James Willis committed
988
989
    /* Combine all 3 masks. */
    doi_mask = doi_mask & doi_mask_self_check;
990

James Willis's avatar
James Willis committed
991
992
993
    /* If there are any interactions left pack interaction values into c2
     * cache. */
    if (doi_mask) {
994

James Willis's avatar
James Willis committed
995
996
997
      storeForceInteractions(
          doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache,
          &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
998
          &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy,
999
          v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2);
James Willis's avatar
James Willis committed
1000
    }
1001

James Willis's avatar
James Willis committed
1002
  } /* Loop over all other particles. */
1003

James Willis's avatar
James Willis committed
1004
1005
1006
  /* Perform padded vector remainder interactions if any are present. */
  calcRemForceInteractions(
      &int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum,
1007
1008
      &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, v_rhoi,
      v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2);
1009

James Willis's avatar
James Willis committed
1010
1011
1012
  /* Initialise masks to true in case remainder interactions have been
   * performed. */
  mask_t int_mask, int_mask2;
1013
1014
  vec_init_mask_true(int_mask);
  vec_init_mask_true(int_mask2);
1015

James Willis's avatar
James Willis committed
1016
1017
1018
1019
  /* Perform interaction with 2 vectors. */
  for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) {
    runner_iact_nonsym_2_vec_force(
        &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
1020
1021
        &int_cache.dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi,
        v_pOrhoi2, v_balsara_i, v_ci, &int_cache.vxq[pjd],
James Willis's avatar
James Willis committed
1022
1023
1024
        &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache