runner_doiact_vec.c 53.4 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* This object's header. */
#include "runner_doiact_vec.h"

26
27
28
/* Local headers. */
#include "active.h"

29
30
31
#ifdef WITH_VECTORIZATION
static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);

James Willis's avatar
James Willis committed
32
33
34
/**
 * @brief Compute the vector remainder interactions from the secondary cache.
 *
Matthieu Schaller's avatar
Matthieu Schaller committed
35
 * @param int_cache (return) secondary #cache of interactions between two
James Willis's avatar
James Willis committed
36
 * particles.
James Willis's avatar
James Willis committed
37
 * @param icount Interaction count.
Matthieu Schaller's avatar
Matthieu Schaller committed
38
 * @param rhoSum (return) #vector holding the cumulative sum of the density
James Willis's avatar
James Willis committed
39
 * update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
40
 * @param rho_dhSum (return) #vector holding the cumulative sum of the density
James Willis's avatar
James Willis committed
41
 * gradient update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
42
 * @param wcountSum (return) #vector holding the cumulative sum of the wcount
James Willis's avatar
James Willis committed
43
 * update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
44
 * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
James Willis's avatar
James Willis committed
45
 * gradient update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
46
 * @param div_vSum (return) #vector holding the cumulative sum of the divergence
James Willis's avatar
James Willis committed
47
 * update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
48
 * @param curlvxSum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
49
 * vx update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
50
 * @param curlvySum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
51
 * vy update on pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
52
 * @param curlvzSum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
53
 * vz update on pi.
James Willis's avatar
James Willis committed
54
55
56
57
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
58
 * @param icount_align (return) Interaction count after the remainder
James Willis's avatar
James Willis committed
59
 * interactions have been performed, should be a multiple of the vector length.
James Willis's avatar
James Willis committed
60
 */
James Willis's avatar
James Willis committed
61
__attribute__((always_inline)) INLINE static void calcRemInteractions(
Matthieu Schaller's avatar
Matthieu Schaller committed
62
63
64
65
66
    struct c2_cache *const int_cache, const int icount, vector *rhoSum,
    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
    int *icount_align) {
67

68
  mask_t int_mask, int_mask2;
James Willis's avatar
James Willis committed
69
70

  /* Work out the number of remainder interactions and pad secondary cache. */
71
72
73
74
75
76
  *icount_align = icount;
  int rem = icount % (NUM_VEC_PROC * VEC_SIZE);
  if (rem != 0) {
    int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
    *icount_align += pad;

77
    /* Initialise masks to true. */
78
79
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
80

James Willis's avatar
James Willis committed
81
82
83
    /* Pad secondary cache so that there are no contributions in the interaction
     * function. */
    for (int i = icount; i < *icount_align; i++) {
84
85
86
87
88
89
90
91
      int_cache->mq[i] = 0.f;
      int_cache->r2q[i] = 1.f;
      int_cache->dxq[i] = 0.f;
      int_cache->dyq[i] = 0.f;
      int_cache->dzq[i] = 0.f;
      int_cache->vxq[i] = 0.f;
      int_cache->vyq[i] = 0.f;
      int_cache->vzq[i] = 0.f;
92
93
94
95
    }

    /* Zero parts of mask that represent the padded values.*/
    if (pad < VEC_SIZE) {
James Willis's avatar
James Willis committed
96
      vec_pad_mask(int_mask2, pad);
James Willis's avatar
James Willis committed
97
    } else {
James Willis's avatar
James Willis committed
98
      vec_pad_mask(int_mask, VEC_SIZE - rem);
99
      vec_zero_mask(int_mask2);
100
101
    }

James Willis's avatar
James Willis committed
102
103
    /* Perform remainder interaction and remove remainder from aligned
     * interaction count. */
104
    *icount_align = icount - rem;
James Willis's avatar
James Willis committed
105
106
107
108
109
110
    runner_iact_nonsym_2_vec_density(
        &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
        &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align],
        v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
        &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
        &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
James Willis's avatar
James Willis committed
111
112
        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
        int_mask2, 1);
113
114
115
  }
}

James Willis's avatar
James Willis committed
116
/**
James Willis's avatar
James Willis committed
117
118
 * @brief Left-packs the values needed by an interaction into the secondary
 * cache (Supports AVX, AVX2 and AVX512 instruction sets).
James Willis's avatar
James Willis committed
119
120
 *
 * @param mask Contains which particles need to interact.
Matthieu Schaller's avatar
Matthieu Schaller committed
121
 * @param pjd Index of the particle to store into.
James Willis's avatar
James Willis committed
122
123
124
125
126
 * @param v_r2 #vector of the separation between two particles squared.
 * @param v_dx #vector of the x separation between two particles.
 * @param v_dy #vector of the y separation between two particles.
 * @param v_dz #vector of the z separation between two particles.
 * @param cell_cache #cache of all particles in the cell.
Matthieu Schaller's avatar
Matthieu Schaller committed
127
 * @param int_cache (return) secondary #cache of interactions between two
James Willis's avatar
James Willis committed
128
 * particles.
James Willis's avatar
James Willis committed
129
130
 * @param icount Interaction count.
 * @param rhoSum #vector holding the cumulative sum of the density update on pi.
James Willis's avatar
James Willis committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
 * @param rho_dhSum #vector holding the cumulative sum of the density gradient
 * update on pi.
 * @param wcountSum #vector holding the cumulative sum of the wcount update on
 * pi.
 * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
 * update on pi.
 * @param div_vSum #vector holding the cumulative sum of the divergence update
 * on pi.
 * @param curlvxSum #vector holding the cumulative sum of the curl of vx update
 * on pi.
 * @param curlvySum #vector holding the cumulative sum of the curl of vy update
 * on pi.
 * @param curlvzSum #vector holding the cumulative sum of the curl of vz update
 * on pi.
James Willis's avatar
James Willis committed
145
146
147
148
149
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
 */
James Willis's avatar
James Willis committed
150
__attribute__((always_inline)) INLINE static void storeInteractions(
151
    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
James Willis's avatar
James Willis committed
152
153
154
155
156
    vector *v_dz, const struct cache *const cell_cache,
    struct c2_cache *const int_cache, int *icount, vector *rhoSum,
    vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
    vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
    vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) {
James Willis's avatar
James Willis committed
157
158
159

/* Left-pack values needed into the secondary cache using the interaction mask.
 */
160
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
161
162
163
164
165
166
167
  mask_t packed_mask;
  VEC_FORM_PACKED_MASK(mask, packed_mask);

  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
James Willis's avatar
James Willis committed
168
169
170
171
172
173
174
175
  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
                &int_cache->mq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
                &int_cache->vxq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
                &int_cache->vyq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
                &int_cache->vzq[*icount]);
176
177
178

  /* Increment interaction count by number of bits set in mask. */
  (*icount) += __builtin_popcount(mask);
179
#else
James Willis's avatar
James Willis committed
180
  /* Quicker to do it serially in AVX rather than use intrinsics. */
James Willis's avatar
James Willis committed
181
  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
182
183
    if (mask & (1 << bit_index)) {
      /* Add this interaction to the queue. */
184
185
186
187
188
189
190
191
      int_cache->r2q[*icount] = v_r2->f[bit_index];
      int_cache->dxq[*icount] = v_dx->f[bit_index];
      int_cache->dyq[*icount] = v_dy->f[bit_index];
      int_cache->dzq[*icount] = v_dz->f[bit_index];
      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
192
193
194
195

      (*icount)++;
    }
  }
196

James Willis's avatar
James Willis committed
197
198
#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */

James Willis's avatar
James Willis committed
199
  /* Flush the c2 cache if it has reached capacity. */
James Willis's avatar
James Willis committed
200
  if (*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) {
201
202

    int icount_align = *icount;
James Willis's avatar
James Willis committed
203

James Willis's avatar
James Willis committed
204
    /* Peform remainder interactions. */
Matthieu Schaller's avatar
Matthieu Schaller committed
205
206
207
    calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum,
                        wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
                        v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
208

209
    mask_t int_mask, int_mask2;
210
211
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
James Willis's avatar
James Willis committed
212
213

    /* Perform interactions. */
James Willis's avatar
James Willis committed
214
215
216
217
218
219
    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
      runner_iact_nonsym_2_vec_density(
          &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd],
          &int_cache->dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
          &int_cache->vxq[pjd], &int_cache->vyq[pjd], &int_cache->vzq[pjd],
          &int_cache->mq[pjd], rhoSum, rho_dhSum, wcountSum, wcount_dhSum,
220
          div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, int_mask2, 0);
221
    }
James Willis's avatar
James Willis committed
222
223

    /* Reset interaction count. */
224
225
226
    *icount = 0;
  }
}
227

228
/**
James Willis's avatar
James Willis committed
229
 * @brief Compute the vector remainder force interactions from the secondary cache.
230
231
232
233
 *
 * @param int_cache (return) secondary #cache of interactions between two
 * particles.
 * @param icount Interaction count.
James Willis's avatar
James Willis committed
234
 * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x acceleration
235
 * update on pi.
James Willis's avatar
James Willis committed
236
 * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y acceleration
237
 * update on pi.
James Willis's avatar
James Willis committed
238
239
240
241
242
 * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z acceleration
 * update on pi.
 * @param h_dtSum (return) #vector holding the cumulative sum of the time derivative of the smoothing length update on pi.
 * @param v_sigSum (return) #vector holding the maximum of the signal velocity update on pi.
 * @param entropy_dtSum (return) #vector holding the cumulative sum of the time derivative of the entropy
243
244
245
246
247
 * update on pi.
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
James Willis's avatar
James Willis committed
248
249
250
251
252
 * @param v_rhoi #vector of density of pi.
 * @param v_grad_hi #vector of smoothing length gradient of pi.
 * @param v_pOrhoi2 #vector of pressure over density squared of pi.
 * @param v_balsara_i #vector of balsara switch of pi.
 * @param v_ci #vector of sound speed of pi.
253
 * @param icount_align (return) Interaction count after the remainder
James Willis's avatar
James Willis committed
254
 * @param num_vec_proc #int of the number of vectors to use to perform interaction.
255
256
257
258
259
 * interactions have been performed, should be a multiple of the vector length.
 */
__attribute__((always_inline)) INLINE static void calcRemForceInteractions(
    struct c2_cache *const int_cache, const int icount, vector *a_hydro_xSum,
    vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
260
261
262
    vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix,
    vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi,
    vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int *icount_align,
James Willis's avatar
James Willis committed
263
    int num_vec_proc) {
264

265
  mask_t int_mask, int_mask2;
266
267
268

  /* Work out the number of remainder interactions and pad secondary cache. */
  *icount_align = icount;
269
  int rem = icount % (num_vec_proc * VEC_SIZE);
270
  if (rem != 0) {
271
    int pad = (num_vec_proc * VEC_SIZE) - rem;
272
273
    *icount_align += pad;

274
    /* Initialise masks to true. */
275
276
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
277

278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
    /* Pad secondary cache so that there are no contributions in the interaction
     * function. */
    for (int i = icount; i < *icount_align; i++) {
      int_cache->mq[i] = 0.f;
      int_cache->r2q[i] = 1.f;
      int_cache->dxq[i] = 0.f;
      int_cache->dyq[i] = 0.f;
      int_cache->dzq[i] = 0.f;
      int_cache->vxq[i] = 0.f;
      int_cache->vyq[i] = 0.f;
      int_cache->vzq[i] = 0.f;
      int_cache->rhoq[i] = 1.f;
      int_cache->grad_hq[i] = 1.f;
      int_cache->pOrho2q[i] = 1.f;
      int_cache->balsaraq[i] = 1.f;
      int_cache->soundspeedq[i] = 1.f;
      int_cache->h_invq[i] = 1.f;
    }

    /* Zero parts of mask that represent the padded values.*/
    if (pad < VEC_SIZE) {
James Willis's avatar
James Willis committed
299
      vec_pad_mask(int_mask2, pad);
300
    } else {
James Willis's avatar
James Willis committed
301
      vec_pad_mask(int_mask, VEC_SIZE - rem);
302
      vec_zero_mask(int_mask2);
303
304
305
306
307
    }

    /* Perform remainder interaction and remove remainder from aligned
     * interaction count. */
    *icount_align = icount - rem;
308
    runner_iact_nonsym_2_vec_force(
James Willis's avatar
James Willis committed
309
310
311
312
313
314
315
316
317
318
        &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
        &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix,
        v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci,
        &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align],
        &int_cache->vzq[*icount_align], &int_cache->rhoq[*icount_align],
        &int_cache->grad_hq[*icount_align], &int_cache->pOrho2q[*icount_align],
        &int_cache->balsaraq[*icount_align],
        &int_cache->soundspeedq[*icount_align], &int_cache->mq[*icount_align],
        v_hi_inv, &int_cache->h_invq[*icount_align], a_hydro_xSum, a_hydro_ySum,
        a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum, int_mask, int_mask2, 1);
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
  }
}

/**
 * @brief Left-packs the values needed by an interaction into the secondary
 * cache (Supports AVX, AVX2 and AVX512 instruction sets).
 *
 * @param mask Contains which particles need to interact.
 * @param pjd Index of the particle to store into.
 * @param v_r2 #vector of the separation between two particles squared.
 * @param v_dx #vector of the x separation between two particles.
 * @param v_dy #vector of the y separation between two particles.
 * @param v_dz #vector of the z separation between two particles.
 * @param cell_cache #cache of all particles in the cell.
 * @param int_cache (return) secondary #cache of interactions between two
 * particles.
 * @param icount Interaction count.
James Willis's avatar
James Willis committed
336
 * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x acceleration
337
 * update on pi.
James Willis's avatar
James Willis committed
338
339
340
341
342
343
344
 * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y acceleration
 * update on pi.
 * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z acceleration
 * update on pi.
 * @param h_dtSum (return) #vector holding the cumulative sum of the time derivative of the smoothing length update on pi.
 * @param v_sigSum (return) #vector holding the maximum of the signal velocity update on pi.
 * @param entropy_dtSum (return) #vector holding the cumulative sum of the time derivative of the entropy
345
346
347
348
349
 * update on pi.
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
James Willis's avatar
James Willis committed
350
351
352
353
354
355
356
 * @param v_rhoi #vector of density of pi.
 * @param v_grad_hi #vector of smoothing length gradient of pi.
 * @param v_pOrhoi2 #vector of pressure over density squared of pi.
 * @param v_balsara_i #vector of balsara switch of pi.
 * @param v_ci #vector of sound speed of pi.
 * @param num_vec_proc #int of the number of vectors to use to perform interaction.
 * interactions have been performed, should be a multiple of the vector length.
357
358
359
 */
__attribute__((always_inline)) INLINE static void storeForceInteractions(
    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
James Willis's avatar
James Willis committed
360
361
362
    vector *v_dz, const struct cache *const cell_cache,
    struct c2_cache *const int_cache, int *icount, vector *a_hydro_xSum,
    vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
363
364
    vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix,
    vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi,
365
    vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int num_vec_proc) {
366
367
368
369

/* Left-pack values needed into the secondary cache using the interaction mask.
 */
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
370
371
  /* Invert hj. */
  vector v_hj, v_hj_inv;
372
  v_hj.v = vec_load(&cell_cache->h[pjd]);
373
374
  v_hj_inv = vec_reciprocal(v_hj);

375
376
  mask_t packed_mask;
  VEC_FORM_PACKED_MASK(mask, packed_mask);
James Willis's avatar
James Willis committed
377

378
379
380
381
  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
James Willis's avatar
James Willis committed
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
                &int_cache->mq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
                &int_cache->vxq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
                &int_cache->vyq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
                &int_cache->vzq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->rho[pjd]), packed_mask,
                &int_cache->rhoq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->grad_h[pjd]), packed_mask,
                &int_cache->grad_hq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->pOrho2[pjd]), packed_mask,
                &int_cache->pOrho2q[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->balsara[pjd]), packed_mask,
                &int_cache->balsaraq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->soundspeed[pjd]), packed_mask,
                &int_cache->soundspeedq[*icount]);
400
  VEC_LEFT_PACK(v_hj_inv.v, packed_mask, &int_cache->h_invq[*icount]);
James Willis's avatar
James Willis committed
401

402
403
  /* Increment interaction count by number of bits set in mask. */
  (*icount) += __builtin_popcount(mask);
404
405
406
407
408
409
410
411
412
413
414
415
416
#else
  /* Quicker to do it serially in AVX rather than use intrinsics. */
  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
    if (mask & (1 << bit_index)) {
      /* Add this interaction to the queue. */
      int_cache->r2q[*icount] = v_r2->f[bit_index];
      int_cache->dxq[*icount] = v_dx->f[bit_index];
      int_cache->dyq[*icount] = v_dy->f[bit_index];
      int_cache->dzq[*icount] = v_dz->f[bit_index];
      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
James Willis's avatar
James Willis committed
417

418
419
420
421
422
      int_cache->rhoq[*icount] = cell_cache->rho[pjd + bit_index];
      int_cache->grad_hq[*icount] = cell_cache->grad_h[pjd + bit_index];
      int_cache->pOrho2q[*icount] = cell_cache->pOrho2[pjd + bit_index];
      int_cache->balsaraq[*icount] = cell_cache->balsara[pjd + bit_index];
      int_cache->soundspeedq[*icount] = cell_cache->soundspeed[pjd + bit_index];
423
      int_cache->h_invq[*icount] = 1.f / cell_cache->h[pjd + bit_index];
424
425
426
427
428
429
430
431

      (*icount)++;
    }
  }

#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */

  /* Flush the c2 cache if it has reached capacity. */
432
  if (*icount >= (C2_CACHE_SIZE - (num_vec_proc * VEC_SIZE))) {
433
434
435
436

    int icount_align = *icount;

    /* Peform remainder interactions. */
James Willis's avatar
James Willis committed
437
438
439
440
    calcRemForceInteractions(int_cache, *icount, a_hydro_xSum, a_hydro_ySum,
                             a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum,
                             v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi,
                             v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2);
441

442
443
444
    /* Initialise masks to true in case remainder interactions have been
     * performed. */
    mask_t int_mask, int_mask2;
445
446
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
447

448
    /* Perform interactions. */
449
    for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
450

451
      runner_iact_nonsym_2_vec_force(
James Willis's avatar
James Willis committed
452
453
454
455
456
457
458
459
460
          &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd],
          &int_cache->dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi,
          v_pOrhoi2, v_balsara_i, v_ci, &int_cache->vxq[pjd],
          &int_cache->vyq[pjd], &int_cache->vzq[pjd], &int_cache->rhoq[pjd],
          &int_cache->grad_hq[pjd], &int_cache->pOrho2q[pjd],
          &int_cache->balsaraq[pjd], &int_cache->soundspeedq[pjd],
          &int_cache->mq[pjd], v_hi_inv, &int_cache->h_invq[pjd], a_hydro_xSum,
          a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum,
          int_mask, int_mask2, 0);
461
462
463
464
465
466
467
    }

    /* Reset interaction count. */
    *icount = 0;
  }
}

468
/**
469
470
 * @brief Populates the arrays max_index_i and max_index_j with the maximum
 * indices of
James Willis's avatar
James Willis committed
471
472
473
 * particles into their neighbouring cells. Also finds the first pi that
 * interacts with any particle in cj and the last pj that interacts with any
 * particle in ci.
474
 *
James Willis's avatar
James Willis committed
475
476
477
478
479
480
 * @param ci #cell pointer to ci
 * @param cj #cell pointer to cj
 * @param sort_i #entry array for particle distance in ci
 * @param sort_j #entry array for particle distance in cj
 * @param dx_max maximum particle movement allowed in cell
 * @param rshift cutoff shift
481
482
483
484
 * @param hi_max Maximal smoothing length in cell ci
 * @param hj_max Maximal smoothing length in cell cj
 * @param di_max Maximal position on the axis that can interact in cell ci
 * @param dj_min Minimal position on the axis that can interact in cell ci
485
486
 * @param max_index_i array to hold the maximum distances of pi particles into
 * cell
James Willis's avatar
James Willis committed
487
 * cj
488
489
 * @param max_index_j array to hold the maximum distances of pj particles into
 * cell
James Willis's avatar
James Willis committed
490
 * cj
James Willis's avatar
James Willis committed
491
492
 * @param init_pi first pi to interact with a pj particle
 * @param init_pj last pj to interact with a pi particle
493
 * @param e The #engine.
James Willis's avatar
James Willis committed
494
 */
495
__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
James Willis's avatar
James Willis committed
496
497
    const struct cell *ci, const struct cell *cj,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
498
499
    const float dx_max, const float rshift, const double hi_max,
    const double hj_max, const double di_max, const double dj_min,
500
    int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
501
    const struct engine *e) {
502

503
504
  const struct part *restrict parts_i = ci->parts;
  const struct part *restrict parts_j = cj->parts;
505
506

  int first_pi = 0, last_pj = cj->count - 1;
507
  int temp;
508

James Willis's avatar
James Willis committed
509
510
  /* Find the leftmost active particle in cell i that interacts with any
   * particle in cell j. */
511
  first_pi = ci->count;
512
  int active_id = first_pi - 1;
513
  while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
514
    first_pi--;
515
516
517
    /* Store the index of the particle if it is active. */
    if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi;
  }
James Willis's avatar
James Willis committed
518

519
520
  /* Set the first active pi in range of any particle in cell j. */
  first_pi = active_id;
521

522
  /* Find the maximum index into cell j for each particle in range in cell i. */
James Willis's avatar
James Willis committed
523
  if (first_pi < ci->count) {
524

525
526
    /* Start from the first particle in cell j. */
    temp = 0;
527

528
    const struct part *pi = &parts_i[sort_i[first_pi].i];
529

530
    /* Loop through particles in cell j until they are not in range of pi. */
James Willis's avatar
James Willis committed
531
532
533
    while (temp <= cj->count &&
           (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) >
            sort_j[temp].d))
534
      temp++;
535

536
    max_index_i[first_pi] = temp;
537

538
    /* Populate max_index_i for remaining particles that are within range. */
James Willis's avatar
James Willis committed
539
    for (int i = first_pi + 1; i < ci->count; i++) {
540
      temp = max_index_i[i - 1];
541
      pi = &parts_i[sort_i[i].i];
542

James Willis's avatar
James Willis committed
543
544
545
      while (temp <= cj->count &&
             (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) >
              sort_j[temp].d))
546
        temp++;
547

548
      max_index_i[i] = temp;
549
    }
James Willis's avatar
James Willis committed
550
  } else {
551
552
    /* Make sure that max index is set to first particle in cj.*/
    max_index_i[ci->count - 1] = 0;
553
554
  }

James Willis's avatar
James Willis committed
555
556
  /* Find the rightmost active particle in cell j that interacts with any
   * particle in cell i. */
557
  last_pj = -1;
558
  active_id = last_pj;
James Willis's avatar
James Willis committed
559
560
  while (last_pj < cj->count &&
         sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
561
    last_pj++;
562
    /* Store the index of the particle if it is active. */
563
    if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj;
564
  }
James Willis's avatar
James Willis committed
565

566
  /* Set the last active pj in range of any particle in cell i. */
567
  last_pj = active_id;
Matthieu Schaller's avatar
Matthieu Schaller committed
568

569
  /* Find the maximum index into cell i for each particle in range in cell j. */
570
  if (last_pj > 0) {
Matthieu Schaller's avatar
Matthieu Schaller committed
571

572
573
    /* Start from the last particle in cell i. */
    temp = ci->count - 1;
574

575
    const struct part *pj = &parts_j[sort_j[last_pj].i];
576

577
    /* Loop through particles in cell i until they are not in range of pj. */
James Willis's avatar
James Willis committed
578
579
580
    while (temp > 0 &&
           sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) <
               sort_i[temp].d - rshift)
581
      temp--;
582

583
    max_index_j[last_pj] = temp;
584

585
    /* Populate max_index_j for remaining particles that are within range. */
James Willis's avatar
James Willis committed
586
    for (int i = last_pj - 1; i >= 0; i--) {
587
      temp = max_index_j[i + 1];
588
      pj = &parts_j[sort_j[i].i];
589

James Willis's avatar
James Willis committed
590
591
592
      while (temp > 0 &&
             sort_j[i].d - dx_max - (pj->h * kernel_gamma) <
                 sort_i[temp].d - rshift)
593
        temp--;
594

595
596
      max_index_j[i] = temp;
    }
James Willis's avatar
James Willis committed
597
  } else {
598
    /* Make sure that max index is set to last particle in ci.*/
James Willis's avatar
James Willis committed
599
    max_index_j[0] = ci->count - 1;
600
601
  }

James Willis's avatar
James Willis committed
602
603
  *init_pi = first_pi;
  *init_pj = last_pj;
604
}
James Willis's avatar
James Willis committed
605
#endif /* WITH_VECTORIZATION */
606
607

/**
James Willis's avatar
James Willis committed
608
609
 * @brief Compute the cell self-interaction (non-symmetric) using vector
 * intrinsics with one particle pi at a time.
610
611
612
613
 *
 * @param r The #runner.
 * @param c The #cell.
 */
James Willis's avatar
James Willis committed
614
615
__attribute__((always_inline)) INLINE void runner_doself1_density_vec(
    struct runner *r, struct cell *restrict c) {
616
617

#ifdef WITH_VECTORIZATION
618
  const struct engine *e = r->e;
619
620
621
622
623
624
  struct part *restrict pi;
  int count_align;
  int num_vec_proc = NUM_VEC_PROC;

  struct part *restrict parts = c->parts;
  const int count = c->count;
James Willis's avatar
James Willis committed
625

626
627
  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;

James Willis's avatar
James Willis committed
628
  TIMER_TIC
629

630
631
  if (!cell_is_active(c, e)) return;

632
  if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
633

James Willis's avatar
James Willis committed
634
  /* Get the particle cache from the runner and re-allocate
635
   * the cache if it is not big enough for the cell. */
636
  struct cache *restrict cell_cache = &r->ci_cache;
James Willis's avatar
James Willis committed
637
638
639

  if (cell_cache->count < count) {
    cache_init(cell_cache, count);
640
641
  }

James Willis's avatar
James Willis committed
642
  /* Read the particles from the cell and store them locally in the cache. */
James Willis's avatar
James Willis committed
643
  cache_read_particles(c, cell_cache);
644
645
646
647

  /* Create secondary cache to store particle interactions. */
  struct c2_cache int_cache;
  int icount = 0, icount_align = 0;
648
649
650
651
652
653
654
655

  /* Loop over the particles in the cell. */
  for (int pid = 0; pid < count; pid++) {

    /* Get a pointer to the ith particle. */
    pi = &parts[pid];

    /* Is the ith particle active? */
656
    if (!part_is_active(pi, e)) continue;
657
658
659
660
661

    vector pix, piy, piz;

    const float hi = cell_cache->h[pid];

James Willis's avatar
James Willis committed
662
    /* Fill particle pi vectors. */
663
664
665
666
667
668
669
670
671
672
673
    pix.v = vec_set1(cell_cache->x[pid]);
    piy.v = vec_set1(cell_cache->y[pid]);
    piz.v = vec_set1(cell_cache->z[pid]);
    v_hi.v = vec_set1(hi);
    v_vix.v = vec_set1(cell_cache->vx[pid]);
    v_viy.v = vec_set1(cell_cache->vy[pid]);
    v_viz.v = vec_set1(cell_cache->vz[pid]);

    const float hig2 = hi * hi * kernel_gamma2;
    v_hig2.v = vec_set1(hig2);

James Willis's avatar
James Willis committed
674
    /* Reset cumulative sums of update vectors. */
James Willis's avatar
James Willis committed
675
676
677
    vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
        curlvySum, curlvzSum;

James Willis's avatar
James Willis committed
678
    /* Get the inverse of hi. */
679
    vector v_hi_inv;
James Willis's avatar
James Willis committed
680

681
    v_hi_inv = vec_reciprocal(v_hi);
James Willis's avatar
James Willis committed
682

683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
    rhoSum.v = vec_setzero();
    rho_dhSum.v = vec_setzero();
    wcountSum.v = vec_setzero();
    wcount_dhSum.v = vec_setzero();
    div_vSum.v = vec_setzero();
    curlvxSum.v = vec_setzero();
    curlvySum.v = vec_setzero();
    curlvzSum.v = vec_setzero();

    /* Pad cache if there is a serial remainder. */
    count_align = count;
    int rem = count % (num_vec_proc * VEC_SIZE);
    if (rem != 0) {
      int pad = (num_vec_proc * VEC_SIZE) - rem;

      count_align += pad;
699
700
701
702
703
704
705
706

      /* Set positions to the same as particle pi so when the r2 > 0 mask is
       * applied these extra contributions are masked out.*/
      for (int i = count; i < count_align; i++) {
        cell_cache->x[i] = pix.f[0];
        cell_cache->y[i] = piy.f[0];
        cell_cache->z[i] = piz.f[0];
      }
707
708
709
710
711
    }

    vector pjx, pjy, pjz;
    vector pjx2, pjy2, pjz2;

James Willis's avatar
James Willis committed
712
713
    /* Find all of particle pi's interacions and store needed values in the
     * secondary cache.*/
714
715
716
717
718
719
    for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {

      /* Load 2 sets of vectors from the particle cache. */
      pjx.v = vec_load(&cell_cache->x[pjd]);
      pjy.v = vec_load(&cell_cache->y[pjd]);
      pjz.v = vec_load(&cell_cache->z[pjd]);
720

721
722
723
      pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
      pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
      pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
James Willis's avatar
James Willis committed
724

725
726
727
728
      /* Compute the pairwise distance. */
      vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
      vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2;

James Willis's avatar
James Willis committed
729
730
      v_dx_tmp.v = vec_sub(pix.v, pjx.v);
      v_dx_tmp2.v = vec_sub(pix.v, pjx2.v);
731
      v_dy_tmp.v = vec_sub(piy.v, pjy.v);
James Willis's avatar
James Willis committed
732
      v_dy_tmp2.v = vec_sub(piy.v, pjy2.v);
733
      v_dz_tmp.v = vec_sub(piz.v, pjz.v);
James Willis's avatar
James Willis committed
734
735
736
737
      v_dz_tmp2.v = vec_sub(piz.v, pjz2.v);

      v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
      v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v);
738
      v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
James Willis's avatar
James Willis committed
739
      v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v);
740
      v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
James Willis's avatar
James Willis committed
741
742
      v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v);

743
      /* Form a mask from r2 < hig2 and r2 > 0.*/
James Willis's avatar
James Willis committed
744
745
      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
          v_doi_mask2_self_check;
746
      int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check;
747

James Willis's avatar
James Willis committed
748
      /* Form r2 > 0 mask and r2 < hig2 mask. */
749
      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
750
      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
751

James Willis's avatar
James Willis committed
752
      /* Form r2 > 0 mask and r2 < hig2 mask. */
James Willis's avatar
James Willis committed
753
754
      vec_create_mask(v_doi_mask2_self_check,
                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
755
      vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
756

757
758
759
760
761
762
      /* Form integer masks. */
      doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
      doi_mask = vec_form_int_mask(v_doi_mask);

      doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check);
      doi_mask2 = vec_form_int_mask(v_doi_mask2);
James Willis's avatar
James Willis committed
763

764
765
766
      /* Combine the two masks. */
      doi_mask = doi_mask & doi_mask_self_check;
      doi_mask2 = doi_mask2 & doi_mask2_self_check;
767

James Willis's avatar
James Willis committed
768
769
      /* If there are any interactions left pack interaction values into c2
       * cache. */
770
      if (doi_mask) {
James Willis's avatar
James Willis committed
771
        storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp,
James Willis's avatar
James Willis committed
772
773
774
775
776
777
778
779
                          cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum,
                          &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum,
                          &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy,
                          v_viz);
      }
      if (doi_mask2) {
        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2,
                          &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache,
James Willis's avatar
James Willis committed
780
781
782
                          &icount, &rhoSum, &rho_dhSum, &wcountSum,
                          &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                          &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
783
784
785
      }
    }

James Willis's avatar
James Willis committed
786
    /* Perform padded vector remainder interactions if any are present. */
Matthieu Schaller's avatar
Matthieu Schaller committed
787
788
789
    calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
                        &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
                        &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
James Willis's avatar
James Willis committed
790
791
792
793
                        &icount_align);

    /* Initialise masks to true in case remainder interactions have been
     * performed. */
794
    mask_t int_mask, int_mask2;
795
796
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
797
798

    /* Perform interaction with 2 vectors. */
James Willis's avatar
James Willis committed
799
800
801
802
803
804
    for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) {
      runner_iact_nonsym_2_vec_density(
          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
          &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
James Willis's avatar
James Willis committed
805
806
          &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
          0);
807
808
    }

James Willis's avatar
James Willis committed
809
810
811
812
813
814
815
816
817
818
    /* Perform horizontal adds on vector sums and store result in particle pi.
     */
    VEC_HADD(rhoSum, pi->rho);
    VEC_HADD(rho_dhSum, pi->density.rho_dh);
    VEC_HADD(wcountSum, pi->density.wcount);
    VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
    VEC_HADD(div_vSum, pi->density.div_v);
    VEC_HADD(curlvxSum, pi->density.rot_v[0]);
    VEC_HADD(curlvySum, pi->density.rot_v[1]);
    VEC_HADD(curlvzSum, pi->density.rot_v[2]);
819
820
821
822
823

    /* Reset interaction count. */
    icount = 0;
  } /* loop over all particles. */

James Willis's avatar
James Willis committed
824
  TIMER_TOC(timer_doself_density);
825
#endif /* WITH_VECTORIZATION */
826
827
}

828
/**
James Willis's avatar
James Willis committed
829
 * @brief Compute the force cell self-interaction (non-symmetric) using vector
830
831
832
833
834
835
836
837
838
839
840
841
 * intrinsics with one particle pi at a time.
 *
 * @param r The #runner.
 * @param c The #cell.
 */
__attribute__((always_inline)) INLINE void runner_doself2_force_vec(
    struct runner *r, struct cell *restrict c) {

#ifdef WITH_VECTORIZATION
  const struct engine *e = r->e;
  struct part *restrict pi;
  int count_align;
James Willis's avatar
James Willis committed
842
  const int num_vec_proc = 1;
843
844
845
846
847
848
849

  struct part *restrict parts = c->parts;
  const int count = c->count;

  vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
  vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;

850
  TIMER_TIC
851
852
853

  if (!cell_is_active(c, e)) return;

854
  if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
855
856
857
858
859
860
861
862
863
864

  /* Get the particle cache from the runner and re-allocate
   * the cache if it is not big enough for the cell. */
  struct cache *restrict cell_cache = &r->ci_cache;

  if (cell_cache->count < count) {
    cache_init(cell_cache, count);
  }

  /* Read the particles from the cell and store them locally in the cache. */
865
  cache_read_force_particles(c, cell_cache);
866

James Willis's avatar
James Willis committed
867
#ifdef SWIFT_DEBUG_CHECKS
James Willis's avatar
James Willis committed
868
  for (int i = 0; i < count; i++) {
James Willis's avatar
James Willis committed
869
870
871
872
873
    pi = &c->parts[i];
    /* Check that particles have been drifted to the current time */
    if (pi->ti_drift != e->ti_current)
      error("Particle pi not drifted to current time");
  }
James Willis's avatar
James Willis committed
874
}
James Willis's avatar
James Willis committed
875
876
#endif

James Willis's avatar
James Willis committed
877
878
879
/* Create secondary cache to store particle interactions. */
struct c2_cache int_cache;
int icount = 0, icount_align = 0;
880

James Willis's avatar
James Willis committed
881
882
/* Loop over the particles in the cell. */
for (int pid = 0; pid < count; pid++) {
883

James Willis's avatar
James Willis committed
884
885
  /* Get a pointer to the ith particle. */
  pi = &parts[pid];
886

James Willis's avatar
James Willis committed
887
888
  /* Is the ith particle active? */
  if (!part_is_active(pi, e)) continue;
889

James Willis's avatar
James Willis committed
890
  vector pix, piy, piz;
891

James Willis's avatar
James Willis committed
892
  const float hi = cell_cache->h[pid];
893

James Willis's avatar
James Willis committed
894
895
896
897
898
899
900
901
  /* Fill particle pi vectors. */
  pix.v = vec_set1(cell_cache->x[pid]);
  piy.v = vec_set1(cell_cache->y[pid]);
  piz.v = vec_set1(cell_cache->z[pid]);
  v_hi.v = vec_set1(hi);
  v_vix.v = vec_set1(cell_cache->vx[pid]);
  v_viy.v = vec_set1(cell_cache->vy[pid]);
  v_viz.v = vec_set1(cell_cache->vz[pid]);
902

James Willis's avatar
James Willis committed
903
904
905
906
907
  v_rhoi.v = vec_set1(cell_cache->rho[pid]);
  v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]);
  v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]);
  v_balsara_i.v = vec_set1(cell_cache->balsara[pid]);
  v_ci.v = vec_set1(cell_cache->soundspeed[pid]);
908

James Willis's avatar
James Willis committed
909
910
  const float hig2 = hi * hi * kernel_gamma2;
  v_hig2.v = vec_set1(hig2);
911

James Willis's avatar
James Willis committed
912
913
914
  /* Reset cumulative sums of update vectors. */
  vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum,
      entropy_dtSum;
915

James Willis's avatar
James Willis committed
916
917
  /* Get the inverse of hi. */
  vector v_hi_inv;
918

James Willis's avatar
James Willis committed
919
  v_hi_inv = vec_reciprocal(v_hi);
920

James Willis's avatar
James Willis committed
921
922
923
924
925
926
  a_hydro_xSum.v = vec_setzero();
  a_hydro_ySum.v = vec_setzero();
  a_hydro_zSum.v = vec_setzero();
  h_dtSum.v = vec_setzero();
  v_sigSum.v = vec_set1(pi->force.v_sig);
  entropy_dtSum.v = vec_setzero();
927

James Willis's avatar
James Willis committed
928
929
930
931
932
  /* Pad cache if there is a serial remainder. */
  count_align = count;
  int rem = count % (num_vec_proc * VEC_SIZE);
  if (rem != 0) {
    int pad = (num_vec_proc * VEC_SIZE) - rem;
933

James Willis's avatar
James Willis committed
934
935
936
937
938
939
940
941
942
    count_align += pad;

    /* Set positions to the same as particle pi so when the r2 > 0 mask is
     * applied these extra contributions are masked out.*/
    for (int i = count; i < count_align; i++) {
      cell_cache->x[i] = pix.f[0];
      cell_cache->y[i] = piy.f[0];
      cell_cache->z[i] = piz.f[0];
      cell_cache->h[i] = 1.f;
943
    }
James Willis's avatar
James Willis committed
944
  }
945

James Willis's avatar
James Willis committed
946
  vector pjx, pjy, pjz, hj, hjg2;
947

James Willis's avatar
James Willis committed
948
949
950
  /* Find all of particle pi's interacions and store needed values in the
   * secondary cache.*/
  for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
951

James Willis's avatar
James Willis committed
952
953
954
955
956
957
    /* Load 2 sets of vectors from the particle cache. */
    pjx.v = vec_load(&cell_cache->x[pjd]);
    pjy.v = vec_load(&cell_cache->y[pjd]);
    pjz.v = vec_load(&cell_cache->z[pjd]);
    hj.v = vec_load(&cell_cache->h[pjd]);
    hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v);
958

James Willis's avatar
James Willis committed
959
960
    /* Compute the pairwise distance. */
    vector v_dx_tmp, v_dy_tmp, v_dz_tmp;
961

James Willis's avatar
James Willis committed
962
963
964
    v_dx_tmp.v = vec_sub(pix.v, pjx.v);
    v_dy_tmp.v = vec_sub(piy.v, pjy.v);
    v_dz_tmp.v = vec_sub(piz.v, pjz.v);
965

James Willis's avatar
James Willis committed
966
967
968
    v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v);
    v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v);
    v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v);
969

James Willis's avatar
James Willis committed
970
971
972
    /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */
    mask_t v_doi_mask, v_doi_mask_self_check;
    int doi_mask, doi_mask_self_check;
973

James Willis's avatar
James Willis committed
974
975
    /* Form r2 > 0 mask.*/
    vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
976

James Willis's avatar
James Willis committed
977
978
979
980
    /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */
    vector v_h2;
    v_h2.v = vec_fmax(v_hig2.v, hjg2.v);
    vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v));
981

James Willis's avatar
James Willis committed
982
983
984
    /* Form integer masks. */
    doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check);
    doi_mask = vec_form_int_mask(v_doi_mask);
985

James Willis's avatar
James Willis committed
986
987
    /* Combine all 3 masks. */
    doi_mask = doi_mask & doi_mask_self_check;
988

James Willis's avatar
James Willis committed
989
990
991
    /* If there are any interactions left pack interaction values into c2
     * cache. */
    if (doi_mask) {
992

James Willis's avatar
James Willis committed
993
994
995
      storeForceInteractions(
          doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache,
          &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
996
          &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy,
997
          v_viz