runner_doiact_vec.c 75 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* This object's header. */
#include "runner_doiact_vec.h"

26
27
28
/* Local headers. */
#include "active.h"

29
30
#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)

31
32
static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);

James Willis's avatar
James Willis committed
33
34
35
/**
 * @brief Compute the vector remainder interactions from the secondary cache.
 *
Matthieu Schaller's avatar
Matthieu Schaller committed
36
 * @param int_cache (return) secondary #cache of interactions between two
James Willis's avatar
James Willis committed
37
 * particles.
James Willis's avatar
James Willis committed
38
 * @param icount Interaction count.
James Willis's avatar
James Willis committed
39
 * @param v_rhoSum (return) #vector holding the cumulative sum of the density
James Willis's avatar
James Willis committed
40
 * update on pi.
James Willis's avatar
James Willis committed
41
 * @param v_rho_dhSum (return) #vector holding the cumulative sum of the density
James Willis's avatar
James Willis committed
42
 * gradient update on pi.
James Willis's avatar
James Willis committed
43
 * @param v_wcountSum (return) #vector holding the cumulative sum of the wcount
James Willis's avatar
James Willis committed
44
 * update on pi.
45
46
 * @param v_wcount_dhSum (return) #vector holding the cumulative sum of the
 * wcount
James Willis's avatar
James Willis committed
47
 * gradient update on pi.
48
49
 * @param v_div_vSum (return) #vector holding the cumulative sum of the
 * divergence
James Willis's avatar
James Willis committed
50
 * update on pi.
James Willis's avatar
James Willis committed
51
 * @param v_curlvxSum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
52
 * vx update on pi.
James Willis's avatar
James Willis committed
53
 * @param v_curlvySum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
54
 * vy update on pi.
James Willis's avatar
James Willis committed
55
 * @param v_curlvzSum (return) #vector holding the cumulative sum of the curl of
James Willis's avatar
James Willis committed
56
 * vz update on pi.
James Willis's avatar
James Willis committed
57
58
59
60
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
Matthieu Schaller's avatar
Matthieu Schaller committed
61
 * @param icount_align (return) Interaction count after the remainder
James Willis's avatar
James Willis committed
62
 * interactions have been performed, should be a multiple of the vector length.
James Willis's avatar
James Willis committed
63
 */
James Willis's avatar
James Willis committed
64
__attribute__((always_inline)) INLINE static void calcRemInteractions(
James Willis's avatar
James Willis committed
65
66
    struct c2_cache *const int_cache, const int icount, vector *v_rhoSum,
    vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum,
67
68
69
    vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum,
    vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy,
    vector v_viz, int *icount_align) {
70

71
  mask_t int_mask, int_mask2;
James Willis's avatar
James Willis committed
72
73

  /* Work out the number of remainder interactions and pad secondary cache. */
74
75
76
77
78
79
  *icount_align = icount;
  int rem = icount % (NUM_VEC_PROC * VEC_SIZE);
  if (rem != 0) {
    int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
    *icount_align += pad;

80
    /* Initialise masks to true. */
81
82
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
83

James Willis's avatar
James Willis committed
84
85
86
    /* Pad secondary cache so that there are no contributions in the interaction
     * function. */
    for (int i = icount; i < *icount_align; i++) {
87
88
89
90
91
92
93
94
      int_cache->mq[i] = 0.f;
      int_cache->r2q[i] = 1.f;
      int_cache->dxq[i] = 0.f;
      int_cache->dyq[i] = 0.f;
      int_cache->dzq[i] = 0.f;
      int_cache->vxq[i] = 0.f;
      int_cache->vyq[i] = 0.f;
      int_cache->vzq[i] = 0.f;
95
96
97
98
    }

    /* Zero parts of mask that represent the padded values.*/
    if (pad < VEC_SIZE) {
James Willis's avatar
James Willis committed
99
      vec_pad_mask(int_mask2, pad);
James Willis's avatar
James Willis committed
100
    } else {
James Willis's avatar
James Willis committed
101
      vec_pad_mask(int_mask, VEC_SIZE - rem);
102
      vec_zero_mask(int_mask2);
103
104
    }

James Willis's avatar
James Willis committed
105
106
    /* Perform remainder interaction and remove remainder from aligned
     * interaction count. */
107
    *icount_align = icount - rem;
James Willis's avatar
James Willis committed
108
109
110
111
112
    runner_iact_nonsym_2_vec_density(
        &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align],
        &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align],
        v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
        &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
James Willis's avatar
James Willis committed
113
        &int_cache->mq[*icount_align], v_rhoSum, v_rho_dhSum, v_wcountSum,
114
115
        v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum,
        int_mask, int_mask2, 1);
116
117
118
  }
}

James Willis's avatar
James Willis committed
119
/**
James Willis's avatar
James Willis committed
120
121
 * @brief Left-packs the values needed by an interaction into the secondary
 * cache (Supports AVX, AVX2 and AVX512 instruction sets).
James Willis's avatar
James Willis committed
122
123
 *
 * @param mask Contains which particles need to interact.
Matthieu Schaller's avatar
Matthieu Schaller committed
124
 * @param pjd Index of the particle to store into.
James Willis's avatar
James Willis committed
125
126
127
128
129
 * @param v_r2 #vector of the separation between two particles squared.
 * @param v_dx #vector of the x separation between two particles.
 * @param v_dy #vector of the y separation between two particles.
 * @param v_dz #vector of the z separation between two particles.
 * @param cell_cache #cache of all particles in the cell.
Matthieu Schaller's avatar
Matthieu Schaller committed
130
 * @param int_cache (return) secondary #cache of interactions between two
James Willis's avatar
James Willis committed
131
 * particles.
James Willis's avatar
James Willis committed
132
 * @param icount Interaction count.
133
134
 * @param v_rhoSum #vector holding the cumulative sum of the density update on
 * pi.
James Willis's avatar
James Willis committed
135
 * @param v_rho_dhSum #vector holding the cumulative sum of the density gradient
James Willis's avatar
James Willis committed
136
 * update on pi.
James Willis's avatar
James Willis committed
137
 * @param v_wcountSum #vector holding the cumulative sum of the wcount update on
James Willis's avatar
James Willis committed
138
 * pi.
139
140
 * @param v_wcount_dhSum #vector holding the cumulative sum of the wcount
 * gradient
James Willis's avatar
James Willis committed
141
 * update on pi.
James Willis's avatar
James Willis committed
142
 * @param v_div_vSum #vector holding the cumulative sum of the divergence update
James Willis's avatar
James Willis committed
143
 * on pi.
144
145
 * @param v_curlvxSum #vector holding the cumulative sum of the curl of vx
 * update
James Willis's avatar
James Willis committed
146
 * on pi.
147
148
 * @param v_curlvySum #vector holding the cumulative sum of the curl of vy
 * update
James Willis's avatar
James Willis committed
149
 * on pi.
150
151
 * @param v_curlvzSum #vector holding the cumulative sum of the curl of vz
 * update
James Willis's avatar
James Willis committed
152
 * on pi.
James Willis's avatar
James Willis committed
153
154
155
156
157
 * @param v_hi_inv #vector of 1/h for pi.
 * @param v_vix #vector of x velocity of pi.
 * @param v_viy #vector of y velocity of pi.
 * @param v_viz #vector of z velocity of pi.
 */
James Willis's avatar
James Willis committed
158
__attribute__((always_inline)) INLINE static void storeInteractions(
159
    const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
James Willis's avatar
James Willis committed
160
    vector *v_dz, const struct cache *const cell_cache,
James Willis's avatar
James Willis committed
161
162
    struct c2_cache *const int_cache, int *icount, vector *v_rhoSum,
    vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum,
163
164
165
    vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum,
    vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy,
    vector v_viz) {
James Willis's avatar
James Willis committed
166
167
168

/* Left-pack values needed into the secondary cache using the interaction mask.
 */
169
#if defined(HAVE_AVX2) || defined(HAVE_AVX512_F)
170
171
172
173
174
175
176
  mask_t packed_mask;
  VEC_FORM_PACKED_MASK(mask, packed_mask);

  VEC_LEFT_PACK(v_r2->v, packed_mask, &int_cache->r2q[*icount]);
  VEC_LEFT_PACK(v_dx->v, packed_mask, &int_cache->dxq[*icount]);
  VEC_LEFT_PACK(v_dy->v, packed_mask, &int_cache->dyq[*icount]);
  VEC_LEFT_PACK(v_dz->v, packed_mask, &int_cache->dzq[*icount]);
James Willis's avatar
James Willis committed
177
178
179
180
181
182
183
184
  VEC_LEFT_PACK(vec_load(&cell_cache->m[pjd]), packed_mask,
                &int_cache->mq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vx[pjd]), packed_mask,
                &int_cache->vxq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vy[pjd]), packed_mask,
                &int_cache->vyq[*icount]);
  VEC_LEFT_PACK(vec_load(&cell_cache->vz[pjd]), packed_mask,
                &int_cache->vzq[*icount]);
185
186
187

  /* Increment interaction count by number of bits set in mask. */
  (*icount) += __builtin_popcount(mask);
188
#else
James Willis's avatar
James Willis committed
189
  /* Quicker to do it serially in AVX rather than use intrinsics. */
James Willis's avatar
James Willis committed
190
  for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
191
192
    if (mask & (1 << bit_index)) {
      /* Add this interaction to the queue. */
193
194
195
196
197
198
199
200
      int_cache->r2q[*icount] = v_r2->f[bit_index];
      int_cache->dxq[*icount] = v_dx->f[bit_index];
      int_cache->dyq[*icount] = v_dy->f[bit_index];
      int_cache->dzq[*icount] = v_dz->f[bit_index];
      int_cache->mq[*icount] = cell_cache->m[pjd + bit_index];
      int_cache->vxq[*icount] = cell_cache->vx[pjd + bit_index];
      int_cache->vyq[*icount] = cell_cache->vy[pjd + bit_index];
      int_cache->vzq[*icount] = cell_cache->vz[pjd + bit_index];
201
202
203
204

      (*icount)++;
    }
  }
205

James Willis's avatar
James Willis committed
206
207
#endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */

James Willis's avatar
James Willis committed
208
  /* Flush the c2 cache if it has reached capacity. */
James Willis's avatar
James Willis committed
209
  if (*icount >= (C2_CACHE_SIZE - (NUM_VEC_PROC * VEC_SIZE))) {
210
211

    int icount_align = *icount;
James Willis's avatar
James Willis committed
212

James Willis's avatar
James Willis committed
213
    /* Peform remainder interactions. */
James Willis's avatar
James Willis committed
214
    calcRemInteractions(int_cache, *icount, v_rhoSum, v_rho_dhSum, v_wcountSum,
215
216
217
                        v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum,
                        v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
                        &icount_align);
218

219
    mask_t int_mask, int_mask2;
220
221
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
James Willis's avatar
James Willis committed
222
223

    /* Perform interactions. */
224
    for (int j = 0; j < icount_align; j += (NUM_VEC_PROC * VEC_SIZE)) {
James Willis's avatar
James Willis committed
225
      runner_iact_nonsym_2_vec_density(
226
227
          &int_cache->r2q[j], &int_cache->dxq[j], &int_cache->dyq[j],
          &int_cache->dzq[j], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[j],
James Willis's avatar
James Willis committed
228
          &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], v_rhoSum,
229
230
          v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum,
          v_curlvySum, v_curlvzSum, int_mask, int_mask2, 0);
231
    }
James Willis's avatar
James Willis committed
232
233

    /* Reset interaction count. */
234
235
236
    *icount = 0;
  }
}
237

238
/**
239
240
 * @brief Populates the arrays max_index_i and max_index_j with the maximum
 * indices of
James Willis's avatar
James Willis committed
241
242
243
 * particles into their neighbouring cells. Also finds the first pi that
 * interacts with any particle in cj and the last pj that interacts with any
 * particle in ci.
244
 *
James Willis's avatar
James Willis committed
245
246
247
248
249
250
 * @param ci #cell pointer to ci
 * @param cj #cell pointer to cj
 * @param sort_i #entry array for particle distance in ci
 * @param sort_j #entry array for particle distance in cj
 * @param dx_max maximum particle movement allowed in cell
 * @param rshift cutoff shift
251
252
253
254
 * @param hi_max Maximal smoothing length in cell ci
 * @param hj_max Maximal smoothing length in cell cj
 * @param di_max Maximal position on the axis that can interact in cell ci
 * @param dj_min Minimal position on the axis that can interact in cell ci
255
 * @param max_index_i array to hold the maximum distances of pi particles into
256
 * #cell cj
257
 * @param max_index_j array to hold the maximum distances of pj particles into
258
 * #cell cj
James Willis's avatar
James Willis committed
259
260
 * @param init_pi first pi to interact with a pj particle
 * @param init_pj last pj to interact with a pi particle
261
 * @param max_active_bin The largest time-bin active during this step.
262
263
 * @param active_ci Is any particle in cell ci active?
 * @param active_cj Is any particle in cell cj active?
James Willis's avatar
James Willis committed
264
 */
265
__attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
James Willis's avatar
James Willis committed
266
267
    const struct cell *ci, const struct cell *cj,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
268
269
    const float dx_max, const float rshift, const double hi_max,
    const double hj_max, const double di_max, const double dj_min,
270
    int *max_index_i, int *max_index_j, int *init_pi, int *init_pj,
271
    const timebin_t max_active_bin, const int active_ci, const int active_cj) {
272

273
274
  const struct part *restrict parts_i = ci->parts;
  const struct part *restrict parts_j = cj->parts;
275
276

  int first_pi = 0, last_pj = cj->count - 1;
277
278
  int temp, active_id;

James Willis's avatar
James Willis committed
279
280
  /* Only populate max_index array for local actve cells. */
  if (active_ci) {
281
282
283
284
285
286
287
288
289
290
291

    /* Find the leftmost active particle in cell i that interacts with any
     * particle in cell j. */
    first_pi = ci->count;
    active_id = first_pi - 1;
    while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) {
      first_pi--;
      /* Store the index of the particle if it is active. */
      if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin))
        active_id = first_pi;
    }
James Willis's avatar
James Willis committed
292

293
294
    /* Set the first active pi in range of any particle in cell j. */
    first_pi = active_id;
295

James Willis's avatar
James Willis committed
296
297
    /* Find the maximum index into cell j for each particle in range in cell i.
     */
298
    if (first_pi < ci->count) {
299

300
301
      /* Start from the first particle in cell j. */
      temp = 0;
302

303
304
      const struct part *pi = &parts_i[sort_i[first_pi].i];
      const float first_di =
James Willis's avatar
James Willis committed
305
          sort_i[first_pi].d + pi->h * kernel_gamma + dx_max - rshift;
306

307
308
309
      /* Loop through particles in cell j until they are not in range of pi.
       * Make sure that temp stays between 0 and cj->count - 1.*/
      while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++;
310

311
      max_index_i[first_pi] = temp;
312

313
314
315
316
      /* Populate max_index_i for remaining particles that are within range. */
      for (int i = first_pi + 1; i < ci->count; i++) {
        temp = max_index_i[i - 1];
        pi = &parts_i[sort_i[i].i];
317

318
        const float di = sort_i[i].d + pi->h * kernel_gamma + dx_max - rshift;
319

320
321
        /* Make sure that temp stays between 0 and cj->count - 1.*/
        while (temp < cj->count - 1 && di > sort_j[temp].d) temp++;
322

323
324
325
326
327
        max_index_i[i] = temp;
      }
    } else {
      /* Make sure that max index is set to first particle in cj.*/
      max_index_i[ci->count - 1] = 0;
328
    }
James Willis's avatar
James Willis committed
329
330
331
  } else {
    /* Make sure that foreign cells are only read into the cache if the local
     * cell requires it.
332
     * Also ensure that it does not require any particles from cj. */
James Willis's avatar
James Willis committed
333
    first_pi = ci->count - 1;
334
335
    max_index_i[ci->count - 1] = 0;
  }
336

James Willis's avatar
James Willis committed
337
338
  /* Only populate max_index array for local actve cells. */
  if (active_cj) {
339
340
341
342
343
    /* Find the rightmost active particle in cell j that interacts with any
     * particle in cell i. */
    last_pj = -1;
    active_id = last_pj;
    while (last_pj < cj->count &&
James Willis's avatar
James Willis committed
344
           sort_j[last_pj + 1].d - hj_max - dx_max < di_max) {
345
346
347
348
349
      last_pj++;
      /* Store the index of the particle if it is active. */
      if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin))
        active_id = last_pj;
    }
James Willis's avatar
James Willis committed
350

351
352
    /* Set the last active pj in range of any particle in cell i. */
    last_pj = active_id;
Matthieu Schaller's avatar
Matthieu Schaller committed
353

James Willis's avatar
James Willis committed
354
355
    /* Find the maximum index into cell i for each particle in range in cell j.
     */
356
    if (last_pj >= 0) {
Matthieu Schaller's avatar
Matthieu Schaller committed
357

358
359
      /* Start from the last particle in cell i. */
      temp = ci->count - 1;
360

361
362
      const struct part *pj = &parts_j[sort_j[last_pj].i];
      const float last_dj =
James Willis's avatar
James Willis committed
363
          sort_j[last_pj].d - dx_max - pj->h * kernel_gamma + rshift;
364

365
366
      /* Loop through particles in cell i until they are not in range of pj. */
      while (temp > 0 && last_dj < sort_i[temp].d) temp--;
367

368
      max_index_j[last_pj] = temp;
369

370
371
372
373
374
      /* Populate max_index_j for remaining particles that are within range. */
      for (int i = last_pj - 1; i >= 0; i--) {
        temp = max_index_j[i + 1];
        pj = &parts_j[sort_j[i].i];
        const float dj = sort_j[i].d - dx_max - (pj->h * kernel_gamma) + rshift;
375

376
        while (temp > 0 && dj < sort_i[temp].d) temp--;
377

378
379
380
381
382
        max_index_j[i] = temp;
      }
    } else {
      /* Make sure that max index is set to last particle in ci.*/
      max_index_j[0] = ci->count - 1;
383
    }
James Willis's avatar
James Willis committed
384
385
386
  } else {
    /* Make sure that foreign cells are only read into the cache if the local
     * cell requires it.
387
388
389
390
     * Also ensure that it does not require any particles from ci. */
    last_pj = 0;
    max_index_j[0] = ci->count - 1;
  }
391

James Willis's avatar
James Willis committed
392
393
  *init_pi = first_pi;
  *init_pj = last_pj;
394
}
395

James Willis's avatar
James Willis committed
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
/**
 * @brief Populates the arrays max_index_i and max_index_j with the maximum
 * indices of
 * particles into their neighbouring cells. Also finds the first pi that
 * interacts with any particle in cj and the last pj that interacts with any
 * particle in ci.
 *
 * @param ci #cell pointer to ci
 * @param cj #cell pointer to cj
 * @param sort_i #entry array for particle distance in ci
 * @param sort_j #entry array for particle distance in cj
 * @param dx_max maximum particle movement allowed in cell
 * @param rshift cutoff shift
 * @param hi_max_raw Maximal smoothing length in cell ci
 * @param hj_max_raw Maximal smoothing length in cell cj
James Willis's avatar
James Willis committed
411
 * @param h_max Maximal smoothing length in both cells scaled by kernel_gamma
James Willis's avatar
James Willis committed
412
413
414
415
416
417
418
419
420
 * @param di_max Maximal position on the axis that can interact in cell ci
 * @param dj_min Minimal position on the axis that can interact in cell ci
 * @param max_index_i array to hold the maximum distances of pi particles into
 * #cell cj
 * @param max_index_j array to hold the maximum distances of pj particles into
 * #cell cj
 * @param init_pi first pi to interact with a pj particle
 * @param init_pj last pj to interact with a pi particle
 * @param max_active_bin The largest time-bin active during this step.
421
422
 * @param active_ci Is any particle in cell ci active?
 * @param active_cj Is any particle in cell cj active?
James Willis's avatar
James Willis committed
423
 */
James Willis's avatar
James Willis committed
424
__attribute__((always_inline)) INLINE static void
Matthieu Schaller's avatar
Matthieu Schaller committed
425
426
427
428
429
430
431
432
populate_max_index_no_cache_force(
    const struct cell *ci, const struct cell *cj,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
    const float dx_max, const float rshift, const double hi_max_raw,
    const double hj_max_raw, const double h_max, const double di_max,
    const double dj_min, int *max_index_i, int *max_index_j, int *init_pi,
    int *init_pj, const timebin_t max_active_bin, const int active_ci,
    const int active_cj) {
433

434
435
  const struct part *restrict parts_i = ci->parts;
  const struct part *restrict parts_j = cj->parts;
James Willis's avatar
James Willis committed
436

437
  int first_pi = 0, last_pj = cj->count - 1;
438
  int temp, active_id;
439

James Willis's avatar
James Willis committed
440
441
  /* Only populate max_index array for local actve cells. */
  if (active_ci) {
442

443
444
445
446
    /* Find the leftmost active particle in cell i that interacts with any
     * particle in cell j. */
    first_pi = ci->count;
    active_id = first_pi - 1;
Matthieu Schaller's avatar
Matthieu Schaller committed
447
    while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + h_max > dj_min) {
448
449
450
451
452
      first_pi--;
      /* Store the index of the particle if it is active. */
      if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin))
        active_id = first_pi;
    }
453

454
455
    /* Set the first active pi in range of any particle in cell j. */
    first_pi = active_id;
456

James Willis's avatar
James Willis committed
457
458
    /* Find the maximum index into cell j for each particle in range in cell i.
     */
459
    if (first_pi < ci->count) {
460

461
462
      /* Start from the first particle in cell j. */
      temp = 0;
463

464
465
      const struct part *pi = &parts_i[sort_i[first_pi].i];
      const float first_di = sort_i[first_pi].d +
James Willis's avatar
James Willis committed
466
467
                             max(pi->h, hj_max_raw) * kernel_gamma + dx_max -
                             rshift;
468

469
470
471
      /* Loop through particles in cell j until they are not in range of pi.
       * Make sure that temp stays between 0 and cj->count - 1.*/
      while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++;
472

473
474
475
476
477
478
479
      max_index_i[first_pi] = temp;

      /* Populate max_index_i for remaining particles that are within range. */
      for (int i = first_pi + 1; i < ci->count; i++) {
        temp = max_index_i[i - 1];
        pi = &parts_i[sort_i[i].i];

James Willis's avatar
James Willis committed
480
481
        const float di = sort_i[i].d + max(pi->h, hj_max_raw) * kernel_gamma +
                         dx_max - rshift;
James Willis's avatar
James Willis committed
482

483
484
        /* Make sure that temp stays between 0 and cj->count - 1.*/
        while (temp < cj->count - 1 && di > sort_j[temp].d) temp++;
James Willis's avatar
James Willis committed
485

486
487
488
489
490
        max_index_i[i] = temp;
      }
    } else {
      /* Make sure that max index is set to first particle in cj.*/
      max_index_i[ci->count - 1] = 0;
491
    }
James Willis's avatar
James Willis committed
492
493
494
  } else {
    /* Make sure that foreign cells are only read into the cache if the local
     * cell requires it.
495
     * Also ensure that it does not require any particles from cj. */
James Willis's avatar
James Willis committed
496
    first_pi = ci->count - 1;
497
    max_index_i[ci->count - 1] = 0;
498
499
  }

James Willis's avatar
James Willis committed
500
501
  /* Only populate max_index array for local actve cells. */
  if (active_cj) {
502
503
504
505
506
    /* Find the rightmost active particle in cell j that interacts with any
     * particle in cell i. */
    last_pj = -1;
    active_id = last_pj;
    while (last_pj < cj->count &&
507
           sort_j[last_pj + 1].d - h_max - dx_max < di_max) {
508
509
510
511
512
      last_pj++;
      /* Store the index of the particle if it is active. */
      if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin))
        active_id = last_pj;
    }
James Willis's avatar
James Willis committed
513

514
515
    /* Set the last active pj in range of any particle in cell i. */
    last_pj = active_id;
Matthieu Schaller's avatar
Matthieu Schaller committed
516

James Willis's avatar
James Willis committed
517
518
    /* Find the maximum index into cell i for each particle in range in cell j.
     */
519
    if (last_pj >= 0) {
Matthieu Schaller's avatar
Matthieu Schaller committed
520

521
522
      /* Start from the last particle in cell i. */
      temp = ci->count - 1;
523

524
525
      const struct part *pj = &parts_j[sort_j[last_pj].i];
      const float last_dj = sort_j[last_pj].d - dx_max -
James Willis's avatar
James Willis committed
526
                            max(pj->h, hi_max_raw) * kernel_gamma + rshift;
527

528
529
      /* Loop through particles in cell i until they are not in range of pj. */
      while (temp > 0 && last_dj < sort_i[temp].d) temp--;
James Willis's avatar
James Willis committed
530

531
      max_index_j[last_pj] = temp;
532

533
534
535
536
      /* Populate max_index_j for remaining particles that are within range. */
      for (int i = last_pj - 1; i >= 0; i--) {
        temp = max_index_j[i + 1];
        pj = &parts_j[sort_j[i].i];
537

538
        const float dj = sort_j[i].d - dx_max -
James Willis's avatar
James Willis committed
539
                         (max(pj->h, hi_max_raw) * kernel_gamma) + rshift;
James Willis's avatar
James Willis committed
540

541
        while (temp > 0 && dj < sort_i[temp].d) temp--;
James Willis's avatar
James Willis committed
542

543
544
545
546
547
        max_index_j[i] = temp;
      }
    } else {
      /* Make sure that max index is set to last particle in ci.*/
      max_index_j[0] = ci->count - 1;
548
    }
James Willis's avatar
James Willis committed
549
550
551
  } else {
    /* Make sure that foreign cells are only read into the cache if the local
     * cell requires it.
552
553
     * Also ensure that it does not require any particles from ci. */
    last_pj = 0;
554
    max_index_j[0] = ci->count - 1;
555
556
  }

James Willis's avatar
James Willis committed
557
558
  *init_pi = first_pi;
  *init_pj = last_pj;
559
}
560

561
#endif /* WITH_VECTORIZATION && GADGET2_SPH */
562
563

/**
James Willis's avatar
James Willis committed
564
565
 * @brief Compute the cell self-interaction (non-symmetric) using vector
 * intrinsics with one particle pi at a time.
566
567
568
569
 *
 * @param r The #runner.
 * @param c The #cell.
 */
James Willis's avatar
James Willis committed
570
571
__attribute__((always_inline)) INLINE void runner_doself1_density_vec(
    struct runner *r, struct cell *restrict c) {
572

573
#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
574

Matthieu Schaller's avatar
Matthieu Schaller committed
575
576
577
  /* Get some local variables */
  const struct engine *e = r->e;
  const timebin_t max_active_bin = e->max_active_bin;
578
579
  struct part *restrict parts = c->parts;
  const int count = c->count;
James Willis's avatar
James Willis committed
580

Matthieu Schaller's avatar
Matthieu Schaller committed
581
  TIMER_TIC;
582

Matthieu Schaller's avatar
Matthieu Schaller committed
583
  /* Anything to do here? */
584
  if (!cell_is_active_hydro(c, e)) return;
585

586
  if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell.");
587

588
589
590
591
592
593
594
595
#ifdef SWIFT_DEBUG_CHECKS
  for (int i = 0; i < count; i++) {
    /* Check that particles have been drifted to the current time */
    if (parts[i].ti_drift != e->ti_current)
      error("Particle pi not drifted to current time");
  }
#endif

James Willis's avatar
James Willis committed
596
  /* Get the particle cache from the runner and re-allocate
597
   * the cache if it is not big enough for the cell. */
598
  struct cache *restrict cell_cache = &r->ci_cache;
599
  if (cell_cache->count < count) cache_init(cell_cache, count);
600

James Willis's avatar
James Willis committed
601
  /* Read the particles from the cell and store them locally in the cache. */
James Willis's avatar
James Willis committed
602
  cache_read_particles(c, cell_cache);
603
604
605

  /* Create secondary cache to store particle interactions. */
  struct c2_cache int_cache;
606
607
608
609
610

  /* Loop over the particles in the cell. */
  for (int pid = 0; pid < count; pid++) {

    /* Get a pointer to the ith particle. */
Matthieu Schaller's avatar
Matthieu Schaller committed
611
    struct part *restrict pi = &parts[pid];
612
613

    /* Is the ith particle active? */
614
    if (!part_is_active_no_debug(pi, max_active_bin)) continue;
615
616
617

    const float hi = cell_cache->h[pid];

James Willis's avatar
James Willis committed
618
    /* Fill particle pi vectors. */
619
620
621
622
623
624
625
    const vector v_pix = vector_set1(cell_cache->x[pid]);
    const vector v_piy = vector_set1(cell_cache->y[pid]);
    const vector v_piz = vector_set1(cell_cache->z[pid]);
    const vector v_hi = vector_set1(hi);
    const vector v_vix = vector_set1(cell_cache->vx[pid]);
    const vector v_viy = vector_set1(cell_cache->vy[pid]);
    const vector v_viz = vector_set1(cell_cache->vz[pid]);
626
627

    const float hig2 = hi * hi * kernel_gamma2;
628
    const vector v_hig2 = vector_set1(hig2);
James Willis's avatar
James Willis committed
629

James Willis's avatar
James Willis committed
630
    /* Get the inverse of hi. */
631
    vector v_hi_inv = vec_reciprocal(v_hi);
James Willis's avatar
James Willis committed
632

633
634
635
636
637
638
639
640
641
    /* Reset cumulative sums of update vectors. */
    vector v_rhoSum = vector_setzero();
    vector v_rho_dhSum = vector_setzero();
    vector v_wcountSum = vector_setzero();
    vector v_wcount_dhSum = vector_setzero();
    vector v_div_vSum = vector_setzero();
    vector v_curlvxSum = vector_setzero();
    vector v_curlvySum = vector_setzero();
    vector v_curlvzSum = vector_setzero();
642
643

    /* Pad cache if there is a serial remainder. */
Matthieu Schaller's avatar
Matthieu Schaller committed
644
    int count_align = count;
James Willis's avatar
James Willis committed
645
    const int rem = count % (NUM_VEC_PROC * VEC_SIZE);
646
    if (rem != 0) {
James Willis's avatar
James Willis committed
647
      count_align += (NUM_VEC_PROC * VEC_SIZE) - rem;
648
649
650
651

      /* Set positions to the same as particle pi so when the r2 > 0 mask is
       * applied these extra contributions are masked out.*/
      for (int i = count; i < count_align; i++) {
James Willis's avatar
James Willis committed
652
653
654
        cell_cache->x[i] = v_pix.f[0];
        cell_cache->y[i] = v_piy.f[0];
        cell_cache->z[i] = v_piz.f[0];
655
      }
656
657
    }

James Willis's avatar
James Willis committed
658
    /* The number of interactions for pi and the padded version of it to
659
     * make it a multiple of VEC_SIZE. */
660
661
    int icount = 0, icount_align = 0;

James Willis's avatar
James Willis committed
662
663
    /* Find all of particle pi's interacions and store needed values in the
     * secondary cache.*/
James Willis's avatar
James Willis committed
664
    for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
665
666

      /* Load 2 sets of vectors from the particle cache. */
667
668
669
      const vector v_pjx = vector_load(&cell_cache->x[pjd]);
      const vector v_pjy = vector_load(&cell_cache->y[pjd]);
      const vector v_pjz = vector_load(&cell_cache->z[pjd]);
670

671
672
673
      const vector v_pjx2 = vector_load(&cell_cache->x[pjd + VEC_SIZE]);
      const vector v_pjy2 = vector_load(&cell_cache->y[pjd + VEC_SIZE]);
      const vector v_pjz2 = vector_load(&cell_cache->z[pjd + VEC_SIZE]);
James Willis's avatar
James Willis committed
674

675
      /* Compute the pairwise distance. */
James Willis's avatar
James Willis committed
676
      vector v_dx, v_dy, v_dz, v_r2;
677
678
      vector v_dx_2, v_dy_2, v_dz_2, v_r2_2;

James Willis's avatar
James Willis committed
679
680
681
682
683
684
      v_dx.v = vec_sub(v_pix.v, v_pjx.v);
      v_dx_2.v = vec_sub(v_pix.v, v_pjx2.v);
      v_dy.v = vec_sub(v_piy.v, v_pjy.v);
      v_dy_2.v = vec_sub(v_piy.v, v_pjy2.v);
      v_dz.v = vec_sub(v_piz.v, v_pjz.v);
      v_dz_2.v = vec_sub(v_piz.v, v_pjz2.v);
685
686
687
688
689
690
691

      v_r2.v = vec_mul(v_dx.v, v_dx.v);
      v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v);
      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
      v_r2_2.v = vec_fma(v_dy_2.v, v_dy_2.v, v_r2_2.v);
      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
      v_r2_2.v = vec_fma(v_dz_2.v, v_dz_2.v, v_r2_2.v);
James Willis's avatar
James Willis committed
692

693
      /* Form a mask from r2 < hig2 and r2 > 0.*/
James Willis's avatar
James Willis committed
694
695
      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
          v_doi_mask2_self_check;
696

James Willis's avatar
James Willis committed
697
      /* Form r2 > 0 mask and r2 < hig2 mask. */
698
      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
699
      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));
700

James Willis's avatar
James Willis committed
701
      /* Form r2 > 0 mask and r2 < hig2 mask. */
James Willis's avatar
James Willis committed
702
703
      vec_create_mask(v_doi_mask2_self_check,
                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
704
      vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));
705

James Willis's avatar
James Willis committed
706
      /* Combine two masks and form integer masks. */
707
708
709
710
      const int doi_mask = vec_is_mask_true(v_doi_mask) &
                           vec_is_mask_true(v_doi_mask_self_check);
      const int doi_mask2 = vec_is_mask_true(v_doi_mask2) &
                            vec_is_mask_true(v_doi_mask2_self_check);
711

712
#ifdef DEBUG_INTERACTIONS_SPH
713
      for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
714
715
716
717
        if (doi_mask & (1 << bit_index)) {
          if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS)
            pi->ids_ngbs_density[pi->num_ngb_density] =
                parts[pjd + bit_index].id;
James Willis's avatar
James Willis committed
718
          ++pi->num_ngb_density;
719
        }
720

721
722
723
724
        if (doi_mask2 & (1 << bit_index)) {
          if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS)
            pi->ids_ngbs_density[pi->num_ngb_density] =
                parts[pjd + VEC_SIZE + bit_index].id;
James Willis's avatar
James Willis committed
725
          ++pi->num_ngb_density;
726
727
728
729
        }
      }
#endif

James Willis's avatar
James Willis committed
730
731
      /* If there are any interactions left pack interaction values into c2
       * cache. */
732
      if (doi_mask) {
James Willis's avatar
James Willis committed
733
        storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache,
734
735
736
737
                          &int_cache, &icount, &v_rhoSum, &v_rho_dhSum,
                          &v_wcountSum, &v_wcount_dhSum, &v_div_vSum,
                          &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv,
                          v_vix, v_viy, v_viz);
738
      }
James Willis's avatar
James Willis committed
739
740
      if (doi_mask2) {
        storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2,
James Willis's avatar
James Willis committed
741
                          &v_dz_2, cell_cache, &int_cache, &icount, &v_rhoSum,
742
743
744
                          &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
                          &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum,
                          v_hi_inv, v_vix, v_viy, v_viz);
James Willis's avatar
James Willis committed
745
      }
746
747
    }

James Willis's avatar
James Willis committed
748
    /* Perform padded vector remainder interactions if any are present. */
749
750
751
752
    calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum,
                        &v_wcountSum, &v_wcount_dhSum, &v_div_vSum,
                        &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv,
                        v_vix, v_viy, v_viz, &icount_align);
James Willis's avatar
James Willis committed
753
754
755

    /* Initialise masks to true in case remainder interactions have been
     * performed. */
756
    mask_t int_mask, int_mask2;
757
758
    vec_init_mask_true(int_mask);
    vec_init_mask_true(int_mask2);
759
760

    /* Perform interaction with 2 vectors. */
James Willis's avatar
James Willis committed
761
    for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
James Willis's avatar
James Willis committed
762
763
764
765
      runner_iact_nonsym_2_vec_density(
          &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
          &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
          &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
766
767
768
          &int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum,
          &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum,
          &v_curlvzSum, int_mask, int_mask2, 0);
769
770
    }

James Willis's avatar
James Willis committed
771
772
    /* Perform horizontal adds on vector sums and store result in particle pi.
     */
James Willis's avatar
James Willis committed
773
774
775
776
777
778
779
780
    VEC_HADD(v_rhoSum, pi->rho);
    VEC_HADD(v_rho_dhSum, pi->density.rho_dh);
    VEC_HADD(v_wcountSum, pi->density.wcount);
    VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh);
    VEC_HADD(v_div_vSum, pi->density.div_v);
    VEC_HADD(v_curlvxSum, pi->density.rot_v[0]);
    VEC_HADD(v_curlvySum, pi->density.rot_v[1]);
    VEC_HADD(v_curlvzSum, pi->density.rot_v[2]);
781
782
783
784
785

    /* Reset interaction count. */
    icount = 0;
  } /* loop over all particles. */

James Willis's avatar
James Willis committed
786
  TIMER_TOC(timer_doself_density);
787
788
789
790
791

#else

  error("Incorrectly calling vectorized Gadget-2 functions!");

792
#endif /* WITH_VECTORIZATION */
793
794
}

795
796
797
798
799
800
801
802
803
804
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci. (Vectorised)
 *
 * @param r The #runner.
 * @param c The first #cell.
 * @param parts The #part to interact.
 * @param ind The list of indices of particles in @c c to interact with.
 * @param pi_count The number of particles in @c ind.
 */
805
__attribute__((always_inline)) INLINE void runner_doself_subset_density_vec(
James Willis's avatar
James Willis committed
806
807
    struct runner *r, struct cell *restrict c, struct part *restrict parts,
    int *restrict ind, int pi_count) {
808

809
#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH)
810
811
812

  const int count = c->count;

813
  TIMER_TIC;
814
815
816
817
818

  /* Get the particle cache from the runner and re-allocate
   * the cache if it is not big enough for the cell. */
  struct cache *restrict cell_cache = &r->ci_cache;

819
  if (cell_cache->count < count) cache_init(cell_cache, count);
820
821
822
823
824
825
826

  /* Read the particles from the cell and store them locally in the cache. */
  cache_read_particles(c, cell_cache);

  /* Create secondary cache to store particle interactions. */
  struct c2_cache int_cache;

827
  /* Loop over the subset of particles in the parts that need updating. */
828
829
830
  for (int pid = 0; pid < pi_count; pid++) {

    /* Get a pointer to the ith particle. */
Matthieu Schaller's avatar
Matthieu Schaller committed
831
    struct part *pi = &parts[ind[pid]];
832

833
834
835
836
#ifdef SWIFT_DEBUG_CHECKS
    const struct engine *e = r->e;
    if (!part_is_active(pi, e)) error("Inactive particle in subset function!");
#endif
837

838
    const float hi = pi->h;
839
840

    /* Fill particle pi vectors. */
James Willis's avatar
James Willis committed
841
842
843
844
845
846
847
    const vector v_pix = vector_set1(pi->x[0] - c->loc[0]);
    const vector v_piy = vector_set1(pi->x[1] - c->loc[1]);
    const vector v_piz = vector_set1(pi->x[2] - c->loc[2]);
    const vector v_hi = vector_set1(hi);
    const vector v_vix = vector_set1(pi->v[0]);
    const vector v_viy = vector_set1(pi->v[1]);
    const vector v_viz = vector_set1(pi->v[2]);
848
849

    const float hig2 = hi * hi * kernel_gamma2;
James Willis's avatar
James Willis committed
850
    const vector v_hig2 = vector_set1(hig2);
851
852

    /* Get the inverse of hi. */
James Willis's avatar
James Willis committed
853
    vector v_hi_inv = vec_reciprocal(v_hi);
854

James Willis's avatar
James Willis committed
855
856
857
858
859
860
861
862
863
    /* Reset cumulative sums of update vectors. */
    vector v_rhoSum = vector_setzero();
    vector v_rho_dhSum = vector_setzero();
    vector v_wcountSum = vector_setzero();
    vector v_wcount_dhSum = vector_setzero();
    vector v_div_vSum = vector_setzero();
    vector v_curlvxSum = vector_setzero();
    vector v_curlvySum = vector_setzero();
    vector v_curlvzSum = vector_setzero();
864
865

    /* Pad cache if there is a serial remainder. */
James Willis's avatar
James Willis committed
866
    int count_align = count;
Matthieu Schaller's avatar
Matthieu Schaller committed
867
    const int rem = count % (NUM_VEC_PROC * VEC_SIZE);
868
    if (rem != 0) {
Matthieu Schaller's avatar
Matthieu Schaller committed
869
      const int pad = (NUM_VEC_PROC * VEC_SIZE) - rem;
870
871
872
873
874
875

      count_align += pad;

      /* Set positions to the same as particle pi so when the r2 > 0 mask is
       * applied these extra contributions are masked out.*/
      for (int i = count; i < count_align; i++) {
James Willis's avatar
James Willis committed
876
877
878
        cell_cache->x[i] = v_pix.f[0];
        cell_cache->y[i] = v_piy.f[0];
        cell_cache->z[i] = v_piz.f[0];
879
880
881
      }
    }

James Willis's avatar
James Willis committed
882
    /* The number of interactions for pi and the padded version of it to
883
     * make it a multiple of VEC_SIZE. */
884
    int icount = 0, icount_align = 0;
885
886
887

    /* Find all of particle pi's interacions and store needed values in the
     * secondary cache.*/
James Willis's avatar
James Willis committed
888
    for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) {
889
890

      /* Load 2 sets of vectors from the particle cache. */
James Willis's avatar
James Willis committed
891
892
893
      const vector v_pjx = vector_load(&cell_cache->x[pjd]);
      const vector v_pjy = vector_load(&cell_cache->y[pjd]);
      const vector v_pjz = vector_load(&cell_cache->z[pjd]);
894

James Willis's avatar
James Willis committed
895
896
897
      const vector v_pjx2 = vector_load(&cell_cache->x[pjd + VEC_SIZE]);
      const vector v_pjy2 = vector_load(&cell_cache->y[pjd + VEC_SIZE]);
      const vector v_pjz2 = vector_load(&cell_cache->z[pjd + VEC_SIZE]);
898
899

      /* Compute the pairwise distance. */
James Willis's avatar
James Willis committed
900
      vector v_dx, v_dy, v_dz, v_r2;
901
902
      vector v_dx_2, v_dy_2, v_dz_2, v_r2_2;

Matthieu Schaller's avatar
Matthieu Schaller committed
903
      /* p_i - p_j */
James Willis's avatar
James Willis committed
904
905
906
907
908
909
      v_dx.v = vec_sub(v_pix.v, v_pjx.v);
      v_dx_2.v = vec_sub(v_pix.v, v_pjx2.v);
      v_dy.v = vec_sub(v_piy.v, v_pjy.v);
      v_dy_2.v = vec_sub(v_piy.v, v_pjy2.v);
      v_dz.v = vec_sub(v_piz.v, v_pjz.v);
      v_dz_2.v = vec_sub(v_piz.v, v_pjz2.v);
910

Matthieu Schaller's avatar
Matthieu Schaller committed
911
      /* r2 = dx^2 + dy^2 + dz^2 */
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
      v_r2.v = vec_mul(v_dx.v, v_dx.v);
      v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v);
      v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
      v_r2_2.v = vec_fma(v_dy_2.v, v_dy_2.v, v_r2_2.v);
      v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
      v_r2_2.v = vec_fma(v_dz_2.v, v_dz_2.v, v_r2_2.v);

      /* Form a mask from r2 < hig2 and r2 > 0.*/
      mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2,
          v_doi_mask2_self_check;

      /* Form r2 > 0 mask and r2 < hig2 mask. */
      vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero()));
      vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v));

      /* Form r2 > 0 mask and r2 < hig2 mask. */
      vec_create_mask(v_doi_mask2_self_check,
                      vec_cmp_gt(v_r2_2.v, vec_setzero()));
      vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v));

James Willis's avatar
James Willis committed
932
933
934
935
936
      /* Combine two masks and form integer masks. */
      const int doi_mask = vec_is_mask_true(v_doi_mask) &
                           vec_is_mask_true(v_doi_mask_self_check);
      const int doi_mask2 = vec_is_mask_true(v_doi_mask2) &
                            vec_is_mask_true(v_doi_mask2_self_check);
James Willis's avatar
James Willis committed
937

938
#ifdef DEBUG_INTERACTIONS_SPH
939
      struct part *restrict parts_i = c->parts;
940
      for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) {
941
        if (doi_mask & (1 << bit_index)) {
942
943
          if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS)
            pi->ids_ngbs_density[pi->num_ngb_density] =
944
                parts_i[pjd + bit_index].id;
James Willis's avatar
James Willis committed
945
          ++pi->num_ngb_density;