runner_doiact.h 114 KB
Newer Older
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
20
21
22
23
24
25
26
 ******************************************************************************/

/* Before including this file, define FUNCTION, which is the
   name of the interaction function. This creates the interaction functions
   runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
   and runner_dosub_FUNCTION calling the pairwise interaction function
   runner_iact_FUNCTION. */

27
#define PASTE(x, y) x##_##y
28

29
30
31
#define _DOPAIR1_BRANCH(f) PASTE(runner_dopair1_branch, f)
#define DOPAIR1_BRANCH _DOPAIR1_BRANCH(FUNCTION)

32
#define _DOPAIR1(f) PASTE(runner_dopair1, f)
33
#define DOPAIR1 _DOPAIR1(FUNCTION)
34

35
#define _DOPAIR2(f) PASTE(runner_dopair2, f)
36
#define DOPAIR2 _DOPAIR2(FUNCTION)
37

38
#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f)
39
#define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
40

41
42
43
#define _DOPAIR_SUBSET_NOSORT(f) PASTE(runner_dopair_subset_nosort, f)
#define DOPAIR_SUBSET_NOSORT _DOPAIR_SUBSET_NOSORT(FUNCTION)

44
#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f)
Pedro Gonnet's avatar
Pedro Gonnet committed
45
46
#define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)

47
48
49
50
51
#define _DOPAIR1_NAIVE(f) PASTE(runner_dopair1_naive, f)
#define DOPAIR1_NAIVE _DOPAIR1_NAIVE(FUNCTION)

#define _DOPAIR2_NAIVE(f) PASTE(runner_dopair2_naive, f)
#define DOPAIR2_NAIVE _DOPAIR2_NAIVE(FUNCTION)
52

Matthieu Schaller's avatar
Matthieu Schaller committed
53
54
#define _DOSELF2_NAIVE(f) PASTE(runner_doself2_naive, f)
#define DOSELF2_NAIVE _DOSELF2_NAIVE(FUNCTION)
55

56
#define _DOSELF1(f) PASTE(runner_doself1, f)
57
#define DOSELF1 _DOSELF1(FUNCTION)
58

59
#define _DOSELF2(f) PASTE(runner_doself2, f)
60
#define DOSELF2 _DOSELF2(FUNCTION)
61

62
#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
63
#define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
64

65
66
67
68
69
70
71
72
#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)

#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f)
#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION)

#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f)
#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION)
73

74
75
#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f)
#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION)
76

77
#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
78
#define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
79

80
#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym, f)
81
#define IACT_NONSYM _IACT_NONSYM(FUNCTION)
82

83
#define _IACT(f) PASTE(runner_iact, f)
84
#define IACT _IACT(FUNCTION)
85

86
87
88
89
90
91
#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)

#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
#define IACT_VEC _IACT_VEC(FUNCTION)

92
#define _TIMER_DOSELF(f) PASTE(timer_doself, f)
93
#define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
94

95
#define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
96
#define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
Pedro Gonnet's avatar
Pedro Gonnet committed
97

98
99
100
101
102
#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f)
#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION)

#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f)
#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION)
103

104
#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
105
106
#define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)

107
#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
108
109
#define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)

110
/**
Matthieu Schaller's avatar
Matthieu Schaller committed
111
112
113
 * @brief Compute the interactions between a cell pair (non-symmetric case).
 *
 * Inefficient version using a brute-force algorithm.
114
115
116
117
118
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
119
void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
120
                   struct cell *restrict cj) {
121
122
123
124

  const struct engine *e = r->e;

#ifndef SWIFT_DEBUG_CHECKS
125
  error("Don't use in actual runs ! Slow code !");
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#endif

#ifdef WITH_VECTORIZATION
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
  TIMER_TIC;

  /* Anything to do here? */
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;

  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;

  /* Get the relative distance between the pairs, wrapping. */
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts in ci. */
  for (int pid = 0; pid < count_i; pid++) {

    /* Get a hold of the ith part in ci. */
    struct part *restrict pi = &parts_i[pid];
    const float hi = pi->h;

    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;

    /* Loop over the parts in cj. */
    for (int pjd = 0; pjd < count_j; pjd++) {

      /* Get a pointer to the jth particle. */
      struct part *restrict pj = &parts_j[pjd];

      /* Compute the pairwise distance. */
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2) {

#ifndef WITH_VECTORIZATION

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }
      if (r2 < pj->h * pj->h * kernel_gamma2) {

#ifndef WITH_VECTORIZATION
211

212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
        for (int k = 0; k < 3; k++) dx[k] = -dx[k];
        IACT_NONSYM(r2, dx, pj->h, hi, pj, pi);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = -dx[0];
        dxq[3 * icount + 1] = -dx[1];
        dxq[3 * icount + 2] = -dx[2];
        hiq[icount] = pj->h;
        hjq[icount] = hi;
        piq[icount] = pj;
        pjq[icount] = pi;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

#ifdef WITH_VECTORIZATION
  /* Pick up any leftovers. */
  if (icount > 0)
    for (int k = 0; k < icount; k++)
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(TIMER_DOPAIR);
}

Matthieu Schaller's avatar
Matthieu Schaller committed
251
252
253
254
255
256
257
258
259
/**
 * @brief Compute the interactions between a cell pair (symmetric case).
 *
 * Inefficient version using a brute-force algorithm.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
260
void DOPAIR2_NAIVE(struct runner *r, struct cell *restrict ci,
261
                   struct cell *restrict cj) {
262

263
264
  const struct engine *e = r->e;

265
#ifndef SWIFT_DEBUG_CHECKS
266
  error("Don't use in actual runs ! Slow code !");
267
#endif
268

269
#ifdef WITH_OLD_VECTORIZATION
270
271
272
273
274
275
276
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
Matthieu Schaller's avatar
Matthieu Schaller committed
277
  TIMER_TIC;
278
279

  /* Anything to do here? */
280
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
281

282
283
284
285
286
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;

287
  /* Get the relative distance between the pairs, wrapping. */
288
289
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
290
291
292
293
294
295
296
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts in ci. */
297
  for (int pid = 0; pid < count_i; pid++) {
298
299

    /* Get a hold of the ith part in ci. */
300
301
302
303
304
305
    struct part *restrict pi = &parts_i[pid];
    const float hi = pi->h;

    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;
306
307

    /* Loop over the parts in cj. */
308
    for (int pjd = 0; pjd < count_j; pjd++) {
309
310

      /* Get a pointer to the jth particle. */
311
      struct part *restrict pj = &parts_j[pjd];
312
313

      /* Compute the pairwise distance. */
314
315
316
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
317
318
319
320
321
322
323
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {

324
#ifndef WITH_OLD_VECTORIZATION
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344

        IACT(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
345
346
        }

347
348
349
350
351
352
353
#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

354
#ifdef WITH_OLD_VECTORIZATION
355
356
  /* Pick up any leftovers. */
  if (icount > 0)
357
    for (int k = 0; k < icount; k++)
358
359
360
361
362
363
      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(TIMER_DOPAIR);
}

Matthieu Schaller's avatar
Matthieu Schaller committed
364
365
366
367
368
369
370
371
372
/**
 * @brief Compute the interactions within a cell (symmetric case).
 *
 * Inefficient version using a brute-force algorithm.
 *
 * @param r The #runner.
 * @param c The #cell.
 */
void DOSELF2_NAIVE(struct runner *r, struct cell *restrict c) {
373

374
  const struct engine *e = r->e;
375

376
#ifndef SWIFT_DEBUG_CHECKS
377
  error("Don't use in actual runs ! Slow code !");
378
#endif
379

380
#ifdef WITH_OLD_VECTORIZATION
381
382
383
384
385
386
387
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
388

Matthieu Schaller's avatar
Matthieu Schaller committed
389
  TIMER_TIC;
390
391

  /* Anything to do here? */
392
  if (!cell_is_active(c, e)) return;
393

394
395
  const int count = c->count;
  struct part *restrict parts = c->parts;
396
397

  /* Loop over the parts in ci. */
398
  for (int pid = 0; pid < count; pid++) {
399
400

    /* Get a hold of the ith part in ci. */
401
402
403
404
    struct part *restrict pi = &parts[pid];
    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;
405

406
    /* Loop over the parts in cj. */
407
    for (int pjd = pid + 1; pjd < count; pjd++) {
408
409

      /* Get a pointer to the jth particle. */
410
      struct part *restrict pj = &parts[pjd];
411
412

      /* Compute the pairwise distance. */
413
414
415
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
416
417
418
419
420
421
422
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {

423
#ifndef WITH_OLD_VECTORIZATION
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444

        IACT(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }
445

446
447
#endif
      }
448

449
    } /* loop over the parts in cj. */
450

451
452
  } /* loop over the parts in ci. */

453
#ifdef WITH_OLD_VECTORIZATION
454
455
  /* Pick up any leftovers. */
  if (icount > 0)
456
    for (int k = 0; k < icount; k++)
457
458
      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif
459

460
461
  TIMER_TOC(TIMER_DOSELF);
}
462

463
464
465
466
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
Matthieu Schaller's avatar
Matthieu Schaller committed
467
468
 * Version using a brute-force algorithm.
 *
469
470
 * @param r The #runner.
 * @param ci The first #cell.
471
 * @param parts_i The #part to interact with @c cj.
472
473
474
475
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
476
477
478
479
void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
                         struct part *restrict parts_i, int *restrict ind,
                         int count, struct cell *restrict cj) {

480
  const struct engine *e = r->e;
481

482
#ifdef WITH_OLD_VECTORIZATION
483
484
485
486
487
488
489
490
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif

Matthieu Schaller's avatar
Matthieu Schaller committed
491
  TIMER_TIC;
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514

  const int count_j = cj->count;
  struct part *restrict parts_j = cj->parts;

  /* Get the relative distance between the pairs, wrapping. */
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts_i. */
  for (int pid = 0; pid < count; pid++) {

    /* Get a hold of the ith part in ci. */
    struct part *restrict pi = &parts_i[ind[pid]];
    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;

515
516
517
518
519
520
#ifdef SWIFT_DEBUG_CHECKS
    if (!part_is_active(pi, e))
      error("Trying to correct smoothing length of inactive particle !");

#endif

521
522
523
524
525
526
527
528
529
530
531
532
533
    /* Loop over the parts in cj. */
    for (int pjd = 0; pjd < count_j; pjd++) {

      /* Get a pointer to the jth particle. */
      struct part *restrict pj = &parts_j[pjd];

      /* Compute the pairwise distance. */
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }
534

535
536
537
538
539
540
541
542
#ifdef SWIFT_DEBUG_CHECKS
      /* Check that particles have been drifted to the current time */
      if (pi->ti_drift != e->ti_current)
        error("Particle pi not drifted to current time");
      if (pj->ti_drift != e->ti_current)
        error("Particle pj not drifted to current time");
#endif

543
544
545
      /* Hit or miss? */
      if (r2 < hig2) {

546
#ifndef WITH_OLD_VECTORIZATION
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */
575

576
#ifdef WITH_OLD_VECTORIZATION
577
578
579
580
581
582
  /* Pick up any leftovers. */
  if (icount > 0)
    for (int k = 0; k < icount; k++)
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

583
  TIMER_TOC(timer_dopair_subset_naive);
584
585
586
587
588
589
590
591
592
593
594
595
596
}

/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param parts_i The #part to interact with @c cj.
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
597
598
599
600
601
void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                   struct part *restrict parts_i, int *restrict ind, int count,
                   struct cell *restrict cj) {

  struct engine *e = r->e;
602

603
#ifdef WITH_OLD_VECTORIZATION
604
605
606
607
608
609
610
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
611

Matthieu Schaller's avatar
Matthieu Schaller committed
612
  TIMER_TIC;
613

614
615
616
  const int count_j = cj->count;
  struct part *restrict parts_j = cj->parts;

617
  /* Get the relative distance between the pairs, wrapping. */
618
619
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
620
621
622
623
624
625
626
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Get the sorting index. */
627
628
  int sid = 0;
  for (int k = 0; k < 3; k++)
629
630
631
632
633
    sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                         ? 0
                         : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);

  /* Switch the cells around? */
634
  const int flipped = runner_flip[sid];
635
636
  sid = sortlistID[sid];

637
  /* Has the cell cj been sorted? */
Pedro Gonnet's avatar
Pedro Gonnet committed
638
  if (!(cj->sorted & (1 << sid)) ||
639
640
      cj->dx_max_sort_old > space_maxreldx * cj->dmin)
    error("Interacting unsorted cells.");
Pedro Gonnet's avatar
Pedro Gonnet committed
641

642
  /* Pick-out the sorted lists. */
643
  const struct entry *restrict sort_j = cj->sort[sid];
644
  const float dxj = cj->dx_max_sort;
645
646
647
648
649

  /* Parts are on the left? */
  if (!flipped) {

    /* Loop over the parts_i. */
650
    for (int pid = 0; pid < count; pid++) {
651
652

      /* Get a hold of the ith part in ci. */
653
654
655
656
657
658
659
660
661
      struct part *restrict pi = &parts_i[ind[pid]];
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];

      const float hi = pi->h;
      const float hig2 = hi * hi * kernel_gamma2;
      const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
                       pix[1] * runner_shift[sid][1] +
                       pix[2] * runner_shift[sid][2];
662
663

      /* Loop over the parts in cj. */
664
      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
665
666

        /* Get a pointer to the jth particle. */
667
        struct part *restrict pj = &parts_j[sort_j[pjd].i];
668
669

        /* Compute the pairwise distance. */
670
671
672
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
673
674
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
675
        }
676
677
678
679

        /* Hit or miss? */
        if (r2 < hig2) {

680
#ifndef WITH_OLD_VECTORIZATION
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703

          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

          /* Add this interaction to the queue. */
          r2q[icount] = r2;
          dxq[3 * icount + 0] = dx[0];
          dxq[3 * icount + 1] = dx[1];
          dxq[3 * icount + 2] = dx[2];
          hiq[icount] = hi;
          hjq[icount] = pj->h;
          piq[icount] = pi;
          pjq[icount] = pj;
          icount += 1;

          /* Flush? */
          if (icount == VEC_SIZE) {
            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
            icount = 0;
          }

#endif
704
        }
705
706
707
708
709
710
711
712
713
714
715

      } /* loop over the parts in cj. */

    } /* loop over the parts in ci. */

  }

  /* Parts are on the right. */
  else {

    /* Loop over the parts_i. */
716
    for (int pid = 0; pid < count; pid++) {
717
718

      /* Get a hold of the ith part in ci. */
719
720
721
722
723
724
725
726
      struct part *restrict pi = &parts_i[ind[pid]];
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
      const float hi = pi->h;
      const float hig2 = hi * hi * kernel_gamma2;
      const float di =
          -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
          pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
727
728

      /* Loop over the parts in cj. */
729
      for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
730
731

        /* Get a pointer to the jth particle. */
732
        struct part *restrict pj = &parts_j[sort_j[pjd].i];
733
734

        /* Compute the pairwise distance. */
735
736
737
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
738
739
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
740
        }
741

742
743
        /* Hit or miss? */
        if (r2 < hig2) {
744

745
#ifndef WITH_OLD_VECTORIZATION
746
747

          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
748

749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
#else

          /* Add this interaction to the queue. */
          r2q[icount] = r2;
          dxq[3 * icount + 0] = dx[0];
          dxq[3 * icount + 1] = dx[1];
          dxq[3 * icount + 2] = dx[2];
          hiq[icount] = hi;
          hjq[icount] = pj->h;
          piq[icount] = pi;
          pjq[icount] = pj;
          icount += 1;

          /* Flush? */
          if (icount == VEC_SIZE) {
            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
            icount = 0;
          }

#endif
        }

      } /* loop over the parts in cj. */

    } /* loop over the parts in ci. */
  }

776
#ifdef WITH_OLD_VECTORIZATION
777
778
  /* Pick up any leftovers. */
  if (icount > 0)
779
    for (int k = 0; k < icount; k++)
780
781
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
782

783
784
  TIMER_TOC(timer_dopair_subset);
}
Pedro Gonnet's avatar
Pedro Gonnet committed
785

786
787
788
789
790
791
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
792
 * @param parts The #part to interact.
793
794
795
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 */
796
797
798
void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
                   struct part *restrict parts, int *restrict ind, int count) {

799
#ifdef WITH_OLD_VECTORIZATION
800
801
802
803
804
805
806
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
807

Matthieu Schaller's avatar
Matthieu Schaller committed
808
  TIMER_TIC;
809

810
811
  const int count_i = ci->count;
  struct part *restrict parts_j = ci->parts;
812
813

  /* Loop over the parts in ci. */
814
  for (int pid = 0; pid < count; pid++) {
815
816

    /* Get a hold of the ith part in ci. */
817
818
819
820
    struct part *restrict pi = &parts[ind[pid]];
    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;
821

822
    /* Loop over the parts in cj. */
823
    for (int pjd = 0; pjd < count_i; pjd++) {
824
825

      /* Get a pointer to the jth particle. */
826
      struct part *restrict pj = &parts_j[pjd];
827
828

      /* Compute the pairwise distance. */
829
830
831
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
832
833
834
835
836
837
838
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 > 0.0f && r2 < hig2) {

839
#ifndef WITH_OLD_VECTORIZATION
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */
868

869
#ifdef WITH_OLD_VECTORIZATION
870
871
  /* Pick up any leftovers. */
  if (icount > 0)
872
    for (int k = 0; k < icount; k++)
873
874
875
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

876
  TIMER_TOC(timer_doself_subset);
877
}
878

879
/**
880
 * @brief Compute the interactions between a cell pair (non-symmetric).
881
882
883
884
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
885
886
 * @param sid The direction of the pair
 * @param shift The shift vector to apply to the particles in ci.
887
 */
888
889
void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
             const double *shift) {
890

891
  const struct engine *restrict e = r->e;
892

893
#ifdef WITH_OLD_VECTORIZATION
894
895
896
897
898
899
900
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
901

Matthieu Schaller's avatar
Matthieu Schaller committed
902
  TIMER_TIC;
903
904

  /* Get the cutoff shift. */
905
906
  double rshift = 0.0;
  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
907
908

  /* Pick-out the sorted lists. */
909
910
  const struct entry *restrict sort_i = ci->sort[sid];
  const struct entry *restrict sort_j = cj->sort[sid];
911

912
913
914
915
916
917
918
919
#ifdef SWIFT_DEBUG_CHECKS
  /* Check that the dx_max_sort values in the cell are indeed an upper
     bound on particle movement. */
  for (int pid = 0; pid < ci->count; pid++) {
    const struct part *p = &ci->parts[sort_i[pid].i];
    const float d = p->x[0] * runner_shift[sid][0] +
                    p->x[1] * runner_shift[sid][1] +
                    p->x[2] * runner_shift[sid][2];
920
    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
921
        1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
Pedro Gonnet's avatar
Pedro Gonnet committed
922
923
924
925
926
927
      error(
          "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
          "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
          "ci->dx_max_sort_old=%e",
          ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
          ci->dx_max_sort_old);
928
929
930
931
932
933
  }
  for (int pjd = 0; pjd < cj->count; pjd++) {
    const struct part *p = &cj->parts[sort_j[pjd].i];
    const float d = p->x[0] * runner_shift[sid][0] +
                    p->x[1] * runner_shift[sid][1] +
                    p->x[2] * runner_shift[sid][2];
934
    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
935
        1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
Pedro Gonnet's avatar
Pedro Gonnet committed
936
937
938
939
940
941
      error(
          "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
          "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
          "cj->dx_max_sort_old=%e",
          cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
          cj->dx_max_sort_old);
942
943
944
  }
#endif /* SWIFT_DEBUG_CHECKS */

945
  /* Get some other useful values. */
946
947
948
949
950
951
952
953
  const double hi_max = ci->h_max * kernel_gamma - rshift;
  const double hj_max = cj->h_max * kernel_gamma;
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;
  const double di_max = sort_i[count_i - 1].d - rshift;
  const double dj_min = sort_j[0].d;
954
  const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
955

956
  if (cell_is_active(ci, e)) {
957

958
959
960
    /* Loop over the parts in ci. */
    for (int pid = count_i - 1;
         pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
961

962
963
964
965
966
967
      /* Get a hold of the ith part in ci. */
      struct part *restrict pi = &parts_i[sort_i[pid].i];
      if (!part_is_active(pi, e)) continue;
      const float hi = pi->h;
      const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
      if (di < dj_min) continue;
968

969
970
971
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
      const float hig2 = hi * hi * kernel_gamma2;
972

973
974
      /* Loop over the parts in cj. */
      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
975

976
977
978
979
980
981
982
983
984
985
        /* Get a pointer to the jth particle. */
        struct part *restrict pj = &parts_j[sort_j[pjd].i];

        /* Compute the pairwise distance. */
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
        }
986

987
#ifdef SWIFT_DEBUG_CHECKS
988
989
990
991
992
        /* Check that particles have been drifted to the current time */
        if (pi->ti_drift != e->ti_current)
          error("Particle pi not drifted to current time");
        if (pj->ti_drift != e->ti_current)
          error("Particle pj not drifted to current time");
993
994
#endif

995
996
        /* Hit or miss? */
        if (r2 < hig2) {
997

998
#ifndef WITH_OLD_VECTORIZATION
999

1000
          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);