runner_doiact.h 106 KB
Newer Older
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
20
21
22
23
24
25
26
 ******************************************************************************/

/* Before including this file, define FUNCTION, which is the
   name of the interaction function. This creates the interaction functions
   runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
   and runner_dosub_FUNCTION calling the pairwise interaction function
   runner_iact_FUNCTION. */

27
#define PASTE(x, y) x##_##y
28

29
#define _DOPAIR1(f) PASTE(runner_dopair1, f)
30
#define DOPAIR1 _DOPAIR1(FUNCTION)
31

32
#define _DOPAIR2(f) PASTE(runner_dopair2, f)
33
#define DOPAIR2 _DOPAIR2(FUNCTION)
34

35
#define _DOPAIR1_NOSORT(f) PASTE(runner_dopair1_nosort, f)
36
37
#define DOPAIR1_NOSORT _DOPAIR1_NOSORT(FUNCTION)

38
#define _DOPAIR2_NOSORT(f) PASTE(runner_dopair2_nosort, f)
39
40
#define DOPAIR2_NOSORT _DOPAIR2_NOSORT(FUNCTION)

41
#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f)
42
#define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
43

44
45
46
#define _DOPAIR_SUBSET_NOSORT(f) PASTE(runner_dopair_subset_nosort, f)
#define DOPAIR_SUBSET_NOSORT _DOPAIR_SUBSET_NOSORT(FUNCTION)

47
#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f)
Pedro Gonnet's avatar
Pedro Gonnet committed
48
49
#define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)

50
#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive, f)
51
#define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION)
52

53
#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive, f)
54
#define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION)
55

56
#define _DOSELF1(f) PASTE(runner_doself1, f)
57
#define DOSELF1 _DOSELF1(FUNCTION)
58

59
#define _DOSELF2(f) PASTE(runner_doself2, f)
60
#define DOSELF2 _DOSELF2(FUNCTION)
61

62
#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
63
#define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
64

65
66
67
68
69
70
71
72
#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)

#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f)
#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION)

#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f)
#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION)
73

74
75
#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f)
#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION)
76

77
#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
78
#define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
79

80
#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym, f)
81
#define IACT_NONSYM _IACT_NONSYM(FUNCTION)
82

83
#define _IACT(f) PASTE(runner_iact, f)
84
#define IACT _IACT(FUNCTION)
85

86
87
88
89
90
91
#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)

#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
#define IACT_VEC _IACT_VEC(FUNCTION)

92
#define _TIMER_DOSELF(f) PASTE(timer_doself, f)
93
#define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
94

95
#define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
96
#define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
Pedro Gonnet's avatar
Pedro Gonnet committed
97

98
99
100
101
102
#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f)
#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION)

#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f)
#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION)
103

104
#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
105
106
#define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)

107
#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
108
109
#define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)

110
111
#include "runner_doiact_nosort.h"

112
113
114
115
116
117
118
/**
 * @brief Compute the interactions between a cell pair.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
119
120
121
void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci,
                  struct cell *restrict cj) {

122
123
124
125
  const struct engine *e = r->e;

  error("Don't use in actual runs ! Slow code !");

126
#ifdef WITH_OLD_VECTORIZATION
127
128
129
130
131
132
133
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
Matthieu Schaller's avatar
Matthieu Schaller committed
134
  TIMER_TIC;
135
136

  /* Anything to do here? */
137
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
138

139
140
141
142
143
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;

144
  /* Get the relative distance between the pairs, wrapping. */
145
146
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
147
148
149
150
151
152
153
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts in ci. */
154
  for (int pid = 0; pid < count_i; pid++) {
155
156

    /* Get a hold of the ith part in ci. */
157
158
159
160
161
162
    struct part *restrict pi = &parts_i[pid];
    const float hi = pi->h;

    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;
163
164

    /* Loop over the parts in cj. */
165
    for (int pjd = 0; pjd < count_j; pjd++) {
166
167

      /* Get a pointer to the jth particle. */
168
      struct part *restrict pj = &parts_j[pjd];
169
170

      /* Compute the pairwise distance. */
171
172
173
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
174
175
176
177
178
179
180
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {

181
#ifndef WITH_OLD_VECTORIZATION
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201

        IACT(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
202
203
        }

204
205
206
207
208
209
210
#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

211
#ifdef WITH_OLD_VECTORIZATION
212
213
  /* Pick up any leftovers. */
  if (icount > 0)
214
    for (int k = 0; k < icount; k++)
215
216
217
218
219
220
221
222
      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(TIMER_DOPAIR);
}

void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {

223
  const struct engine *e = r->e;
224
225
226

  error("Don't use in actual runs ! Slow code !");

227
#ifdef WITH_OLD_VECTORIZATION
228
229
230
231
232
233
234
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
235

Matthieu Schaller's avatar
Matthieu Schaller committed
236
  TIMER_TIC;
237
238

  /* Anything to do here? */
239
  if (!cell_is_active(c, e)) return;
240

241
242
  const int count = c->count;
  struct part *restrict parts = c->parts;
243
244

  /* Loop over the parts in ci. */
245
  for (int pid = 0; pid < count; pid++) {
246
247

    /* Get a hold of the ith part in ci. */
248
249
250
251
    struct part *restrict pi = &parts[pid];
    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;
252

253
    /* Loop over the parts in cj. */
254
    for (int pjd = pid + 1; pjd < count; pjd++) {
255
256

      /* Get a pointer to the jth particle. */
257
      struct part *restrict pj = &parts[pjd];
258
259

      /* Compute the pairwise distance. */
260
261
262
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
263
264
265
266
267
268
269
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {

270
#ifndef WITH_OLD_VECTORIZATION
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291

        IACT(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }
292

293
294
#endif
      }
295

296
    } /* loop over the parts in cj. */
297

298
299
  } /* loop over the parts in ci. */

300
#ifdef WITH_OLD_VECTORIZATION
301
302
  /* Pick up any leftovers. */
  if (icount > 0)
303
    for (int k = 0; k < icount; k++)
304
305
      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif
306

307
308
  TIMER_TOC(TIMER_DOSELF);
}
309

310
311
312
313
314
315
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
316
 * @param parts_i The #part to interact with @c cj.
317
318
319
320
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
321
322
323
324
325
326
327
328
void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
                         struct part *restrict parts_i, int *restrict ind,
                         int count, struct cell *restrict cj) {

  struct engine *e = r->e;

  error("Don't use in actual runs ! Slow code !");

329
#ifdef WITH_OLD_VECTORIZATION
330
331
332
333
334
335
336
337
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif

Matthieu Schaller's avatar
Matthieu Schaller committed
338
  TIMER_TIC;
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

  const int count_j = cj->count;
  struct part *restrict parts_j = cj->parts;

  /* Get the relative distance between the pairs, wrapping. */
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts_i. */
  for (int pid = 0; pid < count; pid++) {

    /* Get a hold of the ith part in ci. */
    struct part *restrict pi = &parts_i[ind[pid]];
    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;

    /* Loop over the parts in cj. */
    for (int pjd = 0; pjd < count_j; pjd++) {

      /* Get a pointer to the jth particle. */
      struct part *restrict pj = &parts_j[pjd];

      /* Compute the pairwise distance. */
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }
375

376
377
378
      /* Hit or miss? */
      if (r2 < hig2) {

379
#ifndef WITH_OLD_VECTORIZATION
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */
408

409
#ifdef WITH_OLD_VECTORIZATION
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
  /* Pick up any leftovers. */
  if (icount > 0)
    for (int k = 0; k < icount; k++)
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(timer_dopair_subset);
}

/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param parts_i The #part to interact with @c cj.
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
430
431
432
433
434
void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                   struct part *restrict parts_i, int *restrict ind, int count,
                   struct cell *restrict cj) {

  struct engine *e = r->e;
435

436
#ifdef WITH_MPI
Matthieu Schaller's avatar
Matthieu Schaller committed
437
  if (ci->nodeID != cj->nodeID) {
438
439
440
441
442
    DOPAIR_SUBSET_NOSORT(r, ci, parts_i, ind, count, cj);
    return;
  }
#endif

443
#ifdef WITH_OLD_VECTORIZATION
444
445
446
447
448
449
450
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
451

Matthieu Schaller's avatar
Matthieu Schaller committed
452
  TIMER_TIC;
453

454
455
456
  const int count_j = cj->count;
  struct part *restrict parts_j = cj->parts;

457
  /* Get the relative distance between the pairs, wrapping. */
458
459
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
460
461
462
463
464
465
466
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Get the sorting index. */
467
468
  int sid = 0;
  for (int k = 0; k < 3; k++)
469
470
471
472
473
    sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                         ? 0
                         : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);

  /* Switch the cells around? */
474
  const int flipped = runner_flip[sid];
475
476
477
478
479
480
  sid = sortlistID[sid];

  /* Have the cells been sorted? */
  if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");

  /* Pick-out the sorted lists. */
481
482
  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
  const float dxj = cj->dx_max;
483
484
485
486
487

  /* Parts are on the left? */
  if (!flipped) {

    /* Loop over the parts_i. */
488
    for (int pid = 0; pid < count; pid++) {
489
490

      /* Get a hold of the ith part in ci. */
491
492
493
494
495
496
497
498
499
      struct part *restrict pi = &parts_i[ind[pid]];
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];

      const float hi = pi->h;
      const float hig2 = hi * hi * kernel_gamma2;
      const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
                       pix[1] * runner_shift[sid][1] +
                       pix[2] * runner_shift[sid][2];
500
501

      /* Loop over the parts in cj. */
502
      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
503
504

        /* Get a pointer to the jth particle. */
505
        struct part *restrict pj = &parts_j[sort_j[pjd].i];
506
507

        /* Compute the pairwise distance. */
508
509
510
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
511
512
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
513
        }
514
515
516
517

        /* Hit or miss? */
        if (r2 < hig2) {

518
#ifndef WITH_OLD_VECTORIZATION
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541

          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

          /* Add this interaction to the queue. */
          r2q[icount] = r2;
          dxq[3 * icount + 0] = dx[0];
          dxq[3 * icount + 1] = dx[1];
          dxq[3 * icount + 2] = dx[2];
          hiq[icount] = hi;
          hjq[icount] = pj->h;
          piq[icount] = pi;
          pjq[icount] = pj;
          icount += 1;

          /* Flush? */
          if (icount == VEC_SIZE) {
            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
            icount = 0;
          }

#endif
542
        }
543
544
545
546
547
548
549
550
551
552
553

      } /* loop over the parts in cj. */

    } /* loop over the parts in ci. */

  }

  /* Parts are on the right. */
  else {

    /* Loop over the parts_i. */
554
    for (int pid = 0; pid < count; pid++) {
555
556

      /* Get a hold of the ith part in ci. */
557
558
559
560
561
562
563
564
      struct part *restrict pi = &parts_i[ind[pid]];
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
      const float hi = pi->h;
      const float hig2 = hi * hi * kernel_gamma2;
      const float di =
          -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
          pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
565
566

      /* Loop over the parts in cj. */
567
      for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
568
569

        /* Get a pointer to the jth particle. */
570
        struct part *restrict pj = &parts_j[sort_j[pjd].i];
571
572

        /* Compute the pairwise distance. */
573
574
575
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
576
577
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
578
        }
579

580
581
        /* Hit or miss? */
        if (r2 < hig2) {
582

583
#ifndef WITH_OLD_VECTORIZATION
584
585

          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
586

587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
#else

          /* Add this interaction to the queue. */
          r2q[icount] = r2;
          dxq[3 * icount + 0] = dx[0];
          dxq[3 * icount + 1] = dx[1];
          dxq[3 * icount + 2] = dx[2];
          hiq[icount] = hi;
          hjq[icount] = pj->h;
          piq[icount] = pi;
          pjq[icount] = pj;
          icount += 1;

          /* Flush? */
          if (icount == VEC_SIZE) {
            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
            icount = 0;
          }

#endif
        }

      } /* loop over the parts in cj. */

    } /* loop over the parts in ci. */
  }

614
#ifdef WITH_OLD_VECTORIZATION
615
616
  /* Pick up any leftovers. */
  if (icount > 0)
617
    for (int k = 0; k < icount; k++)
618
619
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
620

621
622
  TIMER_TOC(timer_dopair_subset);
}
Pedro Gonnet's avatar
Pedro Gonnet committed
623

624
625
626
627
628
629
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
630
 * @param parts The #part to interact.
631
632
633
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 */
634
635
636
void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
                   struct part *restrict parts, int *restrict ind, int count) {

637
#ifdef WITH_OLD_VECTORIZATION
638
639
640
641
642
643
644
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
645

Matthieu Schaller's avatar
Matthieu Schaller committed
646
  TIMER_TIC;
647

648
649
  const int count_i = ci->count;
  struct part *restrict parts_j = ci->parts;
650
651

  /* Loop over the parts in ci. */
652
  for (int pid = 0; pid < count; pid++) {
653
654

    /* Get a hold of the ith part in ci. */
655
656
657
658
    struct part *restrict pi = &parts[ind[pid]];
    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;
659

660
    /* Loop over the parts in cj. */
661
    for (int pjd = 0; pjd < count_i; pjd++) {
662
663

      /* Get a pointer to the jth particle. */
664
      struct part *restrict pj = &parts_j[pjd];
665
666

      /* Compute the pairwise distance. */
667
668
669
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
670
671
672
673
674
675
676
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 > 0.0f && r2 < hig2) {

677
#ifndef WITH_OLD_VECTORIZATION
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */
706

707
#ifdef WITH_OLD_VECTORIZATION
708
709
  /* Pick up any leftovers. */
  if (icount > 0)
710
    for (int k = 0; k < icount; k++)
711
712
713
714
715
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(timer_dopair_subset);
}
716

717
/**
718
 * @brief Compute the interactions between a cell pair (non-symmetric).
719
720
721
722
723
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
724
725
void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {

726
  const struct engine *restrict e = r->e;
727

728
#ifdef WITH_MPI
Matthieu Schaller's avatar
Matthieu Schaller committed
729
  if (ci->nodeID != cj->nodeID) {
730
731
732
733
734
    DOPAIR1_NOSORT(r, ci, cj);
    return;
  }
#endif

735
#ifdef WITH_OLD_VECTORIZATION
736
737
738
739
740
741
742
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
743

Matthieu Schaller's avatar
Matthieu Schaller committed
744
  TIMER_TIC;
745
746

  /* Anything to do here? */
747
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
748

749
750
  if (!cell_is_drifted(ci, e)) cell_drift(ci, e);
  if (!cell_is_drifted(cj, e)) cell_drift(cj, e);
751

752
  /* Get the sort ID. */
753
754
  double shift[3] = {0.0, 0.0, 0.0};
  const int sid = space_getsid(e->s, &ci, &cj, shift);
755
756
757
758
759
760

  /* Have the cells been sorted? */
  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
    error("Trying to interact unsorted cells.");

  /* Get the cutoff shift. */
761
762
  double rshift = 0.0;
  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
763
764

  /* Pick-out the sorted lists. */
765
766
  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
767
768

  /* Get some other useful values. */
769
770
771
772
773
774
775
776
777
  const double hi_max = ci->h_max * kernel_gamma - rshift;
  const double hj_max = cj->h_max * kernel_gamma;
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;
  const double di_max = sort_i[count_i - 1].d - rshift;
  const double dj_min = sort_j[0].d;
  const float dx_max = (ci->dx_max + cj->dx_max);
778
779

  /* Loop over the parts in ci. */
780
781
  for (int pid = count_i - 1;
       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
782
783

    /* Get a hold of the ith part in ci. */
784
    struct part *restrict pi = &parts_i[sort_i[pid].i];
785
    if (!part_is_active(pi, e)) continue;
786
787
    const float hi = pi->h;
    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
788
789
    if (di < dj_min) continue;

790
791
792
    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;
793
794

    /* Loop over the parts in cj. */
795
    for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
796
797

      /* Get a pointer to the jth particle. */
798
      struct part *restrict pj = &parts_j[sort_j[pjd].i];
799
800

      /* Compute the pairwise distance. */
801
802
803
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
804
805
806
807
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

808
809
810
811
812
813
814
815
#ifdef SWIFT_DEBUG_CHECKS
      /* Check that particles have been drifted to the current time */
      if (pi->ti_drift != e->ti_current)
        error("Particle pi not drifted to current time");
      if (pj->ti_drift != e->ti_current)
        error("Particle pj not drifted to current time");
#endif

816
817
818
      /* Hit or miss? */
      if (r2 < hig2) {

819
#ifndef WITH_OLD_VECTORIZATION
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

  /* Loop over the parts in cj. */
850
  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
851
852
853
       pjd++) {

    /* Get a hold of the jth part in cj. */
854
    struct part *restrict pj = &parts_j[sort_j[pjd].i];
855
    if (!part_is_active(pj, e)) continue;
856
857
    const float hj = pj->h;
    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
858
859
    if (dj > di_max) continue;

860
861
862
    double pjx[3];
    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
    const float hjg2 = hj * hj * kernel_gamma2;
863
864

    /* Loop over the parts in ci. */
865
    for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
866
867

      /* Get a pointer to the jth particle. */
868
      struct part *restrict pi = &parts_i[sort_i[pid].i];
869
870

      /* Compute the pairwise distance. */
871
872
873
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
874
875
876
877
        dx[k] = pjx[k] - pi->x[k];
        r2 += dx[k] * dx[k];
      }

878
879
880
881
882
883
884
885
#ifdef SWIFT_DEBUG_CHECKS
      /* Check that particles have been drifted to the current time */
      if (pi->ti_drift != e->ti_current)
        error("Particle pi not drifted to current time");
      if (pj->ti_drift != e->ti_current)
        error("Particle pj not drifted to current time");
#endif

886
887
888
      /* Hit or miss? */
      if (r2 < hjg2) {

889
#ifndef WITH_OLD_VECTORIZATION
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918

        IACT_NONSYM(r2, dx, hj, pi->h, pj, pi);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hj;
        hjq[icount] = pi->h;
        piq[icount] = pj;
        pjq[icount] = pi;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

919
#ifdef WITH_OLD_VECTORIZATION
920
921
  /* Pick up any leftovers. */
  if (icount > 0)
922
    for (int k = 0; k < icount; k++)
923
924
925
926
927
928
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(TIMER_DOPAIR);
}

929
930
931
932
933
934
935
/**
 * @brief Compute the interactions between a cell pair (symmetric)
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
936
937
938
void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {

  struct engine *restrict e = r->e;
939

940
#ifdef WITH_MPI
Matthieu Schaller's avatar
Matthieu Schaller committed
941
  if (ci->nodeID != cj->nodeID) {
942
943
944
945
946
    DOPAIR2_NOSORT(r, ci, cj);
    return;
  }
#endif

947
#ifdef WITH_OLD_VECTORIZATION
948
949
950
951
952
953
954
955
956
957
958
959
960
  int icount1 = 0;
  float r2q1[VEC_SIZE] __attribute__((aligned(16)));
  float hiq1[VEC_SIZE] __attribute__((aligned(16)));
  float hjq1[VEC_SIZE] __attribute__((aligned(16)));
  float dxq1[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE];
  int icount2 = 0;
  float r2q2[VEC_SIZE] __attribute__((aligned(16)));
  float hiq2[VEC_SIZE] __attribute__((aligned(16)));
  float hjq2[VEC_SIZE] __attribute__((aligned(16)));
  float dxq2[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE];
#endif
961

Matthieu Schaller's avatar
Matthieu Schaller committed
962
  TIMER_TIC;
963
964

  /* Anything to do here? */
965
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
966

967
968
969
  if (!cell_is_drifted(ci, e)) error("Cell ci not drifted");
  if (!cell_is_drifted(cj, e)) error("Cell cj not drifted");

970
  /* Get the shift ID. */
971
972
  double shift[3] = {0.0, 0.0, 0.0};
  const int sid = space_getsid(e->s, &ci, &cj, shift);
973
974
975
976
977
978

  /* Have the cells been sorted? */
  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
    error("Trying to interact unsorted cells.");

  /* Get the cutoff shift. */
979
980
  double rshift = 0.0;
  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
981
982

  /* Pick-out the sorted lists. */
983
984
  struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
  struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
985
986

  /* Get some other useful values. */
987
988
989
990
991
992
993
994
995
  const double hi_max = ci->h_max * kernel_gamma - rshift;
  const double hj_max = cj->h_max * kernel_gamma;
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;
  const double di_max = sort_i[count_i - 1].d - rshift;
  const double dj_min = sort_j[0].d;
  const double dx_max = (ci->dx_max + cj->dx_max);
996
997

  /* Collect the number of parts left and right below dt. */
998
999
  int countdt_i = 0, countdt_j = 0;
  struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL;
1000
  if (cell_is_all_active(ci, e)) {
For faster browsing, not all history is shown. View entire blame