runner_doiact.h 110 KB
Newer Older
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *               2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk)
5
 *
6
7
8
9
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
10
 *
11
12
13
14
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
15
 *
16
17
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18
 *
19
20
21
22
23
24
25
26
 ******************************************************************************/

/* Before including this file, define FUNCTION, which is the
   name of the interaction function. This creates the interaction functions
   runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION,
   and runner_dosub_FUNCTION calling the pairwise interaction function
   runner_iact_FUNCTION. */

27
#define PASTE(x, y) x##_##y
28

29
#define _DOPAIR1(f) PASTE(runner_dopair1, f)
30
#define DOPAIR1 _DOPAIR1(FUNCTION)
31

32
#define _DOPAIR2(f) PASTE(runner_dopair2, f)
33
#define DOPAIR2 _DOPAIR2(FUNCTION)
34

35
#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f)
36
#define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION)
37

38
#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f)
Pedro Gonnet's avatar
Pedro Gonnet committed
39
40
#define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION)

41
42
43
44
45
#define _DOPAIR1_NAIVE(f) PASTE(runner_dopair1_naive, f)
#define DOPAIR1_NAIVE _DOPAIR1_NAIVE(FUNCTION)

#define _DOPAIR2_NAIVE(f) PASTE(runner_dopair2_naive, f)
#define DOPAIR2_NAIVE _DOPAIR2_NAIVE(FUNCTION)
46

47
#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive, f)
48
#define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION)
49

50
#define _DOSELF1(f) PASTE(runner_doself1, f)
51
#define DOSELF1 _DOSELF1(FUNCTION)
52

53
#define _DOSELF2(f) PASTE(runner_doself2, f)
54
#define DOSELF2 _DOSELF2(FUNCTION)
55

56
#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f)
57
#define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION)
58

59
60
61
62
63
64
65
66
#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f)
#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION)

#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f)
#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION)

#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f)
#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION)
67

68
69
#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f)
#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION)
70

71
#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f)
72
#define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION)
73

74
#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym, f)
75
#define IACT_NONSYM _IACT_NONSYM(FUNCTION)
76

77
#define _IACT(f) PASTE(runner_iact, f)
78
#define IACT _IACT(FUNCTION)
79

80
81
82
83
84
85
#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f)
#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION)

#define _IACT_VEC(f) PASTE(runner_iact_vec, f)
#define IACT_VEC _IACT_VEC(FUNCTION)

86
#define _TIMER_DOSELF(f) PASTE(timer_doself, f)
87
#define TIMER_DOSELF _TIMER_DOSELF(FUNCTION)
88

89
#define _TIMER_DOPAIR(f) PASTE(timer_dopair, f)
90
#define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION)
Pedro Gonnet's avatar
Pedro Gonnet committed
91

92
93
94
95
96
#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f)
#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION)

#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f)
#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION)
97

98
#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f)
99
100
#define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION)

101
#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f)
102
103
#define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION)

104
/**
105
 * @brief Compute the interactions between a cell pair (non-symmetric).
106
107
108
109
110
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
111
void DOPAIR1_NAIVE(struct runner *r, struct cell *restrict ci,
112
                   struct cell *restrict cj) {
113
114
115
116

  const struct engine *e = r->e;

#ifndef SWIFT_DEBUG_CHECKS
117
// error("Don't use in actual runs ! Slow code !");
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#endif

#ifdef WITH_VECTORIZATION
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
  TIMER_TIC;

  /* Anything to do here? */
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;

  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;

  /* Get the relative distance between the pairs, wrapping. */
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts in ci. */
  for (int pid = 0; pid < count_i; pid++) {

    /* Get a hold of the ith part in ci. */
    struct part *restrict pi = &parts_i[pid];
    const float hi = pi->h;

    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;

    /* Loop over the parts in cj. */
    for (int pjd = 0; pjd < count_j; pjd++) {

      /* Get a pointer to the jth particle. */
      struct part *restrict pj = &parts_j[pjd];

      /* Compute the pairwise distance. */
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2) {

#ifndef WITH_VECTORIZATION

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }
      if (r2 < pj->h * pj->h * kernel_gamma2) {

#ifndef WITH_VECTORIZATION
203

204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
        for (int k = 0; k < 3; k++) dx[k] = -dx[k];
        IACT_NONSYM(r2, dx, pj->h, hi, pj, pi);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = -dx[0];
        dxq[3 * icount + 1] = -dx[1];
        dxq[3 * icount + 2] = -dx[2];
        hiq[icount] = pj->h;
        hjq[icount] = hi;
        piq[icount] = pj;
        pjq[icount] = pi;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

#ifdef WITH_VECTORIZATION
  /* Pick up any leftovers. */
  if (icount > 0)
    for (int k = 0; k < icount; k++)
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(TIMER_DOPAIR);
}

void DOPAIR2_NAIVE(struct runner *r, struct cell *restrict ci,
244
                   struct cell *restrict cj) {
245

246
247
  const struct engine *e = r->e;

248
#ifndef SWIFT_DEBUG_CHECKS
249
// error("Don't use in actual runs ! Slow code !");
250
#endif
251

252
#ifdef WITH_VECTORIZATION
253
254
255
256
257
258
259
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
Matthieu Schaller's avatar
Matthieu Schaller committed
260
  TIMER_TIC;
261
262

  /* Anything to do here? */
263
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
264

265
266
267
268
269
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;

270
  /* Get the relative distance between the pairs, wrapping. */
271
272
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
273
274
275
276
277
278
279
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts in ci. */
280
  for (int pid = 0; pid < count_i; pid++) {
281
282

    /* Get a hold of the ith part in ci. */
283
284
285
286
287
288
    struct part *restrict pi = &parts_i[pid];
    const float hi = pi->h;

    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;
289
290

    /* Loop over the parts in cj. */
291
    for (int pjd = 0; pjd < count_j; pjd++) {
292
293

      /* Get a pointer to the jth particle. */
294
      struct part *restrict pj = &parts_j[pjd];
295
296

      /* Compute the pairwise distance. */
297
298
299
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
300
301
302
303
304
305
306
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {

307
#ifndef WITH_VECTORIZATION
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327

        IACT(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
328
329
        }

330
331
332
333
334
335
336
#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

337
#ifdef WITH_VECTORIZATION
338
339
  /* Pick up any leftovers. */
  if (icount > 0)
340
    for (int k = 0; k < icount; k++)
341
342
343
344
345
346
347
348
      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(TIMER_DOPAIR);
}

void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) {

349
  const struct engine *e = r->e;
350

351
#ifndef SWIFT_DEBUG_CHECKS
352
// error("Don't use in actual runs ! Slow code !");
353
#endif
354

355
#ifdef WITH_VECTORIZATION
356
357
358
359
360
361
362
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
363

Matthieu Schaller's avatar
Matthieu Schaller committed
364
  TIMER_TIC;
365
366

  /* Anything to do here? */
367
  if (!cell_is_active(c, e)) return;
368

369
370
  const int count = c->count;
  struct part *restrict parts = c->parts;
371
372

  /* Loop over the parts in ci. */
373
  for (int pid = 0; pid < count; pid++) {
374
375

    /* Get a hold of the ith part in ci. */
376
377
378
379
    struct part *restrict pi = &parts[pid];
    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;
380

381
    /* Loop over the parts in cj. */
382
    for (int pjd = pid + 1; pjd < count; pjd++) {
383
384

      /* Get a pointer to the jth particle. */
385
      struct part *restrict pj = &parts[pjd];
386
387

      /* Compute the pairwise distance. */
388
389
390
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
391
392
393
394
395
396
397
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) {

398
#ifndef WITH_VECTORIZATION
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419

        IACT(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }
420

421
422
#endif
      }
423

424
    } /* loop over the parts in cj. */
425

426
427
  } /* loop over the parts in ci. */

428
#ifdef WITH_VECTORIZATION
429
430
  /* Pick up any leftovers. */
  if (icount > 0)
431
    for (int k = 0; k < icount; k++)
432
433
      IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif
434

435
436
  TIMER_TOC(TIMER_DOSELF);
}
437

438
439
440
441
442
443
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
444
 * @param parts_i The #part to interact with @c cj.
445
446
447
448
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
449
450
451
452
453
454
455
456
void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci,
                         struct part *restrict parts_i, int *restrict ind,
                         int count, struct cell *restrict cj) {

  struct engine *e = r->e;

  error("Don't use in actual runs ! Slow code !");

457
#ifdef WITH_VECTORIZATION
458
459
460
461
462
463
464
465
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif

Matthieu Schaller's avatar
Matthieu Schaller committed
466
  TIMER_TIC;
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502

  const int count_j = cj->count;
  struct part *restrict parts_j = cj->parts;

  /* Get the relative distance between the pairs, wrapping. */
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Loop over the parts_i. */
  for (int pid = 0; pid < count; pid++) {

    /* Get a hold of the ith part in ci. */
    struct part *restrict pi = &parts_i[ind[pid]];
    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;

    /* Loop over the parts in cj. */
    for (int pjd = 0; pjd < count_j; pjd++) {

      /* Get a pointer to the jth particle. */
      struct part *restrict pj = &parts_j[pjd];

      /* Compute the pairwise distance. */
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }
503

504
505
506
      /* Hit or miss? */
      if (r2 < hig2) {

507
#ifndef WITH_VECTORIZATION
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */
536

537
#ifdef WITH_VECTORIZATION
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
  /* Pick up any leftovers. */
  if (icount > 0)
    for (int k = 0; k < icount; k++)
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(timer_dopair_subset);
}

/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param parts_i The #part to interact with @c cj.
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 * @param cj The second #cell.
 */
558
559
560
561
562
void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci,
                   struct part *restrict parts_i, int *restrict ind, int count,
                   struct cell *restrict cj) {

  struct engine *e = r->e;
563

564
#ifdef WITH_VECTORIZATION
565
566
567
568
569
570
571
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
572

Matthieu Schaller's avatar
Matthieu Schaller committed
573
  TIMER_TIC;
574

575
576
577
  const int count_j = cj->count;
  struct part *restrict parts_j = cj->parts;

578
  /* Get the relative distance between the pairs, wrapping. */
579
580
  double shift[3] = {0.0, 0.0, 0.0};
  for (int k = 0; k < 3; k++) {
581
582
583
584
585
586
587
    if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2)
      shift[k] = e->s->dim[k];
    else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2)
      shift[k] = -e->s->dim[k];
  }

  /* Get the sorting index. */
588
589
  int sid = 0;
  for (int k = 0; k < 3; k++)
590
591
592
593
594
    sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0)
                         ? 0
                         : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1);

  /* Switch the cells around? */
595
  const int flipped = runner_flip[sid];
596
597
598
599
600
601
  sid = sortlistID[sid];

  /* Have the cells been sorted? */
  if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells.");

  /* Pick-out the sorted lists. */
602
  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
603
  const float dxj = cj->dx_max_sort;
604
605
606
607
608

  /* Parts are on the left? */
  if (!flipped) {

    /* Loop over the parts_i. */
609
    for (int pid = 0; pid < count; pid++) {
610
611

      /* Get a hold of the ith part in ci. */
612
613
614
615
616
617
618
619
620
      struct part *restrict pi = &parts_i[ind[pid]];
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];

      const float hi = pi->h;
      const float hig2 = hi * hi * kernel_gamma2;
      const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] +
                       pix[1] * runner_shift[sid][1] +
                       pix[2] * runner_shift[sid][2];
621
622

      /* Loop over the parts in cj. */
623
      for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
624
625

        /* Get a pointer to the jth particle. */
626
        struct part *restrict pj = &parts_j[sort_j[pjd].i];
627
628

        /* Compute the pairwise distance. */
629
630
631
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
632
633
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
634
        }
635
636
637
638

        /* Hit or miss? */
        if (r2 < hig2) {

639
#ifndef WITH_VECTORIZATION
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662

          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

          /* Add this interaction to the queue. */
          r2q[icount] = r2;
          dxq[3 * icount + 0] = dx[0];
          dxq[3 * icount + 1] = dx[1];
          dxq[3 * icount + 2] = dx[2];
          hiq[icount] = hi;
          hjq[icount] = pj->h;
          piq[icount] = pi;
          pjq[icount] = pj;
          icount += 1;

          /* Flush? */
          if (icount == VEC_SIZE) {
            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
            icount = 0;
          }

#endif
663
        }
664
665
666
667
668
669
670
671
672
673
674

      } /* loop over the parts in cj. */

    } /* loop over the parts in ci. */

  }

  /* Parts are on the right. */
  else {

    /* Loop over the parts_i. */
675
    for (int pid = 0; pid < count; pid++) {
676
677

      /* Get a hold of the ith part in ci. */
678
679
680
681
682
683
684
685
      struct part *restrict pi = &parts_i[ind[pid]];
      double pix[3];
      for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
      const float hi = pi->h;
      const float hig2 = hi * hi * kernel_gamma2;
      const float di =
          -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] +
          pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2];
686
687

      /* Loop over the parts in cj. */
688
      for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) {
689
690

        /* Get a pointer to the jth particle. */
691
        struct part *restrict pj = &parts_j[sort_j[pjd].i];
692
693

        /* Compute the pairwise distance. */
694
695
696
        float r2 = 0.0f;
        float dx[3];
        for (int k = 0; k < 3; k++) {
697
698
          dx[k] = pix[k] - pj->x[k];
          r2 += dx[k] * dx[k];
699
        }
700

701
702
        /* Hit or miss? */
        if (r2 < hig2) {
703

704
#ifndef WITH_VECTORIZATION
705
706

          IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);
707

708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
#else

          /* Add this interaction to the queue. */
          r2q[icount] = r2;
          dxq[3 * icount + 0] = dx[0];
          dxq[3 * icount + 1] = dx[1];
          dxq[3 * icount + 2] = dx[2];
          hiq[icount] = hi;
          hjq[icount] = pj->h;
          piq[icount] = pi;
          pjq[icount] = pj;
          icount += 1;

          /* Flush? */
          if (icount == VEC_SIZE) {
            IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
            icount = 0;
          }

#endif
        }

      } /* loop over the parts in cj. */

    } /* loop over the parts in ci. */
  }

735
#ifdef WITH_VECTORIZATION
736
737
  /* Pick up any leftovers. */
  if (icount > 0)
738
    for (int k = 0; k < icount; k++)
739
740
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif
Pedro Gonnet's avatar
Pedro Gonnet committed
741

742
743
  TIMER_TOC(timer_dopair_subset);
}
Pedro Gonnet's avatar
Pedro Gonnet committed
744

745
746
747
748
749
750
/**
 * @brief Compute the interactions between a cell pair, but only for the
 *      given indices in ci.
 *
 * @param r The #runner.
 * @param ci The first #cell.
751
 * @param parts The #part to interact.
752
753
754
 * @param ind The list of indices of particles in @c ci to interact with.
 * @param count The number of particles in @c ind.
 */
755
756
757
void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci,
                   struct part *restrict parts, int *restrict ind, int count) {

758
#ifdef WITH_VECTORIZATION
759
760
761
762
763
764
765
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
766

Matthieu Schaller's avatar
Matthieu Schaller committed
767
  TIMER_TIC;
768

769
770
  const int count_i = ci->count;
  struct part *restrict parts_j = ci->parts;
771
772

  /* Loop over the parts in ci. */
773
  for (int pid = 0; pid < count; pid++) {
774
775

    /* Get a hold of the ith part in ci. */
776
777
778
779
    struct part *restrict pi = &parts[ind[pid]];
    const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]};
    const float hi = pi->h;
    const float hig2 = hi * hi * kernel_gamma2;
780

781
    /* Loop over the parts in cj. */
782
    for (int pjd = 0; pjd < count_i; pjd++) {
783
784

      /* Get a pointer to the jth particle. */
785
      struct part *restrict pj = &parts_j[pjd];
786
787

      /* Compute the pairwise distance. */
788
789
790
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
791
792
793
794
795
796
797
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

      /* Hit or miss? */
      if (r2 > 0.0f && r2 < hig2) {

798
#ifndef WITH_VECTORIZATION
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */
827

828
#ifdef WITH_VECTORIZATION
829
830
  /* Pick up any leftovers. */
  if (icount > 0)
831
    for (int k = 0; k < icount; k++)
832
833
834
835
836
      IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]);
#endif

  TIMER_TOC(timer_dopair_subset);
}
837

838
/**
839
 * @brief Compute the interactions between a cell pair (non-symmetric).
840
841
842
843
844
 *
 * @param r The #runner.
 * @param ci The first #cell.
 * @param cj The second #cell.
 */
845
846
void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) {

847
  const struct engine *restrict e = r->e;
848

849
#ifdef WITH_VECTORIZATION
850
851
852
853
854
855
856
  int icount = 0;
  float r2q[VEC_SIZE] __attribute__((aligned(16)));
  float hiq[VEC_SIZE] __attribute__((aligned(16)));
  float hjq[VEC_SIZE] __attribute__((aligned(16)));
  float dxq[3 * VEC_SIZE] __attribute__((aligned(16)));
  struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
857

Matthieu Schaller's avatar
Matthieu Schaller committed
858
  TIMER_TIC;
859
860

  /* Anything to do here? */
861
  if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
862

863
864
  if (!cell_is_drifted(ci, e) || !cell_is_drifted(cj, e))
    error("Interacting undrifted cells.");
865

866
  /* Get the sort ID. */
867
868
  double shift[3] = {0.0, 0.0, 0.0};
  const int sid = space_getsid(e->s, &ci, &cj, shift);
869
870
871
872
873
874

  /* Have the cells been sorted? */
  if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
    error("Trying to interact unsorted cells.");

  /* Get the cutoff shift. */
875
876
  double rshift = 0.0;
  for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
877
878

  /* Pick-out the sorted lists. */
879
880
  const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
  const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
881

882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
#ifdef SWIFT_DEBUG_CHECKS
  /* Check that the dx_max_sort values in the cell are indeed an upper
     bound on particle movement. */
  for (int pid = 0; pid < ci->count; pid++) {
    const struct part *p = &ci->parts[sort_i[pid].i];
    const float d = p->x[0] * runner_shift[sid][0] +
                    p->x[1] * runner_shift[sid][1] +
                    p->x[2] * runner_shift[sid][2];
    if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > 1.0e-6)
      error("particle shift diff exceeds dx_max_sort.");
  }
  for (int pjd = 0; pjd < cj->count; pjd++) {
    const struct part *p = &cj->parts[sort_j[pjd].i];
    const float d = p->x[0] * runner_shift[sid][0] +
                    p->x[1] * runner_shift[sid][1] +
                    p->x[2] * runner_shift[sid][2];
    if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > 1.0e-6)
      error("particle shift diff exceeds dx_max_sort.");
  }
#endif /* SWIFT_DEBUG_CHECKS */

903
  /* Get some other useful values. */
904
905
906
907
908
909
910
911
  const double hi_max = ci->h_max * kernel_gamma - rshift;
  const double hj_max = cj->h_max * kernel_gamma;
  const int count_i = ci->count;
  const int count_j = cj->count;
  struct part *restrict parts_i = ci->parts;
  struct part *restrict parts_j = cj->parts;
  const double di_max = sort_i[count_i - 1].d - rshift;
  const double dj_min = sort_j[0].d;
912
  const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
913
914

  /* Loop over the parts in ci. */
915
916
  for (int pid = count_i - 1;
       pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
917
918

    /* Get a hold of the ith part in ci. */
919
    struct part *restrict pi = &parts_i[sort_i[pid].i];
920
    if (!part_is_active(pi, e)) continue;
921
922
    const float hi = pi->h;
    const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
923
924
    if (di < dj_min) continue;

925
926
927
    double pix[3];
    for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k];
    const float hig2 = hi * hi * kernel_gamma2;
928
929

    /* Loop over the parts in cj. */
930
    for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) {
931
932

      /* Get a pointer to the jth particle. */
933
      struct part *restrict pj = &parts_j[sort_j[pjd].i];
934
935

      /* Compute the pairwise distance. */
936
937
938
      float r2 = 0.0f;
      float dx[3];
      for (int k = 0; k < 3; k++) {
939
940
941
942
        dx[k] = pix[k] - pj->x[k];
        r2 += dx[k] * dx[k];
      }

943
944
945
946
947
948
949
950
#ifdef SWIFT_DEBUG_CHECKS
      /* Check that particles have been drifted to the current time */
      if (pi->ti_drift != e->ti_current)
        error("Particle pi not drifted to current time");
      if (pj->ti_drift != e->ti_current)
        error("Particle pj not drifted to current time");
#endif

951
952
953
      /* Hit or miss? */
      if (r2 < hig2) {

954
#ifndef WITH_VECTORIZATION
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984

        IACT_NONSYM(r2, dx, hi, pj->h, pi, pj);

#else

        /* Add this interaction to the queue. */
        r2q[icount] = r2;
        dxq[3 * icount + 0] = dx[0];
        dxq[3 * icount + 1] = dx[1];
        dxq[3 * icount + 2] = dx[2];
        hiq[icount] = hi;
        hjq[icount] = pj->h;
        piq[icount] = pi;
        pjq[icount] = pj;
        icount += 1;

        /* Flush? */
        if (icount == VEC_SIZE) {
          IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq);
          icount = 0;
        }

#endif
      }

    } /* loop over the parts in cj. */

  } /* loop over the parts in ci. */

  /* Loop over the parts in cj. */
985
  for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
986
987
988
       pjd++) {

    /* Get a hold of the jth part in cj. */
989
    struct part *restrict pj = &parts_j[sort_j[pjd].i];
990
    if (!part_is_active(pj, e)) continue;
991
992
    const float hj = pj->h;
    const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
993
994
    if (dj > di_max) continue;

995
996
997
    double pjx[3];
    for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
    const float hjg2 = hj * hj * kernel_gamma2;
998
999

    /* Loop over the parts in ci. */
1000
    for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
For faster browsing, not all history is shown. View entire blame