cache.h 38.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2016 James Willis (jame.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/
#ifndef SWIFT_CACHE_H
#define SWIFT_CACHE_H

/* Config parameters. */
#include "../config.h"

/* Local headers */
26
#include "align.h"
27
#include "cell.h"
James Willis's avatar
James Willis committed
28
#include "error.h"
James Willis's avatar
James Willis committed
29
#include "part.h"
30
#include "sort_part.h"
James Willis's avatar
James Willis committed
31
#include "vector.h"
32

33
34
35
#define NUM_VEC_PROC 2
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)

36
#ifdef WITH_VECTORIZATION
James Willis's avatar
James Willis committed
37
/* Cache struct to hold a local copy of a cells' particle
38
 * properties required for density/force calculations.*/
James Willis's avatar
James Willis committed
39
struct cache {
40
41

  /* Particle x position. */
42
  float *restrict x SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
43

44
  /* Particle y position. */
45
  float *restrict y SWIFT_CACHE_ALIGN;
46
47

  /* Particle z position. */
48
  float *restrict z SWIFT_CACHE_ALIGN;
49
50

  /* Particle smoothing length. */
51
  float *restrict h SWIFT_CACHE_ALIGN;
52
53

  /* Particle mass. */
54
  float *restrict m SWIFT_CACHE_ALIGN;
55
56

  /* Particle x velocity. */
57
  float *restrict vx SWIFT_CACHE_ALIGN;
58
59

  /* Particle y velocity. */
60
  float *restrict vy SWIFT_CACHE_ALIGN;
61
62

  /* Particle z velocity. */
63
  float *restrict vz SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
64

65
  /* Maximum index into neighbouring cell for particles that are in range. */
66
  int *restrict max_index SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
67

James Willis's avatar
James Willis committed
68
  /* Particle density. */
69
  float *restrict rho SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
70

James Willis's avatar
James Willis committed
71
  /* Particle smoothing length gradient. */
72
  float *restrict grad_h SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
73

James Willis's avatar
James Willis committed
74
  /* Pressure over density squared. */
75
  float *restrict pOrho2 SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
76

James Willis's avatar
James Willis committed
77
  /* Balsara switch. */
78
  float *restrict balsara SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
79

James Willis's avatar
James Willis committed
80
  /* Particle sound speed. */
81
  float *restrict soundspeed SWIFT_CACHE_ALIGN;
82
83
84
85
86

  /* Cache size. */
  int count;
};

James Willis's avatar
James Willis committed
87
88
/* Secondary cache struct to hold a list of interactions between two
 * particles.*/
89
90
91
struct c2_cache {

  /* Separation between two particles squared. */
92
  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
93
94

  /* x separation between two particles. */
95
  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
96
97

  /* y separation between two particles. */
98
  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
99
100

  /* z separation between two particles. */
101
  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
102
103

  /* Mass of particle pj. */
104
  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
105
106

  /* x velocity of particle pj. */
107
  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
108

109
  /* y velocity of particle pj. */
110
  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
111
112

  /* z velocity of particle pj. */
113
  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
114
115
};

116
117
118
119
120
121
/**
 * @brief Allocate memory and initialise cache.
 *
 * @param c The cache.
 * @param count Number of particles to allocate space for.
 */
James Willis's avatar
James Willis committed
122
123
__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
                                                      size_t count) {
124

James Willis's avatar
James Willis committed
125
126
  /* Align cache on correct byte boundary and pad cache size to be a multiple of
   * the vector size
127
   * and include 2 vector lengths for remainder operations. */
128
  size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
129
  if (rem > 0) pad += VEC_SIZE - rem;
130
131
  size_t sizeBytes = (count + pad) * sizeof(float);
  size_t sizeIntBytes = (count + pad) * sizeof(int);
132
133
  int error = 0;

134
135
136
137
138
139
140
141
142
143
  /* Free memory if cache has already been allocated. */
  if (c->count > 0) {
    free(c->x);
    free(c->y);
    free(c->z);
    free(c->m);
    free(c->vx);
    free(c->vy);
    free(c->vz);
    free(c->h);
144
    free(c->max_index);
145
146
147
148
149
    free(c->rho);
    free(c->grad_h);
    free(c->pOrho2);
    free(c->balsara);
    free(c->soundspeed);
150
151
  }

152
153
154
155
156
157
158
159
  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
James Willis's avatar
James Willis committed
160
161
  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
                          sizeIntBytes);
162
  error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes);
James Willis's avatar
James Willis committed
163
164
165
166
167
168
169
170
  error +=
      posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes);
171

James Willis's avatar
James Willis committed
172
173
  if (error != 0)
    error("Couldn't allocate cache, no. of particles: %d", (int)count);
174
175
176
177
178
179
180
181
  c->count = count;
}

/**
 * @brief Populate cache by reading in the particles in unsorted order.
 *
 * @param ci The #cell.
 * @param ci_cache The cache.
182
 * @return uninhibited_count The no. of uninhibited particles.
183
 */
184
__attribute__((always_inline)) INLINE int cache_read_particles(
James Willis's avatar
James Willis committed
185
186
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {
187

188
189
#if defined(GADGET2_SPH)

James Willis's avatar
James Willis committed
190
191
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
192
193
194
195
196
197
198
199
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
200

201
  const int count = ci->hydro.count;
202
  const struct part *restrict parts = ci->hydro.parts;
203
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
204
205
206
207
  const double max_dx = ci->hydro.dx_max_part;
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
                               -(2. * ci->width[2] + max_dx)};
208
  const float h_padded = ci->hydro.h_max / 4.;
209

James Willis's avatar
James Willis committed
210
211
  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
212
213
214
215
216
217
218
  for (int i = 0; i < count; i++) {

    /* Pad inhibited particles. */
    if (parts[i].time_bin >= time_bin_inhibited) {
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
219
      h[i] = h_padded;
Matthieu Schaller's avatar
Matthieu Schaller committed
220

221
222
223
224
225
226
227
228
229
230
231
232
      continue;
    }

    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    h[i] = parts[i].h;
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
  }
233

Matthieu Schaller's avatar
Matthieu Schaller committed
234
235
  /* Pad cache if the no. of particles is not a multiple of double the vector
   * length. */
236
237
238
239
240
241
242
243
244
245
246
  int count_align = count;
  const int rem = count % (NUM_VEC_PROC * VEC_SIZE);
  if (rem != 0) {
    count_align += (NUM_VEC_PROC * VEC_SIZE) - rem;

    /* Set positions to something outside of the range of any particle */
    for (int i = count; i < count_align; i++) {
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
    }
James Willis's avatar
James Willis committed
247
  }
248

249
  return count_align;
250

251
252
253
#else
  error("Can't call the cache reading function with this flavour of SPH!");
  return 0;
254
#endif
255
256
}

257
/**
258
259
 * @brief Populate cache by reading in the particles in unsorted order for
 * doself_subset.
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
 *
 * @param ci The #cell.
 * @param ci_cache The cache.
 * @return uninhibited_count The no. of uninhibited particles.
 */
__attribute__((always_inline)) INLINE int cache_read_particles_subset_self(
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {

#if defined(GADGET2_SPH)

  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);

  const int count = ci->hydro.count;
  const struct part *restrict parts = ci->hydro.parts;
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
  const double max_dx = ci->hydro.dx_max_part;
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
                               -(2. * ci->width[2] + max_dx)};

  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
  for (int i = 0; i < count; i++) {

    /* Pad inhibited particles. */
    if (parts[i].time_bin >= time_bin_inhibited) {
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];

      continue;
    }

    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
  }

  /* Pad cache if the no. of particles is not a multiple of double the vector
   * length. */
  int count_align = count;
  const int rem = count % (NUM_VEC_PROC * VEC_SIZE);
  if (rem != 0) {
    count_align += (NUM_VEC_PROC * VEC_SIZE) - rem;

    /* Set positions to something outside of the range of any particle */
    for (int i = count; i < count_align; i++) {
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
    }
  }

  return count_align;

#else
  error("Can't call the cache reading function with this flavour of SPH!");
  return 0;
#endif
}

334
335
336
337
338
339
340
341
342
343
344
345
346
347
/**
 * @brief Populate cache by only reading particles that are within range of
 * each other within the adjoining cell. Also read the particles into the cache
 * in sorted order.
 *
 * @param ci The i #cell.
 * @param ci_cache The #cache for cell ci.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pi The last particle in cell ci that is in range.
 * @param last_pi The last particle in cell ci that is in range.
 * @param loc The cell location to remove from the particle positions.
 * @param flipped Flag to check whether the cells have been flipped or not.
 */
348
__attribute__((always_inline)) INLINE void cache_read_particles_subset_pair(
James Willis's avatar
James Willis committed
349
350
351
    const struct cell *restrict const ci, struct cache *restrict const ci_cache,
    const struct entry *restrict sort_i, int *first_pi, int *last_pi,
    const double *loc, const int flipped) {
352
353
354
355
356
357
358
359
360
361
362
363
364

#if defined(GADGET2_SPH)

  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);

365
  const struct part *restrict parts = ci->hydro.parts;
366

James Willis's avatar
James Willis committed
367
  /* The cell is on the right so read the particles
368
   * into the cache from the start of the cell. */
James Willis's avatar
James Willis committed
369
  if (!flipped) {
James Willis's avatar
James Willis committed
370
    const int rem = (*last_pi + 1) % VEC_SIZE;
371
    if (rem != 0) {
James Willis's avatar
James Willis committed
372
      const int pad = VEC_SIZE - rem;
373

374
      /* Increase last_pi if there are particles in the cell left to read. */
375
      if (*last_pi + pad < ci->hydro.count) *last_pi += pad;
376
377
    }

378
379
380
381
382
    const double max_dx = ci->hydro.dx_max_part;
    const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                                 -(2. * ci->width[1] + max_dx),
                                 -(2. * ci->width[2] + max_dx)};

383
384
    /* Shift the particles positions to a local frame so single precision can be
     * used instead of double precision. */
385
    for (int i = 0; i < *last_pi; i++) {
386
      const int idx = sort_i[i].i;
387

388
389
390
391
392
393
394
395
396
397
398
399
400
      /* Put inhibited particles out of range. */
      if (parts[idx].time_bin >= time_bin_inhibited) {
        x[i] = pos_padded[0];
        y[i] = pos_padded[1];
        z[i] = pos_padded[2];
        m[i] = 1.f;
        vx[i] = 1.f;
        vy[i] = 1.f;
        vz[i] = 1.f;

        continue;
      }

401
402
403
404
405
406
407
408
409
410
411
412
      x[i] = (float)(parts[idx].x[0] - loc[0]);
      y[i] = (float)(parts[idx].x[1] - loc[1]);
      z[i] = (float)(parts[idx].x[2] - loc[2]);
      m[i] = parts[idx].mass;
      vx[i] = parts[idx].v[0];
      vy[i] = parts[idx].v[1];
      vz[i] = parts[idx].v[2];
    }

    /* Pad cache with fake particles that exist outside the cell so will not
     * interact. We use values of the same magnitude (but negative!) as the real
     * particles to avoid overflow problems. */
413
    for (int i = *last_pi; i < *last_pi + VEC_SIZE; i++) {
414
415
416
417
418
419
420
421
422
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];

      m[i] = 1.f;
      vx[i] = 1.f;
      vy[i] = 1.f;
      vz[i] = 1.f;
    }
423
  }
James Willis's avatar
James Willis committed
424
  /* The cell is on the left so read the particles
425
   * into the cache from the end of the cell. */
426
  else {
427
    const int rem = (ci->hydro.count - *first_pi) % VEC_SIZE;
428
    if (rem != 0) {
James Willis's avatar
James Willis committed
429
      const int pad = VEC_SIZE - rem;
430
431
432
433
434

      /* Decrease first_pi if there are particles in the cell left to read. */
      if (*first_pi - pad >= 0) *first_pi -= pad;
    }

435
    const int ci_cache_count = ci->hydro.count - *first_pi;
436
437
438
439
    const double max_dx = ci->hydro.dx_max_part;
    const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                                 -(2. * ci->width[1] + max_dx),
                                 -(2. * ci->width[2] + max_dx)};
440

441
442
    /* Shift the particles positions to a local frame so single precision can be
     * used instead of double precision. */
443
444
    for (int i = 0; i < ci_cache_count; i++) {
      const int idx = sort_i[i + *first_pi].i;
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459

      /* Put inhibited particles out of range. */
      if (parts[idx].time_bin >= time_bin_inhibited) {
        x[i] = pos_padded[0];
        y[i] = pos_padded[1];
        z[i] = pos_padded[2];

        m[i] = 1.f;
        vx[i] = 1.f;
        vy[i] = 1.f;
        vz[i] = 1.f;

        continue;
      }

460
461
462
463
464
465
466
467
468
469
470
471
      x[i] = (float)(parts[idx].x[0] - loc[0]);
      y[i] = (float)(parts[idx].x[1] - loc[1]);
      z[i] = (float)(parts[idx].x[2] - loc[2]);
      m[i] = parts[idx].mass;
      vx[i] = parts[idx].v[0];
      vy[i] = parts[idx].v[1];
      vz[i] = parts[idx].v[2];
    }

    /* Pad cache with fake particles that exist outside the cell so will not
     * interact. We use values of the same magnitude (but negative!) as the real
     * particles to avoid overflow problems. */
472
473
    for (int i = ci->hydro.count - *first_pi;
         i < ci->hydro.count - *first_pi + VEC_SIZE; i++) {
474
475
476
477
478
479
480
481
482
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];

      m[i] = 1.f;
      vx[i] = 1.f;
      vy[i] = 1.f;
      vz[i] = 1.f;
    }
James Willis's avatar
James Willis committed
483
  }
484

485
486
487
#endif
}

488
/**
James Willis's avatar
James Willis committed
489
490
 * @brief Populate cache for force interactions by reading in the particles in
 * unsorted order.
491
 *
492
493
 * @param ci The #cell.
 * @param ci_cache The cache.
494
 * @return uninhibited_count The no. of uninhibited particles.
495
 */
496
__attribute__((always_inline)) INLINE int cache_read_force_particles(
497
498
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {
James Willis's avatar
James Willis committed
499

500
#if defined(GADGET2_SPH)
501

502
503
504
505
506
507
508
509
510
511
512
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
513
514
515
516
517
518
519
520
  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
521

522
  const int count = ci->hydro.count;
523
  const struct part *restrict parts = ci->hydro.parts;
524
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
525
526
527
528
  const double max_dx = ci->hydro.dx_max_part;
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
                               -(2. * ci->width[2] + max_dx)};
529
  const float h_padded = ci->hydro.h_max / 4.;
Matthieu Schaller's avatar
Matthieu Schaller committed
530

531
532
  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
533
  for (int i = 0; i < count; i++) {
534
535

    /* Skip inhibited particles. */
536
537
538
539
    if (parts[i].time_bin >= time_bin_inhibited) {
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
540
      h[i] = h_padded;
541
542
543
544
545
      rho[i] = 1.f;
      grad_h[i] = 1.f;
      pOrho2[i] = 1.f;
      balsara[i] = 1.f;
      soundspeed[i] = 1.f;
Matthieu Schaller's avatar
Matthieu Schaller committed
546

547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
      continue;
    }

    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    h[i] = parts[i].h;
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
    rho[i] = parts[i].rho;
    grad_h[i] = parts[i].force.f;
    pOrho2[i] = parts[i].force.P_over_rho2;
    balsara[i] = parts[i].force.balsara;
    soundspeed[i] = parts[i].force.soundspeed;
563
  }
James Willis's avatar
James Willis committed
564

565
566
567
568
569
570
571
572
573
574
575
576
  /* Pad cache if there is a serial remainder. */
  int count_align = count;
  const int rem = count % VEC_SIZE;
  if (rem != 0) {
    count_align += VEC_SIZE - rem;

    /* Set positions to the same as particle pi so when the r2 > 0 mask is
     * applied these extra contributions are masked out.*/
    for (int i = count; i < count_align; i++) {
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
577
      h[i] = h_padded;
578
579
580
581
582
583
584
585
586
      rho[i] = 1.f;
      grad_h[i] = 1.f;
      pOrho2[i] = 1.f;
      balsara[i] = 1.f;
      soundspeed[i] = 1.f;
    }
  }

  return count_align;
587

588
589
590
#else
  error("Can't call the cache reading function with this flavour of SPH!");
  return 0;
591
#endif
592
593
}

594
/**
James Willis's avatar
James Willis committed
595
596
597
 * @brief Populate caches by only reading particles that are within range of
 * each other within the adjoining cell.Also read the particles into the cache
 * in sorted order.
598
599
600
601
602
603
604
605
606
607
608
 *
 * @param ci The i #cell.
 * @param cj The j #cell.
 * @param ci_cache The #cache for cell ci.
 * @param cj_cache The #cache for cell cj.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param sort_j The array of sorted particle indices for cell ci.
 * @param shift The amount to shift the particle positions to account for BCs
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pj The last particle in cell cj that is in range.
 */
609
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
610
    const struct cell *restrict const ci, const struct cell *restrict const cj,
James Willis's avatar
James Willis committed
611
612
613
    struct cache *restrict const ci_cache,
    struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
    const struct entry *restrict sort_j, const double *restrict const shift,
James Willis's avatar
James Willis committed
614
    int *first_pi, int *last_pj) {
615

James Willis's avatar
James Willis committed
616
617
618
619
620
  /* Make the number of particles to be read a multiple of the vector size.
   * This eliminates serial remainder loops where possible when populating the
   * cache. */

  /* Is the number of particles to read a multiple of the vector size? */
621
  int rem = (ci->hydro.count - *first_pi) % VEC_SIZE;
622
  if (rem != 0) {
James Willis's avatar
James Willis committed
623
    int pad = VEC_SIZE - rem;
James Willis's avatar
James Willis committed
624

James Willis's avatar
James Willis committed
625
    /* Decrease first_pi if there are particles in the cell left to read. */
626
    if (*first_pi - pad >= 0) *first_pi -= pad;
627
628
  }

629
  rem = (*last_pj + 1) % VEC_SIZE;
630
  if (rem != 0) {
James Willis's avatar
James Willis committed
631
    int pad = VEC_SIZE - rem;
632

James Willis's avatar
James Willis committed
633
    /* Increase last_pj if there are particles in the cell left to read. */
634
    if (*last_pj + pad < cj->hydro.count) *last_pj += pad;
635
636
  }

637
  /* Get some local pointers */
James Willis's avatar
James Willis committed
638
639
  const int first_pi_align = *first_pi;
  const int last_pj_align = *last_pj;
640
641
  const struct part *restrict parts_i = ci->hydro.parts;
  const struct part *restrict parts_j = cj->hydro.parts;
642

James Willis's avatar
James Willis committed
643
  /* Shift particles to the local frame and account for boundary conditions.*/
644
645
646
647
  const double total_ci_shift[3] = {
      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};

James Willis's avatar
James Willis committed
648
649
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
650
651
652
653
654
655
656
657
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
658

659
  int ci_cache_count = ci->hydro.count - first_pi_align;
660
  const double max_dx = max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
661
  const float pos_padded_i[3] = {-(2. * ci->width[0] + max_dx),
Matthieu Schaller's avatar
Matthieu Schaller committed
662
663
                                 -(2. * ci->width[1] + max_dx),
                                 -(2. * ci->width[2] + max_dx)};
664
  const float h_padded_i = ci->hydro.h_max / 4.;
665

James Willis's avatar
James Willis committed
666
  /* Shift the particles positions to a local frame (ci frame) so single
667
   * precision can be used instead of double precision.  */
668
  for (int i = 0; i < ci_cache_count; i++) {
669
    const int idx = sort_i[i + first_pi_align].i;
Matthieu Schaller's avatar
Matthieu Schaller committed
670

671
    /* Put inhibited particles out of range. */
672
    if (parts_i[idx].time_bin >= time_bin_inhibited) {
673
674
675
676
      x[i] = pos_padded_i[0];
      y[i] = pos_padded_i[1];
      z[i] = pos_padded_i[2];
      h[i] = h_padded_i;
677
678
679
680
681
682
683
684

      m[i] = 1.f;
      vx[i] = 1.f;
      vy[i] = 1.f;
      vz[i] = 1.f;

      continue;
    }
Matthieu Schaller's avatar
Matthieu Schaller committed
685

686
687
688
    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
689
690
691
692
    h[i] = parts_i[idx].h;
    vx[i] = parts_i[idx].v[0];
    vy[i] = parts_i[idx].v[1];
    vz[i] = parts_i[idx].v[2];
693
694
695
#ifdef GADGET2_SPH
    m[i] = parts_i[idx].mass;
#endif
696
  }
697

698
#ifdef SWIFT_DEBUG_CHECKS
699
  const float shift_threshold_x =
700
701
      2. * ci->width[0] +
      2. * max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
702
  const float shift_threshold_y =
703
704
      2. * ci->width[1] +
      2. * max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
705
  const float shift_threshold_z =
706
707
      2. * ci->width[2] +
      2. * max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
708

709
710
  /* Make sure that particle positions have been shifted correctly. */
  for (int i = 0; i < ci_cache_count; i++) {
711
    if (x[i] > shift_threshold_x || x[i] < -shift_threshold_x)
James Willis's avatar
James Willis committed
712
      error(
713
714
715
716
          "Error: ci->loc[%lf,%lf,%lf],cj->loc[%lf,%lf,%lf] Particle %d x pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. x=%f, ci->width[0]=%f",
717
718
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, x[i], ci->width[0]);
719
    if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
James Willis's avatar
James Willis committed
720
      error(
721
722
723
724
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. y=%f, ci->width[1]=%f",
725
726
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, y[i], ci->width[1]);
727
    if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
James Willis's avatar
James Willis committed
728
      error(
729
730
731
732
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. z=%f, ci->width[2]=%f",
733
734
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, z[i], ci->width[2]);
735
736
  }
#endif
James Willis's avatar
James Willis committed
737

Matthieu Schaller's avatar
Matthieu Schaller committed
738
  /* Pad cache with fake particles that exist outside the cell so will not
739
740
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
741
742
  for (int i = ci->hydro.count - first_pi_align;
       i < ci->hydro.count - first_pi_align + VEC_SIZE; i++) {
743
744
745
746
    x[i] = pos_padded_i[0];
    y[i] = pos_padded_i[1];
    z[i] = pos_padded_i[2];
    h[i] = h_padded_i;
747
748
749
750
751

    m[i] = 1.f;
    vx[i] = 1.f;
    vy[i] = 1.f;
    vz[i] = 1.f;
752
  }
James Willis's avatar
James Willis committed
753

James Willis's avatar
James Willis committed
754
755
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
756
757
758
759
760
761
762
763
  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
764

765
766
767
  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
                                 -(2. * cj->width[1] + max_dx),
                                 -(2. * cj->width[2] + max_dx)};
768
  const float h_padded_j = cj->hydro.h_max / 4.;
769

770
  for (int i = 0; i <= last_pj_align; i++) {
771
    const int idx = sort_j[i].i;
Matthieu Schaller's avatar
Matthieu Schaller committed
772

773
    /* Put inhibited particles out of range. */
774
    if (parts_j[idx].time_bin >= time_bin_inhibited) {
775
776
777
778
779
780
781
782
783
784
785
786
787
      xj[i] = pos_padded_j[0];
      yj[i] = pos_padded_j[1];
      zj[i] = pos_padded_j[2];
      hj[i] = h_padded_j;

      mj[i] = 1.f;
      vxj[i] = 1.f;
      vyj[i] = 1.f;
      vzj[i] = 1.f;

      continue;
    }

James Willis's avatar
James Willis committed
788
789
790
    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
791
792
793
794
    hj[i] = parts_j[idx].h;
    vxj[i] = parts_j[idx].v[0];
    vyj[i] = parts_j[idx].v[1];
    vzj[i] = parts_j[idx].v[2];
795
796
797
#ifdef GADGET2_SPH
    mj[i] = parts_j[idx].mass;
#endif
798
  }
799

800
801
802
#ifdef SWIFT_DEBUG_CHECKS
  /* Make sure that particle positions have been shifted correctly. */
  for (int i = 0; i <= last_pj_align; i++) {
803
    if (xj[i] > shift_threshold_x || xj[i] < -shift_threshold_x)
James Willis's avatar
James Willis committed
804
      error(
805
806
807
808
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d xj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. xj=%f, ci->width[0]=%f",
809
810
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, xj[i], ci->width[0]);
811
    if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
James Willis's avatar
James Willis committed
812
      error(
813
814
815
816
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. yj=%f, ci->width[1]=%f",
817
818
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, yj[i], ci->width[1]);
819
    if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
James Willis's avatar
James Willis committed
820
      error(
821
822
823
824
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. zj=%f, ci->width[2]=%f",
825
826
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, zj[i], ci->width[2]);
827
828
829
  }
#endif

Matthieu Schaller's avatar
Matthieu Schaller committed
830
  /* Pad cache with fake particles that exist outside the cell so will not
831
832
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
833
  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
834
835
836
    xj[i] = pos_padded_j[0];
    yj[i] = pos_padded_j[1];
    zj[i] = pos_padded_j[2];
837
    hj[i] = h_padded_j;
838
839
840
841
    mj[i] = 1.f;
    vxj[i] = 1.f;
    vyj[i] = 1.f;
    vzj[i] = 1.f;
842
  }
843
}
James Willis's avatar
James Willis committed
844

845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
/**
 * @brief Populate caches by only reading particles that are within range of
 * each other within the adjoining cell.Also read the particles into the cache
 * in sorted order.
 *
 * @param ci The i #cell.
 * @param cj The j #cell.
 * @param ci_cache The #cache for cell ci.
 * @param cj_cache The #cache for cell cj.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param sort_j The array of sorted particle indices for cell ci.
 * @param shift The amount to shift the particle positions to account for BCs
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pj The last particle in cell cj that is in range.
 */
James Willis's avatar
James Willis committed
860
861
__attribute__((always_inline)) INLINE void
cache_read_two_partial_cells_sorted_force(
862
863
864
    const struct cell *const ci, const struct cell *const cj,
    struct cache *const ci_cache, struct cache *const cj_cache,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
James Willis's avatar
James Willis committed
865
    const double *const shift, int *first_pi, int *last_pj) {
866

James Willis's avatar
James Willis committed
867
868
869
870
871
  /* Make the number of particles to be read a multiple of the vector size.
   * This eliminates serial remainder loops where possible when populating the
   * cache. */

  /* Is the number of particles to read a multiple of the vector size? */
872
  int rem = (ci->hydro.count - *first_pi) % VEC_SIZE;
873
  if (rem != 0) {
James Willis's avatar
James Willis committed
874
    int pad = VEC_SIZE - rem;
875

James Willis's avatar
James Willis committed
876
    /* Decrease first_pi if there are particles in the cell left to read. */
877
878
879
    if (*first_pi - pad >= 0) *first_pi -= pad;
  }

880
  rem = (*last_pj + 1) % VEC_SIZE;
881
  if (rem != 0) {
James Willis's avatar
James Willis committed
882
    int pad = VEC_SIZE - rem;
883

James Willis's avatar
James Willis committed
884
    /* Increase last_pj if there are particles in the cell left to read. */
885
    if (*last_pj + pad < cj->hydro.count) *last_pj += pad;
886
887
  }

888
  /* Get some local pointers */
James Willis's avatar
James Willis committed
889
890
  const int first_pi_align = *first_pi;
  const int last_pj_align = *last_pj;
891
892
  const struct part *restrict parts_i = ci->hydro.parts;
  const struct part *restrict parts_j = cj->hydro.parts;
893

James Willis's avatar
James Willis committed
894
  /* Shift particles to the local frame and account for boundary conditions.*/
895
896
897
  const double total_ci_shift[3] = {
      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
898

899
900
901
902
903
904
905
906
907
908
909
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
910
911
912
913
914
915
916
917
  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
918

919
  int ci_cache_count = ci->hydro.count - first_pi_align;
920
  const double max_dx = max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
921
  const float pos_padded_i[3] = {-(2. * ci->width[0] + max_dx),
Matthieu Schaller's avatar
Matthieu Schaller committed
922
923
                                 -(2. * ci->width[1] + max_dx),
                                 -(2. * ci->width[2] + max_dx)};
924
  const float h_padded_i = ci->hydro.h_max / 4.;
925

James Willis's avatar
James Willis committed
926
  /* Shift the particles positions to a local frame (ci frame) so single
927
   * precision can be  used instead of double precision.  */
928
  for (int i = 0; i < ci_cache_count; i++) {
929
930

    const int idx = sort_i[i + first_pi_align].i;
Matthieu Schaller's avatar
Matthieu Schaller committed
931

932
    /* Put inhibited particles out of range. */
933
    if (parts_i[idx].time_bin >= time_bin_inhibited) {
934
935
936
937
      x[i] = pos_padded_i[0];
      y[i] = pos_padded_i[1];
      z[i] = pos_padded_i[2];
      h[i] = h_padded_i;
938
939
940
941
942
943
944
945
946
947
948
949
950
      m[i] = 1.f;
      vx[i] = 1.f;
      vy[i] = 1.f;
      vz[i] = 1.f;
      rho[i] = 1.f;
      grad_h[i] = 1.f;
      pOrho2[i] = 1.f;
      balsara[i] = 1.f;
      soundspeed[i] = 1.f;

      continue;
    }

James Willis's avatar
James Willis committed
951
952
953
    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
954
955
956
957
    h[i] = parts_i[idx].h;
    vx[i] = parts_i[idx].v[0];
    vy[i] = parts_i[idx].v[1];
    vz[i] = parts_i[idx].v[2];
958
959
#ifdef GADGET2_SPH
    m[i] = parts_i[idx].mass;
960
961
962
963
964
    rho[i] = parts_i[idx].rho;
    grad_h[i] = parts_i[idx].force.f;
    pOrho2[i] = parts_i[idx].force.P_over_rho2;
    balsara[i] = parts_i[idx].force.balsara;
    soundspeed[i] = parts_i[idx].force.soundspeed;
965
#endif
966
967
968
  }

  /* Pad cache with fake particles that exist outside the cell so will not
969
970
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
971
972
  for (int i = ci->hydro.count - first_pi_align;
       i < ci->hydro.count - first_pi_align + VEC_SIZE; i++) {
973
974
975
976
    x[i] = pos_padded_i[0];
    y[i] = pos_padded_i[1];
    z[i] = pos_padded_i[2];
    h[i] = h_padded_i;
977
978
979
980
981
982
983
984
985
    m[i] = 1.f;
    vx[i] = 1.f;
    vy[i] = 1.f;
    vz[i] = 1.f;
    rho[i] = 1.f;
    grad_h[i] = 1.f;
    pOrho2[i] = 1.f;
    balsara[i] = 1.f;
    soundspeed[i] = 1.f;
986
987
  }

988
989
990
991
992
993
994
995
996
997
998
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
999
1000
1001
1002
1003
1004
1005
1006
  swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
1007

1008
1009
1010
  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
                                 -(2. * cj->width[1] + max_dx),
                                 -(2. * cj->width[2] + max_dx)};
1011
  const float h_padded_j = cj->hydro.h_max / 4.;
1012

1013
  for (int i = 0; i <= last_pj_align; i++) {
1014
    const int idx = sort_j[i].i;
1015
1016

    /* Put inhibited particles out of range. */
Matthieu Schaller's avatar
Matthieu Schaller committed
1017
    if (parts_j[idx].time_bin == time_bin_inhibited) {
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
      xj[i] = pos_padded_j[0];
      yj[i] = pos_padded_j[1];
      zj[i] = pos_padded_j[2];
      hj[i] = h_padded_j;
      mj[i] = 1.f;
      vxj[i] = 1.f;
      vyj[i] = 1.f;
      vzj[i] = 1.f