cache.h 31.9 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2016 James Willis (jame.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/
#ifndef SWIFT_CACHE_H
#define SWIFT_CACHE_H

/* Config parameters. */
#include "../config.h"

/* Local headers */
26
#include "align.h"
27
#include "cell.h"
James Willis's avatar
James Willis committed
28
#include "error.h"
James Willis's avatar
James Willis committed
29
#include "part.h"
30
#include "sort_part.h"
James Willis's avatar
James Willis committed
31
#include "vector.h"
32

33
34
35
#define NUM_VEC_PROC 2
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)

36
#ifdef WITH_VECTORIZATION
James Willis's avatar
James Willis committed
37
/* Cache struct to hold a local copy of a cells' particle
38
 * properties required for density/force calculations.*/
James Willis's avatar
James Willis committed
39
struct cache {
40
41

  /* Particle x position. */
42
  float *restrict x SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
43

44
  /* Particle y position. */
45
  float *restrict y SWIFT_CACHE_ALIGN;
46
47

  /* Particle z position. */
48
  float *restrict z SWIFT_CACHE_ALIGN;
49
50

  /* Particle smoothing length. */
51
  float *restrict h SWIFT_CACHE_ALIGN;
52
53

  /* Particle mass. */
54
  float *restrict m SWIFT_CACHE_ALIGN;
55
56

  /* Particle x velocity. */
57
  float *restrict vx SWIFT_CACHE_ALIGN;
58
59

  /* Particle y velocity. */
60
  float *restrict vy SWIFT_CACHE_ALIGN;
61
62

  /* Particle z velocity. */
63
  float *restrict vz SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
64

65
  /* Maximum index into neighbouring cell for particles that are in range. */
66
  int *restrict max_index SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
67

James Willis's avatar
James Willis committed
68
  /* Particle density. */
69
  float *restrict rho SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
70

James Willis's avatar
James Willis committed
71
  /* Particle smoothing length gradient. */
72
  float *restrict grad_h SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
73

James Willis's avatar
James Willis committed
74
  /* Pressure over density squared. */
75
  float *restrict pOrho2 SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
76

James Willis's avatar
James Willis committed
77
  /* Balsara switch. */
78
  float *restrict balsara SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
79

James Willis's avatar
James Willis committed
80
  /* Particle sound speed. */
81
  float *restrict soundspeed SWIFT_CACHE_ALIGN;
82
83
84
85
86

  /* Cache size. */
  int count;
};

James Willis's avatar
James Willis committed
87
88
/* Secondary cache struct to hold a list of interactions between two
 * particles.*/
89
90
91
struct c2_cache {

  /* Separation between two particles squared. */
92
  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
93
94

  /* x separation between two particles. */
95
  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
96
97

  /* y separation between two particles. */
98
  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
99
100

  /* z separation between two particles. */
101
  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
102
103

  /* Mass of particle pj. */
104
  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
105
106

  /* x velocity of particle pj. */
107
  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
108

109
  /* y velocity of particle pj. */
110
  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
111
112

  /* z velocity of particle pj. */
113
  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
114
115
};

116
117
118
119
120
121
/**
 * @brief Allocate memory and initialise cache.
 *
 * @param c The cache.
 * @param count Number of particles to allocate space for.
 */
James Willis's avatar
James Willis committed
122
123
__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
                                                      size_t count) {
124

James Willis's avatar
James Willis committed
125
126
  /* Align cache on correct byte boundary and pad cache size to be a multiple of
   * the vector size
127
   * and include 2 vector lengths for remainder operations. */
128
  size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
129
  if (rem > 0) pad += VEC_SIZE - rem;
130
131
  size_t sizeBytes = (count + pad) * sizeof(float);
  size_t sizeIntBytes = (count + pad) * sizeof(int);
132
133
  int error = 0;

134
135
136
137
138
139
140
141
142
143
  /* Free memory if cache has already been allocated. */
  if (c->count > 0) {
    free(c->x);
    free(c->y);
    free(c->z);
    free(c->m);
    free(c->vx);
    free(c->vy);
    free(c->vz);
    free(c->h);
144
    free(c->max_index);
145
146
147
148
149
    free(c->rho);
    free(c->grad_h);
    free(c->pOrho2);
    free(c->balsara);
    free(c->soundspeed);
150
151
  }

152
153
154
155
156
157
158
159
  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
James Willis's avatar
James Willis committed
160
161
  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
                          sizeIntBytes);
162
  error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes);
James Willis's avatar
James Willis committed
163
164
165
166
167
168
169
170
  error +=
      posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes);
171

James Willis's avatar
James Willis committed
172
173
  if (error != 0)
    error("Couldn't allocate cache, no. of particles: %d", (int)count);
174
175
176
177
178
179
180
181
182
  c->count = count;
}

/**
 * @brief Populate cache by reading in the particles in unsorted order.
 *
 * @param ci The #cell.
 * @param ci_cache The cache.
 */
James Willis's avatar
James Willis committed
183
__attribute__((always_inline)) INLINE void cache_read_particles(
James Willis's avatar
James Willis committed
184
185
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {
186

187
188
#if defined(GADGET2_SPH)

James Willis's avatar
James Willis committed
189
190
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
191
192
193
194
195
196
197
198
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
199

200
  const struct part *restrict parts = ci->hydro.parts;
201
202
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};

James Willis's avatar
James Willis committed
203
204
  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
205
  for (int i = 0; i < ci->hydro.count; i++) {
206
207
208
209
210
211
212
213
    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    h[i] = parts[i].h;
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
James Willis's avatar
James Willis committed
214
  }
215
216

#endif
217
218
}

219
220
221
222
223
224
225
226
227
228
229
230
231
232
/**
 * @brief Populate cache by only reading particles that are within range of
 * each other within the adjoining cell. Also read the particles into the cache
 * in sorted order.
 *
 * @param ci The i #cell.
 * @param ci_cache The #cache for cell ci.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pi The last particle in cell ci that is in range.
 * @param last_pi The last particle in cell ci that is in range.
 * @param loc The cell location to remove from the particle positions.
 * @param flipped Flag to check whether the cells have been flipped or not.
 */
James Willis's avatar
James Willis committed
233
__attribute__((always_inline)) INLINE void cache_read_particles_subset(
James Willis's avatar
James Willis committed
234
235
236
    const struct cell *restrict const ci, struct cache *restrict const ci_cache,
    const struct entry *restrict sort_i, int *first_pi, int *last_pi,
    const double *loc, const int flipped) {
237
238
239
240
241
242
243
244
245
246
247
248
249
250

#if defined(GADGET2_SPH)

  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);

251
  const struct part *restrict parts = ci->hydro.parts;
252

James Willis's avatar
James Willis committed
253
  /* The cell is on the right so read the particles
254
   * into the cache from the start of the cell. */
James Willis's avatar
James Willis committed
255
  if (!flipped) {
James Willis's avatar
James Willis committed
256
    const int rem = (*last_pi + 1) % VEC_SIZE;
257
    if (rem != 0) {
James Willis's avatar
James Willis committed
258
      const int pad = VEC_SIZE - rem;
259

260
      /* Increase last_pi if there are particles in the cell left to read. */
261
      if (*last_pi + pad < ci->hydro.count) *last_pi += pad;
262
263
    }

264
265
    /* Shift the particles positions to a local frame so single precision can be
     * used instead of double precision. */
266
    for (int i = 0; i < *last_pi; i++) {
267
268
269
270
271
272
273
274
275
276
277
278
279
280
      const int idx = sort_i[i].i;
      x[i] = (float)(parts[idx].x[0] - loc[0]);
      y[i] = (float)(parts[idx].x[1] - loc[1]);
      z[i] = (float)(parts[idx].x[2] - loc[2]);
      h[i] = parts[idx].h;
      m[i] = parts[idx].mass;
      vx[i] = parts[idx].v[0];
      vy[i] = parts[idx].v[1];
      vz[i] = parts[idx].v[2];
    }

    /* Pad cache with fake particles that exist outside the cell so will not
     * interact. We use values of the same magnitude (but negative!) as the real
     * particles to avoid overflow problems. */
281
    const double max_dx = ci->hydro.dx_max_part;
James Willis's avatar
James Willis committed
282
283
284
    const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                                 -(2. * ci->width[1] + max_dx),
                                 -(2. * ci->width[2] + max_dx)};
285
    const float h_padded = ci->hydro.parts[0].h;
286

287
    for (int i = *last_pi; i < *last_pi + VEC_SIZE; i++) {
288
289
290
291
292
293
294
295
296
297
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
      h[i] = h_padded;

      m[i] = 1.f;
      vx[i] = 1.f;
      vy[i] = 1.f;
      vz[i] = 1.f;
    }
298
  }
James Willis's avatar
James Willis committed
299
  /* The cell is on the left so read the particles
300
   * into the cache from the end of the cell. */
301
  else {
302
    const int rem = (ci->hydro.count - *first_pi) % VEC_SIZE;
303
    if (rem != 0) {
James Willis's avatar
James Willis committed
304
      const int pad = VEC_SIZE - rem;
305
306
307
308
309

      /* Decrease first_pi if there are particles in the cell left to read. */
      if (*first_pi - pad >= 0) *first_pi -= pad;
    }

310
    const int ci_cache_count = ci->hydro.count - *first_pi;
311

312
313
    /* Shift the particles positions to a local frame so single precision can be
     * used instead of double precision. */
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
    for (int i = 0; i < ci_cache_count; i++) {
      const int idx = sort_i[i + *first_pi].i;
      x[i] = (float)(parts[idx].x[0] - loc[0]);
      y[i] = (float)(parts[idx].x[1] - loc[1]);
      z[i] = (float)(parts[idx].x[2] - loc[2]);
      h[i] = parts[idx].h;
      m[i] = parts[idx].mass;
      vx[i] = parts[idx].v[0];
      vy[i] = parts[idx].v[1];
      vz[i] = parts[idx].v[2];
    }

    /* Pad cache with fake particles that exist outside the cell so will not
     * interact. We use values of the same magnitude (but negative!) as the real
     * particles to avoid overflow problems. */
329
    const double max_dx = ci->hydro.dx_max_part;
330
    const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
James Willis's avatar
James Willis committed
331
332
                                 -(2. * ci->width[1] + max_dx),
                                 -(2. * ci->width[2] + max_dx)};
333
    const float h_padded = ci->hydro.parts[0].h;
334

335
336
    for (int i = ci->hydro.count - *first_pi;
         i < ci->hydro.count - *first_pi + VEC_SIZE; i++) {
337
338
339
340
341
342
343
344
345
346
      x[i] = pos_padded[0];
      y[i] = pos_padded[1];
      z[i] = pos_padded[2];
      h[i] = h_padded;

      m[i] = 1.f;
      vx[i] = 1.f;
      vy[i] = 1.f;
      vz[i] = 1.f;
    }
James Willis's avatar
James Willis committed
347
  }
348

349
350
351
#endif
}

352
/**
James Willis's avatar
James Willis committed
353
354
 * @brief Populate cache for force interactions by reading in the particles in
 * unsorted order.
355
 *
356
357
 * @param ci The #cell.
 * @param ci_cache The cache.
358
 */
359
360
361
__attribute__((always_inline)) INLINE void cache_read_force_particles(
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {
James Willis's avatar
James Willis committed
362

363
#if defined(GADGET2_SPH)
364

365
366
367
368
369
370
371
372
373
374
375
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
376
377
378
379
380
381
382
383
  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
384

385
  const struct part *restrict parts = ci->hydro.parts;
386
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
387
388
389

  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
390
  for (int i = 0; i < ci->hydro.count; i++) {
391
392
393
394
395
396
397
398
399
400
401
402
403
    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    h[i] = parts[i].h;
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
    rho[i] = parts[i].rho;
    grad_h[i] = parts[i].force.f;
    pOrho2[i] = parts[i].force.P_over_rho2;
    balsara[i] = parts[i].force.balsara;
    soundspeed[i] = parts[i].force.soundspeed;
404
  }
James Willis's avatar
James Willis committed
405

406
#endif
407
408
}

409
/**
James Willis's avatar
James Willis committed
410
411
412
 * @brief Populate caches by only reading particles that are within range of
 * each other within the adjoining cell.Also read the particles into the cache
 * in sorted order.
413
414
415
416
417
418
419
420
421
422
423
 *
 * @param ci The i #cell.
 * @param cj The j #cell.
 * @param ci_cache The #cache for cell ci.
 * @param cj_cache The #cache for cell cj.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param sort_j The array of sorted particle indices for cell ci.
 * @param shift The amount to shift the particle positions to account for BCs
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pj The last particle in cell cj that is in range.
 */
424
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
425
    const struct cell *restrict const ci, const struct cell *restrict const cj,
James Willis's avatar
James Willis committed
426
427
428
    struct cache *restrict const ci_cache,
    struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
    const struct entry *restrict sort_j, const double *restrict const shift,
James Willis's avatar
James Willis committed
429
    int *first_pi, int *last_pj) {
430

James Willis's avatar
James Willis committed
431
432
433
434
435
  /* Make the number of particles to be read a multiple of the vector size.
   * This eliminates serial remainder loops where possible when populating the
   * cache. */

  /* Is the number of particles to read a multiple of the vector size? */
436
  int rem = (ci->hydro.count - *first_pi) % VEC_SIZE;
437
  if (rem != 0) {
James Willis's avatar
James Willis committed
438
    int pad = VEC_SIZE - rem;
James Willis's avatar
James Willis committed
439

James Willis's avatar
James Willis committed
440
    /* Decrease first_pi if there are particles in the cell left to read. */
441
    if (*first_pi - pad >= 0) *first_pi -= pad;
442
443
  }

444
  rem = (*last_pj + 1) % VEC_SIZE;
445
  if (rem != 0) {
James Willis's avatar
James Willis committed
446
    int pad = VEC_SIZE - rem;
447

James Willis's avatar
James Willis committed
448
    /* Increase last_pj if there are particles in the cell left to read. */
449
    if (*last_pj + pad < cj->hydro.count) *last_pj += pad;
450
451
  }

452
  /* Get some local pointers */
James Willis's avatar
James Willis committed
453
454
  const int first_pi_align = *first_pi;
  const int last_pj_align = *last_pj;
455
456
  const struct part *restrict parts_i = ci->hydro.parts;
  const struct part *restrict parts_j = cj->hydro.parts;
457

James Willis's avatar
James Willis committed
458
  /* Shift particles to the local frame and account for boundary conditions.*/
459
460
461
462
  const double total_ci_shift[3] = {
      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};

James Willis's avatar
James Willis committed
463
464
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
465
466
467
468
469
470
471
472
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
473

474
  int ci_cache_count = ci->hydro.count - first_pi_align;
475

James Willis's avatar
James Willis committed
476
  /* Shift the particles positions to a local frame (ci frame) so single
477
   * precision can be used instead of double precision.  */
478
  for (int i = 0; i < ci_cache_count; i++) {
479
    const int idx = sort_i[i + first_pi_align].i;
480
481
482
    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
483
484
485
486
    h[i] = parts_i[idx].h;
    vx[i] = parts_i[idx].v[0];
    vy[i] = parts_i[idx].v[1];
    vz[i] = parts_i[idx].v[2];
487
488
489
#ifdef GADGET2_SPH
    m[i] = parts_i[idx].mass;
#endif
490
  }
491

492
#ifdef SWIFT_DEBUG_CHECKS
493
  const float shift_threshold_x =
494
495
      2. * ci->width[0] +
      2. * max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
496
  const float shift_threshold_y =
497
498
      2. * ci->width[1] +
      2. * max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
499
  const float shift_threshold_z =
500
501
      2. * ci->width[2] +
      2. * max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
502

503
504
  /* Make sure that particle positions have been shifted correctly. */
  for (int i = 0; i < ci_cache_count; i++) {
505
    if (x[i] > shift_threshold_x || x[i] < -shift_threshold_x)
James Willis's avatar
James Willis committed
506
      error(
507
508
509
510
          "Error: ci->loc[%lf,%lf,%lf],cj->loc[%lf,%lf,%lf] Particle %d x pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. x=%f, ci->width[0]=%f",
511
512
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, x[i], ci->width[0]);
513
    if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
James Willis's avatar
James Willis committed
514
      error(
515
516
517
518
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. y=%f, ci->width[1]=%f",
519
520
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, y[i], ci->width[1]);
521
    if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
James Willis's avatar
James Willis committed
522
      error(
523
524
525
526
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. z=%f, ci->width[2]=%f",
527
528
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, z[i], ci->width[2]);
529
530
  }
#endif
James Willis's avatar
James Willis committed
531

Matthieu Schaller's avatar
Matthieu Schaller committed
532
  /* Pad cache with fake particles that exist outside the cell so will not
533
534
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
535
  const double max_dx = max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
James Willis's avatar
James Willis committed
536
537
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
538
                               -(2. * ci->width[2] + max_dx)};
539
  const float h_padded = ci->hydro.parts[0].h;
540

541
542
  for (int i = ci->hydro.count - first_pi_align;
       i < ci->hydro.count - first_pi_align + VEC_SIZE; i++) {
543
544
545
    x[i] = pos_padded[0];
    y[i] = pos_padded[1];
    z[i] = pos_padded[2];
546
    h[i] = h_padded;
547
548
549
550
551

    m[i] = 1.f;
    vx[i] = 1.f;
    vy[i] = 1.f;
    vz[i] = 1.f;
552
  }
James Willis's avatar
James Willis committed
553

James Willis's avatar
James Willis committed
554
555
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
556
557
558
559
560
561
562
563
  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
564

565
  for (int i = 0; i <= last_pj_align; i++) {
566
    const int idx = sort_j[i].i;
James Willis's avatar
James Willis committed
567
568
569
    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
570
571
572
573
    hj[i] = parts_j[idx].h;
    vxj[i] = parts_j[idx].v[0];
    vyj[i] = parts_j[idx].v[1];
    vzj[i] = parts_j[idx].v[2];
574
575
576
#ifdef GADGET2_SPH
    mj[i] = parts_j[idx].mass;
#endif
577
  }
578

579
580
581
#ifdef SWIFT_DEBUG_CHECKS
  /* Make sure that particle positions have been shifted correctly. */
  for (int i = 0; i <= last_pj_align; i++) {
582
    if (xj[i] > shift_threshold_x || xj[i] < -shift_threshold_x)
James Willis's avatar
James Willis committed
583
      error(
584
585
586
587
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d xj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. xj=%f, ci->width[0]=%f",
588
589
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, xj[i], ci->width[0]);
590
    if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
James Willis's avatar
James Willis committed
591
      error(
592
593
594
595
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. yj=%f, ci->width[1]=%f",
596
597
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, yj[i], ci->width[1]);
598
    if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
James Willis's avatar
James Willis committed
599
      error(
600
601
602
603
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. zj=%f, ci->width[2]=%f",
604
605
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, zj[i], ci->width[2]);
606
607
608
  }
#endif

Matthieu Schaller's avatar
Matthieu Schaller committed
609
  /* Pad cache with fake particles that exist outside the cell so will not
610
611
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
James Willis's avatar
James Willis committed
612
613
614
  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
                                 -(2. * cj->width[1] + max_dx),
                                 -(2. * cj->width[2] + max_dx)};
615
  const float h_padded_j = cj->hydro.parts[0].h;
616

617
  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
618
619
620
    xj[i] = pos_padded_j[0];
    yj[i] = pos_padded_j[1];
    zj[i] = pos_padded_j[2];
621
    hj[i] = h_padded_j;
622
623
624
625
    mj[i] = 1.f;
    vxj[i] = 1.f;
    vyj[i] = 1.f;
    vzj[i] = 1.f;
626
  }
627
}
James Willis's avatar
James Willis committed
628

629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
/**
 * @brief Populate caches by only reading particles that are within range of
 * each other within the adjoining cell.Also read the particles into the cache
 * in sorted order.
 *
 * @param ci The i #cell.
 * @param cj The j #cell.
 * @param ci_cache The #cache for cell ci.
 * @param cj_cache The #cache for cell cj.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param sort_j The array of sorted particle indices for cell ci.
 * @param shift The amount to shift the particle positions to account for BCs
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pj The last particle in cell cj that is in range.
 */
James Willis's avatar
James Willis committed
644
645
__attribute__((always_inline)) INLINE void
cache_read_two_partial_cells_sorted_force(
646
647
648
    const struct cell *const ci, const struct cell *const cj,
    struct cache *const ci_cache, struct cache *const cj_cache,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
James Willis's avatar
James Willis committed
649
    const double *const shift, int *first_pi, int *last_pj) {
650

James Willis's avatar
James Willis committed
651
652
653
654
655
  /* Make the number of particles to be read a multiple of the vector size.
   * This eliminates serial remainder loops where possible when populating the
   * cache. */

  /* Is the number of particles to read a multiple of the vector size? */
656
  int rem = (ci->hydro.count - *first_pi) % VEC_SIZE;
657
  if (rem != 0) {
James Willis's avatar
James Willis committed
658
    int pad = VEC_SIZE - rem;
659

James Willis's avatar
James Willis committed
660
    /* Decrease first_pi if there are particles in the cell left to read. */
661
662
663
    if (*first_pi - pad >= 0) *first_pi -= pad;
  }

664
  rem = (*last_pj + 1) % VEC_SIZE;
665
  if (rem != 0) {
James Willis's avatar
James Willis committed
666
    int pad = VEC_SIZE - rem;
667

James Willis's avatar
James Willis committed
668
    /* Increase last_pj if there are particles in the cell left to read. */
669
    if (*last_pj + pad < cj->hydro.count) *last_pj += pad;
670
671
  }

672
  /* Get some local pointers */
James Willis's avatar
James Willis committed
673
674
  const int first_pi_align = *first_pi;
  const int last_pj_align = *last_pj;
675
676
  const struct part *restrict parts_i = ci->hydro.parts;
  const struct part *restrict parts_j = cj->hydro.parts;
677

James Willis's avatar
James Willis committed
678
  /* Shift particles to the local frame and account for boundary conditions.*/
679
680
681
  const double total_ci_shift[3] = {
      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
682

683
684
685
686
687
688
689
690
691
692
693
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
694
695
696
697
698
699
700
701
  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
702

703
  int ci_cache_count = ci->hydro.count - first_pi_align;
James Willis's avatar
James Willis committed
704
  /* Shift the particles positions to a local frame (ci frame) so single
705
   * precision can be  used instead of double precision.  */
706
  for (int i = 0; i < ci_cache_count; i++) {
707
708

    const int idx = sort_i[i + first_pi_align].i;
James Willis's avatar
James Willis committed
709
710
711
    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
712
713
714
715
    h[i] = parts_i[idx].h;
    vx[i] = parts_i[idx].v[0];
    vy[i] = parts_i[idx].v[1];
    vz[i] = parts_i[idx].v[2];
716
717
#ifdef GADGET2_SPH
    m[i] = parts_i[idx].mass;
718
719
720
721
722
    rho[i] = parts_i[idx].rho;
    grad_h[i] = parts_i[idx].force.f;
    pOrho2[i] = parts_i[idx].force.P_over_rho2;
    balsara[i] = parts_i[idx].force.balsara;
    soundspeed[i] = parts_i[idx].force.soundspeed;
723
#endif
724
725
726
  }

  /* Pad cache with fake particles that exist outside the cell so will not
727
728
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
729
  const double max_dx = max(ci->hydro.dx_max_part, cj->hydro.dx_max_part);
James Willis's avatar
James Willis committed
730
731
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
732
                               -(2. * ci->width[2] + max_dx)};
733
  const float h_padded = ci->hydro.parts[0].h;
734

735
736
  for (int i = ci->hydro.count - first_pi_align;
       i < ci->hydro.count - first_pi_align + VEC_SIZE; i++) {
737
738
739
    x[i] = pos_padded[0];
    y[i] = pos_padded[1];
    z[i] = pos_padded[2];
740
    h[i] = h_padded;
741
742
743
744
745
746
747
748
749
    m[i] = 1.f;
    vx[i] = 1.f;
    vy[i] = 1.f;
    vz[i] = 1.f;
    rho[i] = 1.f;
    grad_h[i] = 1.f;
    pOrho2[i] = 1.f;
    balsara[i] = 1.f;
    soundspeed[i] = 1.f;
750
751
  }

752
753
754
755
756
757
758
759
760
761
762
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
763
764
765
766
767
768
769
770
  swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
771

772
  for (int i = 0; i <= last_pj_align; i++) {
773
    const int idx = sort_j[i].i;
James Willis's avatar
James Willis committed
774
775
776
    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
777
778
779
780
    hj[i] = parts_j[idx].h;
    vxj[i] = parts_j[idx].v[0];
    vyj[i] = parts_j[idx].v[1];
    vzj[i] = parts_j[idx].v[2];
781
782
#ifdef GADGET2_SPH
    mj[i] = parts_j[idx].mass;
783
784
785
786
787
    rhoj[i] = parts_j[idx].rho;
    grad_hj[i] = parts_j[idx].force.f;
    pOrho2j[i] = parts_j[idx].force.P_over_rho2;
    balsaraj[i] = parts_j[idx].force.balsara;
    soundspeedj[i] = parts_j[idx].force.soundspeed;
788
#endif
789
790
791
  }

  /* Pad cache with fake particles that exist outside the cell so will not
792
793
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
James Willis's avatar
James Willis committed
794
795
796
  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
                                 -(2. * cj->width[1] + max_dx),
                                 -(2. * cj->width[2] + max_dx)};
797
  const float h_padded_j = cj->hydro.parts[0].h;
798

799
  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
800
801
802
    xj[i] = pos_padded_j[0];
    yj[i] = pos_padded_j[1];
    zj[i] = pos_padded_j[2];
803
    hj[i] = h_padded_j;
804
805
806
807
808
809
810
811
812
    mj[i] = 1.f;
    vxj[i] = 1.f;
    vyj[i] = 1.f;
    vzj[i] = 1.f;
    rhoj[i] = 1.f;
    grad_hj[i] = 1.f;
    pOrho2j[i] = 1.f;
    balsaraj[i] = 1.f;
    soundspeedj[i] = 1.f;
813
814
815
  }
}

816
/* @brief Clean the memory allocated by a #cache object.
817
818
819
820
821
822
823
824
825
826
827
828
829
 *
 * @param c The #cache to clean.
 */
static INLINE void cache_clean(struct cache *c) {
  if (c->count > 0) {
    free(c->x);
    free(c->y);
    free(c->z);
    free(c->m);
    free(c->vx);
    free(c->vy);
    free(c->vz);
    free(c->h);
830
    free(c->max_index);
James Willis's avatar
James Willis committed
831
832
833
834
835
    free(c->rho);
    free(c->grad_h);
    free(c->pOrho2);
    free(c->balsara);
    free(c->soundspeed);
836
837
838
  }
}

839
840
#endif /* WITH_VECTORIZATION */

841
#endif /* SWIFT_CACHE_H */