cache.h 26.7 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2016 James Willis (jame.s.willis@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/
#ifndef SWIFT_CACHE_H
#define SWIFT_CACHE_H

/* Config parameters. */
#include "../config.h"

/* Local headers */
26
#include "align.h"
27
#include "cell.h"
James Willis's avatar
James Willis committed
28
#include "error.h"
James Willis's avatar
James Willis committed
29
#include "part.h"
30
#include "sort_part.h"
James Willis's avatar
James Willis committed
31
#include "vector.h"
32

33
34
35
#define NUM_VEC_PROC 2
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)

36
#ifdef WITH_VECTORIZATION
James Willis's avatar
James Willis committed
37
/* Cache struct to hold a local copy of a cells' particle
38
 * properties required for density/force calculations.*/
James Willis's avatar
James Willis committed
39
struct cache {
40
41

  /* Particle x position. */
42
  float *restrict x SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
43

44
  /* Particle y position. */
45
  float *restrict y SWIFT_CACHE_ALIGN;
46
47

  /* Particle z position. */
48
  float *restrict z SWIFT_CACHE_ALIGN;
49
50

  /* Particle smoothing length. */
51
  float *restrict h SWIFT_CACHE_ALIGN;
52
53

  /* Particle mass. */
54
  float *restrict m SWIFT_CACHE_ALIGN;
55
56

  /* Particle x velocity. */
57
  float *restrict vx SWIFT_CACHE_ALIGN;
58
59

  /* Particle y velocity. */
60
  float *restrict vy SWIFT_CACHE_ALIGN;
61
62

  /* Particle z velocity. */
63
  float *restrict vz SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
64

65
  /* Maximum index into neighbouring cell for particles that are in range. */
66
  int *restrict max_index SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
67

James Willis's avatar
James Willis committed
68
  /* Particle density. */
69
  float *restrict rho SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
70

James Willis's avatar
James Willis committed
71
  /* Particle smoothing length gradient. */
72
  float *restrict grad_h SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
73

James Willis's avatar
James Willis committed
74
  /* Pressure over density squared. */
75
  float *restrict pOrho2 SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
76

James Willis's avatar
James Willis committed
77
  /* Balsara switch. */
78
  float *restrict balsara SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
79

James Willis's avatar
James Willis committed
80
  /* Particle sound speed. */
81
  float *restrict soundspeed SWIFT_CACHE_ALIGN;
82
83
84
85
86

  /* Cache size. */
  int count;
};

James Willis's avatar
James Willis committed
87
88
/* Secondary cache struct to hold a list of interactions between two
 * particles.*/
89
90
91
struct c2_cache {

  /* Separation between two particles squared. */
92
  float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
93
94

  /* x separation between two particles. */
95
  float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
96
97

  /* y separation between two particles. */
98
  float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
99
100

  /* z separation between two particles. */
101
  float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
102
103

  /* Mass of particle pj. */
104
  float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
105
106

  /* x velocity of particle pj. */
107
  float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
James Willis's avatar
James Willis committed
108

109
  /* y velocity of particle pj. */
110
  float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
111
112

  /* z velocity of particle pj. */
113
  float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
114
115
};

116
117
118
119
120
121
/**
 * @brief Allocate memory and initialise cache.
 *
 * @param c The cache.
 * @param count Number of particles to allocate space for.
 */
James Willis's avatar
James Willis committed
122
123
__attribute__((always_inline)) INLINE void cache_init(struct cache *c,
                                                      size_t count) {
124

James Willis's avatar
James Willis committed
125
126
  /* Align cache on correct byte boundary and pad cache size to be a multiple of
   * the vector size
127
   * and include 2 vector lengths for remainder operations. */
128
  size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
129
  if (rem > 0) pad += VEC_SIZE - rem;
130
131
  size_t sizeBytes = (count + pad) * sizeof(float);
  size_t sizeIntBytes = (count + pad) * sizeof(int);
132
133
  int error = 0;

134
135
136
137
138
139
140
141
142
143
  /* Free memory if cache has already been allocated. */
  if (c->count > 0) {
    free(c->x);
    free(c->y);
    free(c->z);
    free(c->m);
    free(c->vx);
    free(c->vy);
    free(c->vz);
    free(c->h);
144
    free(c->max_index);
145
146
147
148
149
    free(c->rho);
    free(c->grad_h);
    free(c->pOrho2);
    free(c->balsara);
    free(c->soundspeed);
150
151
  }

152
153
154
155
156
157
158
159
  error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
James Willis's avatar
James Willis committed
160
161
  error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
                          sizeIntBytes);
162
  error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes);
James Willis's avatar
James Willis committed
163
164
165
166
167
168
169
170
  error +=
      posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes);
  error +=
      posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes);
171

James Willis's avatar
James Willis committed
172
173
  if (error != 0)
    error("Couldn't allocate cache, no. of particles: %d", (int)count);
174
175
176
177
178
179
180
181
182
  c->count = count;
}

/**
 * @brief Populate cache by reading in the particles in unsorted order.
 *
 * @param ci The #cell.
 * @param ci_cache The cache.
 */
James Willis's avatar
James Willis committed
183
__attribute__((always_inline)) INLINE void cache_read_particles(
James Willis's avatar
James Willis committed
184
185
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {
186

187
188
#if defined(GADGET2_SPH)

James Willis's avatar
James Willis committed
189
190
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
191
192
193
194
195
196
197
198
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
199
200

  const struct part *restrict parts = ci->parts;
201
202
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};

James Willis's avatar
James Willis committed
203
204
  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
James Willis's avatar
James Willis committed
205
  for (int i = 0; i < ci->count; i++) {
206
207
208
209
210
211
212
213
    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    h[i] = parts[i].h;
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
James Willis's avatar
James Willis committed
214
  }
215
216

#endif
217
218
}

219
/**
James Willis's avatar
James Willis committed
220
221
 * @brief Populate cache for force interactions by reading in the particles in
 * unsorted order.
222
 *
223
224
 * @param ci The #cell.
 * @param ci_cache The cache.
225
 */
226
227
228
__attribute__((always_inline)) INLINE void cache_read_force_particles(
    const struct cell *restrict const ci,
    struct cache *restrict const ci_cache) {
James Willis's avatar
James Willis committed
229

230
#if defined(GADGET2_SPH)
231

232
233
234
235
236
237
238
239
240
241
242
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
243
244
245
246
247
248
249
250
  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
251
252

  const struct part *restrict parts = ci->parts;
253
  const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
254
255
256

  /* Shift the particles positions to a local frame so single precision can be
   * used instead of double precision. */
257
  for (int i = 0; i < ci->count; i++) {
258
259
260
261
262
263
264
265
266
267
268
269
270
    x[i] = (float)(parts[i].x[0] - loc[0]);
    y[i] = (float)(parts[i].x[1] - loc[1]);
    z[i] = (float)(parts[i].x[2] - loc[2]);
    h[i] = parts[i].h;
    m[i] = parts[i].mass;
    vx[i] = parts[i].v[0];
    vy[i] = parts[i].v[1];
    vz[i] = parts[i].v[2];
    rho[i] = parts[i].rho;
    grad_h[i] = parts[i].force.f;
    pOrho2[i] = parts[i].force.P_over_rho2;
    balsara[i] = parts[i].force.balsara;
    soundspeed[i] = parts[i].force.soundspeed;
271
  }
James Willis's avatar
James Willis committed
272

273
#endif
274
275
}

276
/**
James Willis's avatar
James Willis committed
277
278
279
 * @brief Populate caches by only reading particles that are within range of
 * each other within the adjoining cell.Also read the particles into the cache
 * in sorted order.
280
281
282
283
284
285
286
287
288
289
 *
 * @param ci The i #cell.
 * @param cj The j #cell.
 * @param ci_cache The #cache for cell ci.
 * @param cj_cache The #cache for cell cj.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param sort_j The array of sorted particle indices for cell ci.
 * @param shift The amount to shift the particle positions to account for BCs
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pj The last particle in cell cj that is in range.
James Willis's avatar
James Willis committed
290
 * interaction.
291
 */
292
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
293
    const struct cell *restrict const ci, const struct cell *restrict const cj,
James Willis's avatar
James Willis committed
294
295
296
    struct cache *restrict const ci_cache,
    struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
    const struct entry *restrict sort_j, const double *restrict const shift,
James Willis's avatar
James Willis committed
297
    int *first_pi, int *last_pj) {
298

James Willis's avatar
James Willis committed
299
300
301
302
303
  /* Make the number of particles to be read a multiple of the vector size.
   * This eliminates serial remainder loops where possible when populating the
   * cache. */

  /* Is the number of particles to read a multiple of the vector size? */
James Willis's avatar
James Willis committed
304
  int rem = (ci->count - *first_pi) % VEC_SIZE;
305
  if (rem != 0) {
James Willis's avatar
James Willis committed
306
    int pad = VEC_SIZE - rem;
James Willis's avatar
James Willis committed
307

James Willis's avatar
James Willis committed
308
    /* Decrease first_pi if there are particles in the cell left to read. */
309
    if (*first_pi - pad >= 0) *first_pi -= pad;
310
311
  }

312
  rem = (*last_pj + 1) % VEC_SIZE;
313
  if (rem != 0) {
James Willis's avatar
James Willis committed
314
    int pad = VEC_SIZE - rem;
315

James Willis's avatar
James Willis committed
316
    /* Increase last_pj if there are particles in the cell left to read. */
317
    if (*last_pj + pad < cj->count) *last_pj += pad;
318
319
  }

320
  /* Get some local pointers */
James Willis's avatar
James Willis committed
321
322
  const int first_pi_align = *first_pi;
  const int last_pj_align = *last_pj;
323
324
325
  const struct part *restrict parts_i = ci->parts;
  const struct part *restrict parts_j = cj->parts;

James Willis's avatar
James Willis committed
326
  /* Shift particles to the local frame and account for boundary conditions.*/
327
328
329
330
  const double total_ci_shift[3] = {
      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};

James Willis's avatar
James Willis committed
331
332
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
333
334
335
336
337
338
339
340
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
341
342

  int ci_cache_count = ci->count - first_pi_align;
343

James Willis's avatar
James Willis committed
344
  /* Shift the particles positions to a local frame (ci frame) so single
345
   * precision can be used instead of double precision.  */
346
  for (int i = 0; i < ci_cache_count; i++) {
347
    const int idx = sort_i[i + first_pi_align].i;
348
349
350
    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
351
352
353
354
355
    h[i] = parts_i[idx].h;
    m[i] = parts_i[idx].mass;
    vx[i] = parts_i[idx].v[0];
    vy[i] = parts_i[idx].v[1];
    vz[i] = parts_i[idx].v[2];
356
  }
357

358
#ifdef SWIFT_DEBUG_CHECKS
359
  const float shift_threshold_x =
360
      2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part);
361
  const float shift_threshold_y =
362
      2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part);
363
  const float shift_threshold_z =
364
      2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part);
365

366
367
  /* Make sure that particle positions have been shifted correctly. */
  for (int i = 0; i < ci_cache_count; i++) {
368
    if (x[i] > shift_threshold_x || x[i] < -shift_threshold_x)
James Willis's avatar
James Willis committed
369
      error(
370
371
372
373
          "Error: ci->loc[%lf,%lf,%lf],cj->loc[%lf,%lf,%lf] Particle %d x pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. x=%f, ci->width[0]=%f",
374
375
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, x[i], ci->width[0]);
376
    if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
James Willis's avatar
James Willis committed
377
      error(
378
379
380
381
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. y=%f, ci->width[1]=%f",
382
383
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, y[i], ci->width[1]);
384
    if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
James Willis's avatar
James Willis committed
385
      error(
386
387
388
389
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
          "is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. z=%f, ci->width[2]=%f",
390
391
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, z[i], ci->width[2]);
392
393
  }
#endif
James Willis's avatar
James Willis committed
394

Matthieu Schaller's avatar
Matthieu Schaller committed
395
  /* Pad cache with fake particles that exist outside the cell so will not
396
397
398
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
  const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
James Willis's avatar
James Willis committed
399
400
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
401
                               -(2. * ci->width[2] + max_dx)};
402
  const float h_padded = ci->parts[0].h;
403

James Willis's avatar
James Willis committed
404
  for (int i = ci->count - first_pi_align;
405
       i < ci->count - first_pi_align + VEC_SIZE; i++) {
406
407
408
    x[i] = pos_padded[0];
    y[i] = pos_padded[1];
    z[i] = pos_padded[2];
409
    h[i] = h_padded;
410
411
412
413
414

    m[i] = 1.f;
    vx[i] = 1.f;
    vy[i] = 1.f;
    vz[i] = 1.f;
415
  }
James Willis's avatar
James Willis committed
416

James Willis's avatar
James Willis committed
417
418
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
James Willis's avatar
James Willis committed
419
420
421
422
423
424
425
426
  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
427

428
  for (int i = 0; i <= last_pj_align; i++) {
429
    const int idx = sort_j[i].i;
James Willis's avatar
James Willis committed
430
431
432
    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
433
434
435
436
437
    hj[i] = parts_j[idx].h;
    mj[i] = parts_j[idx].mass;
    vxj[i] = parts_j[idx].v[0];
    vyj[i] = parts_j[idx].v[1];
    vzj[i] = parts_j[idx].v[2];
438
  }
439

440
441
442
#ifdef SWIFT_DEBUG_CHECKS
  /* Make sure that particle positions have been shifted correctly. */
  for (int i = 0; i <= last_pj_align; i++) {
443
    if (xj[i] > shift_threshold_x || xj[i] < -shift_threshold_x)
James Willis's avatar
James Willis committed
444
      error(
445
446
447
448
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d xj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. xj=%f, ci->width[0]=%f",
449
450
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, xj[i], ci->width[0]);
451
    if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
James Willis's avatar
James Willis committed
452
      error(
453
454
455
456
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. yj=%f, ci->width[1]=%f",
457
458
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, yj[i], ci->width[1]);
459
    if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
James Willis's avatar
James Willis committed
460
      error(
461
462
463
464
          "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
          "pos is not within "
          "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
          "2*space_maxreldx)]. zj=%f, ci->width[2]=%f",
465
466
          ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
          cj->loc[2], i, zj[i], ci->width[2]);
467
468
469
  }
#endif

Matthieu Schaller's avatar
Matthieu Schaller committed
470
  /* Pad cache with fake particles that exist outside the cell so will not
471
472
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
James Willis's avatar
James Willis committed
473
474
475
  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
                                 -(2. * cj->width[1] + max_dx),
                                 -(2. * cj->width[2] + max_dx)};
476
477
  const float h_padded_j = cj->parts[0].h;

478
  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
479
480
481
    xj[i] = pos_padded_j[0];
    yj[i] = pos_padded_j[1];
    zj[i] = pos_padded_j[2];
482
    hj[i] = h_padded_j;
483
484
485
486
    mj[i] = 1.f;
    vxj[i] = 1.f;
    vyj[i] = 1.f;
    vzj[i] = 1.f;
487
  }
488
}
James Willis's avatar
James Willis committed
489

490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
/**
 * @brief Populate caches by only reading particles that are within range of
 * each other within the adjoining cell.Also read the particles into the cache
 * in sorted order.
 *
 * @param ci The i #cell.
 * @param cj The j #cell.
 * @param ci_cache The #cache for cell ci.
 * @param cj_cache The #cache for cell cj.
 * @param sort_i The array of sorted particle indices for cell ci.
 * @param sort_j The array of sorted particle indices for cell ci.
 * @param shift The amount to shift the particle positions to account for BCs
 * @param first_pi The first particle in cell ci that is in range.
 * @param last_pj The last particle in cell cj that is in range.
 * interaction.
 */
James Willis's avatar
James Willis committed
506
507
__attribute__((always_inline)) INLINE void
cache_read_two_partial_cells_sorted_force(
508
509
510
    const struct cell *const ci, const struct cell *const cj,
    struct cache *const ci_cache, struct cache *const cj_cache,
    const struct entry *restrict sort_i, const struct entry *restrict sort_j,
James Willis's avatar
James Willis committed
511
    const double *const shift, int *first_pi, int *last_pj) {
512

James Willis's avatar
James Willis committed
513
514
515
516
517
  /* Make the number of particles to be read a multiple of the vector size.
   * This eliminates serial remainder loops where possible when populating the
   * cache. */

  /* Is the number of particles to read a multiple of the vector size? */
James Willis's avatar
James Willis committed
518
  int rem = (ci->count - *first_pi) % VEC_SIZE;
519
  if (rem != 0) {
James Willis's avatar
James Willis committed
520
    int pad = VEC_SIZE - rem;
521

James Willis's avatar
James Willis committed
522
    /* Decrease first_pi if there are particles in the cell left to read. */
523
524
525
    if (*first_pi - pad >= 0) *first_pi -= pad;
  }

526
  rem = (*last_pj + 1) % VEC_SIZE;
527
  if (rem != 0) {
James Willis's avatar
James Willis committed
528
    int pad = VEC_SIZE - rem;
529

James Willis's avatar
James Willis committed
530
    /* Increase last_pj if there are particles in the cell left to read. */
531
532
533
    if (*last_pj + pad < cj->count) *last_pj += pad;
  }

534
  /* Get some local pointers */
James Willis's avatar
James Willis committed
535
536
  const int first_pi_align = *first_pi;
  const int last_pj_align = *last_pj;
537
538
  const struct part *restrict parts_i = ci->parts;
  const struct part *restrict parts_j = cj->parts;
539

James Willis's avatar
James Willis committed
540
  /* Shift particles to the local frame and account for boundary conditions.*/
541
542
543
  const double total_ci_shift[3] = {
      cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
  const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
544

545
546
547
548
549
550
551
552
553
554
555
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
556
557
558
559
560
561
562
563
  swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
564
565

  int ci_cache_count = ci->count - first_pi_align;
James Willis's avatar
James Willis committed
566
  /* Shift the particles positions to a local frame (ci frame) so single
567
   * precision can be  used instead of double precision.  */
568
  for (int i = 0; i < ci_cache_count; i++) {
569
570

    const int idx = sort_i[i + first_pi_align].i;
James Willis's avatar
James Willis committed
571
572
573
    x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
    y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
    z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
574
575
576
577
578
579
580
581
582
583
    h[i] = parts_i[idx].h;
    m[i] = parts_i[idx].mass;
    vx[i] = parts_i[idx].v[0];
    vy[i] = parts_i[idx].v[1];
    vz[i] = parts_i[idx].v[2];
    rho[i] = parts_i[idx].rho;
    grad_h[i] = parts_i[idx].force.f;
    pOrho2[i] = parts_i[idx].force.P_over_rho2;
    balsara[i] = parts_i[idx].force.balsara;
    soundspeed[i] = parts_i[idx].force.soundspeed;
584
585
586
  }

  /* Pad cache with fake particles that exist outside the cell so will not
587
588
589
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
  const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
James Willis's avatar
James Willis committed
590
591
  const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
                               -(2. * ci->width[1] + max_dx),
592
                               -(2. * ci->width[2] + max_dx)};
593
  const float h_padded = ci->parts[0].h;
594

595
596
  for (int i = ci->count - first_pi_align;
       i < ci->count - first_pi_align + VEC_SIZE; i++) {
597
598
599
    x[i] = pos_padded[0];
    y[i] = pos_padded[1];
    z[i] = pos_padded[2];
600
    h[i] = h_padded;
601
602
603
604
605
606
607
608
609
    m[i] = 1.f;
    vx[i] = 1.f;
    vy[i] = 1.f;
    vz[i] = 1.f;
    rho[i] = 1.f;
    grad_h[i] = 1.f;
    pOrho2[i] = 1.f;
    balsara[i] = 1.f;
    soundspeed[i] = 1.f;
610
611
  }

612
613
614
615
616
617
618
619
620
621
622
  /* Let the compiler know that the data is aligned and create pointers to the
   * arrays inside the cache. */
  swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
James Willis's avatar
James Willis committed
623
624
625
626
627
628
629
630
  swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara,
                            SWIFT_CACHE_ALIGNMENT);
  swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed,
                            SWIFT_CACHE_ALIGNMENT);
631

632
  for (int i = 0; i <= last_pj_align; i++) {
633
    const int idx = sort_j[i].i;
James Willis's avatar
James Willis committed
634
635
636
    xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
    yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
    zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
637
638
639
640
641
642
643
644
645
646
    hj[i] = parts_j[idx].h;
    mj[i] = parts_j[idx].mass;
    vxj[i] = parts_j[idx].v[0];
    vyj[i] = parts_j[idx].v[1];
    vzj[i] = parts_j[idx].v[2];
    rhoj[i] = parts_j[idx].rho;
    grad_hj[i] = parts_j[idx].force.f;
    pOrho2j[i] = parts_j[idx].force.P_over_rho2;
    balsaraj[i] = parts_j[idx].force.balsara;
    soundspeedj[i] = parts_j[idx].force.soundspeed;
647
648
649
  }

  /* Pad cache with fake particles that exist outside the cell so will not
650
651
   * interact. We use values of the same magnitude (but negative!) as the real
   * particles to avoid overflow problems. */
James Willis's avatar
James Willis committed
652
653
654
  const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
                                 -(2. * cj->width[1] + max_dx),
                                 -(2. * cj->width[2] + max_dx)};
655
  const float h_padded_j = cj->parts[0].h;
656

657
  for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
658
659
660
    xj[i] = pos_padded_j[0];
    yj[i] = pos_padded_j[1];
    zj[i] = pos_padded_j[2];
661
    hj[i] = h_padded_j;
662
663
664
665
666
667
668
669
670
    mj[i] = 1.f;
    vxj[i] = 1.f;
    vyj[i] = 1.f;
    vzj[i] = 1.f;
    rhoj[i] = 1.f;
    grad_hj[i] = 1.f;
    pOrho2j[i] = 1.f;
    balsaraj[i] = 1.f;
    soundspeedj[i] = 1.f;
671
672
673
  }
}

674
/* @brief Clean the memory allocated by a #cache object.
675
676
677
678
679
680
681
682
683
684
685
686
687
 *
 * @param c The #cache to clean.
 */
static INLINE void cache_clean(struct cache *c) {
  if (c->count > 0) {
    free(c->x);
    free(c->y);
    free(c->z);
    free(c->m);
    free(c->vx);
    free(c->vy);
    free(c->vz);
    free(c->h);
688
    free(c->max_index);
James Willis's avatar
James Willis committed
689
690
691
692
693
    free(c->rho);
    free(c->grad_h);
    free(c->pOrho2);
    free(c->balsara);
    free(c->soundspeed);
694
695
696
  }
}

697
698
#endif /* WITH_VECTORIZATION */

699
#endif /* SWIFT_CACHE_H */