engine.c 143 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "active.h"
52
#include "atomic.h"
53
#include "cell.h"
54
#include "clocks.h"
55
56
#include "cycle.h"
#include "debug.h"
57
#include "error.h"
58
#include "gravity.h"
59
#include "hydro.h"
60
#include "minmax.h"
61
#include "parallel_io.h"
62
#include "part.h"
63
#include "partition.h"
James Willis's avatar
James Willis committed
64
#include "profiler.h"
65
66
#include "proxy.h"
#include "runner.h"
67
68
#include "serial_io.h"
#include "single_io.h"
69
#include "sort_part.h"
70
#include "statistics.h"
71
#include "timers.h"
72
#include "tools.h"
73
#include "units.h"
74
#include "version.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
75

76
77
78
/* Particle cache size. */
#define CACHE_SIZE 512

79
const char *engine_policy_names[16] = {"none",
Pedro Gonnet's avatar
Pedro Gonnet committed
80
81
82
83
84
85
86
87
88
89
                                       "rand",
                                       "steal",
                                       "keep",
                                       "block",
                                       "cpu_tight",
                                       "mpi",
                                       "numa_affinity",
                                       "hydro",
                                       "self_gravity",
                                       "external_gravity",
90
                                       "cosmology_integration",
91
                                       "drift_all",
92
                                       "cooling",
93
                                       "sourceterms",
94
                                       "stars"};
Pedro Gonnet's avatar
Pedro Gonnet committed
95

96
97
98
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

99
100
101
102
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
103
 * @param l A pointer to the #link, will be modified atomically.
104
105
106
107
 * @param t The #task.
 *
 * @return The new #link pointer.
 */
108
void engine_addlink(struct engine *e, struct link **l, struct task *t) {
109

110
  /* Get the next free link. */
111
112
113
114
115
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
116

117
  /* Set it atomically. */
118
  res->t = t;
119
  res->next = atomic_swap(l, res);
120
}
121

122
123
124
125
126
127
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
 * i.e. all the O(Npart) tasks.
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
128
129
 * Note that there is no need to recurse below the super-cell.
 *
130
131
132
 * @param e The #engine.
 * @param c The #cell.
 */
133
void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
134
135

  struct scheduler *s = &e->sched;
136
  const int is_hydro = (e->policy & engine_policy_hydro);
137
  const int is_self_gravity = (e->policy & engine_policy_self_gravity);
138
  const int is_with_cooling = (e->policy & engine_policy_cooling);
Matthieu Schaller's avatar
Matthieu Schaller committed
139
  const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
140

141
  /* Are we in a super-cell ? */
142
  if (c->super == c) {
143
144
145
146

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

147
148
149
150
151
152
      /* Add the two half kicks */
      c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
                                   c, NULL, 0);

      c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0,
                                   c, NULL, 0);
Tom Theuns's avatar
Tom Theuns committed
153

154
155
156
157
158
      /* Add the time-step calculation task and its dependency */
      c->timestep = scheduler_addtask(s, task_type_timestep, task_subtype_none,
                                      0, 0, c, NULL, 0);

      scheduler_addunlock(s, c->kick2, c->timestep);
159
      scheduler_addunlock(s, c->timestep, c->kick1);
160

161
      if (is_self_gravity) {
162

163
164
165
166
        /* Initialisation of the multipoles */
        c->init_grav = scheduler_addtask(s, task_type_init_grav,
                                         task_subtype_none, 0, 0, c, NULL, 0);

167
168
169
170
171
172
173
174
        /* Gravity non-neighbouring pm calculations */
        c->grav_long_range = scheduler_addtask(
            s, task_type_grav_long_range, task_subtype_none, 0, 0, c, NULL, 0);

        /* Gravity top-level periodic calculation */
        c->grav_top_level = scheduler_addtask(
            s, task_type_grav_top_level, task_subtype_none, 0, 0, c, NULL, 0);

175
176
177
178
        /* Gravity recursive down-pass */
        c->grav_down = scheduler_addtask(s, task_type_grav_down,
                                         task_subtype_none, 0, 0, c, NULL, 0);

179
180
        scheduler_addunlock(s, c->init_grav, c->grav_long_range);
        scheduler_addunlock(s, c->init_grav, c->grav_top_level);
181
182
183
        scheduler_addunlock(s, c->grav_long_range, c->grav_down);
        scheduler_addunlock(s, c->grav_top_level, c->grav_down);
        scheduler_addunlock(s, c->grav_down, c->kick2);
184
185
      }

186
      /* Generate the ghost task. */
187
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
188
189
        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                                     0, c, NULL, 0);
190
191
192

#ifdef EXTRA_HYDRO_LOOP
      /* Generate the extra ghost task. */
193
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
194
195
        c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
                                           task_subtype_none, 0, 0, c, NULL, 0);
196
#endif
197

198
      /* Cooling task */
199
      if (is_with_cooling) {
Matthieu Schaller's avatar
Matthieu Schaller committed
200
201
        c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
                                       0, 0, c, NULL, 0);
202
203
204
205

        scheduler_addunlock(s, c->cooling, c->kick2);
      }

206
      /* add source terms */
207
      if (is_with_sourceterms) {
208
209
        c->sourceterms = scheduler_addtask(s, task_type_sourceterms,
                                           task_subtype_none, 0, 0, c, NULL, 0);
210
      }
211
212
    }

213
  } else { /* We are above the super-cell so need to go deeper */
214

215
216
217
#ifdef SWIFT_DEBUG_CHECKS
    if (c->super != NULL) error("Incorrectly set super pointer");
#endif
218

219
220
221
222
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
223
          engine_make_hierarchical_tasks(e, c->progeny[k]);
224
  }
225
}
226

227
/**
228
 * @brief Redistribute the particles amongst the nodes according
229
230
 *      to their cell's node IDs.
 *
231
232
233
234
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
235
236
237
238
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
239
240
 *
 *
241
242
 * @param e The #engine.
 */
243
void engine_redistribute(struct engine *e) {
244

245
#ifdef WITH_MPI
246

247
248
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
249
  struct space *s = e->s;
250
  struct cell *cells = s->cells_top;
251
  const int nr_cells = s->nr_cells;
252
  const int *cdim = s->cdim;
253
  const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
254
255
256
257
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct xpart *xparts = s->xparts;
  struct gpart *gparts = s->gparts;
258
  struct spart *sparts = s->sparts;
259
  ticks tic = getticks();
260

261
262
  /* Allocate temporary arrays to store the counts of particles to be sent
     and the destination of each particle */
263
  int *counts, *g_counts, *s_counts;
264
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
265
    error("Failed to allocate counts temporary buffer.");
266
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
267
    error("Failed to allocate g_gcount temporary buffer.");
268
  if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
269
    error("Failed to allocate s_counts temporary buffer.");
270
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
271
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
272
  bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes);
273

274
  /* Allocate the destination index arrays. */
275
  int *dest, *g_dest, *s_dest;
276
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
277
    error("Failed to allocate dest temporary buffer.");
278
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
279
    error("Failed to allocate g_dest temporary buffer.");
280
281
  if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL)
    error("Failed to allocate s_dest temporary buffer.");
282

283
  /* Get destination of each particle */
284
  for (size_t k = 0; k < s->nr_parts; k++) {
285
286

    /* Periodic boundary conditions */
287
    for (int j = 0; j < 3; j++) {
288
289
290
291
292
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
293
294
295
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
296
297
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
298
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
299
300
301
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

302
    dest[k] = cells[cid].nodeID;
303
304

    /* The counts array is indexed as count[from * nr_nodes + to]. */
305
306
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
307
308

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
309
  if (s->nr_parts > 0)
310
    space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
311

312
313
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the part have been sorted correctly. */
314
315
316
317
  for (size_t k = 0; k < s->nr_parts; k++) {
    const struct part *p = &s->parts[k];

    /* New cell index */
318
    const int new_cid =
319
320
321
322
        cell_getid(s->cdim, p->x[0] * s->iwidth[0], p->x[1] * s->iwidth[1],
                   p->x[2] * s->iwidth[2]);

    /* New cell of this part */
323
324
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
325

326
327
    if (dest[k] != new_node)
      error("part's new node index not matching sorted index.");
328
329
330
331
332

    if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->width[0] ||
        p->x[1] < c->loc[1] || p->x[1] > c->loc[1] + c->width[1] ||
        p->x[2] < c->loc[2] || p->x[2] > c->loc[2] + c->width[2])
      error("part not sorted into the right top-level cell!");
333
334
  }
#endif
335

336
  /* We need to re-link the gpart partners of parts. */
337
338
339
340
341
342
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

343
344
345
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
346
         * Recall that gparts without partners have a positive id.
347
         * We will restore the pointers on the receiving node later on. */
348
349
350
351
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
352

353
#ifdef SWIFT_DEBUG_CHECKS
354
        if (s->parts[k].gpart->id_or_neg_offset > 0)
355
356
          error("Trying to link a partnerless gpart !");
#endif
357

358
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
359
        count_this_dest++;
360
361
362
363
      }
    }
  }

364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
  /* Get destination of each s-particle */
  for (size_t k = 0; k < s->nr_sparts; k++) {

    /* Periodic boundary conditions */
    for (int j = 0; j < 3; j++) {
      if (sparts[k].x[j] < 0.0)
        sparts[k].x[j] += dim[j];
      else if (sparts[k].x[j] >= dim[j])
        sparts[k].x[j] -= dim[j];
    }
    const int cid =
        cell_getid(cdim, sparts[k].x[0] * iwidth[0], sparts[k].x[1] * iwidth[1],
                   sparts[k].x[2] * iwidth[2]);
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
            sparts[k].x[0], sparts[k].x[1], sparts[k].x[2]);
#endif

    s_dest[k] = cells[cid].nodeID;

    /* The counts array is indexed as count[from * nr_nodes + to]. */
    s_counts[nodeID * nr_nodes + s_dest[k]] += 1;
  }

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
390
  if (s->nr_sparts > 0)
391
    space_sparts_sort(s, s_dest, s->nr_sparts, 0, nr_nodes - 1, e->verbose);
392

393
394
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the spart have been sorted correctly. */
395
396
397
398
  for (size_t k = 0; k < s->nr_sparts; k++) {
    const struct spart *sp = &s->sparts[k];

    /* New cell index */
399
    const int new_cid =
400
401
402
403
        cell_getid(s->cdim, sp->x[0] * s->iwidth[0], sp->x[1] * s->iwidth[1],
                   sp->x[2] * s->iwidth[2]);

    /* New cell of this spart */
404
405
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
406

407
408
    if (s_dest[k] != new_node)
      error("spart's new node index not matching sorted index.");
409
410
411
412
413

    if (sp->x[0] < c->loc[0] || sp->x[0] > c->loc[0] + c->width[0] ||
        sp->x[1] < c->loc[1] || sp->x[1] > c->loc[1] + c->width[1] ||
        sp->x[2] < c->loc[2] || sp->x[2] > c->loc[2] + c->width[2])
      error("spart not sorted into the right top-level cell!");
414
415
416
  }
#endif

417
  /* We need to re-link the gpart partners of sparts. */
418
419
420
  if (s->nr_sparts > 0) {
    int current_dest = s_dest[0];
    size_t count_this_dest = 0;
421
    for (size_t k = 0; k < s->nr_sparts; ++k) {
422
423
424
425
426
      if (s->sparts[k].gpart != NULL) {

        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
427
         * Recall that gparts without partners have a positive id.
428
429
430
431
432
433
434
         * We will restore the pointers on the receiving node later on. */
        if (s_dest[k] != current_dest) {
          current_dest = s_dest[k];
          count_this_dest = 0;
        }

#ifdef SWIFT_DEBUG_CHECKS
435
        if (s->sparts[k].gpart->id_or_neg_offset > 0)
436
437
438
439
440
441
442
443
444
          error("Trying to link a partnerless gpart !");
#endif

        s->sparts[k].gpart->id_or_neg_offset = -count_this_dest;
        count_this_dest++;
      }
    }
  }

445
  /* Get destination of each g-particle */
446
  for (size_t k = 0; k < s->nr_gparts; k++) {
447
448

    /* Periodic boundary conditions */
449
    for (int j = 0; j < 3; j++) {
450
451
452
453
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
454
    }
James Willis's avatar
James Willis committed
455
456
457
    const int cid =
        cell_getid(cdim, gparts[k].x[0] * iwidth[0], gparts[k].x[1] * iwidth[1],
                   gparts[k].x[2] * iwidth[2]);
458
459
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
460
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
461
462
463
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

464
    g_dest[k] = cells[cid].nodeID;
465
466

    /* The counts array is indexed as count[from * nr_nodes + to]. */
467
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
468
  }
469
470

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
471
  if (s->nr_gparts > 0)
472
    space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
473

474
475
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the gpart have been sorted correctly. */
476
477
478
479
  for (size_t k = 0; k < s->nr_gparts; k++) {
    const struct gpart *gp = &s->gparts[k];

    /* New cell index */
480
    const int new_cid =
481
482
483
484
        cell_getid(s->cdim, gp->x[0] * s->iwidth[0], gp->x[1] * s->iwidth[1],
                   gp->x[2] * s->iwidth[2]);

    /* New cell of this gpart */
485
486
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
487

488
489
    if (g_dest[k] != new_node)
      error("gpart's new node index not matching sorted index.");
490
491
492
493
494

    if (gp->x[0] < c->loc[0] || gp->x[0] > c->loc[0] + c->width[0] ||
        gp->x[1] < c->loc[1] || gp->x[1] > c->loc[1] + c->width[1] ||
        gp->x[2] < c->loc[2] || gp->x[2] > c->loc[2] + c->width[2])
      error("gpart not sorted into the right top-level cell!");
495
496
  }
#endif
497

498
499
500
501
502
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

503
  /* Get all the s_counts from all the nodes. */
504
505
506
507
508
509
510
511
512
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, s_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce sparticle transfer counts.");

Peter W. Draper's avatar
Peter W. Draper committed
513
  /* Report how many particles will be moved. */
514
515
  if (e->verbose) {
    if (e->nodeID == 0) {
516
517
      size_t total = 0, g_total = 0, s_total = 0;
      size_t unmoved = 0, g_unmoved = 0, s_unmoved = 0;
518
519
520
      for (int p = 0, r = 0; p < nr_nodes; p++) {
        for (int s = 0; s < nr_nodes; s++) {
          total += counts[r];
521
522
523
524
525
526
527
          g_total += g_counts[r];
          s_total += s_counts[r];
          if (p == s) {
            unmoved += counts[r];
            g_unmoved += g_counts[r];
            s_unmoved += s_counts[r];
          }
528
529
530
          r++;
        }
      }
Matthieu Schaller's avatar
Matthieu Schaller committed
531
532
533
534
535
536
537
538
539
540
541
      if (total > 0)
        message("%ld of %ld (%.2f%%) of particles moved", total - unmoved,
                total, 100.0 * (double)(total - unmoved) / (double)total);
      if (g_total > 0)
        message("%ld of %ld (%.2f%%) of g-particles moved", g_total - g_unmoved,
                g_total,
                100.0 * (double)(g_total - g_unmoved) / (double)g_total);
      if (s_total > 0)
        message("%ld of %ld (%.2f%%) of s-particles moved", s_total - s_unmoved,
                s_total,
                100.0 * (double)(s_total - s_unmoved) / (double)s_total);
542
    }
543
544
  }

Matthieu Schaller's avatar
Matthieu Schaller committed
545
  /* Each node knows how many parts, sparts and gparts will be transferred
546
     to every other node. We can start preparing to receive data */
547
548

  /* Get the new number of parts and gparts for this node */
549
  size_t nr_parts = 0, nr_gparts = 0, nr_sparts = 0;
550
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
551
552
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];
553
554
  for (int k = 0; k < nr_nodes; k++)
    nr_sparts += s_counts[k * nr_nodes + nodeID];
555
556

  /* Allocate the new arrays with some extra margin */
557
  struct part *parts_new = NULL;
558
  struct xpart *xparts_new = NULL;
559
  struct gpart *gparts_new = NULL;
560
  struct spart *sparts_new = NULL;
561
  if (posix_memalign((void **)&parts_new, part_align,
562
563
                     sizeof(struct part) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
564
    error("Failed to allocate new part data.");
565
566
567
568
569
570
571
572
  if (posix_memalign((void **)&xparts_new, xpart_align,
                     sizeof(struct xpart) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new xpart data.");
  if (posix_memalign((void **)&gparts_new, gpart_align,
                     sizeof(struct gpart) * nr_gparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new gpart data.");
573
574
575
576
  if (posix_memalign((void **)&sparts_new, spart_align,
                     sizeof(struct spart) * nr_sparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new spart data.");
577
578

  /* Prepare MPI requests for the asynchronous communications */
579
  MPI_Request *reqs;
580
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 8 * nr_nodes)) ==
581
582
      NULL)
    error("Failed to allocate MPI request list.");
583
  for (int k = 0; k < 8 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
584
585
586
587

  /* Emit the sends and recvs for the particle and gparticle data. */
  size_t offset_send = 0, offset_recv = 0;
  size_t g_offset_send = 0, g_offset_recv = 0;
588
  size_t s_offset_send = 0, s_offset_recv = 0;
589
590
591
592
593
594
595
  for (int k = 0; k < nr_nodes; k++) {

    /* Indices in the count arrays of the node of interest */
    const int ind_send = nodeID * nr_nodes + k;
    const int ind_recv = k * nr_nodes + nodeID;

    /* Are we sending any part/xpart ? */
596
    if (counts[ind_send] > 0) {
597

598
      /* message("Sending %d part to node %d", counts[ind_send], k); */
Matthieu Schaller's avatar
Matthieu Schaller committed
599

600
      /* If the send is to the same node, just copy */
601
602
603
604
605
606
607
      if (k == nodeID) {
        memcpy(&parts_new[offset_recv], &s->parts[offset_send],
               sizeof(struct part) * counts[ind_recv]);
        memcpy(&xparts_new[offset_recv], &s->xparts[offset_send],
               sizeof(struct xpart) * counts[ind_recv]);
        offset_send += counts[ind_send];
        offset_recv += counts[ind_recv];
608
609

        /* Else, emit some communications */
610
      } else {
611
        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
612
613
                      k, 4 * ind_send + 0, MPI_COMM_WORLD,
                      &reqs[8 * k + 0]) != MPI_SUCCESS)
614
          error("Failed to isend parts to node %i.", k);
615
        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
616
617
                      k, 4 * ind_send + 1, MPI_COMM_WORLD,
                      &reqs[8 * k + 1]) != MPI_SUCCESS)
618
          error("Failed to isend xparts to node %i.", k);
619
620
        offset_send += counts[ind_send];
      }
621
    }
622
623
624
625

    /* Are we sending any gpart ? */
    if (g_counts[ind_send] > 0) {

626
      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
627

628
629
      /* If the send is to the same node, just copy */
      if (k == nodeID) {
630
        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
631
632
633
634
635
636
637
               sizeof(struct gpart) * g_counts[ind_recv]);
        g_offset_send += g_counts[ind_send];
        g_offset_recv += g_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
638
639
                      gpart_mpi_type, k, 4 * ind_send + 2, MPI_COMM_WORLD,
                      &reqs[8 * k + 2]) != MPI_SUCCESS)
640
          error("Failed to isend gparts to node %i.", k);
641
        g_offset_send += g_counts[ind_send];
642
643
644
      }
    }

645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
    /* Are we sending any spart ? */
    if (s_counts[ind_send] > 0) {

      /* message("Sending %d spart to node %d", s_counts[ind_send], k); */

      /* If the send is to the same node, just copy */
      if (k == nodeID) {
        memcpy(&sparts_new[s_offset_recv], &s->sparts[s_offset_send],
               sizeof(struct spart) * s_counts[ind_recv]);
        s_offset_send += s_counts[ind_send];
        s_offset_recv += s_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->sparts[s_offset_send], s_counts[ind_send],
                      spart_mpi_type, k, 4 * ind_send + 3, MPI_COMM_WORLD,
                      &reqs[8 * k + 3]) != MPI_SUCCESS)
          error("Failed to isend gparts to node %i.", k);
        s_offset_send += s_counts[ind_send];
      }
    }

667
668
669
    /* Now emit the corresponding Irecv() */

    /* Are we receiving any part/xpart from this node ? */
670
    if (k != nodeID && counts[ind_recv] > 0) {
671
      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
672
673
                    4 * ind_recv + 0, MPI_COMM_WORLD,
                    &reqs[8 * k + 4]) != MPI_SUCCESS)
674
        error("Failed to emit irecv of parts from node %i.", k);
675
      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
676
677
                    k, 4 * ind_recv + 1, MPI_COMM_WORLD,
                    &reqs[8 * k + 5]) != MPI_SUCCESS)
678
        error("Failed to emit irecv of xparts from node %i.", k);
679
      offset_recv += counts[ind_recv];
680
    }
681
682
683
684

    /* Are we receiving any gpart from this node ? */
    if (k != nodeID && g_counts[ind_recv] > 0) {
      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
685
686
                    gpart_mpi_type, k, 4 * ind_recv + 2, MPI_COMM_WORLD,
                    &reqs[8 * k + 6]) != MPI_SUCCESS)
687
688
689
        error("Failed to emit irecv of gparts from node %i.", k);
      g_offset_recv += g_counts[ind_recv];
    }
690
691
692
693
694
695
696
697
698

    /* Are we receiving any spart from this node ? */
    if (k != nodeID && s_counts[ind_recv] > 0) {
      if (MPI_Irecv(&sparts_new[s_offset_recv], s_counts[ind_recv],
                    spart_mpi_type, k, 4 * ind_recv + 3, MPI_COMM_WORLD,
                    &reqs[8 * k + 7]) != MPI_SUCCESS)
        error("Failed to emit irecv of sparts from node %i.", k);
      s_offset_recv += s_counts[ind_recv];
    }
699
  }
700

701
  /* Wait for all the sends and recvs to tumble in. */
702
  MPI_Status stats[8 * nr_nodes];
703
  int res;
704
705
  if ((res = MPI_Waitall(8 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
    for (int k = 0; k < 8 * nr_nodes; k++) {
706
707
708
      char buff[MPI_MAX_ERROR_STRING];
      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
      message("request %i has error '%s'.", k, buff);
709
    }
710
711
    error("Failed during waitall for part data.");
  }
712

713
714
715
716
717
  /* All particles have now arrived. Time for some final operations on the
     stuff we just received */

  /* Restore the part<->gpart and spart<->gpart links */
  size_t offset_parts = 0, offset_sparts = 0, offset_gparts = 0;
718
719
720
721
722
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];
723
    const size_t count_sparts = s_counts[ind_recv];
724
725
726
727

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

728
729
      /* Does this gpart have a gas partner ? */
      if (gparts_new[k].type == swift_type_gas) {
730

Matthieu Schaller's avatar
Style    
Matthieu Schaller committed
731
732
        const ptrdiff_t partner_index =
            offset_parts - gparts_new[k].id_or_neg_offset;
733
734

        /* Re-link */
735
        gparts_new[k].id_or_neg_offset = -partner_index;
Pedro Gonnet's avatar
Pedro Gonnet committed
736
        parts_new[partner_index].gpart = &gparts_new[k];
737
      }
738
739
740
741
742
743
744
745
746
747
748

      /* Does this gpart have a star partner ? */
      if (gparts_new[k].type == swift_type_star) {

        const ptrdiff_t partner_index =
            offset_sparts - gparts_new[k].id_or_neg_offset;

        /* Re-link */
        gparts_new[k].id_or_neg_offset = -partner_index;
        sparts_new[partner_index].gpart = &gparts_new[k];
      }
749
750
751
752
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
753
    offset_sparts += count_sparts;
754
755
  }

756
#ifdef SWIFT_DEBUG_CHECKS
757
  /* Verify that all parts are in the right place. */
758
759
760
761
  for (size_t k = 0; k < nr_parts; k++) {
    const int cid = cell_getid(cdim, parts_new[k].x[0] * iwidth[0],
                               parts_new[k].x[1] * iwidth[1],
                               parts_new[k].x[2] * iwidth[2]);
762
    if (cells[cid].nodeID != nodeID)
763
      error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
764
765
            cells[cid].nodeID);
  }
766
767
768
769
770
771
772
  for (size_t k = 0; k < nr_gparts; k++) {
    const int cid = cell_getid(cdim, gparts_new[k].x[0] * iwidth[0],
                               gparts_new[k].x[1] * iwidth[1],
                               gparts_new[k].x[2] * iwidth[2]);
    if (cells[cid].nodeID != nodeID)
      error("Received g-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
773
  }
774
775
776
777
778
779
780
  for (size_t k = 0; k < nr_sparts; k++) {
    const int cid = cell_getid(cdim, sparts_new[k].x[0] * iwidth[0],
                               sparts_new[k].x[1] * iwidth[1],
                               sparts_new[k].x[2] * iwidth[2]);
    if (cells[cid].nodeID != nodeID)
      error("Received s-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
781
  }
782

783
784
  /* Verify that the links are correct */
  part_verify_links(parts_new, gparts_new, sparts_new, nr_parts, nr_gparts,
785
                    nr_sparts, e->verbose);
786
#endif
787
788
789
790

  /* Set the new part data, free the old. */
  free(parts);
  free(xparts);
791
  free(gparts);
792
  free(sparts);
793
794
  s->parts = parts_new;
  s->xparts = xparts_new;
795
  s->gparts = gparts_new;
796
  s->sparts = sparts_new;
797
  s->nr_parts = nr_parts;
798
  s->nr_gparts = nr_gparts;
799
  s->nr_sparts = nr_sparts;
800
801
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
802
  s->size_sparts = engine_redistribute_alloc_margin * nr_sparts;
803

804
  /* Clean up the temporary stuff. */
805
806
807
808
  free(reqs);
  free(counts);
  free(dest);

809
810
811
812
813
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
814
815
    message("node %i now has %zu parts, %zu sparts and %zu gparts in %i cells.",
            nodeID, nr_parts, nr_sparts, nr_gparts, my_cells);
816
817
  }

818
819
820
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
821
#else
822
  error("SWIFT was not compiled with MPI support.");
823
824
#endif
}
825

826
/**
827
 * @brief Repartition the cells amongst the nodes.
828
829
830
 *
 * @param e The #engine.
 */
831
void engine_repartition(struct engine *e) {
832
833
834

#if defined(WITH_MPI) && defined(HAVE_METIS)

835
836
  ticks tic = getticks();

837
#ifdef SWIFT_DEBUG_CHECKS
838
839
840
841
  /* Be verbose about this. */
  if (e->nodeID == 0 || e->verbose) message("repartitioning space");
  fflush(stdout);

842
  /* Check that all cells have been drifted to the current time */
843
844
  space_check_drift_point(e->s, e->ti_old,
                          e->policy & engine_policy_self_gravity);
845
846
#endif

847
  /* Clear the repartition flag. */
848
  e->forcerepart = 0;
849

850
851
  /* Nothing to do if only using a single node. Also avoids METIS
   * bug that doesn't handle this case well. */
852
  if (e->nr_nodes == 1) return;
853

854
  /* Do the repartitioning. */
855
  partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s,
856
                        e->sched.tasks, e->sched.nr_tasks);
857
858
859
860

  /* Now comes the tricky part: Exchange particles between all nodes.
     This is done in two steps, first allreducing a matrix of
     how many particles go from where to where, then re-allocating
861
     the parts array, and emitting the sends and receives.
862
863
864
865
866
867
868
869
870
871
872
     Finally, the space, tasks, and proxies need to be rebuilt. */

  /* Redistribute the particles between the nodes. */
  engine_redistribute(e);

  /* Make the proxies. */
  engine_makeproxies(e);

  /* Tell the engine it should re-build whenever possible */
  e->forcerebuild = 1;

873
874
875
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
876
#else
877
  error("SWIFT was not compiled with MPI and METIS support.");
878
#endif
879
}
880

881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
/**
 * @brief Decide whether trigger a repartition the cells amongst the nodes.
 *
 * @param e The #engine.
 */
void engine_repartition_trigger(struct engine *e) {

#ifdef WITH_MPI

  /* Do nothing if there have not been enough steps since the last
   * repartition, don't want to repeat this too often or immediately after
   * a repartition step. */
  if (e->step - e->last_repartition > 2) {

    /* Old style if trigger is >1 or this is the second step (want an early
     * repartition following the initial repartition). */
    if (e->reparttype->trigger > 1 || e->step == 2) {
      if (e->reparttype->trigger > 1) {
        if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1;
      } else {
        e->forcerepart = 1;
      }

    } else {

      /* Use cputimes from ranks to estimate the imbalance. */
      /* First check if we are going to skip this stage anyway, if so do that
       * now. If is only worth checking the CPU loads when we have processed a
       * significant number of all particles. */
      if ((e->updates > 1 &&
           e->updates >= e->total_nr_parts *