engine.c 141 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "active.h"
52
#include "atomic.h"
53
#include "cell.h"
54
#include "clocks.h"
55
56
#include "cycle.h"
#include "debug.h"
57
#include "error.h"
58
#include "gravity.h"
59
#include "hydro.h"
60
#include "minmax.h"
61
#include "parallel_io.h"
62
#include "part.h"
63
#include "partition.h"
James Willis's avatar
James Willis committed
64
#include "profiler.h"
65
66
#include "proxy.h"
#include "runner.h"
67
68
#include "serial_io.h"
#include "single_io.h"
69
#include "sort_part.h"
70
#include "statistics.h"
71
#include "timers.h"
72
#include "tools.h"
73
#include "units.h"
74
#include "version.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
75

76
77
78
/* Particle cache size. */
#define CACHE_SIZE 512

79
const char *engine_policy_names[16] = {"none",
Pedro Gonnet's avatar
Pedro Gonnet committed
80
81
82
83
84
85
86
87
88
89
                                       "rand",
                                       "steal",
                                       "keep",
                                       "block",
                                       "cpu_tight",
                                       "mpi",
                                       "numa_affinity",
                                       "hydro",
                                       "self_gravity",
                                       "external_gravity",
90
                                       "cosmology_integration",
91
                                       "drift_all",
92
                                       "cooling",
93
                                       "sourceterms",
94
                                       "stars"};
Pedro Gonnet's avatar
Pedro Gonnet committed
95

96
97
98
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

99
100
101
102
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
103
 * @param l A pointer to the #link, will be modified atomically.
104
105
106
107
 * @param t The #task.
 *
 * @return The new #link pointer.
 */
108
void engine_addlink(struct engine *e, struct link **l, struct task *t) {
109

110
  /* Get the next free link. */
111
112
113
114
115
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
116

117
  /* Set it atomically. */
118
  res->t = t;
119
  res->next = atomic_swap(l, res);
120
}
121

122
123
124
125
126
127
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
 * i.e. all the O(Npart) tasks.
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
128
129
 * Note that there is no need to recurse below the super-cell.
 *
130
131
132
 * @param e The #engine.
 * @param c The #cell.
 */
133
void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
134
135

  struct scheduler *s = &e->sched;
136
  const int is_hydro = (e->policy & engine_policy_hydro);
137
  const int is_self_gravity = (e->policy & engine_policy_self_gravity);
138
  const int is_with_cooling = (e->policy & engine_policy_cooling);
Matthieu Schaller's avatar
Matthieu Schaller committed
139
  const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
140

141
  /* Are we in a super-cell ? */
142
  if (c->super == c) {
143
144
145
146

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

Matthieu Schaller's avatar
Matthieu Schaller committed
147
      /* Add the init task. */
148
149
      c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
                                  NULL, 0);
150

151
152
153
154
155
156
      /* Add the two half kicks */
      c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
                                   c, NULL, 0);

      c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0,
                                   c, NULL, 0);
Tom Theuns's avatar
Tom Theuns committed
157

158
159
160
161
162
      /* Add the time-step calculation task and its dependency */
      c->timestep = scheduler_addtask(s, task_type_timestep, task_subtype_none,
                                      0, 0, c, NULL, 0);

      scheduler_addunlock(s, c->kick2, c->timestep);
163
      scheduler_addunlock(s, c->timestep, c->kick1);
164

165
      /* Add the drift task and its dependencies. */
166
167
168
169
170
      c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0,
                                   c, NULL, 0);

      scheduler_addunlock(s, c->drift, c->init);

171
      if (is_self_gravity) {
172

173
174
175
176
177
178
179
180
        /* Gravity non-neighbouring pm calculations */
        c->grav_long_range = scheduler_addtask(
            s, task_type_grav_long_range, task_subtype_none, 0, 0, c, NULL, 0);

        /* Gravity top-level periodic calculation */
        c->grav_top_level = scheduler_addtask(
            s, task_type_grav_top_level, task_subtype_none, 0, 0, c, NULL, 0);

181
182
183
184
        /* Gravity recursive down-pass */
        c->grav_down = scheduler_addtask(s, task_type_grav_down,
                                         task_subtype_none, 0, 0, c, NULL, 0);

185
186
        scheduler_addunlock(s, c->init, c->grav_long_range);
        scheduler_addunlock(s, c->init, c->grav_top_level);
187
188
189
        scheduler_addunlock(s, c->grav_long_range, c->grav_down);
        scheduler_addunlock(s, c->grav_top_level, c->grav_down);
        scheduler_addunlock(s, c->grav_down, c->kick2);
190
191
      }

192
      /* Generate the ghost task. */
193
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
194
195
        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                                     0, c, NULL, 0);
196
197
198

#ifdef EXTRA_HYDRO_LOOP
      /* Generate the extra ghost task. */
199
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
200
201
        c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
                                           task_subtype_none, 0, 0, c, NULL, 0);
202
#endif
203

204
      /* Cooling task */
205
      if (is_with_cooling) {
Matthieu Schaller's avatar
Matthieu Schaller committed
206
207
        c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
                                       0, 0, c, NULL, 0);
208
209
210
211

        scheduler_addunlock(s, c->cooling, c->kick2);
      }

212
      /* add source terms */
213
      if (is_with_sourceterms) {
214
215
        c->sourceterms = scheduler_addtask(s, task_type_sourceterms,
                                           task_subtype_none, 0, 0, c, NULL, 0);
216
      }
217
218
    }

219
  } else { /* We are above the super-cell so need to go deeper */
220

221
222
223
#ifdef SWIFT_DEBUG_CHECKS
    if (c->super != NULL) error("Incorrectly set super pointer");
#endif
224

225
226
227
228
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
229
          engine_make_hierarchical_tasks(e, c->progeny[k]);
230
  }
231
}
232

233
/**
234
 * @brief Redistribute the particles amongst the nodes according
235
236
 *      to their cell's node IDs.
 *
237
238
239
240
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
241
242
243
244
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
245
246
 *
 *
247
248
 * @param e The #engine.
 */
249
void engine_redistribute(struct engine *e) {
250

251
#ifdef WITH_MPI
252

253
254
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
255
  struct space *s = e->s;
256
  struct cell *cells = s->cells_top;
257
  const int nr_cells = s->nr_cells;
258
  const int *cdim = s->cdim;
259
  const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
260
261
262
263
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct xpart *xparts = s->xparts;
  struct gpart *gparts = s->gparts;
264
  struct spart *sparts = s->sparts;
265
  ticks tic = getticks();
266

267
268
  /* Allocate temporary arrays to store the counts of particles to be sent
     and the destination of each particle */
269
  int *counts, *g_counts, *s_counts;
270
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
271
    error("Failed to allocate counts temporary buffer.");
272
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
273
    error("Failed to allocate g_gcount temporary buffer.");
274
  if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
275
    error("Failed to allocate s_counts temporary buffer.");
276
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
277
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);
278
  bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes);
279

280
  /* Allocate the destination index arrays. */
281
  int *dest, *g_dest, *s_dest;
282
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
283
    error("Failed to allocate dest temporary buffer.");
284
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
285
    error("Failed to allocate g_dest temporary buffer.");
286
287
  if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL)
    error("Failed to allocate s_dest temporary buffer.");
288

289
  /* Get destination of each particle */
290
  for (size_t k = 0; k < s->nr_parts; k++) {
291
292

    /* Periodic boundary conditions */
293
    for (int j = 0; j < 3; j++) {
294
295
296
297
298
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
299
300
301
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
302
303
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
304
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
305
306
307
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

308
    dest[k] = cells[cid].nodeID;
309
310

    /* The counts array is indexed as count[from * nr_nodes + to]. */
311
312
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
313
314

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
315
  if (s->nr_parts > 0)
316
    space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
317

318
319
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the part have been sorted correctly. */
320
321
322
323
  for (size_t k = 0; k < s->nr_parts; k++) {
    const struct part *p = &s->parts[k];

    /* New cell index */
324
    const int new_cid =
325
326
327
328
        cell_getid(s->cdim, p->x[0] * s->iwidth[0], p->x[1] * s->iwidth[1],
                   p->x[2] * s->iwidth[2]);

    /* New cell of this part */
329
330
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
331

332
333
    if (dest[k] != new_node)
      error("part's new node index not matching sorted index.");
334
335
336
337
338

    if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->width[0] ||
        p->x[1] < c->loc[1] || p->x[1] > c->loc[1] + c->width[1] ||
        p->x[2] < c->loc[2] || p->x[2] > c->loc[2] + c->width[2])
      error("part not sorted into the right top-level cell!");
339
340
341
  }
#endif

342
  /* We need to re-link the gpart partners of parts. */
343
344
345
346
347
348
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

349
350
351
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
352
         * Recall that gparts without partners have a positive id.
353
         * We will restore the pointers on the receiving node later on. */
354
355
356
357
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
358

359
#ifdef SWIFT_DEBUG_CHECKS
360
        if (s->parts[k].gpart->id_or_neg_offset > 0)
361
362
          error("Trying to link a partnerless gpart !");
#endif
363

364
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
365
        count_this_dest++;
366
367
368
369
      }
    }
  }

370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
  /* Get destination of each s-particle */
  for (size_t k = 0; k < s->nr_sparts; k++) {

    /* Periodic boundary conditions */
    for (int j = 0; j < 3; j++) {
      if (sparts[k].x[j] < 0.0)
        sparts[k].x[j] += dim[j];
      else if (sparts[k].x[j] >= dim[j])
        sparts[k].x[j] -= dim[j];
    }
    const int cid =
        cell_getid(cdim, sparts[k].x[0] * iwidth[0], sparts[k].x[1] * iwidth[1],
                   sparts[k].x[2] * iwidth[2]);
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
            sparts[k].x[0], sparts[k].x[1], sparts[k].x[2]);
#endif

    s_dest[k] = cells[cid].nodeID;

    /* The counts array is indexed as count[from * nr_nodes + to]. */
    s_counts[nodeID * nr_nodes + s_dest[k]] += 1;
  }

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
396
  if (s->nr_sparts > 0)
397
    space_sparts_sort(s, s_dest, s->nr_sparts, 0, nr_nodes - 1, e->verbose);
398

399
400
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the spart have been sorted correctly. */
401
402
403
404
  for (size_t k = 0; k < s->nr_sparts; k++) {
    const struct spart *sp = &s->sparts[k];

    /* New cell index */
405
    const int new_cid =
406
407
408
409
        cell_getid(s->cdim, sp->x[0] * s->iwidth[0], sp->x[1] * s->iwidth[1],
                   sp->x[2] * s->iwidth[2]);

    /* New cell of this spart */
410
411
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
412

413
414
    if (s_dest[k] != new_node)
      error("spart's new node index not matching sorted index.");
415
416
417
418
419

    if (sp->x[0] < c->loc[0] || sp->x[0] > c->loc[0] + c->width[0] ||
        sp->x[1] < c->loc[1] || sp->x[1] > c->loc[1] + c->width[1] ||
        sp->x[2] < c->loc[2] || sp->x[2] > c->loc[2] + c->width[2])
      error("spart not sorted into the right top-level cell!");
420
421
422
  }
#endif

423
  /* We need to re-link the gpart partners of sparts. */
424
425
426
  if (s->nr_sparts > 0) {
    int current_dest = s_dest[0];
    size_t count_this_dest = 0;
427
    for (size_t k = 0; k < s->nr_sparts; ++k) {
428
429
430
431
432
      if (s->sparts[k].gpart != NULL) {

        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
433
         * Recall that gparts without partners have a positive id.
434
435
436
437
438
439
440
         * We will restore the pointers on the receiving node later on. */
        if (s_dest[k] != current_dest) {
          current_dest = s_dest[k];
          count_this_dest = 0;
        }

#ifdef SWIFT_DEBUG_CHECKS
441
        if (s->sparts[k].gpart->id_or_neg_offset > 0)
442
443
444
445
446
447
448
449
450
          error("Trying to link a partnerless gpart !");
#endif

        s->sparts[k].gpart->id_or_neg_offset = -count_this_dest;
        count_this_dest++;
      }
    }
  }

451
  /* Get destination of each g-particle */
452
  for (size_t k = 0; k < s->nr_gparts; k++) {
453
454

    /* Periodic boundary conditions */
455
    for (int j = 0; j < 3; j++) {
456
457
458
459
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
460
    }
James Willis's avatar
James Willis committed
461
462
463
    const int cid =
        cell_getid(cdim, gparts[k].x[0] * iwidth[0], gparts[k].x[1] * iwidth[1],
                   gparts[k].x[2] * iwidth[2]);
464
465
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
466
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
467
468
469
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

470
    g_dest[k] = cells[cid].nodeID;
471
472

    /* The counts array is indexed as count[from * nr_nodes + to]. */
473
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
474
  }
475
476

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
477
  if (s->nr_gparts > 0)
478
    space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
479

480
481
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the gpart have been sorted correctly. */
482
483
484
485
  for (size_t k = 0; k < s->nr_gparts; k++) {
    const struct gpart *gp = &s->gparts[k];

    /* New cell index */
486
    const int new_cid =
487
488
489
490
        cell_getid(s->cdim, gp->x[0] * s->iwidth[0], gp->x[1] * s->iwidth[1],
                   gp->x[2] * s->iwidth[2]);

    /* New cell of this gpart */
491
492
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
493

494
495
    if (g_dest[k] != new_node)
      error("gpart's new node index not matching sorted index.");
496
497
498
499
500

    if (gp->x[0] < c->loc[0] || gp->x[0] > c->loc[0] + c->width[0] ||
        gp->x[1] < c->loc[1] || gp->x[1] > c->loc[1] + c->width[1] ||
        gp->x[2] < c->loc[2] || gp->x[2] > c->loc[2] + c->width[2])
      error("gpart not sorted into the right top-level cell!");
501
502
503
  }
#endif

504
505
506
507
508
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

509
  /* Get all the s_counts from all the nodes. */
510
511
512
513
514
515
516
517
518
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, s_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce sparticle transfer counts.");

Peter W. Draper's avatar
Peter W. Draper committed
519
  /* Report how many particles will be moved. */
520
521
  if (e->verbose) {
    if (e->nodeID == 0) {
522
523
      size_t total = 0, g_total = 0, s_total = 0;
      size_t unmoved = 0, g_unmoved = 0, s_unmoved = 0;
524
525
526
      for (int p = 0, r = 0; p < nr_nodes; p++) {
        for (int s = 0; s < nr_nodes; s++) {
          total += counts[r];
527
528
529
530
531
532
533
          g_total += g_counts[r];
          s_total += s_counts[r];
          if (p == s) {
            unmoved += counts[r];
            g_unmoved += g_counts[r];
            s_unmoved += s_counts[r];
          }
534
535
536
          r++;
        }
      }
Matthieu Schaller's avatar
Matthieu Schaller committed
537
538
539
540
541
542
543
544
545
546
547
      if (total > 0)
        message("%ld of %ld (%.2f%%) of particles moved", total - unmoved,
                total, 100.0 * (double)(total - unmoved) / (double)total);
      if (g_total > 0)
        message("%ld of %ld (%.2f%%) of g-particles moved", g_total - g_unmoved,
                g_total,
                100.0 * (double)(g_total - g_unmoved) / (double)g_total);
      if (s_total > 0)
        message("%ld of %ld (%.2f%%) of s-particles moved", s_total - s_unmoved,
                s_total,
                100.0 * (double)(s_total - s_unmoved) / (double)s_total);
548
    }
549
550
  }

Matthieu Schaller's avatar
Matthieu Schaller committed
551
  /* Each node knows how many parts, sparts and gparts will be transferred
552
     to every other node. We can start preparing to receive data */
553
554

  /* Get the new number of parts and gparts for this node */
555
  size_t nr_parts = 0, nr_gparts = 0, nr_sparts = 0;
556
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
557
558
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];
559
560
  for (int k = 0; k < nr_nodes; k++)
    nr_sparts += s_counts[k * nr_nodes + nodeID];
561
562

  /* Allocate the new arrays with some extra margin */
563
  struct part *parts_new = NULL;
564
  struct xpart *xparts_new = NULL;
565
  struct gpart *gparts_new = NULL;
566
  struct spart *sparts_new = NULL;
567
  if (posix_memalign((void **)&parts_new, part_align,
568
569
                     sizeof(struct part) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
570
    error("Failed to allocate new part data.");
571
572
573
574
575
576
577
578
  if (posix_memalign((void **)&xparts_new, xpart_align,
                     sizeof(struct xpart) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new xpart data.");
  if (posix_memalign((void **)&gparts_new, gpart_align,
                     sizeof(struct gpart) * nr_gparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new gpart data.");
579
580
581
582
  if (posix_memalign((void **)&sparts_new, spart_align,
                     sizeof(struct spart) * nr_sparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new spart data.");
583
584

  /* Prepare MPI requests for the asynchronous communications */
585
  MPI_Request *reqs;
586
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 8 * nr_nodes)) ==
587
588
      NULL)
    error("Failed to allocate MPI request list.");
589
  for (int k = 0; k < 8 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
590
591
592
593

  /* Emit the sends and recvs for the particle and gparticle data. */
  size_t offset_send = 0, offset_recv = 0;
  size_t g_offset_send = 0, g_offset_recv = 0;
594
  size_t s_offset_send = 0, s_offset_recv = 0;
595
596
597
598
599
600
601
  for (int k = 0; k < nr_nodes; k++) {

    /* Indices in the count arrays of the node of interest */
    const int ind_send = nodeID * nr_nodes + k;
    const int ind_recv = k * nr_nodes + nodeID;

    /* Are we sending any part/xpart ? */
602
    if (counts[ind_send] > 0) {
603

604
      /* message("Sending %d part to node %d", counts[ind_send], k); */
Matthieu Schaller's avatar
Matthieu Schaller committed
605

606
      /* If the send is to the same node, just copy */
607
608
609
610
611
612
613
      if (k == nodeID) {
        memcpy(&parts_new[offset_recv], &s->parts[offset_send],
               sizeof(struct part) * counts[ind_recv]);
        memcpy(&xparts_new[offset_recv], &s->xparts[offset_send],
               sizeof(struct xpart) * counts[ind_recv]);
        offset_send += counts[ind_send];
        offset_recv += counts[ind_recv];
614
615

        /* Else, emit some communications */
616
      } else {
617
        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
618
619
                      k, 4 * ind_send + 0, MPI_COMM_WORLD,
                      &reqs[8 * k + 0]) != MPI_SUCCESS)
620
          error("Failed to isend parts to node %i.", k);
621
        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
622
623
                      k, 4 * ind_send + 1, MPI_COMM_WORLD,
                      &reqs[8 * k + 1]) != MPI_SUCCESS)
624
          error("Failed to isend xparts to node %i.", k);
625
626
        offset_send += counts[ind_send];
      }
627
    }
628
629
630
631

    /* Are we sending any gpart ? */
    if (g_counts[ind_send] > 0) {

632
      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
633

634
635
      /* If the send is to the same node, just copy */
      if (k == nodeID) {
636
        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
637
638
639
640
641
642
643
               sizeof(struct gpart) * g_counts[ind_recv]);
        g_offset_send += g_counts[ind_send];
        g_offset_recv += g_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
644
645
                      gpart_mpi_type, k, 4 * ind_send + 2, MPI_COMM_WORLD,
                      &reqs[8 * k + 2]) != MPI_SUCCESS)
646
          error("Failed to isend gparts to node %i.", k);
647
        g_offset_send += g_counts[ind_send];
648
649
650
      }
    }

651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
    /* Are we sending any spart ? */
    if (s_counts[ind_send] > 0) {

      /* message("Sending %d spart to node %d", s_counts[ind_send], k); */

      /* If the send is to the same node, just copy */
      if (k == nodeID) {
        memcpy(&sparts_new[s_offset_recv], &s->sparts[s_offset_send],
               sizeof(struct spart) * s_counts[ind_recv]);
        s_offset_send += s_counts[ind_send];
        s_offset_recv += s_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->sparts[s_offset_send], s_counts[ind_send],
                      spart_mpi_type, k, 4 * ind_send + 3, MPI_COMM_WORLD,
                      &reqs[8 * k + 3]) != MPI_SUCCESS)
          error("Failed to isend gparts to node %i.", k);
        s_offset_send += s_counts[ind_send];
      }
    }

673
674
675
    /* Now emit the corresponding Irecv() */

    /* Are we receiving any part/xpart from this node ? */
676
    if (k != nodeID && counts[ind_recv] > 0) {
677
      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
678
679
                    4 * ind_recv + 0, MPI_COMM_WORLD,
                    &reqs[8 * k + 4]) != MPI_SUCCESS)
680
        error("Failed to emit irecv of parts from node %i.", k);
681
      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
682
683
                    k, 4 * ind_recv + 1, MPI_COMM_WORLD,
                    &reqs[8 * k + 5]) != MPI_SUCCESS)
684
        error("Failed to emit irecv of xparts from node %i.", k);
685
      offset_recv += counts[ind_recv];
686
    }
687
688
689
690

    /* Are we receiving any gpart from this node ? */
    if (k != nodeID && g_counts[ind_recv] > 0) {
      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
691
692
                    gpart_mpi_type, k, 4 * ind_recv + 2, MPI_COMM_WORLD,
                    &reqs[8 * k + 6]) != MPI_SUCCESS)
693
694
695
        error("Failed to emit irecv of gparts from node %i.", k);
      g_offset_recv += g_counts[ind_recv];
    }
696
697
698
699
700
701
702
703
704

    /* Are we receiving any spart from this node ? */
    if (k != nodeID && s_counts[ind_recv] > 0) {
      if (MPI_Irecv(&sparts_new[s_offset_recv], s_counts[ind_recv],
                    spart_mpi_type, k, 4 * ind_recv + 3, MPI_COMM_WORLD,
                    &reqs[8 * k + 7]) != MPI_SUCCESS)
        error("Failed to emit irecv of sparts from node %i.", k);
      s_offset_recv += s_counts[ind_recv];
    }
705
  }
706

707
  /* Wait for all the sends and recvs to tumble in. */
708
  MPI_Status stats[8 * nr_nodes];
709
  int res;
710
711
  if ((res = MPI_Waitall(8 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
    for (int k = 0; k < 8 * nr_nodes; k++) {
712
713
714
      char buff[MPI_MAX_ERROR_STRING];
      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
      message("request %i has error '%s'.", k, buff);
715
    }
716
717
    error("Failed during waitall for part data.");
  }
718

719
720
721
722
723
  /* All particles have now arrived. Time for some final operations on the
     stuff we just received */

  /* Restore the part<->gpart and spart<->gpart links */
  size_t offset_parts = 0, offset_sparts = 0, offset_gparts = 0;
724
725
726
727
728
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];
729
    const size_t count_sparts = s_counts[ind_recv];
730
731
732
733

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

734
735
      /* Does this gpart have a gas partner ? */
      if (gparts_new[k].type == swift_type_gas) {
736

Matthieu Schaller's avatar
Style    
Matthieu Schaller committed
737
738
        const ptrdiff_t partner_index =
            offset_parts - gparts_new[k].id_or_neg_offset;
739
740

        /* Re-link */
741
        gparts_new[k].id_or_neg_offset = -partner_index;
Pedro Gonnet's avatar
Pedro Gonnet committed
742
        parts_new[partner_index].gpart = &gparts_new[k];
743
      }
744
745
746
747
748
749
750
751
752
753
754

      /* Does this gpart have a star partner ? */
      if (gparts_new[k].type == swift_type_star) {

        const ptrdiff_t partner_index =
            offset_sparts - gparts_new[k].id_or_neg_offset;

        /* Re-link */
        gparts_new[k].id_or_neg_offset = -partner_index;
        sparts_new[partner_index].gpart = &gparts_new[k];
      }
755
756
757
758
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
759
    offset_sparts += count_sparts;
760
761
  }

762
#ifdef SWIFT_DEBUG_CHECKS
763
  /* Verify that all parts are in the right place. */
764
765
766
767
  for (size_t k = 0; k < nr_parts; k++) {
    const int cid = cell_getid(cdim, parts_new[k].x[0] * iwidth[0],
                               parts_new[k].x[1] * iwidth[1],
                               parts_new[k].x[2] * iwidth[2]);
768
    if (cells[cid].nodeID != nodeID)
769
      error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
770
771
            cells[cid].nodeID);
  }
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
  for (size_t k = 0; k < nr_gparts; k++) {
    const int cid = cell_getid(cdim, gparts_new[k].x[0] * iwidth[0],
                               gparts_new[k].x[1] * iwidth[1],
                               gparts_new[k].x[2] * iwidth[2]);
    if (cells[cid].nodeID != nodeID)
      error("Received g-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
  }
  for (size_t k = 0; k < nr_sparts; k++) {
    const int cid = cell_getid(cdim, sparts_new[k].x[0] * iwidth[0],
                               sparts_new[k].x[1] * iwidth[1],
                               sparts_new[k].x[2] * iwidth[2]);
    if (cells[cid].nodeID != nodeID)
      error("Received s-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
  }
788

789
790
  /* Verify that the links are correct */
  part_verify_links(parts_new, gparts_new, sparts_new, nr_parts, nr_gparts,
791
                    nr_sparts, e->verbose);
792
#endif
793
794
795
796

  /* Set the new part data, free the old. */
  free(parts);
  free(xparts);
797
  free(gparts);
798
  free(sparts);
799
800
  s->parts = parts_new;
  s->xparts = xparts_new;
801
  s->gparts = gparts_new;
802
  s->sparts = sparts_new;
803
  s->nr_parts = nr_parts;
804
  s->nr_gparts = nr_gparts;
805
  s->nr_sparts = nr_sparts;
806
807
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
808
  s->size_sparts = engine_redistribute_alloc_margin * nr_sparts;
809

810
  /* Clean up the temporary stuff. */
811
812
813
814
  free(reqs);
  free(counts);
  free(dest);

815
816
817
818
819
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
820
821
    message("node %i now has %zu parts, %zu sparts and %zu gparts in %i cells.",
            nodeID, nr_parts, nr_sparts, nr_gparts, my_cells);
822
823
  }

824
825
826
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
827
#else
828
  error("SWIFT was not compiled with MPI support.");
829
830
#endif
}
831

832
/**
833
 * @brief Repartition the cells amongst the nodes.
834
835
836
 *
 * @param e The #engine.
 */
837
void engine_repartition(struct engine *e) {
838
839
840

#if defined(WITH_MPI) && defined(HAVE_METIS)

841
842
  ticks tic = getticks();

843
#ifdef SWIFT_DEBUG_CHECKS
844
845
846
847
  /* Be verbose about this. */
  if (e->nodeID == 0 || e->verbose) message("repartitioning space");
  fflush(stdout);

848
  /* Check that all cells have been drifted to the current time */
849
850
  space_check_drift_point(e->s, e->ti_old,
                          e->policy & engine_policy_self_gravity);
851
852
#endif

853
  /* Clear the repartition flag. */
854
  e->forcerepart = 0;
855

856
857
  /* Nothing to do if only using a single node. Also avoids METIS
   * bug that doesn't handle this case well. */
858
  if (e->nr_nodes == 1) return;
859

860
  /* Do the repartitioning. */
861
  partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s,
862
                        e->sched.tasks, e->sched.nr_tasks);
863
864
865
866

  /* Now comes the tricky part: Exchange particles between all nodes.
     This is done in two steps, first allreducing a matrix of
     how many particles go from where to where, then re-allocating
867
     the parts array, and emitting the sends and receives.
868
869
870
871
872
873
874
875
876
877
878
     Finally, the space, tasks, and proxies need to be rebuilt. */

  /* Redistribute the particles between the nodes. */
  engine_redistribute(e);

  /* Make the proxies. */
  engine_makeproxies(e);

  /* Tell the engine it should re-build whenever possible */
  e->forcerebuild = 1;

879
880
881
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
882
#else
883
  error("SWIFT was not compiled with MPI and METIS support.");
884
#endif
885
}
886

887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
/**
 * @brief Decide whether trigger a repartition the cells amongst the nodes.
 *
 * @param e The #engine.
 */
void engine_repartition_trigger(struct engine *e) {

#ifdef WITH_MPI

  /* Do nothing if there have not been enough steps since the last
   * repartition, don't want to repeat this too often or immediately after
   * a repartition step. */
  if (e->step - e->last_repartition > 2) {

    /* Old style if trigger is >1 or this is the second step (want an early
     * repartition following the initial repartition). */
    if (e->reparttype->trigger > 1 || e->step == 2) {
      if (e->reparttype->trigger > 1) {
        if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1;
      } else {
        e->forcerepart = 1;
      }

    } else {

      /* Use cputimes from ranks to estimate the imbalance. */
      /* First check if we are going to skip this stage anyway, if so do that
       * now. If is only worth checking the CPU loads when we have processed a
       * significant number of all particles. */