engine.c 94.7 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "atomic.h"
52
#include "cell.h"
53
#include "clocks.h"
54
55
#include "cycle.h"
#include "debug.h"
56
#include "error.h"
57
#include "hydro.h"
58
#include "minmax.h"
59
#include "parallel_io.h"
60
#include "part.h"
61
#include "partition.h"
62
63
#include "proxy.h"
#include "runner.h"
64
65
#include "serial_io.h"
#include "single_io.h"
66
#include "timers.h"
67
#include "units.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
68

69
70
71
72
73
74
75
76
77
78
79
80
81
const char *engine_policy_names[13] = {"none",
                                       "rand",
                                       "steal",
                                       "keep",
                                       "block",
                                       "fix_dt",
                                       "cpu_tight",
                                       "mpi",
                                       "numa_affinity",
                                       "hydro",
                                       "self_gravity",
                                       "external_gravity",
                                       "cosmology_integration"};
Pedro Gonnet's avatar
Pedro Gonnet committed
82

83
84
85
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

86
87
88
89
90
#ifdef HAVE_SETAFFINITY
/** The initial affinity of the main thread (set by engin_pin()) */
static cpu_set_t entry_affinity;
#endif

91
92
93
94
95
96
97
98
99
100
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
 * @param l The #link.
 * @param t The #task.
 *
 * @return The new #link pointer.
 */

101
struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) {
102

103
104
105
106
107
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
108
109
110
111
  res->next = l;
  res->t = t;
  return res;
}
112
113

/**
Matthieu Schaller's avatar
Matthieu Schaller committed
114
115
 * @brief Generate the hierarchical tasks for a hierarchy of cells - i.e. all
 * the O(Npart) tasks.
Matthieu Schaller's avatar
Matthieu Schaller committed
116
117
 *
 * Tasks are only created here. The dependencies will be added later on.
118
119
120
121
122
 *
 * @param e The #engine.
 * @param c The #cell.
 * @param super The super #cell.
 */
123
void engine_make_hierarchical_tasks(struct engine *e, struct cell *c,
Matthieu Schaller's avatar
Matthieu Schaller committed
124
                                    struct cell *super) {
125
126

  struct scheduler *s = &e->sched;
127
128
129
  const int is_with_external_gravity =
      (e->policy & engine_policy_external_gravity) ==
      engine_policy_external_gravity;
130
  const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt;
131
132

  /* Am I the super-cell? */
133
  if (super == NULL && (c->count > 0 || c->gcount > 0)) {
134
135
136
137
138
139
140

    /* Remember me. */
    super = c;

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

Matthieu Schaller's avatar
Matthieu Schaller committed
141
      /* Add the init task. */
Matthieu Schaller's avatar
Matthieu Schaller committed
142
143
      c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
                                  NULL, 0);
144

Matthieu Schaller's avatar
Matthieu Schaller committed
145
146
147
      /* Add the drift task. */
      c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0,
                                   c, NULL, 0);
148

149
150
151
152
153
154
155
156
      /* Add the kick task that matches the policy. */
      if (is_fixdt) {
        c->kick = scheduler_addtask(s, task_type_kick_fixdt, task_subtype_none,
                                    0, 0, c, NULL, 0);
      } else {
        c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0,
                                    c, NULL, 0);
      }
Tom Theuns's avatar
Tom Theuns committed
157

158
      if (c->count > 0) {
159

160
161
162
163
        /* Generate the ghost task. */
        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                                     0, c, NULL, 0);
      }
164

165
166
167
168
169
170
      if (c->gcount > 0) {

        /* Add the external gravity tasks */
        if (is_with_external_gravity)
          c->grav_external = scheduler_addtask(
              s, task_type_grav_external, task_subtype_none, 0, 0, c, NULL, 0);
171
      }
172
    }
173
  }
174

175
176
177
178
179
  /* Set the super-cell. */
  c->super = super;

  /* Recurse. */
  if (c->split)
180
    for (int k = 0; k < 8; k++)
181
      if (c->progeny[k] != NULL)
Matthieu Schaller's avatar
Matthieu Schaller committed
182
        engine_make_hierarchical_tasks(e, c->progeny[k], super);
183
}
184

185
/**
186
 * @brief Redistribute the particles amongst the nodes according
187
188
 *      to their cell's node IDs.
 *
189
190
191
192
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
193
194
195
196
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
197
198
 *
 *
199
200
 * @param e The #engine.
 */
201
void engine_redistribute(struct engine *e) {
202

203
#ifdef WITH_MPI
204

205
206
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
207
208
  struct space *s = e->s;
  struct cell *cells = s->cells;
209
  const int nr_cells = s->nr_cells;
210
211
212
213
214
215
  const int *cdim = s->cdim;
  const double ih[3] = {s->ih[0], s->ih[1], s->ih[2]};
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct xpart *xparts = s->xparts;
  struct gpart *gparts = s->gparts;
216
  ticks tic = getticks();
217

218
219
220
  /* Allocate temporary arrays to store the counts of particles to be sent
     and the destination of each particle */
  int *counts, *g_counts;
221
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
222
    error("Failed to allocate count temporary buffer.");
223
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
224
    error("Failed to allocate gcount temporary buffer.");
225
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
226
227
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);

228
  /* Allocate the destination index arrays. */
229
230
  int *dest, *g_dest;
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
231
    error("Failed to allocate dest temporary buffer.");
232
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
233
234
    error("Failed to allocate g_dest temporary buffer.");

235
  /* Get destination of each particle */
236
  for (size_t k = 0; k < s->nr_parts; k++) {
237
238

    /* Periodic boundary conditions */
239
    for (int j = 0; j < 3; j++) {
240
241
242
243
244
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
Pedro Gonnet's avatar
Pedro Gonnet committed
245
246
    const int cid = cell_getid(cdim, parts[k].x[0] * ih[0],
                               parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]);
247
248
249
250
251
252
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

253
    dest[k] = cells[cid].nodeID;
254
255

    /* The counts array is indexed as count[from * nr_nodes + to]. */
256
257
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
258
259

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
260
  space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
261

262
  /* We need to re-link the gpart partners of parts. */
263
264
265
266
267
268
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

269
270
271
272
273
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
         * Recall that gparts without partners have a negative id.
         * We will restore the pointers on the receiving node later on. */
274
275
276
277
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
278

279
#ifdef SWIFT_DEBUG_CHECKS
Matthieu Schaller's avatar
Matthieu Schaller committed
280
        if (s->parts[k].gpart->id_or_neg_offset >= 0)
281
282
          error("Trying to link a partnerless gpart !");
#endif
283

284
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
285
        count_this_dest++;
286
287
288
289
290
      }
    }
  }

  /* Get destination of each g-particle */
291
  for (size_t k = 0; k < s->nr_gparts; k++) {
292
293

    /* Periodic boundary conditions */
294
    for (int j = 0; j < 3; j++) {
295
296
297
298
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
299
    }
300
301
    const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0],
                               gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]);
302
303
304
305
306
307
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
      error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k,
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

308
    g_dest[k] = cells[cid].nodeID;
309
310

    /* The counts array is indexed as count[from * nr_nodes + to]. */
311
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
312
  }
313
314

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
315
  space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
316

317
318
319
320
321
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

322
323
324
325
326
327
  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Each node knows how many parts and gparts will be transferred to every
328
     other node. We can start preparing to receive data */
329
330
331

  /* Get the new number of parts and gparts for this node */
  size_t nr_parts = 0, nr_gparts = 0;
332
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
333
334
335
336
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];

  /* Allocate the new arrays with some extra margin */
337
  struct part *parts_new = NULL;
338
  struct xpart *xparts_new = NULL;
339
  struct gpart *gparts_new = NULL;
340
  if (posix_memalign((void **)&parts_new, part_align,
341
342
                     sizeof(struct part) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
343
    error("Failed to allocate new part data.");
344
345
346
347
348
349
350
351
352
353
  if (posix_memalign((void **)&xparts_new, xpart_align,
                     sizeof(struct xpart) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new xpart data.");
  if (posix_memalign((void **)&gparts_new, gpart_align,
                     sizeof(struct gpart) * nr_gparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new gpart data.");

  /* Prepare MPI requests for the asynchronous communications */
354
  MPI_Request *reqs;
355
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 6 * nr_nodes)) ==
356
357
      NULL)
    error("Failed to allocate MPI request list.");
358
359
360
361
362
363
364
365
366
367
368
369
  for (int k = 0; k < 6 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;

  /* Emit the sends and recvs for the particle and gparticle data. */
  size_t offset_send = 0, offset_recv = 0;
  size_t g_offset_send = 0, g_offset_recv = 0;
  for (int k = 0; k < nr_nodes; k++) {

    /* Indices in the count arrays of the node of interest */
    const int ind_send = nodeID * nr_nodes + k;
    const int ind_recv = k * nr_nodes + nodeID;

    /* Are we sending any part/xpart ? */
370
    if (counts[ind_send] > 0) {
371

372
      /* message("Sending %d part to node %d", counts[ind_send], k); */
Matthieu Schaller's avatar
Matthieu Schaller committed
373

374
      /* If the send is to the same node, just copy */
375
376
377
378
379
380
381
      if (k == nodeID) {
        memcpy(&parts_new[offset_recv], &s->parts[offset_send],
               sizeof(struct part) * counts[ind_recv]);
        memcpy(&xparts_new[offset_recv], &s->xparts[offset_send],
               sizeof(struct xpart) * counts[ind_recv]);
        offset_send += counts[ind_send];
        offset_recv += counts[ind_recv];
382
383

        /* Else, emit some communications */
384
      } else {
385
386
        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
                      k, 3 * ind_send + 0, MPI_COMM_WORLD,
387
388
                      &reqs[6 * k]) != MPI_SUCCESS)
          error("Failed to isend parts to node %i.", k);
389
390
        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
                      k, 3 * ind_send + 1, MPI_COMM_WORLD,
391
392
                      &reqs[6 * k + 1]) != MPI_SUCCESS)
          error("Failed to isend xparts to node %i.", k);
393
394
        offset_send += counts[ind_send];
      }
395
    }
396
397
398
399

    /* Are we sending any gpart ? */
    if (g_counts[ind_send] > 0) {

400
      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
401

402
403
      /* If the send is to the same node, just copy */
      if (k == nodeID) {
404
        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
405
406
407
408
409
410
411
               sizeof(struct gpart) * g_counts[ind_recv]);
        g_offset_send += g_counts[ind_send];
        g_offset_recv += g_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
412
                      gpart_mpi_type, k, 3 * ind_send + 2, MPI_COMM_WORLD,
413
414
                      &reqs[6 * k + 2]) != MPI_SUCCESS)
          error("Failed to isend gparts to node %i.", k);
415
        g_offset_send += g_counts[ind_send];
416
417
418
419
420
421
      }
    }

    /* Now emit the corresponding Irecv() */

    /* Are we receiving any part/xpart from this node ? */
422
    if (k != nodeID && counts[ind_recv] > 0) {
423
424
      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
                    3 * ind_recv + 0, MPI_COMM_WORLD,
425
426
                    &reqs[6 * k + 3]) != MPI_SUCCESS)
        error("Failed to emit irecv of parts from node %i.", k);
427
428
      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
                    k, 3 * ind_recv + 1, MPI_COMM_WORLD,
429
430
                    &reqs[6 * k + 4]) != MPI_SUCCESS)
        error("Failed to emit irecv of xparts from node %i.", k);
431
      offset_recv += counts[ind_recv];
432
    }
433
434
435
436

    /* Are we receiving any gpart from this node ? */
    if (k != nodeID && g_counts[ind_recv] > 0) {
      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
437
                    gpart_mpi_type, k, 3 * ind_recv + 2, MPI_COMM_WORLD,
438
439
440
441
                    &reqs[6 * k + 5]) != MPI_SUCCESS)
        error("Failed to emit irecv of gparts from node %i.", k);
      g_offset_recv += g_counts[ind_recv];
    }
442
  }
443

444
  /* Wait for all the sends and recvs to tumble in. */
445
  MPI_Status stats[6 * nr_nodes];
446
  int res;
447
448
  if ((res = MPI_Waitall(6 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
    for (int k = 0; k < 6 * nr_nodes; k++) {
449
450
451
      char buff[MPI_MAX_ERROR_STRING];
      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
      message("request %i has error '%s'.", k, buff);
452
    }
453
454
    error("Failed during waitall for part data.");
  }
455

456
457
458
459
460
461
462
463
464
465
466
467
  /* We now need to restore the part<->gpart links */
  size_t offset_parts = 0, offset_gparts = 0;
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

      /* Does this gpart have a partner ? */
468
      if (gparts_new[k].id_or_neg_offset <= 0) {
469

Pedro Gonnet's avatar
Pedro Gonnet committed
470
        const ptrdiff_t partner_index = offset_parts - gparts_new[k].id_or_neg_offset;
471
472

        /* Re-link */
473
        gparts_new[k].id_or_neg_offset = -partner_index;
Pedro Gonnet's avatar
Pedro Gonnet committed
474
        parts_new[partner_index].gpart = &gparts_new[k];
475
476
477
478
479
480
481
      }
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
  }

482
#ifdef SWIFT_DEBUG_CHECKS
483
  /* Verify that all parts are in the right place. */
484
485
486
487
488
489
490
  for (int k = 0; k < nr_parts; k++) {
    int cid = cell_getid(cdim, parts_new[k].x[0] * ih[0],
                         parts_new[k].x[1] * ih[1], parts_new[k].x[2] * ih[2]);
    if (cells[cid].nodeID != nodeID)
      error("Received particle (%i) that does not belong here (nodeID=%i).", k,
            cells[cid].nodeID);
  }
491

492
493
494
  /* Verify that the links are correct */
  for (size_t k = 0; k < nr_gparts; ++k) {

495
    if (gparts_new[k].id_or_neg_offset <= 0) {
496

497
      struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset];
498

499
      if (part->gpart != &gparts_new[k]) error("Linking problem !");
500

501
      if (gparts_new[k].x[0] != part->x[0] ||
502
          gparts_new[k].x[1] != part->x[1] || gparts_new[k].x[2] != part->x[2])
503
        error("Linked particles are not at the same position !");
504
505
    }
  }
506
  for (size_t k = 0; k < nr_parts; ++k) {
Matthieu Schaller's avatar
Matthieu Schaller committed
507

508
509
    if (parts_new[k].gpart != NULL &&
        parts_new[k].gpart->id_or_neg_offset != -k) {
510
      error("Linking problem !");
511
512
    }
  }
513
#endif
514
515
516
517

  /* Set the new part data, free the old. */
  free(parts);
  free(xparts);
518
  free(gparts);
519
520
  s->parts = parts_new;
  s->xparts = xparts_new;
521
  s->gparts = gparts_new;
522
  s->nr_parts = nr_parts;
523
524
525
  s->nr_gparts = nr_gparts;
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
526

527
  /* Clean up the temporary stuff. */
528
529
530
531
  free(reqs);
  free(counts);
  free(dest);

532
533
534
535
536
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
537
538
    message("node %i now has %zi parts and %zi gparts in %i cells.", nodeID,
            nr_parts, nr_gparts, my_cells);
539
540
  }

541
542
543
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
544
#else
545
  error("SWIFT was not compiled with MPI support.");
546
547
#endif
}
548

549
/**
550
 * @brief Repartition the cells amongst the nodes.
551
552
553
 *
 * @param e The #engine.
 */
554
555

void engine_repartition(struct engine *e) {
556
557
558

#if defined(WITH_MPI) && defined(HAVE_METIS)

559
560
  ticks tic = getticks();

561
  /* Clear the repartition flag. */
562
  enum repartition_type reparttype = e->forcerepart;
563
  e->forcerepart = REPART_NONE;
564

565
566
  /* Nothing to do if only using a single node. Also avoids METIS
   * bug that doesn't handle this case well. */
567
  if (e->nr_nodes == 1) return;
568

569
570
571
  /* Do the repartitioning. */
  partition_repartition(reparttype, e->nodeID, e->nr_nodes, e->s,
                        e->sched.tasks, e->sched.nr_tasks);
572
573
574
575

  /* Now comes the tricky part: Exchange particles between all nodes.
     This is done in two steps, first allreducing a matrix of
     how many particles go from where to where, then re-allocating
576
     the parts array, and emitting the sends and receives.
577
578
579
580
581
582
583
584
585
586
587
     Finally, the space, tasks, and proxies need to be rebuilt. */

  /* Redistribute the particles between the nodes. */
  engine_redistribute(e);

  /* Make the proxies. */
  engine_makeproxies(e);

  /* Tell the engine it should re-build whenever possible */
  e->forcerebuild = 1;

588
589
590
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
591
#else
592
  error("SWIFT was not compiled with MPI and METIS support.");
593
#endif
594
}
595

596
597
598
599
600
601
602
603
604
/**
 * @brief Add up/down gravity tasks to a cell hierarchy.
 *
 * @param e The #engine.
 * @param c The #cell
 * @param up The upward gravity #task.
 * @param down The downward gravity #task.
 */

605
606
void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
                          struct task *down) {
607

608
609
610
611
612
613
614
615
616
617
  /* Link the tasks to this cell. */
  c->grav_up = up;
  c->grav_down = down;

  /* Recurse? */
  if (c->split)
    for (int k = 0; k < 8; k++)
      if (c->progeny[k] != NULL)
        engine_addtasks_grav(e, c->progeny[k], up, down);
}
618

619
620
621
622
/**
 * @brief Add send tasks to a hierarchy of cells.
 *
 * @param e The #engine.
623
624
 * @param ci The sending #cell.
 * @param cj The receiving #cell
625
626
 */

627
void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) {
628

629
#ifdef WITH_MPI
630
631
  struct link *l = NULL;
  struct scheduler *s = &e->sched;
632

633
634
635
636
637
  /* Check if any of the density tasks are for the target node. */
  for (l = ci->density; l != NULL; l = l->next)
    if (l->t->ci->nodeID == cj->nodeID ||
        (l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID))
      break;
638

639
640
  /* If so, attach send tasks. */
  if (l != NULL) {
641

642
    /* Create the tasks. */
Pedro Gonnet's avatar
Pedro Gonnet committed
643
644
645
646
    struct task *t_xv = scheduler_addtask(s, task_type_send, task_subtype_none,
                                          2 * ci->tag, 0, ci, cj, 0);
    struct task *t_rho = scheduler_addtask(s, task_type_send, task_subtype_none,
                                           2 * ci->tag + 1, 0, ci, cj, 0);
647

648
649
    /* The send_rho task depends on the cell's ghost task. */
    scheduler_addunlock(s, ci->super->ghost, t_rho);
650

Matthieu Schaller's avatar
Matthieu Schaller committed
651
652
    /* The send_rho task should unlock the super-cell's kick task. */
    scheduler_addunlock(s, t_rho, ci->super->kick);
653

654
655
    /* The send_xv task should unlock the super-cell's ghost task. */
    scheduler_addunlock(s, t_xv, ci->super->ghost);
656

657
  }
658

659
660
  /* Recurse? */
  else if (ci->split)
661
    for (int k = 0; k < 8; k++)
662
      if (ci->progeny[k] != NULL) engine_addtasks_send(e, ci->progeny[k], cj);
Matthieu Schaller's avatar
Matthieu Schaller committed
663
664
665
666

#else
  error("SWIFT was not compiled with MPI support.");
#endif
667
}
668
669
670
671
672
673
674
675
676
677

/**
 * @brief Add recv tasks to a hierarchy of cells.
 *
 * @param e The #engine.
 * @param c The #cell.
 * @param t_xv The recv_xv #task, if it has already been created.
 * @param t_rho The recv_rho #task, if it has already been created.
 */

678
679
void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
                          struct task *t_rho) {
680

681
#ifdef WITH_MPI
682
  struct scheduler *s = &e->sched;
683

684
685
  /* Do we need to construct a recv task? */
  if (t_xv == NULL && c->nr_density > 0) {
686

687
    /* Create the tasks. */
Pedro Gonnet's avatar
Pedro Gonnet committed
688
689
690
691
    t_xv = c->recv_xv = scheduler_addtask(s, task_type_recv, task_subtype_none,
                                          2 * c->tag, 0, c, NULL, 0);
    t_rho = c->recv_rho = scheduler_addtask(
        s, task_type_recv, task_subtype_none, 2 * c->tag + 1, 0, c, NULL, 0);
692
  }
693

694
695
696
697
698
699
700
701
702
703
704
  /* Add dependencies. */
  for (struct link *l = c->density; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_xv, l->t);
    scheduler_addunlock(s, l->t, t_rho);
  }
  for (struct link *l = c->force; l != NULL; l = l->next)
    scheduler_addunlock(s, t_rho, l->t);
  if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);

  /* Recurse? */
  if (c->split)
705
    for (int k = 0; k < 8; k++)
706
707
      if (c->progeny[k] != NULL)
        engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho);
Matthieu Schaller's avatar
Matthieu Schaller committed
708
709
710
711

#else
  error("SWIFT was not compiled with MPI support.");
#endif
712
}
713
714
715
716
717
718

/**
 * @brief Exchange cell structures with other nodes.
 *
 * @param e The #engine.
 */
719
720

void engine_exchange_cells(struct engine *e) {
721
722
723

#ifdef WITH_MPI

724
725
  struct space *s = e->s;
  struct cell *cells = s->cells;
726
727
  const int nr_cells = s->nr_cells;
  const int nr_proxies = e->nr_proxies;
728
729
730
731
  int offset[nr_cells];
  MPI_Request reqs_in[engine_maxproxies];
  MPI_Request reqs_out[engine_maxproxies];
  MPI_Status status;
732
  ticks tic = getticks();
733
734
735

  /* Run through the cells and get the size of the ones that will be sent off.
   */
736
737
738
  int count_out = 0;
  for (int k = 0; k < nr_cells; k++) {
    offset[k] = count_out;
739
    if (cells[k].sendto)
740
      count_out += (cells[k].pcell_size = cell_getsize(&cells[k]));
741
  }
742

743
  /* Allocate the pcells. */
744
  struct pcell *pcells;
745
746
  if ((pcells = (struct pcell *)malloc(sizeof(struct pcell) * count_out)) ==
      NULL)
747
748
749
750
    error("Failed to allocate pcell buffer.");

  /* Pack the cells. */
  cell_next_tag = 0;
751
  for (int k = 0; k < nr_cells; k++)
752
753
754
755
756
757
    if (cells[k].sendto) {
      cell_pack(&cells[k], &pcells[offset[k]]);
      cells[k].pcell = &pcells[offset[k]];
    }

  /* Launch the proxies. */
758
  for (int k = 0; k < nr_proxies; k++) {
759
760
761
762
763
764
    proxy_cells_exch1(&e->proxies[k]);
    reqs_in[k] = e->proxies[k].req_cells_count_in;
    reqs_out[k] = e->proxies[k].req_cells_count_out;
  }

  /* Wait for each count to come in and start the recv. */
765
  for (int k = 0; k < nr_proxies; k++) {
766
    int pid = MPI_UNDEFINED;
767
768
769
770
771
772
773
    if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
        pid == MPI_UNDEFINED)
      error("MPI_Waitany failed.");
    // message( "request from proxy %i has arrived." , pid );
    proxy_cells_exch2(&e->proxies[pid]);
  }

774
  /* Wait for all the sends to have finished too. */
775
776
777
778
  if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
    error("MPI_Waitall on sends failed.");

  /* Set the requests for the cells. */
779
  for (int k = 0; k < nr_proxies; k++) {
780
781
782
783
784
    reqs_in[k] = e->proxies[k].req_cells_in;
    reqs_out[k] = e->proxies[k].req_cells_out;
  }

  /* Wait for each pcell array to come in from the proxies. */
785
  for (int k = 0; k < nr_proxies; k++) {
786
    int pid = MPI_UNDEFINED;
787
788
789
790
    if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
        pid == MPI_UNDEFINED)
      error("MPI_Waitany failed.");
    // message( "cell data from proxy %i has arrived." , pid );
791
    for (int count = 0, j = 0; j < e->proxies[pid].nr_cells_in; j++)
792
793
794
795
      count += cell_unpack(&e->proxies[pid].pcells_in[count],
                           e->proxies[pid].cells_in[j], e->s);
  }

796
  /* Wait for all the sends to have finished too. */
797
798
799
800
801
  if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
    error("MPI_Waitall on sends failed.");

  /* Count the number of particles we need to import and re-allocate
     the buffer if needed. */
802
  int count_parts_in = 0, count_gparts_in = 0;
803
  for (int k = 0; k < nr_proxies; k++)
804
805
806
807
808
    for (int j = 0; j < e->proxies[k].nr_cells_in; j++) {
      count_parts_in += e->proxies[k].cells_in[j]->count;
      count_gparts_in += e->proxies[k].cells_in[j]->gcount;
    }
  if (count_parts_in > s->size_parts_foreign) {
809
    if (s->parts_foreign != NULL) free(s->parts_foreign);
810
    s->size_parts_foreign = 1.1 * count_parts_in;
811
812
813
814
    if (posix_memalign((void **)&s->parts_foreign, part_align,
                       sizeof(struct part) * s->size_parts_foreign) != 0)
      error("Failed to allocate foreign part data.");
  }
815
816
817
818
819
820
821
  if (count_gparts_in > s->size_gparts_foreign) {
    if (s->gparts_foreign != NULL) free(s->gparts_foreign);
    s->size_gparts_foreign = 1.1 * count_gparts_in;
    if (posix_memalign((void **)&s->gparts_foreign, gpart_align,
                       sizeof(struct gpart) * s->size_gparts_foreign) != 0)
      error("Failed to allocate foreign gpart data.");
  }
822

823
  /* Unpack the cells and link to the particle data. */
824
  struct part *parts = s->parts_foreign;
825
  struct gpart *gparts = s->gparts_foreign;
826
827
  for (int k = 0; k < nr_proxies; k++) {
    for (int j = 0; j < e->proxies[k].nr_cells_in; j++) {
828
      cell_link_parts(e->proxies[k].cells_in[j], parts);
829
      cell_link_gparts(e->proxies[k].cells_in[j], gparts);
830
      parts = &parts[e->proxies[k].cells_in[j]->count];
831
      gparts = &gparts[e->proxies[k].cells_in[j]->gcount];
832
    }
833
834
  }
  s->nr_parts_foreign = parts - s->parts_foreign;
835
  s->nr_gparts_foreign = gparts - s->gparts_foreign;
836
837
838

  /* Free the pcell buffer. */
  free(pcells);
839

840
841
842
843
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());

844
845
846
847
#else
  error("SWIFT was not compiled with MPI support.");
#endif
}
848
849
850
851
852

/**
 * @brief Exchange straying parts with other nodes.
 *
 * @param e The #engine.
Pedro Gonnet's avatar
Pedro Gonnet committed
853
 * @param offset_parts The index in the parts array as of which the foreign
Pedro Gonnet's avatar
Pedro Gonnet committed
854
 *        parts reside.
855
856
857
 * @param ind_part The foreign #cell ID of each part.
 * @param Npart The number of stray parts, contains the number of parts received
 *        on return.
Pedro Gonnet's avatar
Pedro Gonnet committed
858
 * @param offset_gparts The index in the gparts array as of which the foreign
Pedro Gonnet's avatar
Pedro Gonnet committed
859
 *        parts reside.
860
 * @param ind_gpart The foreign #cell ID of each gpart.
Pedro Gonnet's avatar
Pedro Gonnet committed
861
 * @param Ngpart The number of stray gparts, contains the number of gparts
Pedro Gonnet's avatar
Pedro Gonnet committed
862
 *        received on return.
863
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
864
865
 * Note that this function does not mess-up the linkage between parts and
 * gparts, i.e. the received particles have correct linkeage.
866
 */
867

Pedro Gonnet's avatar
Pedro Gonnet committed
868
869
870
void engine_exchange_strays(struct engine *e, size_t offset_parts,
                            int *ind_part, size_t *Npart, size_t offset_gparts,
                            int *ind_gpart, size_t *Ngpart) {
871
872
873

#ifdef WITH_MPI

874
  struct space *s = e->s;
875
  ticks tic = getticks();
876
877

  /* Re-set the proxies. */
Pedro Gonnet's avatar
Pedro Gonnet committed
878
879
880
881
  for (int k = 0; k < e->nr_proxies; k++) {
    e->proxies[k].nr_parts_out = 0;
    e->proxies[k].nr_gparts_out = 0;
  }
882

883
  /* Put the parts and gparts into the corresponding proxies. */
884
  for (size_t k = 0; k < *Npart; k++) {
885
    /* Get the target node and proxy ID. */
886
    const int node_id = e->s->cells[ind_part[k]].nodeID;
887
888
    if (node_id < 0 || node_id >= e->nr_nodes)
      error("Bad node ID %i.", node_id);
889
    const int pid = e->proxy_ind[node_id];
890
    if (pid < 0) {
891
892
893
      error(
          "Do not have a proxy for the requested nodeID %i for part with "
          "id=%llu, x=[%e,%e,%e].",
Pedro Gonnet's avatar
Pedro Gonnet committed
894
895
896
          node_id, s->parts[offset_parts + k].id,
          s->parts[offset_parts + k].x[0], s->parts[offset_parts + k].x[1],
          s->parts[offset_parts + k].x[2]);
897
    }
Pedro Gonnet's avatar
Pedro Gonnet committed
898

899
900
    /* Re-link the associated gpart with the buffer offset of the part. */
    if (s->parts[offset_parts + k].gpart != NULL) {
901
902
      s->parts[offset_parts + k].gpart->id_or_neg_offset =
          -e->proxies[pid].nr_parts_out;
903
    }
Pedro Gonnet's avatar
Pedro Gonnet committed
904

905
    /* Load the part and xpart into the proxy. */
906
907
908
    proxy_parts_load(&e->proxies[pid], &s->parts[offset_parts + k],
                     &s->xparts[offset_parts + k], 1);
  }
909
  for (size_t k = 0; k < *Ngpart; k++) {
910
    const int node_id = e->s->cells[ind_gpart[k]].nodeID;
911
912
    if (node_id < 0 || node_id >= e->nr_nodes)
      error("Bad node ID %i.", node_id);
913
    const int pid = e->proxy_ind[node_id];
914