engine.c 117 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "atomic.h"
52
#include "cell.h"
53
#include "clocks.h"
54
55
#include "cycle.h"
#include "debug.h"
56
#include "error.h"
57
#include "hydro.h"
58
#include "minmax.h"
59
#include "parallel_io.h"
60
#include "part.h"
61
#include "partition.h"
62
63
#include "proxy.h"
#include "runner.h"
64
65
#include "serial_io.h"
#include "single_io.h"
66
#include "timers.h"
67
#include "tools.h"
68
#include "units.h"
69
#include "version.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
70

71
const char *engine_policy_names[15] = {"none",
Pedro Gonnet's avatar
Pedro Gonnet committed
72
73
74
75
76
77
78
79
80
81
82
                                       "rand",
                                       "steal",
                                       "keep",
                                       "block",
                                       "fix_dt",
                                       "cpu_tight",
                                       "mpi",
                                       "numa_affinity",
                                       "hydro",
                                       "self_gravity",
                                       "external_gravity",
83
                                       "cosmology_integration",
84
                                       "drift_all",
Matthieu Schaller's avatar
Matthieu Schaller committed
85
                                       "cooling"};
Pedro Gonnet's avatar
Pedro Gonnet committed
86

87
88
89
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

90
91
92
93
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
94
 * @param l A pointer to the #link, will be modified atomically.
95
96
97
98
 * @param t The #task.
 *
 * @return The new #link pointer.
 */
99
void engine_addlink(struct engine *e, struct link **l, struct task *t) {
100

101
  /* Get the next free link. */
102
103
104
105
106
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
107

108
  /* Set it atomically. */
109
  res->t = t;
110
  res->next = atomic_swap(l, res);
111
}
112

113
114
115
116
117
118
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
 * i.e. all the O(Npart) tasks.
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
119
120
 * Note that there is no need to recurse below the super-cell.
 *
121
122
123
124
 * @param e The #engine.
 * @param c The #cell.
 * @param super The super #cell.
 */
125
void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
126
127

  struct scheduler *s = &e->sched;
128
  const int is_fixdt = (e->policy & engine_policy_fixdt);
129
  const int is_hydro = (e->policy & engine_policy_hydro);
130
  const int is_with_cooling = (e->policy & engine_policy_cooling);
131

132
  /* Are we in a super-cell ? */
133
  if (c->super == c) {
134
135
136
137

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

Matthieu Schaller's avatar
Matthieu Schaller committed
138
      /* Add the init task. */
139
140
      c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
                                  NULL, 0);
141

142
143
      /* Add the kick task that matches the policy. */
      if (is_fixdt) {
144
145
        c->kick = scheduler_addtask(s, task_type_kick_fixdt, task_subtype_none,
                                    0, 0, c, NULL, 0);
146
      } else {
147
148
        c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0,
                                    c, NULL, 0);
149
      }
Tom Theuns's avatar
Tom Theuns committed
150

151
      /* Generate the ghost task. */
152
153
154
      if (is_hydro)
	c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0,
				     c, NULL, 0);
155
156
157

#ifdef EXTRA_HYDRO_LOOP
      /* Generate the extra ghost task. */
158
159
160
      if (is_hydro)
	c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
					   task_subtype_none, 0, 0, c, NULL, 0);
161
#endif
162

163
      /* Cooling task */
164
      if (is_with_cooling)
Matthieu Schaller's avatar
Matthieu Schaller committed
165
166
        c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
                                       0, 0, c, NULL, 0);
167
168
    }

169
170
171
172
173
  } else { /* We are above the super-cell so need to go deeper */

#ifdef SWIFT_DEBUG_CHECKS
    if (c->super != NULL) error("Incorrectly set super pointer");
#endif
174

175
176
177
178
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
179
          engine_make_hierarchical_tasks(e, c->progeny[k]);
180
  }
181
}
182

183
/**
184
 * @brief Redistribute the particles amongst the nodes according
185
186
 *      to their cell's node IDs.
 *
187
188
189
190
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
191
192
193
194
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
195
196
 *
 *
197
198
 * @param e The #engine.
 */
199
void engine_redistribute(struct engine *e) {
200

201
#ifdef WITH_MPI
202

203
204
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
205
  struct space *s = e->s;
206
  struct cell *cells = s->cells_top;
207
  const int nr_cells = s->nr_cells;
208
  const int *cdim = s->cdim;
209
  const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
210
211
212
213
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct xpart *xparts = s->xparts;
  struct gpart *gparts = s->gparts;
214
  ticks tic = getticks();
215

216
217
218
  /* Allocate temporary arrays to store the counts of particles to be sent
     and the destination of each particle */
  int *counts, *g_counts;
219
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
220
    error("Failed to allocate count temporary buffer.");
221
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
222
    error("Failed to allocate gcount temporary buffer.");
223
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
224
225
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);

226
  /* Allocate the destination index arrays. */
227
228
  int *dest, *g_dest;
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
229
    error("Failed to allocate dest temporary buffer.");
230
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
231
232
    error("Failed to allocate g_dest temporary buffer.");

233
  /* Get destination of each particle */
234
  for (size_t k = 0; k < s->nr_parts; k++) {
235
236

    /* Periodic boundary conditions */
237
    for (int j = 0; j < 3; j++) {
238
239
240
241
242
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
243
244
245
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
246
247
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
248
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
249
250
251
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

252
    dest[k] = cells[cid].nodeID;
253
254

    /* The counts array is indexed as count[from * nr_nodes + to]. */
255
256
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
257
258

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
259
  space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
260

261
  /* We need to re-link the gpart partners of parts. */
262
263
264
265
266
267
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

268
269
270
271
272
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
         * Recall that gparts without partners have a negative id.
         * We will restore the pointers on the receiving node later on. */
273
274
275
276
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
277

278
#ifdef SWIFT_DEBUG_CHECKS
Matthieu Schaller's avatar
Matthieu Schaller committed
279
        if (s->parts[k].gpart->id_or_neg_offset >= 0)
280
281
          error("Trying to link a partnerless gpart !");
#endif
282

283
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
284
        count_this_dest++;
285
286
287
288
289
      }
    }
  }

  /* Get destination of each g-particle */
290
  for (size_t k = 0; k < s->nr_gparts; k++) {
291
292

    /* Periodic boundary conditions */
293
    for (int j = 0; j < 3; j++) {
294
295
296
297
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
298
    }
James Willis's avatar
James Willis committed
299
300
301
    const int cid =
        cell_getid(cdim, gparts[k].x[0] * iwidth[0], gparts[k].x[1] * iwidth[1],
                   gparts[k].x[2] * iwidth[2]);
302
303
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
304
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
305
306
307
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

308
    g_dest[k] = cells[cid].nodeID;
309
310

    /* The counts array is indexed as count[from * nr_nodes + to]. */
311
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
312
  }
313
314

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
315
  space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
316

317
318
319
320
321
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

322
323
324
325
326
327
  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Each node knows how many parts and gparts will be transferred to every
328
     other node. We can start preparing to receive data */
329
330
331

  /* Get the new number of parts and gparts for this node */
  size_t nr_parts = 0, nr_gparts = 0;
332
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
333
334
335
336
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];

  /* Allocate the new arrays with some extra margin */
337
  struct part *parts_new = NULL;
338
  struct xpart *xparts_new = NULL;
339
  struct gpart *gparts_new = NULL;
340
  if (posix_memalign((void **)&parts_new, part_align,
341
342
                     sizeof(struct part) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
343
    error("Failed to allocate new part data.");
344
345
346
347
348
349
350
351
352
353
  if (posix_memalign((void **)&xparts_new, xpart_align,
                     sizeof(struct xpart) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new xpart data.");
  if (posix_memalign((void **)&gparts_new, gpart_align,
                     sizeof(struct gpart) * nr_gparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new gpart data.");

  /* Prepare MPI requests for the asynchronous communications */
354
  MPI_Request *reqs;
355
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 6 * nr_nodes)) ==
356
357
      NULL)
    error("Failed to allocate MPI request list.");
358
359
360
361
362
363
364
365
366
367
368
369
  for (int k = 0; k < 6 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;

  /* Emit the sends and recvs for the particle and gparticle data. */
  size_t offset_send = 0, offset_recv = 0;
  size_t g_offset_send = 0, g_offset_recv = 0;
  for (int k = 0; k < nr_nodes; k++) {

    /* Indices in the count arrays of the node of interest */
    const int ind_send = nodeID * nr_nodes + k;
    const int ind_recv = k * nr_nodes + nodeID;

    /* Are we sending any part/xpart ? */
370
    if (counts[ind_send] > 0) {
371

372
      /* message("Sending %d part to node %d", counts[ind_send], k); */
Matthieu Schaller's avatar
Matthieu Schaller committed
373

374
      /* If the send is to the same node, just copy */
375
376
377
378
379
380
381
      if (k == nodeID) {
        memcpy(&parts_new[offset_recv], &s->parts[offset_send],
               sizeof(struct part) * counts[ind_recv]);
        memcpy(&xparts_new[offset_recv], &s->xparts[offset_send],
               sizeof(struct xpart) * counts[ind_recv]);
        offset_send += counts[ind_send];
        offset_recv += counts[ind_recv];
382
383

        /* Else, emit some communications */
384
      } else {
385
386
        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
                      k, 3 * ind_send + 0, MPI_COMM_WORLD,
387
388
                      &reqs[6 * k]) != MPI_SUCCESS)
          error("Failed to isend parts to node %i.", k);
389
390
        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
                      k, 3 * ind_send + 1, MPI_COMM_WORLD,
391
392
                      &reqs[6 * k + 1]) != MPI_SUCCESS)
          error("Failed to isend xparts to node %i.", k);
393
394
        offset_send += counts[ind_send];
      }
395
    }
396
397
398
399

    /* Are we sending any gpart ? */
    if (g_counts[ind_send] > 0) {

400
      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
401

402
403
      /* If the send is to the same node, just copy */
      if (k == nodeID) {
404
        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
405
406
407
408
409
410
411
               sizeof(struct gpart) * g_counts[ind_recv]);
        g_offset_send += g_counts[ind_send];
        g_offset_recv += g_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
412
                      gpart_mpi_type, k, 3 * ind_send + 2, MPI_COMM_WORLD,
413
414
                      &reqs[6 * k + 2]) != MPI_SUCCESS)
          error("Failed to isend gparts to node %i.", k);
415
        g_offset_send += g_counts[ind_send];
416
417
418
419
420
421
      }
    }

    /* Now emit the corresponding Irecv() */

    /* Are we receiving any part/xpart from this node ? */
422
    if (k != nodeID && counts[ind_recv] > 0) {
423
424
      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
                    3 * ind_recv + 0, MPI_COMM_WORLD,
425
426
                    &reqs[6 * k + 3]) != MPI_SUCCESS)
        error("Failed to emit irecv of parts from node %i.", k);
427
428
      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
                    k, 3 * ind_recv + 1, MPI_COMM_WORLD,
429
430
                    &reqs[6 * k + 4]) != MPI_SUCCESS)
        error("Failed to emit irecv of xparts from node %i.", k);
431
      offset_recv += counts[ind_recv];
432
    }
433
434
435
436

    /* Are we receiving any gpart from this node ? */
    if (k != nodeID && g_counts[ind_recv] > 0) {
      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
437
                    gpart_mpi_type, k, 3 * ind_recv + 2, MPI_COMM_WORLD,
438
439
440
441
                    &reqs[6 * k + 5]) != MPI_SUCCESS)
        error("Failed to emit irecv of gparts from node %i.", k);
      g_offset_recv += g_counts[ind_recv];
    }
442
  }
443

444
  /* Wait for all the sends and recvs to tumble in. */
445
  MPI_Status stats[6 * nr_nodes];
446
  int res;
447
448
  if ((res = MPI_Waitall(6 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
    for (int k = 0; k < 6 * nr_nodes; k++) {
449
450
451
      char buff[MPI_MAX_ERROR_STRING];
      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
      message("request %i has error '%s'.", k, buff);
452
    }
453
454
    error("Failed during waitall for part data.");
  }
455

456
457
458
459
460
461
462
463
464
465
466
467
  /* We now need to restore the part<->gpart links */
  size_t offset_parts = 0, offset_gparts = 0;
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

      /* Does this gpart have a partner ? */
468
      if (gparts_new[k].id_or_neg_offset <= 0) {
469

Matthieu Schaller's avatar
Style    
Matthieu Schaller committed
470
471
        const ptrdiff_t partner_index =
            offset_parts - gparts_new[k].id_or_neg_offset;
472
473

        /* Re-link */
474
        gparts_new[k].id_or_neg_offset = -partner_index;
Pedro Gonnet's avatar
Pedro Gonnet committed
475
        parts_new[partner_index].gpart = &gparts_new[k];
476
477
478
479
480
481
482
      }
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
  }

483
#ifdef SWIFT_DEBUG_CHECKS
484
  /* Verify that all parts are in the right place. */
485
486
487
488
  for (size_t k = 0; k < nr_parts; k++) {
    const int cid = cell_getid(cdim, parts_new[k].x[0] * iwidth[0],
                               parts_new[k].x[1] * iwidth[1],
                               parts_new[k].x[2] * iwidth[2]);
489
    if (cells[cid].nodeID != nodeID)
490
      error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
491
492
            cells[cid].nodeID);
  }
493

494
  /* Verify that the links are correct */
495
  for (size_t k = 0; k < nr_gparts; ++k) {
496

497
    if (gparts_new[k].id_or_neg_offset <= 0) {
498

499
      struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset];
500

501
      if (part->gpart != &gparts_new[k]) error("Linking problem !");
502

503
      if (gparts_new[k].x[0] != part->x[0] ||
504
          gparts_new[k].x[1] != part->x[1] || gparts_new[k].x[2] != part->x[2])
505
506
507
508
        error("Linked particles are not at the same position !");
    }
  }
  for (size_t k = 0; k < nr_parts; ++k) {
Matthieu Schaller's avatar
Matthieu Schaller committed
509

510
    if (parts_new[k].gpart != NULL &&
511
        parts_new[k].gpart->id_or_neg_offset != -(ptrdiff_t)k) {
512
      error("Linking problem !");
513
514
515
    }
  }
#endif
516
517
518
519

  /* Set the new part data, free the old. */
  free(parts);
  free(xparts);
520
  free(gparts);
521
522
  s->parts = parts_new;
  s->xparts = xparts_new;
523
  s->gparts = gparts_new;
524
  s->nr_parts = nr_parts;
525
526
527
  s->nr_gparts = nr_gparts;
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
528

529
  /* Clean up the temporary stuff. */
530
531
532
533
  free(reqs);
  free(counts);
  free(dest);

534
535
536
537
538
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
539
    message("node %i now has %zu parts and %zu gparts in %i cells.", nodeID,
540
            nr_parts, nr_gparts, my_cells);
541
542
  }

543
544
545
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
546
#else
547
  error("SWIFT was not compiled with MPI support.");
548
549
#endif
}
550

551
/**
552
 * @brief Repartition the cells amongst the nodes.
553
554
555
 *
 * @param e The #engine.
 */
556
void engine_repartition(struct engine *e) {
557
558
559

#if defined(WITH_MPI) && defined(HAVE_METIS)

560
561
  ticks tic = getticks();

562
  /* Clear the repartition flag. */
563
  enum repartition_type reparttype = e->forcerepart;
564
  e->forcerepart = REPART_NONE;
565

566
567
  /* Nothing to do if only using a single node. Also avoids METIS
   * bug that doesn't handle this case well. */
568
  if (e->nr_nodes == 1) return;
569

570
571
572
  /* Do the repartitioning. */
  partition_repartition(reparttype, e->nodeID, e->nr_nodes, e->s,
                        e->sched.tasks, e->sched.nr_tasks);
573
574
575
576

  /* Now comes the tricky part: Exchange particles between all nodes.
     This is done in two steps, first allreducing a matrix of
     how many particles go from where to where, then re-allocating
577
     the parts array, and emitting the sends and receives.
578
579
580
581
582
583
584
585
586
587
588
     Finally, the space, tasks, and proxies need to be rebuilt. */

  /* Redistribute the particles between the nodes. */
  engine_redistribute(e);

  /* Make the proxies. */
  engine_makeproxies(e);

  /* Tell the engine it should re-build whenever possible */
  e->forcerebuild = 1;

589
590
591
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
592
#else
593
  error("SWIFT was not compiled with MPI and METIS support.");
594
#endif
595
}
596

597
598
599
600
601
602
603
604
/**
 * @brief Add up/down gravity tasks to a cell hierarchy.
 *
 * @param e The #engine.
 * @param c The #cell
 * @param up The upward gravity #task.
 * @param down The downward gravity #task.
 */
605
606
void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
                          struct task *down) {
607

608
609
610
611
612
613
614
615
616
617
  /* Link the tasks to this cell. */
  c->grav_up = up;
  c->grav_down = down;

  /* Recurse? */
  if (c->split)
    for (int k = 0; k < 8; k++)
      if (c->progeny[k] != NULL)
        engine_addtasks_grav(e, c->progeny[k], up, down);
}
618

619
620
621
622
/**
 * @brief Add send tasks to a hierarchy of cells.
 *
 * @param e The #engine.
623
 * @param ci The sending #cell.
624
625
626
 * @param cj Dummy cell containing the nodeID of the receiving node.
 * @param t_xv The send_xv #task, if it has already been created.
 * @param t_rho The send_rho #task, if it has already been created.
627
 * @param t_gradient The send_gradient #task, if already created.
Peter W. Draper's avatar
Peter W. Draper committed
628
 * @param t_ti The send_ti #task, if required and has already been created.
629
 */
630
void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
631
                          struct task *t_xv, struct task *t_rho,
632
                          struct task *t_gradient, struct task *t_ti) {
633

634
#ifdef WITH_MPI
635
636
  struct link *l = NULL;
  struct scheduler *s = &e->sched;
637
  const int nodeID = cj->nodeID;
638

639
640
  /* Check if any of the density tasks are for the target node. */
  for (l = ci->density; l != NULL; l = l->next)
641
642
    if (l->t->ci->nodeID == nodeID ||
        (l->t->cj != NULL && l->t->cj->nodeID == nodeID))
643
      break;
644

645
646
  /* If so, attach send tasks. */
  if (l != NULL) {
647

648
649
650
    /* Create the tasks and their dependencies? */
    if (t_xv == NULL) {
      t_xv = scheduler_addtask(s, task_type_send, task_subtype_none,
651
                               4 * ci->tag, 0, ci, cj, 0);
652
      t_rho = scheduler_addtask(s, task_type_send, task_subtype_none,
653
                                4 * ci->tag + 1, 0, ci, cj, 0);
654
      if (!(e->policy & engine_policy_fixdt))
655
        t_ti = scheduler_addtask(s, task_type_send, task_subtype_tend,
656
657
658
659
660
661
662
663
664
665
666
667
668
669
                                 4 * ci->tag + 2, 0, ci, cj, 0);
#ifdef EXTRA_HYDRO_LOOP
      t_gradient = scheduler_addtask(s, task_type_send, task_subtype_none,
                                     4 * ci->tag + 3, 0, ci, cj, 0);
#endif

#ifdef EXTRA_HYDRO_LOOP

      scheduler_addunlock(s, t_gradient, ci->super->kick);

      scheduler_addunlock(s, ci->super->extra_ghost, t_gradient);

      /* The send_rho task should unlock the super-cell's extra_ghost task. */
      scheduler_addunlock(s, t_rho, ci->super->extra_ghost);
670

671
672
      /* The send_rho task depends on the cell's ghost task. */
      scheduler_addunlock(s, ci->super->ghost, t_rho);
673

674
675
676
677
      /* The send_xv task should unlock the super-cell's ghost task. */
      scheduler_addunlock(s, t_xv, ci->super->ghost);

#else
678
679
      /* The send_rho task should unlock the super-cell's kick task. */
      scheduler_addunlock(s, t_rho, ci->super->kick);
680

681
682
683
      /* The send_rho task depends on the cell's ghost task. */
      scheduler_addunlock(s, ci->super->ghost, t_rho);

684
685
      /* The send_xv task should unlock the super-cell's ghost task. */
      scheduler_addunlock(s, t_xv, ci->super->ghost);
686
#endif
687

688
      /* The super-cell's kick task should unlock the send_ti task. */
689
      if (t_ti != NULL) scheduler_addunlock(s, ci->super->kick, t_ti);
690
    }
691

692
    /* Add them to the local cell. */
693
694
    engine_addlink(e, &ci->send_xv, t_xv);
    engine_addlink(e, &ci->send_rho, t_rho);
695
#ifdef EXTRA_HYDRO_LOOP
696
    engine_addlink(e, &ci->send_gradient, t_gradient);
697
#endif
698
    if (t_ti != NULL) engine_addlink(e, &ci->send_ti, t_ti);
699
  }
700

701
  /* Recurse? */
702
  if (ci->split)
703
    for (int k = 0; k < 8; k++)
704
      if (ci->progeny[k] != NULL)
705
706
        engine_addtasks_send(e, ci->progeny[k], cj, t_xv, t_rho, t_gradient,
                             t_ti);
Matthieu Schaller's avatar
Matthieu Schaller committed
707
708
709
710

#else
  error("SWIFT was not compiled with MPI support.");
#endif
711
}
712
713
714
715
716

/**
 * @brief Add recv tasks to a hierarchy of cells.
 *
 * @param e The #engine.
717
 * @param c The foreign #cell.
718
719
 * @param t_xv The recv_xv #task, if it has already been created.
 * @param t_rho The recv_rho #task, if it has already been created.
720
 * @param t_gradient The recv_gradient #task, if it has already been created.
Peter W. Draper's avatar
Peter W. Draper committed
721
 * @param t_ti The recv_ti #task, if required and has already been created.
722
 */
723
void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
724
725
                          struct task *t_rho, struct task *t_gradient,
                          struct task *t_ti) {
726

727
#ifdef WITH_MPI
728
  struct scheduler *s = &e->sched;
729

730
731
732
733
  /* Do we need to construct a recv task?
     Note that since c is a foreign cell, all its density tasks will involve
     only the current rank, and thus we don't have to check them.*/
  if (t_xv == NULL && c->density != NULL) {
734

735
    /* Create the tasks. */
736
    t_xv = scheduler_addtask(s, task_type_recv, task_subtype_none, 4 * c->tag,
737
738
                             0, c, NULL, 0);
    t_rho = scheduler_addtask(s, task_type_recv, task_subtype_none,
739
                              4 * c->tag + 1, 0, c, NULL, 0);
740
    if (!(e->policy & engine_policy_fixdt))
741
      t_ti = scheduler_addtask(s, task_type_recv, task_subtype_tend,
742
743
744
745
746
                               4 * c->tag + 2, 0, c, NULL, 0);
#ifdef EXTRA_HYDRO_LOOP
    t_gradient = scheduler_addtask(s, task_type_recv, task_subtype_none,
                                   4 * c->tag + 3, 0, c, NULL, 0);
#endif
747
  }
748
749
  c->recv_xv = t_xv;
  c->recv_rho = t_rho;
750
  c->recv_gradient = t_gradient;
751
  c->recv_ti = t_ti;
752

753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
/* Add dependencies. */
#ifdef EXTRA_HYDRO_LOOP
  for (struct link *l = c->density; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_xv, l->t);
    scheduler_addunlock(s, l->t, t_rho);
  }
  for (struct link *l = c->gradient; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_rho, l->t);
    scheduler_addunlock(s, l->t, t_gradient);
  }
  for (struct link *l = c->force; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_gradient, l->t);
    if (t_ti != NULL) scheduler_addunlock(s, l->t, t_ti);
  }
  if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);
#else
769
770
771
772
  for (struct link *l = c->density; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_xv, l->t);
    scheduler_addunlock(s, l->t, t_rho);
  }
773
  for (struct link *l = c->force; l != NULL; l = l->next) {
774
    scheduler_addunlock(s, t_rho, l->t);
775
    if (t_ti != NULL) scheduler_addunlock(s, l->t, t_ti);
776
  }
777
  if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);
778
#endif
779
780
781

  /* Recurse? */
  if (c->split)
782
    for (int k = 0; k < 8; k++)
783
      if (c->progeny[k] != NULL)
784
        engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho, t_gradient, t_ti);
Matthieu Schaller's avatar
Matthieu Schaller committed
785
786
787
788

#else
  error("SWIFT was not compiled with MPI support.");
#endif
789
}
790
791
792
793
794
795

/**
 * @brief Exchange cell structures with other nodes.
 *
 * @param e The #engine.
 */
796
void engine_exchange_cells(struct engine *e) {
797
798
799

#ifdef WITH_MPI

800
  struct space *s = e->s;
801
  struct cell *cells = s->cells_top;
802
803
  const int nr_cells = s->nr_cells;
  const int nr_proxies = e->nr_proxies;
804
805
806
807
  int offset[nr_cells];
  MPI_Request reqs_in[engine_maxproxies];
  MPI_Request reqs_out[engine_maxproxies];
  MPI_Status status;
808
  ticks tic = getticks();
809
810
811

  /* Run through the cells and get the size of the ones that will be sent off.
   */
812
813
814
  int count_out = 0;
  for (int k = 0; k < nr_cells; k++) {
    offset[k] = count_out;
815
    if (cells[k].sendto)
816
      count_out += (cells[k].pcell_size = cell_getsize(&cells[k]));
817
  }
818

819
  /* Allocate the pcells. */
820
  struct pcell *pcells;
821
822
  if ((pcells = (struct pcell *)malloc(sizeof(struct pcell) * count_out)) ==
      NULL)
823
824
825
826
    error("Failed to allocate pcell buffer.");

  /* Pack the cells. */
  cell_next_tag = 0;
827
  for (int k = 0; k < nr_cells; k++)
828
829
830
831
832
833
    if (cells[k].sendto) {
      cell_pack(&cells[k], &pcells[offset[k]]);
      cells[k].pcell = &pcells[offset[k]];
    }

  /* Launch the proxies. */
834
  for (int k = 0; k < nr_proxies; k++) {
835
836
837
838
839
840
    proxy_cells_exch1(&e->proxies[k]);
    reqs_in[k] = e->proxies[k].req_cells_count_in;
    reqs_out[k] = e->proxies[k].req_cells_count_out;
  }

  /* Wait for each count to come in and start the recv. */
841
  for (int k = 0; k < nr_proxies; k++) {
842
    int pid = MPI_UNDEFINED;
843
844
845
846
847
848
849
    if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
        pid == MPI_UNDEFINED)
      error("MPI_Waitany failed.");
    // message( "request from proxy %i has arrived." , pid );
    proxy_cells_exch2(&e->proxies[pid]);
  }

850
  /* Wait for all the sends to have finished too. */
851
852
853
854
  if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
    error("MPI_Waitall on sends failed.");

  /* Set the requests for the cells. */
855
  for (int k = 0; k < nr_proxies; k++) {
856
857
858
859
860
    reqs_in[k] = e->proxies[k].req_cells_in;
    reqs_out[k] = e->proxies[k].req_cells_out;
  }

  /* Wait for each pcell array to come in from the proxies. */
861
  for (int k = 0; k < nr_proxies; k++) {
862
    int pid = MPI_UNDEFINED