engine.c 112 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "atomic.h"
52
#include "cell.h"
53
#include "clocks.h"
54
55
#include "cycle.h"
#include "debug.h"
56
#include "error.h"
57
#include "hydro.h"
58
#include "minmax.h"
59
#include "parallel_io.h"
60
#include "part.h"
61
#include "partition.h"
62
63
#include "proxy.h"
#include "runner.h"
64
65
#include "serial_io.h"
#include "single_io.h"
66
#include "statistics.h"
67
#include "timers.h"
68
#include "tools.h"
69
#include "units.h"
70
#include "version.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
71

72
const char *engine_policy_names[16] = {"none",
Pedro Gonnet's avatar
Pedro Gonnet committed
73
74
75
76
77
78
79
80
81
82
                                       "rand",
                                       "steal",
                                       "keep",
                                       "block",
                                       "cpu_tight",
                                       "mpi",
                                       "numa_affinity",
                                       "hydro",
                                       "self_gravity",
                                       "external_gravity",
83
                                       "cosmology_integration",
84
                                       "drift_all",
85
86
                                       "cooling",
                                       "sourceterms"};
Pedro Gonnet's avatar
Pedro Gonnet committed
87

88
89
90
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

91
92
93
94
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
95
 * @param l A pointer to the #link, will be modified atomically.
96
97
98
99
 * @param t The #task.
 *
 * @return The new #link pointer.
 */
100
void engine_addlink(struct engine *e, struct link **l, struct task *t) {
101

102
  /* Get the next free link. */
103
104
105
106
107
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
108

109
  /* Set it atomically. */
110
  res->t = t;
111
  res->next = atomic_swap(l, res);
112
}
113

114
115
116
117
118
119
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
 * i.e. all the O(Npart) tasks.
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
120
121
 * Note that there is no need to recurse below the super-cell.
 *
122
123
124
 * @param e The #engine.
 * @param c The #cell.
 */
125
void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
126
127

  struct scheduler *s = &e->sched;
128
  const int is_hydro = (e->policy & engine_policy_hydro);
129
  const int is_with_cooling = (e->policy & engine_policy_cooling);
Matthieu Schaller's avatar
Matthieu Schaller committed
130
  const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
131

132
  /* Are we in a super-cell ? */
133
  if (c->super == c) {
134
135
136
137

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

Matthieu Schaller's avatar
Matthieu Schaller committed
138
      /* Add the init task. */
139
140
      c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c,
                                  NULL, 0);
141

142
143
      c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0, c,
                                  NULL, 0);
Tom Theuns's avatar
Tom Theuns committed
144

145
      /* Generate the ghost task. */
146
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
147
148
        c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                                     0, c, NULL, 0);
149
150
151

#ifdef EXTRA_HYDRO_LOOP
      /* Generate the extra ghost task. */
152
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
153
154
        c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
                                           task_subtype_none, 0, 0, c, NULL, 0);
155
#endif
156

157
      /* Cooling task */
158
      if (is_with_cooling)
Matthieu Schaller's avatar
Matthieu Schaller committed
159
160
        c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
                                       0, 0, c, NULL, 0);
161
162
163
164
      /* add source terms */
      if (is_with_sourceterms)
        c->sourceterms = scheduler_addtask(s, task_type_sourceterms,
                                           task_subtype_none, 0, 0, c, NULL, 0);
165
166
    }

167
  } else { /* We are above the super-cell so need to go deeper */
168

169
170
171
#ifdef SWIFT_DEBUG_CHECKS
    if (c->super != NULL) error("Incorrectly set super pointer");
#endif
172

173
174
175
176
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
177
          engine_make_hierarchical_tasks(e, c->progeny[k]);
178
  }
179
}
180

181
/**
182
 * @brief Redistribute the particles amongst the nodes according
183
184
 *      to their cell's node IDs.
 *
185
186
187
188
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
189
190
191
192
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
193
194
 *
 *
195
196
 * @param e The #engine.
 */
197
void engine_redistribute(struct engine *e) {
198

199
#ifdef WITH_MPI
200

201
202
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
203
  struct space *s = e->s;
204
  struct cell *cells = s->cells_top;
205
  const int nr_cells = s->nr_cells;
206
  const int *cdim = s->cdim;
207
  const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
208
209
210
211
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct xpart *xparts = s->xparts;
  struct gpart *gparts = s->gparts;
212
  ticks tic = getticks();
213

214
215
216
  /* Allocate temporary arrays to store the counts of particles to be sent
     and the destination of each particle */
  int *counts, *g_counts;
217
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
218
    error("Failed to allocate count temporary buffer.");
219
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
220
    error("Failed to allocate gcount temporary buffer.");
221
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
222
223
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);

224
  /* Allocate the destination index arrays. */
225
226
  int *dest, *g_dest;
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
227
    error("Failed to allocate dest temporary buffer.");
228
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
229
230
    error("Failed to allocate g_dest temporary buffer.");

231
  /* Get destination of each particle */
232
  for (size_t k = 0; k < s->nr_parts; k++) {
233
234

    /* Periodic boundary conditions */
235
    for (int j = 0; j < 3; j++) {
236
237
238
239
240
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
241
242
243
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
244
245
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
246
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
247
248
249
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

250
    dest[k] = cells[cid].nodeID;
251
252

    /* The counts array is indexed as count[from * nr_nodes + to]. */
253
254
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
255
256

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
257
  space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
258

259
  /* We need to re-link the gpart partners of parts. */
260
261
262
263
264
265
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

266
267
268
269
270
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
         * Recall that gparts without partners have a negative id.
         * We will restore the pointers on the receiving node later on. */
271
272
273
274
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
275

276
#ifdef SWIFT_DEBUG_CHECKS
Matthieu Schaller's avatar
Matthieu Schaller committed
277
        if (s->parts[k].gpart->id_or_neg_offset >= 0)
278
279
          error("Trying to link a partnerless gpart !");
#endif
280

281
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
282
        count_this_dest++;
283
284
285
286
287
      }
    }
  }

  /* Get destination of each g-particle */
288
  for (size_t k = 0; k < s->nr_gparts; k++) {
289
290

    /* Periodic boundary conditions */
291
    for (int j = 0; j < 3; j++) {
292
293
294
295
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
296
    }
James Willis's avatar
James Willis committed
297
298
299
    const int cid =
        cell_getid(cdim, gparts[k].x[0] * iwidth[0], gparts[k].x[1] * iwidth[1],
                   gparts[k].x[2] * iwidth[2]);
300
301
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
302
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
303
304
305
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

306
    g_dest[k] = cells[cid].nodeID;
307
308

    /* The counts array is indexed as count[from * nr_nodes + to]. */
309
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
310
  }
311
312

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
313
  space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
314

315
316
317
318
319
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

320
321
322
323
324
325
  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Each node knows how many parts and gparts will be transferred to every
326
     other node. We can start preparing to receive data */
327
328
329

  /* Get the new number of parts and gparts for this node */
  size_t nr_parts = 0, nr_gparts = 0;
330
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
331
332
333
334
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];

  /* Allocate the new arrays with some extra margin */
335
  struct part *parts_new = NULL;
336
  struct xpart *xparts_new = NULL;
337
  struct gpart *gparts_new = NULL;
338
  if (posix_memalign((void **)&parts_new, part_align,
339
340
                     sizeof(struct part) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
341
    error("Failed to allocate new part data.");
342
343
344
345
346
347
348
349
350
351
  if (posix_memalign((void **)&xparts_new, xpart_align,
                     sizeof(struct xpart) * nr_parts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new xpart data.");
  if (posix_memalign((void **)&gparts_new, gpart_align,
                     sizeof(struct gpart) * nr_gparts *
                         engine_redistribute_alloc_margin) != 0)
    error("Failed to allocate new gpart data.");

  /* Prepare MPI requests for the asynchronous communications */
352
  MPI_Request *reqs;
353
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 6 * nr_nodes)) ==
354
355
      NULL)
    error("Failed to allocate MPI request list.");
356
357
358
359
360
361
362
363
364
365
366
367
  for (int k = 0; k < 6 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;

  /* Emit the sends and recvs for the particle and gparticle data. */
  size_t offset_send = 0, offset_recv = 0;
  size_t g_offset_send = 0, g_offset_recv = 0;
  for (int k = 0; k < nr_nodes; k++) {

    /* Indices in the count arrays of the node of interest */
    const int ind_send = nodeID * nr_nodes + k;
    const int ind_recv = k * nr_nodes + nodeID;

    /* Are we sending any part/xpart ? */
368
    if (counts[ind_send] > 0) {
369

370
      /* message("Sending %d part to node %d", counts[ind_send], k); */
Matthieu Schaller's avatar
Matthieu Schaller committed
371

372
      /* If the send is to the same node, just copy */
373
374
375
376
377
378
379
      if (k == nodeID) {
        memcpy(&parts_new[offset_recv], &s->parts[offset_send],
               sizeof(struct part) * counts[ind_recv]);
        memcpy(&xparts_new[offset_recv], &s->xparts[offset_send],
               sizeof(struct xpart) * counts[ind_recv]);
        offset_send += counts[ind_send];
        offset_recv += counts[ind_recv];
380
381

        /* Else, emit some communications */
382
      } else {
383
384
        if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type,
                      k, 3 * ind_send + 0, MPI_COMM_WORLD,
385
386
                      &reqs[6 * k]) != MPI_SUCCESS)
          error("Failed to isend parts to node %i.", k);
387
388
        if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type,
                      k, 3 * ind_send + 1, MPI_COMM_WORLD,
389
390
                      &reqs[6 * k + 1]) != MPI_SUCCESS)
          error("Failed to isend xparts to node %i.", k);
391
392
        offset_send += counts[ind_send];
      }
393
    }
394
395
396
397

    /* Are we sending any gpart ? */
    if (g_counts[ind_send] > 0) {

398
      /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */
399

400
401
      /* If the send is to the same node, just copy */
      if (k == nodeID) {
402
        memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send],
403
404
405
406
407
408
409
               sizeof(struct gpart) * g_counts[ind_recv]);
        g_offset_send += g_counts[ind_send];
        g_offset_recv += g_counts[ind_recv];

        /* Else, emit some communications */
      } else {
        if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send],
410
                      gpart_mpi_type, k, 3 * ind_send + 2, MPI_COMM_WORLD,
411
412
                      &reqs[6 * k + 2]) != MPI_SUCCESS)
          error("Failed to isend gparts to node %i.", k);
413
        g_offset_send += g_counts[ind_send];
414
415
416
417
418
419
      }
    }

    /* Now emit the corresponding Irecv() */

    /* Are we receiving any part/xpart from this node ? */
420
    if (k != nodeID && counts[ind_recv] > 0) {
421
422
      if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k,
                    3 * ind_recv + 0, MPI_COMM_WORLD,
423
424
                    &reqs[6 * k + 3]) != MPI_SUCCESS)
        error("Failed to emit irecv of parts from node %i.", k);
425
426
      if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type,
                    k, 3 * ind_recv + 1, MPI_COMM_WORLD,
427
428
                    &reqs[6 * k + 4]) != MPI_SUCCESS)
        error("Failed to emit irecv of xparts from node %i.", k);
429
      offset_recv += counts[ind_recv];
430
    }
431
432
433
434

    /* Are we receiving any gpart from this node ? */
    if (k != nodeID && g_counts[ind_recv] > 0) {
      if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv],
435
                    gpart_mpi_type, k, 3 * ind_recv + 2, MPI_COMM_WORLD,
436
437
438
439
                    &reqs[6 * k + 5]) != MPI_SUCCESS)
        error("Failed to emit irecv of gparts from node %i.", k);
      g_offset_recv += g_counts[ind_recv];
    }
440
  }
441

442
  /* Wait for all the sends and recvs to tumble in. */
443
  MPI_Status stats[6 * nr_nodes];
444
  int res;
445
446
  if ((res = MPI_Waitall(6 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
    for (int k = 0; k < 6 * nr_nodes; k++) {
447
448
449
      char buff[MPI_MAX_ERROR_STRING];
      MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
      message("request %i has error '%s'.", k, buff);
450
    }
451
452
    error("Failed during waitall for part data.");
  }
453

454
455
456
457
458
459
460
461
462
463
464
465
  /* We now need to restore the part<->gpart links */
  size_t offset_parts = 0, offset_gparts = 0;
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

      /* Does this gpart have a partner ? */
466
      if (gparts_new[k].id_or_neg_offset <= 0) {
467

Matthieu Schaller's avatar
Style    
Matthieu Schaller committed
468
469
        const ptrdiff_t partner_index =
            offset_parts - gparts_new[k].id_or_neg_offset;
470
471

        /* Re-link */
472
        gparts_new[k].id_or_neg_offset = -partner_index;
Pedro Gonnet's avatar
Pedro Gonnet committed
473
        parts_new[partner_index].gpart = &gparts_new[k];
474
475
476
477
478
479
480
      }
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
  }

481
#ifdef SWIFT_DEBUG_CHECKS
482
  /* Verify that all parts are in the right place. */
483
484
485
486
  for (size_t k = 0; k < nr_parts; k++) {
    const int cid = cell_getid(cdim, parts_new[k].x[0] * iwidth[0],
                               parts_new[k].x[1] * iwidth[1],
                               parts_new[k].x[2] * iwidth[2]);
487
    if (cells[cid].nodeID != nodeID)
488
      error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
489
490
            cells[cid].nodeID);
  }
491

492
  /* Verify that the links are correct */
493
  for (size_t k = 0; k < nr_gparts; ++k) {
494

495
    if (gparts_new[k].id_or_neg_offset <= 0) {
496

497
      struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset];
498

499
      if (part->gpart != &gparts_new[k]) error("Linking problem !");
500

501
      if (gparts_new[k].x[0] != part->x[0] ||
502
          gparts_new[k].x[1] != part->x[1] || gparts_new[k].x[2] != part->x[2])
503
504
505
506
        error("Linked particles are not at the same position !");
    }
  }
  for (size_t k = 0; k < nr_parts; ++k) {
Matthieu Schaller's avatar
Matthieu Schaller committed
507

508
    if (parts_new[k].gpart != NULL &&
509
        parts_new[k].gpart->id_or_neg_offset != -(ptrdiff_t)k) {
510
      error("Linking problem !");
511
512
513
    }
  }
#endif
514
515
516
517

  /* Set the new part data, free the old. */
  free(parts);
  free(xparts);
518
  free(gparts);
519
520
  s->parts = parts_new;
  s->xparts = xparts_new;
521
  s->gparts = gparts_new;
522
  s->nr_parts = nr_parts;
523
524
525
  s->nr_gparts = nr_gparts;
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
526

527
  /* Clean up the temporary stuff. */
528
529
530
531
  free(reqs);
  free(counts);
  free(dest);

532
533
534
535
536
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
537
    message("node %i now has %zu parts and %zu gparts in %i cells.", nodeID,
538
            nr_parts, nr_gparts, my_cells);
539
540
  }

541
542
543
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
544
#else
545
  error("SWIFT was not compiled with MPI support.");
546
547
#endif
}
548

549
/**
550
 * @brief Repartition the cells amongst the nodes.
551
552
553
 *
 * @param e The #engine.
 */
554
void engine_repartition(struct engine *e) {
555
556
557

#if defined(WITH_MPI) && defined(HAVE_METIS)

558
559
  ticks tic = getticks();

560
561
562
563
564
#ifdef SWIFT_DEBUG_CHECKS
  /* Check that all cells have been drifted to the current time */
  space_check_drift_point(e->s, e->ti_current);
#endif

565
  /* Clear the repartition flag. */
566
  enum repartition_type reparttype = e->forcerepart;
567
  e->forcerepart = REPART_NONE;
568

569
570
  /* Nothing to do if only using a single node. Also avoids METIS
   * bug that doesn't handle this case well. */
571
  if (e->nr_nodes == 1) return;
572

573
574
575
  /* Do the repartitioning. */
  partition_repartition(reparttype, e->nodeID, e->nr_nodes, e->s,
                        e->sched.tasks, e->sched.nr_tasks);
576
577
578
579

  /* Now comes the tricky part: Exchange particles between all nodes.
     This is done in two steps, first allreducing a matrix of
     how many particles go from where to where, then re-allocating
580
     the parts array, and emitting the sends and receives.
581
582
583
584
585
586
587
588
589
590
591
     Finally, the space, tasks, and proxies need to be rebuilt. */

  /* Redistribute the particles between the nodes. */
  engine_redistribute(e);

  /* Make the proxies. */
  engine_makeproxies(e);

  /* Tell the engine it should re-build whenever possible */
  e->forcerebuild = 1;

592
593
594
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
595
#else
596
  error("SWIFT was not compiled with MPI and METIS support.");
597
#endif
598
}
599

600
601
602
603
604
605
606
607
/**
 * @brief Add up/down gravity tasks to a cell hierarchy.
 *
 * @param e The #engine.
 * @param c The #cell
 * @param up The upward gravity #task.
 * @param down The downward gravity #task.
 */
608
609
void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up,
                          struct task *down) {
610

611
612
613
614
615
616
617
618
619
620
  /* Link the tasks to this cell. */
  c->grav_up = up;
  c->grav_down = down;

  /* Recurse? */
  if (c->split)
    for (int k = 0; k < 8; k++)
      if (c->progeny[k] != NULL)
        engine_addtasks_grav(e, c->progeny[k], up, down);
}
621

622
623
624
625
/**
 * @brief Add send tasks to a hierarchy of cells.
 *
 * @param e The #engine.
626
 * @param ci The sending #cell.
627
628
629
 * @param cj Dummy cell containing the nodeID of the receiving node.
 * @param t_xv The send_xv #task, if it has already been created.
 * @param t_rho The send_rho #task, if it has already been created.
630
 * @param t_gradient The send_gradient #task, if already created.
Peter W. Draper's avatar
Peter W. Draper committed
631
 * @param t_ti The send_ti #task, if required and has already been created.
632
 */
633
void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj,
634
                          struct task *t_xv, struct task *t_rho,
635
                          struct task *t_gradient, struct task *t_ti) {
636

637
#ifdef WITH_MPI
638
639
  struct link *l = NULL;
  struct scheduler *s = &e->sched;
640
  const int nodeID = cj->nodeID;
641

642
643
  /* Check if any of the density tasks are for the target node. */
  for (l = ci->density; l != NULL; l = l->next)
644
645
    if (l->t->ci->nodeID == nodeID ||
        (l->t->cj != NULL && l->t->cj->nodeID == nodeID))
646
      break;
647

648
649
  /* If so, attach send tasks. */
  if (l != NULL) {
650

651
652
653
    /* Create the tasks and their dependencies? */
    if (t_xv == NULL) {
      t_xv = scheduler_addtask(s, task_type_send, task_subtype_none,
654
                               4 * ci->tag, 0, ci, cj, 0);
655
      t_rho = scheduler_addtask(s, task_type_send, task_subtype_none,
656
                                4 * ci->tag + 1, 0, ci, cj, 0);
657
658
      t_ti = scheduler_addtask(s, task_type_send, task_subtype_tend,
                               4 * ci->tag + 2, 0, ci, cj, 0);
659
660
661
662
663
664
665
666
667
668
669
670
671
#ifdef EXTRA_HYDRO_LOOP
      t_gradient = scheduler_addtask(s, task_type_send, task_subtype_none,
                                     4 * ci->tag + 3, 0, ci, cj, 0);
#endif

#ifdef EXTRA_HYDRO_LOOP

      scheduler_addunlock(s, t_gradient, ci->super->kick);

      scheduler_addunlock(s, ci->super->extra_ghost, t_gradient);

      /* The send_rho task should unlock the super-cell's extra_ghost task. */
      scheduler_addunlock(s, t_rho, ci->super->extra_ghost);
672

673
674
      /* The send_rho task depends on the cell's ghost task. */
      scheduler_addunlock(s, ci->super->ghost, t_rho);
675

676
677
678
679
      /* The send_xv task should unlock the super-cell's ghost task. */
      scheduler_addunlock(s, t_xv, ci->super->ghost);

#else
680
681
      /* The send_rho task should unlock the super-cell's kick task. */
      scheduler_addunlock(s, t_rho, ci->super->kick);
682

683
684
685
      /* The send_rho task depends on the cell's ghost task. */
      scheduler_addunlock(s, ci->super->ghost, t_rho);

686
687
      /* The send_xv task should unlock the super-cell's ghost task. */
      scheduler_addunlock(s, t_xv, ci->super->ghost);
688
#endif
689

690
      /* The super-cell's kick task should unlock the send_ti task. */
691
      if (t_ti != NULL) scheduler_addunlock(s, ci->super->kick, t_ti);
692
    }
693

694
    /* Add them to the local cell. */
695
696
    engine_addlink(e, &ci->send_xv, t_xv);
    engine_addlink(e, &ci->send_rho, t_rho);
697
#ifdef EXTRA_HYDRO_LOOP
698
    engine_addlink(e, &ci->send_gradient, t_gradient);
699
#endif
700
    if (t_ti != NULL) engine_addlink(e, &ci->send_ti, t_ti);
701
  }
702

703
  /* Recurse? */
704
  if (ci->split)
705
    for (int k = 0; k < 8; k++)
706
      if (ci->progeny[k] != NULL)
707
708
        engine_addtasks_send(e, ci->progeny[k], cj, t_xv, t_rho, t_gradient,
                             t_ti);
Matthieu Schaller's avatar
Matthieu Schaller committed
709
710
711
712

#else
  error("SWIFT was not compiled with MPI support.");
#endif
713
}
714
715
716
717
718

/**
 * @brief Add recv tasks to a hierarchy of cells.
 *
 * @param e The #engine.
719
 * @param c The foreign #cell.
720
721
 * @param t_xv The recv_xv #task, if it has already been created.
 * @param t_rho The recv_rho #task, if it has already been created.
722
 * @param t_gradient The recv_gradient #task, if it has already been created.
Peter W. Draper's avatar
Peter W. Draper committed
723
 * @param t_ti The recv_ti #task, if required and has already been created.
724
 */
725
void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv,
726
727
                          struct task *t_rho, struct task *t_gradient,
                          struct task *t_ti) {
728

729
#ifdef WITH_MPI
730
  struct scheduler *s = &e->sched;
731

732
733
734
735
  /* Do we need to construct a recv task?
     Note that since c is a foreign cell, all its density tasks will involve
     only the current rank, and thus we don't have to check them.*/
  if (t_xv == NULL && c->density != NULL) {
736

737
    /* Create the tasks. */
738
    t_xv = scheduler_addtask(s, task_type_recv, task_subtype_none, 4 * c->tag,
739
740
                             0, c, NULL, 0);
    t_rho = scheduler_addtask(s, task_type_recv, task_subtype_none,
741
                              4 * c->tag + 1, 0, c, NULL, 0);
742
743
    t_ti = scheduler_addtask(s, task_type_recv, task_subtype_tend,
                             4 * c->tag + 2, 0, c, NULL, 0);
744
745
746
747
#ifdef EXTRA_HYDRO_LOOP
    t_gradient = scheduler_addtask(s, task_type_recv, task_subtype_none,
                                   4 * c->tag + 3, 0, c, NULL, 0);
#endif
748
  }
749
750
  c->recv_xv = t_xv;
  c->recv_rho = t_rho;
751
  c->recv_gradient = t_gradient;
752
  c->recv_ti = t_ti;
753

754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
/* Add dependencies. */
#ifdef EXTRA_HYDRO_LOOP
  for (struct link *l = c->density; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_xv, l->t);
    scheduler_addunlock(s, l->t, t_rho);
  }
  for (struct link *l = c->gradient; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_rho, l->t);
    scheduler_addunlock(s, l->t, t_gradient);
  }
  for (struct link *l = c->force; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_gradient, l->t);
    if (t_ti != NULL) scheduler_addunlock(s, l->t, t_ti);
  }
  if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);
#else
770
771
772
773
  for (struct link *l = c->density; l != NULL; l = l->next) {
    scheduler_addunlock(s, t_xv, l->t);
    scheduler_addunlock(s, l->t, t_rho);
  }
774
  for (struct link *l = c->force; l != NULL; l = l->next) {
775
    scheduler_addunlock(s, t_rho, l->t);
776
    if (t_ti != NULL) scheduler_addunlock(s, l->t, t_ti);
777
  }
778
  if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts);
779
#endif
780
781
782

  /* Recurse? */
  if (c->split)
783
    for (int k = 0; k < 8; k++)
784
      if (c->progeny[k] != NULL)
785
        engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho, t_gradient, t_ti);
Matthieu Schaller's avatar
Matthieu Schaller committed
786
787
788
789

#else
  error("SWIFT was not compiled with MPI support.");
#endif
790
}
791
792
793
794
795
796

/**
 * @brief Exchange cell structures with other nodes.
 *
 * @param e The #engine.
 */
797
void engine_exchange_cells(struct engine *e) {
798
799
800

#ifdef WITH_MPI

801
  struct space *s = e->s;
802
  struct cell *cells = s->cells_top;
803
804
  const int nr_cells = s->nr_cells;
  const int nr_proxies = e->nr_proxies;
805
806
807
808
  int offset[nr_cells];
  MPI_Request reqs_in[engine_maxproxies];
  MPI_Request reqs_out[engine_maxproxies];
  MPI_Status status;
809
  ticks tic = getticks();
810
811
812

  /* Run through the cells and get the size of the ones that will be sent off.
   */
813
814
815
  int count_out = 0;
  for (int k = 0; k < nr_cells; k++) {
    offset[k] = count_out;
816
    if (cells[k].sendto)
817
      count_out += (cells[k].pcell_size = cell_getsize(&cells[k]));
818
  }
819

820
  /* Allocate the pcells. */
821
  struct pcell *pcells;
822
823
  if ((pcells = (struct pcell *)malloc(sizeof(struct pcell) * count_out)) ==
      NULL)
824
825
826
827
    error("Failed to allocate pcell buffer.");

  /* Pack the cells. */
  cell_next_tag = 0;
828
  for (int k = 0; k < nr_cells; k++)
829
830
831
832
833
834
    if (cells[k].sendto) {
      cell_pack(&cells[k], &pcells[offset[k]]);
      cells[k].pcell = &pcells[offset[k]];
    }

  /* Launch the proxies. */
835
  for (int k = 0; k < nr_proxies; k++) {
836
837
838
839
840
841
    proxy_cells_exch1(&e->proxies[k]);
    reqs_in[k] = e->proxies[k].req_cells_count_in;
    reqs_out[k] = e->proxies[k].req_cells_count_out;
  }

  /* Wait for each count to come in and start the recv. */
842
  for (int k = 0; k < nr_proxies; k++) {
843
    int pid = MPI_UNDEFINED;
844
845
846
847
848
849
850
    if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS ||
        pid == MPI_UNDEFINED)
      error("MPI_Waitany failed.");
    // message( "request from proxy %i has arrived." , pid );
    proxy_cells_exch2(&e->proxies[pid]);
  }

851
  /* Wait for all the sends to have finished too. */
852
853
854
855
  if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS)
    error("MPI_Waitall on sends failed.");

  /* Set the requests for the cells. */
856
  for (int k = 0; k < nr_proxies; k++) {
857
858
859
860
861
    reqs_in[k] = e->proxies[k].req_cells_in;
    reqs_out[k] = e->proxies[k].req_cells_out;
  }

  /* Wait for each pcell array to come in from the proxies. */
862
  for (int k = 0; k < nr_proxies; k++)