engine.c 155 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "active.h"
52
#include "atomic.h"
53
#include "cell.h"
54
#include "clocks.h"
55
56
#include "cycle.h"
#include "debug.h"
57
#include "error.h"
58
#include "gravity.h"
59
#include "hydro.h"
60
#include "minmax.h"
61
#include "parallel_io.h"
62
#include "part.h"
63
#include "partition.h"
James Willis's avatar
James Willis committed
64
#include "profiler.h"
65
66
#include "proxy.h"
#include "runner.h"
67
68
#include "serial_io.h"
#include "single_io.h"
69
#include "sort_part.h"
70
#include "statistics.h"
71
#include "timers.h"
72
#include "tools.h"
73
#include "units.h"
74
#include "version.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
75

76
77
78
/* Particle cache size. */
#define CACHE_SIZE 512

79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
const char *engine_policy_names[] = {"none",
                                     "rand",
                                     "steal",
                                     "keep",
                                     "block",
                                     "cpu_tight",
                                     "mpi",
                                     "numa_affinity",
                                     "hydro",
                                     "self_gravity",
                                     "external_gravity",
                                     "cosmology_integration",
                                     "drift_all",
                                     "reconstruct_mpoles",
                                     "cooling",
                                     "sourceterms",
                                     "stars"};
Pedro Gonnet's avatar
Pedro Gonnet committed
96

97
98
99
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

100
101
102
103
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
104
 * @param l A pointer to the #link, will be modified atomically.
105
106
107
108
 * @param t The #task.
 *
 * @return The new #link pointer.
 */
109
void engine_addlink(struct engine *e, struct link **l, struct task *t) {
110

111
  /* Get the next free link. */
112
113
114
115
116
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
117

118
  /* Set it atomically. */
119
  res->t = t;
120
  res->next = atomic_swap(l, res);
121
}
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/**
 * @brief Recursively add non-implicit ghost tasks to a cell hierarchy.
 */
void engine_add_ghosts(struct engine *e, struct cell *c, struct task *ghost_in,
                       struct task *ghost_out) {
  if (!c->split || c->count < engine_max_parts_per_ghost) {
    struct scheduler *s = &e->sched;
    c->ghost =
        scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0, c, NULL);
    scheduler_addunlock(s, ghost_in, c->ghost);
    scheduler_addunlock(s, c->ghost, ghost_out);
  } else {
    for (int k = 0; k < 8; k++)
      if (c->progeny[k] != NULL)
        engine_add_ghosts(e, c->progeny[k], ghost_in, ghost_out);
  }
}

141
142
143
144
145
146
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
 * i.e. all the O(Npart) tasks.
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
147
148
 * Note that there is no need to recurse below the super-cell.
 *
149
150
151
 * @param e The #engine.
 * @param c The #cell.
 */
152
void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
153
154

  struct scheduler *s = &e->sched;
155
  const int periodic = e->s->periodic;
156
  const int is_hydro = (e->policy & engine_policy_hydro);
157
  const int is_self_gravity = (e->policy & engine_policy_self_gravity);
158
  const int is_with_cooling = (e->policy & engine_policy_cooling);
Matthieu Schaller's avatar
Matthieu Schaller committed
159
  const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
160

161
  /* Are we in a super-cell ? */
162
  if (c->super == c) {
163

164
165
166
167
    /* Add the sort task. */
    c->sorts =
        scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, c, NULL);

168
169
170
    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

171
      /* Add the drift task. */
Pedro Gonnet's avatar
Pedro Gonnet committed
172
173
      c->drift_part = scheduler_addtask(s, task_type_drift_part,
                                        task_subtype_none, 0, 0, c, NULL);
174

175
176
      /* Add the two half kicks */
      c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
177
                                   c, NULL);
178
179

      c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0,
180
                                   c, NULL);
Tom Theuns's avatar
Tom Theuns committed
181

182
183
      /* Add the time-step calculation task and its dependency */
      c->timestep = scheduler_addtask(s, task_type_timestep, task_subtype_none,
184
                                      0, 0, c, NULL);
185
186

      scheduler_addunlock(s, c->kick2, c->timestep);
187
      scheduler_addunlock(s, c->timestep, c->kick1);
188

189
      if (is_self_gravity) {
190

191
192
        /* Initialisation of the multipoles */
        c->init_grav = scheduler_addtask(s, task_type_init_grav,
193
                                         task_subtype_none, 0, 0, c, NULL);
194

195
196
        /* Gravity non-neighbouring pm calculations */
        c->grav_long_range = scheduler_addtask(
197
            s, task_type_grav_long_range, task_subtype_none, 0, 0, c, NULL);
198

199
200
        /* Gravity recursive down-pass */
        c->grav_down = scheduler_addtask(s, task_type_grav_down,
201
                                         task_subtype_none, 0, 0, c, NULL);
202

203
        if (periodic) scheduler_addunlock(s, c->init_grav, c->grav_ghost[0]);
204
        scheduler_addunlock(s, c->init_grav, c->grav_long_range);
205
206
        scheduler_addunlock(s, c->grav_long_range, c->grav_down);
        scheduler_addunlock(s, c->grav_down, c->kick2);
207
208
      }

209
210
      /* Generate the ghost tasks. */
      if (is_hydro) {
211
212
213
214
215
216
217
        c->ghost_in =
            scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                              /* implicit = */ 1, c, NULL);
        c->ghost_out =
            scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
                              /* implicit = */ 1, c, NULL);
        engine_add_ghosts(e, c, c->ghost_in, c->ghost_out);
218
      }
219
220
221

#ifdef EXTRA_HYDRO_LOOP
      /* Generate the extra ghost task. */
222
      if (is_hydro)
Matthieu Schaller's avatar
Matthieu Schaller committed
223
        c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
224
                                           task_subtype_none, 0, 0, c, NULL);
225
#endif
226

227
      /* Cooling task */
228
      if (is_with_cooling) {
Matthieu Schaller's avatar
Matthieu Schaller committed
229
        c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
230
                                       0, 0, c, NULL);
231
232
233
234

        scheduler_addunlock(s, c->cooling, c->kick2);
      }

235
      /* add source terms */
236
      if (is_with_sourceterms) {
237
        c->sourceterms = scheduler_addtask(s, task_type_sourceterms,
238
                                           task_subtype_none, 0, 0, c, NULL);
239
      }
240
241
    }

242
  } else { /* We are above the super-cell so need to go deeper */
243

244
245
246
#ifdef SWIFT_DEBUG_CHECKS
    if (c->super != NULL) error("Incorrectly set super pointer");
#endif
247

248
249
250
251
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
252
          engine_make_hierarchical_tasks(e, c->progeny[k]);
253
  }
254
}
255

256
#ifdef WITH_MPI
257
/**
Peter W. Draper's avatar
Peter W. Draper committed
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
 * Do the exchange of one type of particles with all the other nodes.
 *
 * @param counts 2D array with the counts of particles to exchange with
 *               each other node.
 * @param parts the particle data to exchange
 * @param new_nr_parts the number of particles this node will have after all
 *                     exchanges have completed.
 * @param sizeofparts sizeof the particle struct.
 * @param alignsize the memory alignment required for this particle type.
 * @param mpi_type the MPI_Datatype for these particles.
 * @param nr_nodes the number of nodes to exchange with.
 * @param nodeID the id of this node.
 *
 * @result new particle data constructed from all the exchanges with the
 *         given alignment.
273
 */
274
static void *engine_do_redistribute(int *counts, char *parts,
275
276
                                    size_t new_nr_parts, size_t sizeofparts,
                                    size_t alignsize, MPI_Datatype mpi_type,
277
                                    int nr_nodes, int nodeID) {
278
279

  /* Allocate a new particle array with some extra margin */
280
  char *parts_new = NULL;
281
282
  if (posix_memalign(
          (void **)&parts_new, alignsize,
283
          sizeofparts * new_nr_parts * engine_redistribute_alloc_margin) != 0)
284
285
286
287
    error("Failed to allocate new particle data.");

  /* Prepare MPI requests for the asynchronous communications */
  MPI_Request *reqs;
288
289
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 2 * nr_nodes)) ==
      NULL)
290
291
    error("Failed to allocate MPI request list.");

292
  /* Only send and receive only "chunk" particles per request. So we need to
293
294
295
   * loop as many times as necessary here. Make 2Gb/sizeofparts so we only
   * send 2Gb packets. */
  const int chunk = INT_MAX / sizeofparts;
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
  int sent = 0;
  int recvd = 0;

  int activenodes = 1;
  while (activenodes) {

    for (int k = 0; k < 2 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;

    /* Emit the sends and recvs for the data. */
    size_t offset_send = sent;
    size_t offset_recv = recvd;
    activenodes = 0;

    for (int k = 0; k < nr_nodes; k++) {

      /* Indices in the count arrays of the node of interest */
      const int ind_send = nodeID * nr_nodes + k;
      const int ind_recv = k * nr_nodes + nodeID;

      /* Are we sending any data this loop? */
      int sending = counts[ind_send] - sent;
      if (sending > 0) {
        activenodes++;
319
        if (sending > chunk) sending = chunk;
320
321
322
323

        /* If the send and receive is local then just copy. */
        if (k == nodeID) {
          int receiving = counts[ind_recv] - recvd;
324
          if (receiving > chunk) receiving = chunk;
325
          memcpy(&parts_new[offset_recv * sizeofparts],
326
                 &parts[offset_send * sizeofparts], sizeofparts * receiving);
327
328
        } else {
          /* Otherwise send it. */
329
330
331
          int res =
              MPI_Isend(&parts[offset_send * sizeofparts], sending, mpi_type, k,
                        ind_send, MPI_COMM_WORLD, &reqs[2 * k + 0]);
332
333
334
335
          if (res != MPI_SUCCESS)
            mpi_error(res, "Failed to isend parts to node %i.", k);
        }
      }
336

337
      /* If we're sending to this node, then move past it to next. */
338
      if (counts[ind_send] > 0) offset_send += counts[ind_send];
339

340
341
342
343
344
345
      /* Are we receiving any data from this node? Note already done if coming
       * from this node. */
      if (k != nodeID) {
        int receiving = counts[ind_recv] - recvd;
        if (receiving > 0) {
          activenodes++;
346
          if (receiving > chunk) receiving = chunk;
347
348
349
350
351
352
          int res = MPI_Irecv(&parts_new[offset_recv * sizeofparts], receiving,
                              mpi_type, k, ind_recv, MPI_COMM_WORLD,
                              &reqs[2 * k + 1]);
          if (res != MPI_SUCCESS)
            mpi_error(res, "Failed to emit irecv of parts from node %i.", k);
        }
353
354
      }

355
      /* If we're receiving from this node, then move past it to next. */
356
      if (counts[ind_recv] > 0) offset_recv += counts[ind_recv];
357
358
    }

359
360
361
362
363
364
365
    /* Wait for all the sends and recvs to tumble in. */
    MPI_Status stats[2 * nr_nodes];
    int res;
    if ((res = MPI_Waitall(2 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
      for (int k = 0; k < 2 * nr_nodes; k++) {
        char buff[MPI_MAX_ERROR_STRING];
        MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
366
367
        message("request from source %i, tag %i has error '%s'.",
                stats[k].MPI_SOURCE, stats[k].MPI_TAG, buff);
368
369
      }
      error("Failed during waitall for part data.");
370
    }
371
372
373
374

    /* Move to next chunks. */
    sent += chunk;
    recvd += chunk;
375
376
377
378
379
380
381
382
383
384
  }

  /* Free temps. */
  free(reqs);

  /* And return new memory. */
  return parts_new;
}
#endif

385
/**
386
 * @brief Redistribute the particles amongst the nodes according
387
388
 *      to their cell's node IDs.
 *
389
390
391
392
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
393
394
395
396
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
397
398
 *
 *
399
400
 * @param e The #engine.
 */
401
void engine_redistribute(struct engine *e) {
402

403
#ifdef WITH_MPI
404

405
406
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
407
  struct space *s = e->s;
408
  struct cell *cells = s->cells_top;
409
  const int nr_cells = s->nr_cells;
410
  const int *cdim = s->cdim;
411
  const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
412
413
414
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct gpart *gparts = s->gparts;
415
  struct spart *sparts = s->sparts;
416
  ticks tic = getticks();
417

418
  /* Allocate temporary arrays to store the counts of particles to be sent
419
420
   * and the destination of each particle */
  int *counts;
421
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
422
    error("Failed to allocate counts temporary buffer.");
423
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
424

425
  int *dest;
426
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
427
428
    error("Failed to allocate dest temporary buffer.");

429
  /* Get destination of each particle */
430
  for (size_t k = 0; k < s->nr_parts; k++) {
431
432

    /* Periodic boundary conditions */
433
    for (int j = 0; j < 3; j++) {
434
435
436
437
438
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
439
440
441
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
442
443
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
444
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
445
446
447
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

448
    dest[k] = cells[cid].nodeID;
449
450

    /* The counts array is indexed as count[from * nr_nodes + to]. */
451
452
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
453
454

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
455
  if (s->nr_parts > 0)
456
    space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
457

458
459
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the part have been sorted correctly. */
460
461
462
463
  for (size_t k = 0; k < s->nr_parts; k++) {
    const struct part *p = &s->parts[k];

    /* New cell index */
464
    const int new_cid =
465
466
467
468
        cell_getid(s->cdim, p->x[0] * s->iwidth[0], p->x[1] * s->iwidth[1],
                   p->x[2] * s->iwidth[2]);

    /* New cell of this part */
469
470
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
471

472
473
    if (dest[k] != new_node)
      error("part's new node index not matching sorted index.");
474
475
476
477
478

    if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->width[0] ||
        p->x[1] < c->loc[1] || p->x[1] > c->loc[1] + c->width[1] ||
        p->x[2] < c->loc[2] || p->x[2] > c->loc[2] + c->width[2])
      error("part not sorted into the right top-level cell!");
479
480
481
  }
#endif

482
  /* We need to re-link the gpart partners of parts. */
483
484
485
486
487
488
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

489
490
491
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
492
         * Recall that gparts without partners have a positive id.
493
         * We will restore the pointers on the receiving node later on. */
494
495
496
497
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
498

499
#ifdef SWIFT_DEBUG_CHECKS
500
        if (s->parts[k].gpart->id_or_neg_offset > 0)
501
502
          error("Trying to link a partnerless gpart !");
#endif
503

504
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
505
        count_this_dest++;
506
507
508
      }
    }
  }
509
  free(dest);
510

511
  /* Get destination of each s-particle */
512
513
514
515
516
517
518
519
520
  int *s_counts;
  if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
    error("Failed to allocate s_counts temporary buffer.");
  bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes);

  int *s_dest;
  if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL)
    error("Failed to allocate s_dest temporary buffer.");

521
522
523
524
525
526
527
528
529
530
531
532
533
534
  for (size_t k = 0; k < s->nr_sparts; k++) {

    /* Periodic boundary conditions */
    for (int j = 0; j < 3; j++) {
      if (sparts[k].x[j] < 0.0)
        sparts[k].x[j] += dim[j];
      else if (sparts[k].x[j] >= dim[j])
        sparts[k].x[j] -= dim[j];
    }
    const int cid =
        cell_getid(cdim, sparts[k].x[0] * iwidth[0], sparts[k].x[1] * iwidth[1],
                   sparts[k].x[2] * iwidth[2]);
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
535
      error("Bad cell id %i for spart %zu at [%.3e,%.3e,%.3e].", cid, k,
536
537
538
539
540
541
542
543
544
545
            sparts[k].x[0], sparts[k].x[1], sparts[k].x[2]);
#endif

    s_dest[k] = cells[cid].nodeID;

    /* The counts array is indexed as count[from * nr_nodes + to]. */
    s_counts[nodeID * nr_nodes + s_dest[k]] += 1;
  }

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
546
  if (s->nr_sparts > 0)
547
    space_sparts_sort(s, s_dest, s->nr_sparts, 0, nr_nodes - 1, e->verbose);
548

549
550
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the spart have been sorted correctly. */
551
552
553
554
  for (size_t k = 0; k < s->nr_sparts; k++) {
    const struct spart *sp = &s->sparts[k];

    /* New cell index */
555
    const int new_cid =
556
557
558
559
        cell_getid(s->cdim, sp->x[0] * s->iwidth[0], sp->x[1] * s->iwidth[1],
                   sp->x[2] * s->iwidth[2]);

    /* New cell of this spart */
560
561
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
562

563
564
    if (s_dest[k] != new_node)
      error("spart's new node index not matching sorted index.");
565
566
567
568
569

    if (sp->x[0] < c->loc[0] || sp->x[0] > c->loc[0] + c->width[0] ||
        sp->x[1] < c->loc[1] || sp->x[1] > c->loc[1] + c->width[1] ||
        sp->x[2] < c->loc[2] || sp->x[2] > c->loc[2] + c->width[2])
      error("spart not sorted into the right top-level cell!");
570
571
572
  }
#endif

573
  /* We need to re-link the gpart partners of sparts. */
574
575
576
  if (s->nr_sparts > 0) {
    int current_dest = s_dest[0];
    size_t count_this_dest = 0;
577
    for (size_t k = 0; k < s->nr_sparts; ++k) {
578
579
580
581
582
      if (s->sparts[k].gpart != NULL) {

        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
583
         * Recall that gparts without partners have a positive id.
584
585
586
587
588
589
590
         * We will restore the pointers on the receiving node later on. */
        if (s_dest[k] != current_dest) {
          current_dest = s_dest[k];
          count_this_dest = 0;
        }

#ifdef SWIFT_DEBUG_CHECKS
591
        if (s->sparts[k].gpart->id_or_neg_offset > 0)
592
593
594
595
596
597
598
599
600
          error("Trying to link a partnerless gpart !");
#endif

        s->sparts[k].gpart->id_or_neg_offset = -count_this_dest;
        count_this_dest++;
      }
    }
  }

601
602
  free(s_dest);

603
  /* Get destination of each g-particle */
604
605
606
607
608
609
610
611
612
  int *g_counts;
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
    error("Failed to allocate g_gcount temporary buffer.");
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);

  int *g_dest;
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
    error("Failed to allocate g_dest temporary buffer.");

613
  for (size_t k = 0; k < s->nr_gparts; k++) {
614
615

    /* Periodic boundary conditions */
616
    for (int j = 0; j < 3; j++) {
617
618
619
620
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
621
    }
James Willis's avatar
James Willis committed
622
623
624
    const int cid =
        cell_getid(cdim, gparts[k].x[0] * iwidth[0], gparts[k].x[1] * iwidth[1],
                   gparts[k].x[2] * iwidth[2]);
625
626
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
627
      error("Bad cell id %i for gpart %zu at [%.3e,%.3e,%.3e].", cid, k,
628
629
630
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

631
    g_dest[k] = cells[cid].nodeID;
632
633

    /* The counts array is indexed as count[from * nr_nodes + to]. */
634
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
635
  }
636
637

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
638
  if (s->nr_gparts > 0)
639
    space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
640

641
642
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the gpart have been sorted correctly. */
643
644
645
646
  for (size_t k = 0; k < s->nr_gparts; k++) {
    const struct gpart *gp = &s->gparts[k];

    /* New cell index */
647
    const int new_cid =
648
649
650
651
        cell_getid(s->cdim, gp->x[0] * s->iwidth[0], gp->x[1] * s->iwidth[1],
                   gp->x[2] * s->iwidth[2]);

    /* New cell of this gpart */
652
653
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
654

655
    if (g_dest[k] != new_node)
656
657
      error("gpart's new node index not matching sorted index (%d != %d).",
            g_dest[k], new_node);
658
659
660
661
662

    if (gp->x[0] < c->loc[0] || gp->x[0] > c->loc[0] + c->width[0] ||
        gp->x[1] < c->loc[1] || gp->x[1] > c->loc[1] + c->width[1] ||
        gp->x[2] < c->loc[2] || gp->x[2] > c->loc[2] + c->width[2])
      error("gpart not sorted into the right top-level cell!");
663
664
665
  }
#endif

666
667
  free(g_dest);

668
669
670
671
672
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

673
  /* Get all the s_counts from all the nodes. */
674
675
676
677
678
679
680
681
682
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, s_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce sparticle transfer counts.");

Peter W. Draper's avatar
Peter W. Draper committed
683
  /* Report how many particles will be moved. */
684
685
  if (e->verbose) {
    if (e->nodeID == 0) {
686
687
      size_t total = 0, g_total = 0, s_total = 0;
      size_t unmoved = 0, g_unmoved = 0, s_unmoved = 0;
688
689
690
      for (int p = 0, r = 0; p < nr_nodes; p++) {
        for (int s = 0; s < nr_nodes; s++) {
          total += counts[r];
691
692
693
694
695
696
697
          g_total += g_counts[r];
          s_total += s_counts[r];
          if (p == s) {
            unmoved += counts[r];
            g_unmoved += g_counts[r];
            s_unmoved += s_counts[r];
          }
698
699
700
          r++;
        }
      }
Matthieu Schaller's avatar
Matthieu Schaller committed
701
702
703
704
705
706
707
708
709
710
711
      if (total > 0)
        message("%ld of %ld (%.2f%%) of particles moved", total - unmoved,
                total, 100.0 * (double)(total - unmoved) / (double)total);
      if (g_total > 0)
        message("%ld of %ld (%.2f%%) of g-particles moved", g_total - g_unmoved,
                g_total,
                100.0 * (double)(g_total - g_unmoved) / (double)g_total);
      if (s_total > 0)
        message("%ld of %ld (%.2f%%) of s-particles moved", s_total - s_unmoved,
                s_total,
                100.0 * (double)(s_total - s_unmoved) / (double)s_total);
712
    }
713
714
  }

Peter W. Draper's avatar
Peter W. Draper committed
715
716
717
  /* Now each node knows how many parts, sparts and gparts will be transferred
   * to every other node.
   * Get the new numbers of particles for this node. */
718
  size_t nr_parts = 0, nr_gparts = 0, nr_sparts = 0;
719
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
720
721
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];
722
723
  for (int k = 0; k < nr_nodes; k++)
    nr_sparts += s_counts[k * nr_nodes + nodeID];
724

Peter W. Draper's avatar
Peter W. Draper committed
725
726
727
728
  /* Now exchange the particles, type by type to keep the memory required
   * under control. */

  /* SPH particles. */
729
  void *new_parts = engine_do_redistribute(counts, (char *)s->parts, nr_parts,
730
731
732
                                           sizeof(struct part), part_align,
                                           part_mpi_type, nr_nodes, nodeID);
  free(s->parts);
733
  s->parts = (struct part *)new_parts;
734
735
  s->nr_parts = nr_parts;
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
736

Peter W. Draper's avatar
Peter W. Draper committed
737
  /* Extra SPH particle properties. */
738
  new_parts = engine_do_redistribute(counts, (char *)s->xparts, nr_parts,
739
740
741
                                     sizeof(struct xpart), xpart_align,
                                     xpart_mpi_type, nr_nodes, nodeID);
  free(s->xparts);
742
  s->xparts = (struct xpart *)new_parts;
743

Peter W. Draper's avatar
Peter W. Draper committed
744
  /* Gravity particles. */
745
  new_parts = engine_do_redistribute(g_counts, (char *)s->gparts, nr_gparts,
746
747
748
                                     sizeof(struct gpart), gpart_align,
                                     gpart_mpi_type, nr_nodes, nodeID);
  free(s->gparts);
749
  s->gparts = (struct gpart *)new_parts;
750
751
  s->nr_gparts = nr_gparts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
752

Peter W. Draper's avatar
Peter W. Draper committed
753
  /* Star particles. */
754
  new_parts = engine_do_redistribute(s_counts, (char *)s->sparts, nr_sparts,
755
756
757
                                     sizeof(struct spart), spart_align,
                                     spart_mpi_type, nr_nodes, nodeID);
  free(s->sparts);
758
  s->sparts = (struct spart *)new_parts;
759
760
  s->nr_sparts = nr_sparts;
  s->size_sparts = engine_redistribute_alloc_margin * nr_sparts;
761

762
763
764
765
766
  /* All particles have now arrived. Time for some final operations on the
     stuff we just received */

  /* Restore the part<->gpart and spart<->gpart links */
  size_t offset_parts = 0, offset_sparts = 0, offset_gparts = 0;
767
768
769
770
771
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];
772
    const size_t count_sparts = s_counts[ind_recv];
773
774
775
776

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

777
      /* Does this gpart have a gas partner ? */
778
      if (s->gparts[k].type == swift_type_gas) {
779

Matthieu Schaller's avatar
Style    
Matthieu Schaller committed
780
        const ptrdiff_t partner_index =
781
            offset_parts - s->gparts[k].id_or_neg_offset;
782
783

        /* Re-link */
784
785
        s->gparts[k].id_or_neg_offset = -partner_index;
        s->parts[partner_index].gpart = &s->gparts[k];
786
      }
787
788

      /* Does this gpart have a star partner ? */
789
      if (s->gparts[k].type == swift_type_star) {
790
791

        const ptrdiff_t partner_index =
792
            offset_sparts - s->gparts[k].id_or_neg_offset;
793
794

        /* Re-link */
795
796
        s->gparts[k].id_or_neg_offset = -partner_index;
        s->sparts[partner_index].gpart = &s->gparts[k];
797
      }
798
799
800
801
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
802
    offset_sparts += count_sparts;
803
804
  }

805
806
807
808
809
  /* Clean up the counts now we done. */
  free(counts);
  free(g_counts);
  free(s_counts);

810
#ifdef SWIFT_DEBUG_CHECKS
811
  /* Verify that all parts are in the right place. */
812
  for (size_t k = 0; k < nr_parts; k++) {
813
814
815
    const int cid =
        cell_getid(cdim, s->parts[k].x[0] * iwidth[0],
                   s->parts[k].x[1] * iwidth[1], s->parts[k].x[2] * iwidth[2]);
816
    if (cells[cid].nodeID != nodeID)
817
      error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
818
819
            cells[cid].nodeID);
  }
820
  for (size_t k = 0; k < nr_gparts; k++) {
821
822
823
    const int cid = cell_getid(cdim, s->gparts[k].x[0] * iwidth[0],
                               s->gparts[k].x[1] * iwidth[1],
                               s->gparts[k].x[2] * iwidth[2]);
824
825
826
827
828
    if (cells[cid].nodeID != nodeID)
      error("Received g-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
  }
  for (size_t k = 0; k < nr_sparts; k++) {
829
830
831
    const int cid = cell_getid(cdim, s->sparts[k].x[0] * iwidth[0],
                               s->sparts[k].x[1] * iwidth[1],
                               s->sparts[k].x[2] * iwidth[2]);
832
833
834
835
    if (cells[cid].nodeID != nodeID)
      error("Received s-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
  }
836

837
  /* Verify that the links are correct */
838
  part_verify_links(s->parts, s->gparts, s->sparts, nr_parts, nr_gparts,
839
                    nr_sparts, e->verbose);
840
#endif
841

842
843
844
845
846
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
847
848
    message("node %i now has %zu parts, %zu sparts and %zu gparts in %i cells.",
            nodeID, nr_parts, nr_sparts, nr_gparts, my_cells);
849
850
  }

851
852
853
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
854
#else
855
  error("SWIFT was not compiled with MPI support.");
856
857
#endif
}
858

859
/**
860
 * @brief Repartition the cells amongst the nodes.
861
862
863
 *
 * @param e The #engine.
 */
864
void engine_repartition(struct engine *e) {
865
866
867

#if defined(WITH_MPI) && defined(HAVE_METIS)

868
869
  ticks tic = getticks();

870
#ifdef SWIFT_DEBUG_CHECKS
871
872
873
874
  /* Be verbose about this. */
  if (e->nodeID == 0 || e->verbose) message("repartitioning space");
  fflush(stdout);

875
  /* Check that all cells have been drifted to the current time */
876
877
  space_check_drift_point(e->s, e->ti_old,
                          e->policy & engine_policy_self_gravity);
878
879
#endif

880
  /* Clear the repartition flag. */
881
  e->forcerepart = 0;
882

883
884
  /* Nothing to do if only using a single node. Also avoids METIS
   * bug that doesn't handle this case well. */
885
  if (e->nr_nodes == 1) return;
886

887
  /* Do the repartitioning. */
888
  partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s,
889
                        e->sched.tasks, e->sched.nr_tasks);
890
891
892
893

  /* Now comes the tricky part: Exchange particles between all nodes.
     This is done in two steps, first allreducing a matrix of
     how many particles go from where to where, then re-allocating
894
     the parts array, and emitting the sends and receives.
895
896
897
898
899
900
901
902
903
904
905
     Finally, the space, tasks, and proxies need to be rebuilt. */

  /* Redistribute the particles between the nodes. */
  engine_redistribute(e);

  /* Make the proxies. */
  engine_makeproxies(e);

  /* Tell the engine it should re-build whenever possible */
  e->forcerebuild = 1;

906
907
908
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
909
#else
910
911
  if (e->reparttype->type != REPART_NONE)
    error("SWIFT was not compiled with MPI and METIS support.");
912
#endif
913
}
914

915
916
917
918
919
920
921
922
923
924
925
926
/**
 * @brief Decide whether trigger a repartition the cells amongst the nodes.
 *
 * @param e The #engine.
 */
void engine_repartition_trigger(struct engine *e) {

#ifdef WITH_MPI

  /* Do nothing if there have not been enough steps since the last
   * repartition, don't want to repeat this too often or immediately after
   * a repartition step. */
927
  if (e->step - e->last_repartition >= 2) {
928
929
930
931
932

    /* Old style if trigger is >1 or this is the second step (want an early
     * repartition following the initial repartition). */
    if (e->reparttype->trigger > 1 || e->step == 2) {
      if (e->reparttype->trigger > 1) {
Matthieu Schaller's avatar