engine.c 188 KB
Newer Older
Pedro Gonnet's avatar
Pedro Gonnet committed
1
/*******************************************************************************
2
 * This file is part of SWIFT.
3
 * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
4
 *                    Matthieu Schaller (matthieu.schaller@durham.ac.uk)
Peter W. Draper's avatar
Peter W. Draper committed
5
 *               2015 Peter W. Draper (p.w.draper@durham.ac.uk)
6
7
8
 *                    Angus Lepper (angus.lepper@ed.ac.uk)
 *               2016 John A. Regan (john.a.regan@durham.ac.uk)
 *                    Tom Theuns (tom.theuns@durham.ac.uk)
9
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
10
11
12
13
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
14
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
15
16
17
18
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
19
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
20
21
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
22
 *
Pedro Gonnet's avatar
Pedro Gonnet committed
23
24
25
26
27
28
29
30
31
 ******************************************************************************/

/* Config parameters. */
#include "../config.h"

/* Some standard headers. */
#include <float.h>
#include <limits.h>
#include <sched.h>
32
#include <stdbool.h>
33
34
35
36
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
Pedro Gonnet's avatar
Pedro Gonnet committed
37

38
39
/* MPI headers. */
#ifdef WITH_MPI
40
#include <mpi.h>
41
42
#endif

Angus Lepper's avatar
Angus Lepper committed
43
44
#ifdef HAVE_LIBNUMA
#include <numa.h>
45
46
#endif

47
/* This object's header. */
Pedro Gonnet's avatar
Pedro Gonnet committed
48
#include "engine.h"
49
50

/* Local headers. */
51
#include "active.h"
52
#include "atomic.h"
53
#include "cell.h"
54
#include "clocks.h"
55
56
#include "cycle.h"
#include "debug.h"
57
#include "error.h"
58
#include "gravity.h"
59
#include "hydro.h"
60
#include "map.h"
61
#include "minmax.h"
62
#include "parallel_io.h"
63
#include "part.h"
64
#include "partition.h"
James Willis's avatar
James Willis committed
65
#include "profiler.h"
66
67
#include "proxy.h"
#include "runner.h"
68
69
#include "serial_io.h"
#include "single_io.h"
70
#include "sort_part.h"
71
#include "statistics.h"
72
#include "timers.h"
73
#include "tools.h"
74
#include "units.h"
75
#include "version.h"
Pedro Gonnet's avatar
Pedro Gonnet committed
76

77
78
79
/* Particle cache size. */
#define CACHE_SIZE 512

80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
const char *engine_policy_names[] = {"none",
                                     "rand",
                                     "steal",
                                     "keep",
                                     "block",
                                     "cpu_tight",
                                     "mpi",
                                     "numa_affinity",
                                     "hydro",
                                     "self_gravity",
                                     "external_gravity",
                                     "cosmology_integration",
                                     "drift_all",
                                     "reconstruct_mpoles",
                                     "cooling",
                                     "sourceterms",
                                     "stars"};
Pedro Gonnet's avatar
Pedro Gonnet committed
97

98
99
100
/** The rank of the engine as a global variable (for messages). */
int engine_rank;

101
102
103
104
105
106
/**
 * @brief Data collected from the cells at the end of a time-step
 */
struct end_of_step_data {

  int updates, g_updates, s_updates;
107
108
  integertime_t ti_hydro_end_min, ti_hydro_end_max, ti_hydro_beg_max;
  integertime_t ti_gravity_end_min, ti_gravity_end_max, ti_gravity_beg_max;
109
110
111
  struct engine *e;
};

112
113
114
115
/**
 * @brief Link a density/force task to a cell.
 *
 * @param e The #engine.
116
 * @param l A pointer to the #link, will be modified atomically.
117
118
119
120
 * @param t The #task.
 *
 * @return The new #link pointer.
 */
121
void engine_addlink(struct engine *e, struct link **l, struct task *t) {
122

123
  /* Get the next free link. */
124
125
126
127
128
  const int ind = atomic_inc(&e->nr_links);
  if (ind >= e->size_links) {
    error("Link table overflow.");
  }
  struct link *res = &e->links[ind];
129

130
  /* Set it atomically. */
131
  res->t = t;
132
  res->next = atomic_swap(l, res);
133
}
134

135
136
137
138
139
/**
 * @brief Recursively add non-implicit ghost tasks to a cell hierarchy.
 */
void engine_add_ghosts(struct engine *e, struct cell *c, struct task *ghost_in,
                       struct task *ghost_out) {
140
141

  /* If we have reached the leaf OR have to few particles to play with*/
142
  if (!c->split || c->count < engine_max_parts_per_ghost) {
143
144

    /* Add the ghost task and its dependencies */
145
146
147
148
149
150
    struct scheduler *s = &e->sched;
    c->ghost =
        scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0, c, NULL);
    scheduler_addunlock(s, ghost_in, c->ghost);
    scheduler_addunlock(s, c->ghost, ghost_out);
  } else {
151
    /* Keep recursing */
152
153
154
155
156
157
    for (int k = 0; k < 8; k++)
      if (c->progeny[k] != NULL)
        engine_add_ghosts(e, c->progeny[k], ghost_in, ghost_out);
  }
}

158
159
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
160
 * i.e. all the O(Npart) tasks -- timestep version
161
162
163
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
164
165
 * Note that there is no need to recurse below the super-cell. Note also
 * that we only add tasks if the relevant particles are present in the cell.
166
 *
167
168
169
 * @param e The #engine.
 * @param c The #cell.
 */
170
void engine_make_hierarchical_tasks_common(struct engine *e, struct cell *c) {
171
172
173

  struct scheduler *s = &e->sched;

174
  /* Are we in a super-cell ? */
175
  if (c->super == c) {
176
177
178
179

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

180
181
      /* Add the two half kicks */
      c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
182
                                   c, NULL);
183
184

      c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0,
185
                                   c, NULL);
Tom Theuns's avatar
Tom Theuns committed
186

187
188
      /* Add the time-step calculation task and its dependency */
      c->timestep = scheduler_addtask(s, task_type_timestep, task_subtype_none,
189
                                      0, 0, c, NULL);
190
191

      scheduler_addunlock(s, c->kick2, c->timestep);
192
      scheduler_addunlock(s, c->timestep, c->kick1);
193
    }
194

195
  } else { /* We are above the super-cell so need to go deeper */
196

197
198
199
200
201
202
203
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
          engine_make_hierarchical_tasks_common(e, c->progeny[k]);
  }
}
204

205
206
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
207
 * i.e. all the O(Npart) tasks -- hydro version
208
209
210
211
212
213
214
215
216
217
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
 * Note that there is no need to recurse below the super-cell. Note also
 * that we only add tasks if the relevant particles are present in the cell.
 *
 * @param e The #engine.
 * @param c The #cell.
 */
void engine_make_hierarchical_tasks_hydro(struct engine *e, struct cell *c) {
218

219
220
221
  struct scheduler *s = &e->sched;
  const int is_with_cooling = (e->policy & engine_policy_cooling);
  const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
222

223
224
  /* Are we in a super-cell ? */
  if (c->super_hydro == c) {
225

226
    /* Add the sort task. */
227
228
    c->sorts =
        scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, c, NULL);
229

230
231
    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {
232

233
      /* Add the drift task. */
234
235
      c->drift_part = scheduler_addtask(s, task_type_drift_part,
                                        task_subtype_none, 0, 0, c, NULL);
236

237
238
      /* Generate the ghost tasks. */
      c->ghost_in =
239
240
          scheduler_addtask(s, task_type_ghost_in, task_subtype_none, 0,
                            /* implicit = */ 1, c, NULL);
241
      c->ghost_out =
242
243
          scheduler_addtask(s, task_type_ghost_out, task_subtype_none, 0,
                            /* implicit = */ 1, c, NULL);
244
      engine_add_ghosts(e, c, c->ghost_in, c->ghost_out);
245
246

#ifdef EXTRA_HYDRO_LOOP
247
248
      /* Generate the extra ghost task. */
      c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
249
                                         task_subtype_none, 0, 0, c, NULL);
250
#endif
251

252
      /* Cooling task */
253
      if (is_with_cooling) {
Matthieu Schaller's avatar
Matthieu Schaller committed
254
        c->cooling = scheduler_addtask(s, task_type_cooling, task_subtype_none,
255
                                       0, 0, c, NULL);
256

257
        scheduler_addunlock(s, c->cooling, c->super->kick2);
258
259
      }

260
      /* add source terms */
261
      if (is_with_sourceterms) {
262
        c->sourceterms = scheduler_addtask(s, task_type_sourceterms,
263
                                           task_subtype_none, 0, 0, c, NULL);
264
      }
265
266
    }

267
  } else { /* We are above the super-cell so need to go deeper */
268

269
270
271
272
273
274
275
276
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
          engine_make_hierarchical_tasks_hydro(e, c->progeny[k]);
  }
}

277
278
279
280
281
282
283
284
285
286
287
288
/**
 * @brief Generate the hydro hierarchical tasks for a hierarchy of cells -
 * i.e. all the O(Npart) tasks -- gravity version
 *
 * Tasks are only created here. The dependencies will be added later on.
 *
 * Note that there is no need to recurse below the super-cell. Note also
 * that we only add tasks if the relevant particles are present in the cell.
 *
 * @param e The #engine.
 * @param c The #cell.
 */
289
290
291
292
293
294
295
296
297
298
299
300
301
void engine_make_hierarchical_tasks_gravity(struct engine *e, struct cell *c) {

  struct scheduler *s = &e->sched;
  const int periodic = e->s->periodic;
  const int is_self_gravity = (e->policy & engine_policy_self_gravity);

  /* Are we in a super-cell ? */
  if (c->super_gravity == c) {

    /* Local tasks only... */
    if (c->nodeID == e->nodeID) {

      c->drift_gpart = scheduler_addtask(s, task_type_drift_gpart,
302
303
304
                                         task_subtype_none, 0, 0, c, NULL);

      if (is_self_gravity) {
305

306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
        /* Initialisation of the multipoles */
        c->init_grav = scheduler_addtask(s, task_type_init_grav,
                                         task_subtype_none, 0, 0, c, NULL);

        /* Gravity non-neighbouring pm calculations */
        c->grav_long_range = scheduler_addtask(
            s, task_type_grav_long_range, task_subtype_none, 0, 0, c, NULL);

        /* Gravity recursive down-pass */
        c->grav_down = scheduler_addtask(s, task_type_grav_down,
                                         task_subtype_none, 0, 0, c, NULL);

        if (periodic) scheduler_addunlock(s, c->init_grav, c->grav_ghost_in);
        if (periodic) scheduler_addunlock(s, c->grav_ghost_out, c->grav_down);
        scheduler_addunlock(s, c->init_grav, c->grav_long_range);
        scheduler_addunlock(s, c->grav_long_range, c->grav_down);
        scheduler_addunlock(s, c->grav_down, c->super->kick2);
323
324
325
326
      }
    }

  } else { /* We are above the super-cell so need to go deeper */
327

328
329
330
331
    /* Recurse. */
    if (c->split)
      for (int k = 0; k < 8; k++)
        if (c->progeny[k] != NULL)
332
          engine_make_hierarchical_tasks_gravity(e, c->progeny[k]);
333
  }
334
}
335

336
337
338
void engine_make_hierarchical_tasks_mapper(void *map_data, int num_elements,
                                           void *extra_data) {
  struct engine *e = (struct engine *)extra_data;
339
340
  const int is_with_hydro = (e->policy & engine_policy_hydro);
  const int is_with_self_gravity = (e->policy & engine_policy_self_gravity);
341
342
  const int is_with_external_gravity =
      (e->policy & engine_policy_external_gravity);
343
344
345

  for (int ind = 0; ind < num_elements; ind++) {
    struct cell *c = &((struct cell *)map_data)[ind];
346
347
348
    /* Make the common tasks (time integration) */
    engine_make_hierarchical_tasks_common(e, c);
    /* Add the hydro stuff */
349
    if (is_with_hydro) engine_make_hierarchical_tasks_hydro(e, c);
350
    /* And the gravity stuff */
351
    if (is_with_self_gravity || is_with_external_gravity)
352
      engine_make_hierarchical_tasks_gravity(e, c);
353
354
355
  }
}

356
#ifdef WITH_MPI
357
/**
Peter W. Draper's avatar
Peter W. Draper committed
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
 * Do the exchange of one type of particles with all the other nodes.
 *
 * @param counts 2D array with the counts of particles to exchange with
 *               each other node.
 * @param parts the particle data to exchange
 * @param new_nr_parts the number of particles this node will have after all
 *                     exchanges have completed.
 * @param sizeofparts sizeof the particle struct.
 * @param alignsize the memory alignment required for this particle type.
 * @param mpi_type the MPI_Datatype for these particles.
 * @param nr_nodes the number of nodes to exchange with.
 * @param nodeID the id of this node.
 *
 * @result new particle data constructed from all the exchanges with the
 *         given alignment.
373
 */
374
static void *engine_do_redistribute(int *counts, char *parts,
375
376
                                    size_t new_nr_parts, size_t sizeofparts,
                                    size_t alignsize, MPI_Datatype mpi_type,
377
                                    int nr_nodes, int nodeID) {
378
379

  /* Allocate a new particle array with some extra margin */
380
  char *parts_new = NULL;
381
382
  if (posix_memalign(
          (void **)&parts_new, alignsize,
383
          sizeofparts * new_nr_parts * engine_redistribute_alloc_margin) != 0)
384
385
386
387
    error("Failed to allocate new particle data.");

  /* Prepare MPI requests for the asynchronous communications */
  MPI_Request *reqs;
388
389
  if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 2 * nr_nodes)) ==
      NULL)
390
391
    error("Failed to allocate MPI request list.");

392
  /* Only send and receive only "chunk" particles per request. So we need to
393
394
395
   * loop as many times as necessary here. Make 2Gb/sizeofparts so we only
   * send 2Gb packets. */
  const int chunk = INT_MAX / sizeofparts;
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
  int sent = 0;
  int recvd = 0;

  int activenodes = 1;
  while (activenodes) {

    for (int k = 0; k < 2 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;

    /* Emit the sends and recvs for the data. */
    size_t offset_send = sent;
    size_t offset_recv = recvd;
    activenodes = 0;

    for (int k = 0; k < nr_nodes; k++) {

      /* Indices in the count arrays of the node of interest */
      const int ind_send = nodeID * nr_nodes + k;
      const int ind_recv = k * nr_nodes + nodeID;

      /* Are we sending any data this loop? */
      int sending = counts[ind_send] - sent;
      if (sending > 0) {
        activenodes++;
419
        if (sending > chunk) sending = chunk;
420
421
422
423

        /* If the send and receive is local then just copy. */
        if (k == nodeID) {
          int receiving = counts[ind_recv] - recvd;
424
          if (receiving > chunk) receiving = chunk;
425
          memcpy(&parts_new[offset_recv * sizeofparts],
426
                 &parts[offset_send * sizeofparts], sizeofparts * receiving);
427
428
        } else {
          /* Otherwise send it. */
429
430
431
          int res =
              MPI_Isend(&parts[offset_send * sizeofparts], sending, mpi_type, k,
                        ind_send, MPI_COMM_WORLD, &reqs[2 * k + 0]);
432
433
434
435
          if (res != MPI_SUCCESS)
            mpi_error(res, "Failed to isend parts to node %i.", k);
        }
      }
436

437
      /* If we're sending to this node, then move past it to next. */
438
      if (counts[ind_send] > 0) offset_send += counts[ind_send];
439

440
441
442
443
444
445
      /* Are we receiving any data from this node? Note already done if coming
       * from this node. */
      if (k != nodeID) {
        int receiving = counts[ind_recv] - recvd;
        if (receiving > 0) {
          activenodes++;
446
          if (receiving > chunk) receiving = chunk;
447
448
449
450
451
452
          int res = MPI_Irecv(&parts_new[offset_recv * sizeofparts], receiving,
                              mpi_type, k, ind_recv, MPI_COMM_WORLD,
                              &reqs[2 * k + 1]);
          if (res != MPI_SUCCESS)
            mpi_error(res, "Failed to emit irecv of parts from node %i.", k);
        }
453
454
      }

455
      /* If we're receiving from this node, then move past it to next. */
456
      if (counts[ind_recv] > 0) offset_recv += counts[ind_recv];
457
458
    }

459
460
461
462
463
464
465
    /* Wait for all the sends and recvs to tumble in. */
    MPI_Status stats[2 * nr_nodes];
    int res;
    if ((res = MPI_Waitall(2 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
      for (int k = 0; k < 2 * nr_nodes; k++) {
        char buff[MPI_MAX_ERROR_STRING];
        MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
466
467
        message("request from source %i, tag %i has error '%s'.",
                stats[k].MPI_SOURCE, stats[k].MPI_TAG, buff);
468
469
      }
      error("Failed during waitall for part data.");
470
    }
471
472
473
474

    /* Move to next chunks. */
    sent += chunk;
    recvd += chunk;
475
476
477
478
479
480
481
482
483
484
  }

  /* Free temps. */
  free(reqs);

  /* And return new memory. */
  return parts_new;
}
#endif

485
/**
486
 * @brief Redistribute the particles amongst the nodes according
487
488
 *      to their cell's node IDs.
 *
489
490
491
492
 * The strategy here is as follows:
 * 1) Each node counts the number of particles it has to send to each other
 * node.
 * 2) The number of particles of each type is then exchanged.
493
494
495
496
 * 3) The particles to send are placed in a temporary buffer in which the
 * part-gpart links are preserved.
 * 4) Each node allocates enough space for the new particles.
 * 5) (Asynchronous) communications are issued to transfer the data.
497
498
 *
 *
499
500
 * @param e The #engine.
 */
501
void engine_redistribute(struct engine *e) {
502

503
#ifdef WITH_MPI
504

505
506
  const int nr_nodes = e->nr_nodes;
  const int nodeID = e->nodeID;
507
  struct space *s = e->s;
508
  struct cell *cells = s->cells_top;
509
  const int nr_cells = s->nr_cells;
510
  const int *cdim = s->cdim;
511
  const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
512
513
514
  const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
  struct part *parts = s->parts;
  struct gpart *gparts = s->gparts;
515
  struct spart *sparts = s->sparts;
516
  ticks tic = getticks();
517

518
  /* Allocate temporary arrays to store the counts of particles to be sent
519
520
   * and the destination of each particle */
  int *counts;
521
  if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
522
    error("Failed to allocate counts temporary buffer.");
523
  bzero(counts, sizeof(int) * nr_nodes * nr_nodes);
524

525
  int *dest;
526
  if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
527
528
    error("Failed to allocate dest temporary buffer.");

529
  /* Get destination of each particle */
530
  for (size_t k = 0; k < s->nr_parts; k++) {
531
532

    /* Periodic boundary conditions */
533
    for (int j = 0; j < 3; j++) {
534
535
536
537
538
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
539
540
541
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
542
543
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
544
      error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k,
545
546
547
            parts[k].x[0], parts[k].x[1], parts[k].x[2]);
#endif

548
    dest[k] = cells[cid].nodeID;
549
550

    /* The counts array is indexed as count[from * nr_nodes + to]. */
551
552
    counts[nodeID * nr_nodes + dest[k]] += 1;
  }
553
554

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
555
  if (s->nr_parts > 0)
556
    space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1, e->verbose);
557

558
559
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the part have been sorted correctly. */
560
561
562
563
  for (size_t k = 0; k < s->nr_parts; k++) {
    const struct part *p = &s->parts[k];

    /* New cell index */
564
    const int new_cid =
565
566
567
568
        cell_getid(s->cdim, p->x[0] * s->iwidth[0], p->x[1] * s->iwidth[1],
                   p->x[2] * s->iwidth[2]);

    /* New cell of this part */
569
570
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
571

572
573
    if (dest[k] != new_node)
      error("part's new node index not matching sorted index.");
574
575
576
577
578

    if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->width[0] ||
        p->x[1] < c->loc[1] || p->x[1] > c->loc[1] + c->width[1] ||
        p->x[2] < c->loc[2] || p->x[2] > c->loc[2] + c->width[2])
      error("part not sorted into the right top-level cell!");
579
580
581
  }
#endif

582
  /* We need to re-link the gpart partners of parts. */
583
584
585
586
587
588
  if (s->nr_parts > 0) {
    int current_dest = dest[0];
    size_t count_this_dest = 0;
    for (size_t k = 0; k < s->nr_parts; ++k) {
      if (s->parts[k].gpart != NULL) {

589
590
591
        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
592
         * Recall that gparts without partners have a positive id.
593
         * We will restore the pointers on the receiving node later on. */
594
595
596
597
        if (dest[k] != current_dest) {
          current_dest = dest[k];
          count_this_dest = 0;
        }
598

599
#ifdef SWIFT_DEBUG_CHECKS
600
        if (s->parts[k].gpart->id_or_neg_offset > 0)
601
602
          error("Trying to link a partnerless gpart !");
#endif
603

604
        s->parts[k].gpart->id_or_neg_offset = -count_this_dest;
605
        count_this_dest++;
606
607
608
      }
    }
  }
609
  free(dest);
610

611
  /* Get destination of each s-particle */
612
613
614
615
616
617
618
619
620
  int *s_counts;
  if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
    error("Failed to allocate s_counts temporary buffer.");
  bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes);

  int *s_dest;
  if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL)
    error("Failed to allocate s_dest temporary buffer.");

621
622
623
624
625
626
627
628
629
630
631
632
633
634
  for (size_t k = 0; k < s->nr_sparts; k++) {

    /* Periodic boundary conditions */
    for (int j = 0; j < 3; j++) {
      if (sparts[k].x[j] < 0.0)
        sparts[k].x[j] += dim[j];
      else if (sparts[k].x[j] >= dim[j])
        sparts[k].x[j] -= dim[j];
    }
    const int cid =
        cell_getid(cdim, sparts[k].x[0] * iwidth[0], sparts[k].x[1] * iwidth[1],
                   sparts[k].x[2] * iwidth[2]);
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
635
      error("Bad cell id %i for spart %zu at [%.3e,%.3e,%.3e].", cid, k,
636
637
638
639
640
641
642
643
644
645
            sparts[k].x[0], sparts[k].x[1], sparts[k].x[2]);
#endif

    s_dest[k] = cells[cid].nodeID;

    /* The counts array is indexed as count[from * nr_nodes + to]. */
    s_counts[nodeID * nr_nodes + s_dest[k]] += 1;
  }

  /* Sort the particles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
646
  if (s->nr_sparts > 0)
647
    space_sparts_sort(s, s_dest, s->nr_sparts, 0, nr_nodes - 1, e->verbose);
648

649
650
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the spart have been sorted correctly. */
651
652
653
654
  for (size_t k = 0; k < s->nr_sparts; k++) {
    const struct spart *sp = &s->sparts[k];

    /* New cell index */
655
    const int new_cid =
656
657
658
659
        cell_getid(s->cdim, sp->x[0] * s->iwidth[0], sp->x[1] * s->iwidth[1],
                   sp->x[2] * s->iwidth[2]);

    /* New cell of this spart */
660
661
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
662

663
664
    if (s_dest[k] != new_node)
      error("spart's new node index not matching sorted index.");
665
666
667
668
669

    if (sp->x[0] < c->loc[0] || sp->x[0] > c->loc[0] + c->width[0] ||
        sp->x[1] < c->loc[1] || sp->x[1] > c->loc[1] + c->width[1] ||
        sp->x[2] < c->loc[2] || sp->x[2] > c->loc[2] + c->width[2])
      error("spart not sorted into the right top-level cell!");
670
671
672
  }
#endif

673
  /* We need to re-link the gpart partners of sparts. */
674
675
676
  if (s->nr_sparts > 0) {
    int current_dest = s_dest[0];
    size_t count_this_dest = 0;
677
    for (size_t k = 0; k < s->nr_sparts; ++k) {
678
679
680
681
682
      if (s->sparts[k].gpart != NULL) {

        /* As the addresses will be invalidated by the communications, we will
         * instead store the absolute index from the start of the sub-array of
         * particles to be sent to a given node.
683
         * Recall that gparts without partners have a positive id.
684
685
686
687
688
689
690
         * We will restore the pointers on the receiving node later on. */
        if (s_dest[k] != current_dest) {
          current_dest = s_dest[k];
          count_this_dest = 0;
        }

#ifdef SWIFT_DEBUG_CHECKS
691
        if (s->sparts[k].gpart->id_or_neg_offset > 0)
692
693
694
695
696
697
698
699
700
          error("Trying to link a partnerless gpart !");
#endif

        s->sparts[k].gpart->id_or_neg_offset = -count_this_dest;
        count_this_dest++;
      }
    }
  }

701
702
  free(s_dest);

703
  /* Get destination of each g-particle */
704
705
706
707
708
709
710
711
712
  int *g_counts;
  if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL)
    error("Failed to allocate g_gcount temporary buffer.");
  bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes);

  int *g_dest;
  if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
    error("Failed to allocate g_dest temporary buffer.");

713
  for (size_t k = 0; k < s->nr_gparts; k++) {
714
715

    /* Periodic boundary conditions */
716
    for (int j = 0; j < 3; j++) {
717
718
719
720
      if (gparts[k].x[j] < 0.0)
        gparts[k].x[j] += dim[j];
      else if (gparts[k].x[j] >= dim[j])
        gparts[k].x[j] -= dim[j];
721
    }
James Willis's avatar
James Willis committed
722
723
724
    const int cid =
        cell_getid(cdim, gparts[k].x[0] * iwidth[0], gparts[k].x[1] * iwidth[1],
                   gparts[k].x[2] * iwidth[2]);
725
726
#ifdef SWIFT_DEBUG_CHECKS
    if (cid < 0 || cid >= s->nr_cells)
727
      error("Bad cell id %i for gpart %zu at [%.3e,%.3e,%.3e].", cid, k,
728
729
730
            gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]);
#endif

731
    g_dest[k] = cells[cid].nodeID;
732
733

    /* The counts array is indexed as count[from * nr_nodes + to]. */
734
    g_counts[nodeID * nr_nodes + g_dest[k]] += 1;
735
  }
736
737

  /* Sort the gparticles according to their cell index. */
Matthieu Schaller's avatar
Matthieu Schaller committed
738
  if (s->nr_gparts > 0)
739
    space_gparts_sort(s, g_dest, s->nr_gparts, 0, nr_nodes - 1, e->verbose);
740

741
742
#ifdef SWIFT_DEBUG_CHECKS
  /* Verify that the gpart have been sorted correctly. */
743
744
745
746
  for (size_t k = 0; k < s->nr_gparts; k++) {
    const struct gpart *gp = &s->gparts[k];

    /* New cell index */
747
    const int new_cid =
748
749
750
751
        cell_getid(s->cdim, gp->x[0] * s->iwidth[0], gp->x[1] * s->iwidth[1],
                   gp->x[2] * s->iwidth[2]);

    /* New cell of this gpart */
752
753
    const struct cell *c = &s->cells_top[new_cid];
    const int new_node = c->nodeID;
754

755
    if (g_dest[k] != new_node)
756
757
      error("gpart's new node index not matching sorted index (%d != %d).",
            g_dest[k], new_node);
758
759
760
761
762

    if (gp->x[0] < c->loc[0] || gp->x[0] > c->loc[0] + c->width[0] ||
        gp->x[1] < c->loc[1] || gp->x[1] > c->loc[1] + c->width[1] ||
        gp->x[2] < c->loc[2] || gp->x[2] > c->loc[2] + c->width[2])
      error("gpart not sorted into the right top-level cell!");
763
764
765
  }
#endif

766
767
  free(g_dest);

768
769
770
771
772
  /* Get all the counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM,
                    MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce particle transfer counts.");

773
  /* Get all the s_counts from all the nodes. */
774
775
776
777
778
779
780
781
782
  if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce gparticle transfer counts.");

  /* Get all the g_counts from all the nodes. */
  if (MPI_Allreduce(MPI_IN_PLACE, s_counts, nr_nodes * nr_nodes, MPI_INT,
                    MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
    error("Failed to allreduce sparticle transfer counts.");

Peter W. Draper's avatar
Peter W. Draper committed
783
  /* Report how many particles will be moved. */
784
785
  if (e->verbose) {
    if (e->nodeID == 0) {
786
787
      size_t total = 0, g_total = 0, s_total = 0;
      size_t unmoved = 0, g_unmoved = 0, s_unmoved = 0;
788
      for (int p = 0, r = 0; p < nr_nodes; p++) {
789
        for (int n = 0; n < nr_nodes; n++) {
790
          total += counts[r];
791
792
          g_total += g_counts[r];
          s_total += s_counts[r];
793
          if (p == n) {
794
795
796
797
            unmoved += counts[r];
            g_unmoved += g_counts[r];
            s_unmoved += s_counts[r];
          }
798
799
800
          r++;
        }
      }
Matthieu Schaller's avatar
Matthieu Schaller committed
801
802
803
804
805
806
807
808
809
810
811
      if (total > 0)
        message("%ld of %ld (%.2f%%) of particles moved", total - unmoved,
                total, 100.0 * (double)(total - unmoved) / (double)total);
      if (g_total > 0)
        message("%ld of %ld (%.2f%%) of g-particles moved", g_total - g_unmoved,
                g_total,
                100.0 * (double)(g_total - g_unmoved) / (double)g_total);
      if (s_total > 0)
        message("%ld of %ld (%.2f%%) of s-particles moved", s_total - s_unmoved,
                s_total,
                100.0 * (double)(s_total - s_unmoved) / (double)s_total);
812
    }
813
814
  }

Peter W. Draper's avatar
Peter W. Draper committed
815
816
817
  /* Now each node knows how many parts, sparts and gparts will be transferred
   * to every other node.
   * Get the new numbers of particles for this node. */
818
  size_t nr_parts = 0, nr_gparts = 0, nr_sparts = 0;
819
  for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID];
820
821
  for (int k = 0; k < nr_nodes; k++)
    nr_gparts += g_counts[k * nr_nodes + nodeID];
822
823
  for (int k = 0; k < nr_nodes; k++)
    nr_sparts += s_counts[k * nr_nodes + nodeID];
824

Peter W. Draper's avatar
Peter W. Draper committed
825
826
827
828
  /* Now exchange the particles, type by type to keep the memory required
   * under control. */

  /* SPH particles. */
829
  void *new_parts = engine_do_redistribute(counts, (char *)s->parts, nr_parts,
830
831
832
                                           sizeof(struct part), part_align,
                                           part_mpi_type, nr_nodes, nodeID);
  free(s->parts);
833
  s->parts = (struct part *)new_parts;
834
835
  s->nr_parts = nr_parts;
  s->size_parts = engine_redistribute_alloc_margin * nr_parts;
836

Peter W. Draper's avatar
Peter W. Draper committed
837
  /* Extra SPH particle properties. */
838
  new_parts = engine_do_redistribute(counts, (char *)s->xparts, nr_parts,
839
840
841
                                     sizeof(struct xpart), xpart_align,
                                     xpart_mpi_type, nr_nodes, nodeID);
  free(s->xparts);
842
  s->xparts = (struct xpart *)new_parts;
843

Peter W. Draper's avatar
Peter W. Draper committed
844
  /* Gravity particles. */
845
  new_parts = engine_do_redistribute(g_counts, (char *)s->gparts, nr_gparts,
846
847
848
                                     sizeof(struct gpart), gpart_align,
                                     gpart_mpi_type, nr_nodes, nodeID);
  free(s->gparts);
849
  s->gparts = (struct gpart *)new_parts;
850
851
  s->nr_gparts = nr_gparts;
  s->size_gparts = engine_redistribute_alloc_margin * nr_gparts;
852

Peter W. Draper's avatar
Peter W. Draper committed
853
  /* Star particles. */
854
  new_parts = engine_do_redistribute(s_counts, (char *)s->sparts, nr_sparts,
855
856
857
                                     sizeof(struct spart), spart_align,
                                     spart_mpi_type, nr_nodes, nodeID);
  free(s->sparts);
858
  s->sparts = (struct spart *)new_parts;
859
860
  s->nr_sparts = nr_sparts;
  s->size_sparts = engine_redistribute_alloc_margin * nr_sparts;
861

862
863
864
865
866
  /* All particles have now arrived. Time for some final operations on the
     stuff we just received */

  /* Restore the part<->gpart and spart<->gpart links */
  size_t offset_parts = 0, offset_sparts = 0, offset_gparts = 0;
867
868
869
870
871
  for (int node = 0; node < nr_nodes; ++node) {

    const int ind_recv = node * nr_nodes + nodeID;
    const size_t count_parts = counts[ind_recv];
    const size_t count_gparts = g_counts[ind_recv];
872
    const size_t count_sparts = s_counts[ind_recv];
873
874
875
876

    /* Loop over the gparts received from that node */
    for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) {

877
      /* Does this gpart have a gas partner ? */
878
      if (s->gparts[k].type == swift_type_gas) {
879

Matthieu Schaller's avatar
Style    
Matthieu Schaller committed
880
        const ptrdiff_t partner_index =
881
            offset_parts - s->gparts[k].id_or_neg_offset;
882
883

        /* Re-link */
884
885
        s->gparts[k].id_or_neg_offset = -partner_index;
        s->parts[partner_index].gpart = &s->gparts[k];
886
      }
887
888

      /* Does this gpart have a star partner ? */
889
      if (s->gparts[k].type == swift_type_star) {
890
891

        const ptrdiff_t partner_index =
892
            offset_sparts - s->gparts[k].id_or_neg_offset;
893
894

        /* Re-link */
895
896
        s->gparts[k].id_or_neg_offset = -partner_index;
        s->sparts[partner_index].gpart = &s->gparts[k];
897
      }
898
899
900
901
    }

    offset_parts += count_parts;
    offset_gparts += count_gparts;
902
    offset_sparts += count_sparts;
903
904
  }

905
906
907
908
909
  /* Clean up the counts now we done. */
  free(counts);
  free(g_counts);
  free(s_counts);

910
#ifdef SWIFT_DEBUG_CHECKS
911
  /* Verify that all parts are in the right place. */
912
  for (size_t k = 0; k < nr_parts; k++) {
913
914
915
    const int cid =
        cell_getid(cdim, s->parts[k].x[0] * iwidth[0],
                   s->parts[k].x[1] * iwidth[1], s->parts[k].x[2] * iwidth[2]);
916
    if (cells[cid].nodeID != nodeID)
917
      error("Received particle (%zu) that does not belong here (nodeID=%i).", k,
918
919
            cells[cid].nodeID);
  }
920
  for (size_t k = 0; k < nr_gparts; k++) {
921
922
923
    const int cid = cell_getid(cdim, s->gparts[k].x[0] * iwidth[0],
                               s->gparts[k].x[1] * iwidth[1],
                               s->gparts[k].x[2] * iwidth[2]);
924
925
926
927
928
    if (cells[cid].nodeID != nodeID)
      error("Received g-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
  }
  for (size_t k = 0; k < nr_sparts; k++) {
929
930
931
    const int cid = cell_getid(cdim, s->sparts[k].x[0] * iwidth[0],
                               s->sparts[k].x[1] * iwidth[1],
                               s->sparts[k].x[2] * iwidth[2]);
932
933
934
935
    if (cells[cid].nodeID != nodeID)
      error("Received s-particle (%zu) that does not belong here (nodeID=%i).",
            k, cells[cid].nodeID);
  }
936

937
  /* Verify that the links are correct */
938
  part_verify_links(s->parts, s->gparts, s->sparts, nr_parts, nr_gparts,
939
                    nr_sparts, e->verbose);
940
#endif
941

942
943
944
945
946
  /* Be verbose about what just happened. */
  if (e->verbose) {
    int my_cells = 0;
    for (int k = 0; k < nr_cells; k++)
      if (cells[k].nodeID == nodeID) my_cells += 1;
947
948
    message("node %i now has %zu parts, %zu sparts and %zu gparts in %i cells.",
            nodeID, nr_parts, nr_sparts, nr_gparts, my_cells);
949
950
  }

951
952
953
  /* Flag that a redistribute has taken place */
  e->step_props |= engine_step_prop_redistribute;

954
955
956
  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
957
#else
958
  error("SWIFT was not compiled with MPI support.");
959
960
#endif
}
961

962
/**
963
 * @brief Repartition the cells amongst the nodes.
964
965
966
 *
 * @param e The #engine.
 */
967
void engine_repartition(struct engine *e) {
968
969
970

#if defined(WITH_MPI) && defined(HAVE_METIS)

971
972
  ticks tic = getticks();

973
#ifdef SWIFT_DEBUG_CHECKS