partition.c 41.2 KB
Newer Older
1
2
/*******************************************************************************
 * This file is part of SWIFT.
Peter W. Draper's avatar
Peter W. Draper committed
3
4
 * Copyright (c) 2016 Peter W. Draper (p.w.draper@durham.ac.uk)
 *                    Pedro Gonnet (pedro.gonnet@durham.ac.uk)
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/

/**
 *  @file partition.c
23
 *  @brief file of various techniques for partitioning and repartitioning
Peter W. Draper's avatar
Peter W. Draper committed
24
25
 *  a grid of cells into geometrically connected regions and distributing
 *  these around a number of MPI nodes.
26
 *
Peter W. Draper's avatar
Peter W. Draper committed
27
 *  Currently supported partitioning types: grid, vectorise and METIS.
28
29
30
31
32
33
 */

/* Config parameters. */
#include "../config.h"

/* Standard headers. */
34
#include <float.h>
35
#include <math.h>
36
37
#include <stdio.h>
#include <stdlib.h>
38
#include <strings.h>
39

40
41
42
43
44
45
46
47
48
/* MPI headers. */
#ifdef WITH_MPI
#include <mpi.h>
/* METIS headers only used when MPI is also available. */
#ifdef HAVE_METIS
#include <metis.h>
#endif
#endif

49
/* Local headers. */
50
#include "debug.h"
51
52
53
#include "error.h"
#include "partition.h"
#include "space.h"
54
#include "tools.h"
55

56
57
58
/* Maximum weight used for METIS. */
#define metis_maxweight 10000.0f

59
/* Simple descriptions of initial partition types for reports. */
60
const char *initial_partition_name[] = {
61
    "gridded cells", "vectorized point associated cells",
Matthieu Schaller's avatar
Matthieu Schaller committed
62
    "METIS particle weighted cells", "METIS unweighted cells"};
63
64

/* Simple descriptions of repartition types for reports. */
65
const char *repartition_name[] = {
66
    "no",
67
    "METIS edge and vertex task cost weights",
68
    "METIS particle count vertex weights",
69
70
71
72
73
    "METIS task cost edge weights",
    "METIS particle count vertex and task cost edge weights",
    "METIS vertex task costs and edge delta timebin weights",
    "METIS particle count vertex and edge delta timebin weights",
    "METIS edge delta timebin weights",
74
};
75

Peter W. Draper's avatar
Peter W. Draper committed
76
77
78
/* Local functions, if needed. */
static int check_complete(struct space *s, int verbose, int nregions);

79
80
81
/*  Vectorisation support */
/*  ===================== */

82
#if defined(WITH_MPI)
83
/**
84
 *  @brief Pick a number of cell positions from a vectorised list.
85
 *
86
87
88
 *  Vectorise the cell space and pick positions in it for the number of
 *  expected regions using a single step. Vectorisation is guaranteed
 *  to work, providing there are more cells than regions.
89
90
 *
 *  @param s the space.
91
92
 *  @param nregions the number of regions
 *  @param samplecells the list of sample cell positions, size of 3*nregions
93
 */
94
static void pick_vector(struct space *s, int nregions, int *samplecells) {
95
96
97

  /* Get length of space and divide up. */
  int length = s->cdim[0] * s->cdim[1] * s->cdim[2];
98
99
100
  if (nregions > length) {
    error("Too few cells (%d) for this number of regions (%d)", length,
          nregions);
101
102
  }

103
  int step = length / nregions;
104
105
106
107
108
109
110
  int n = 0;
  int m = 0;
  int l = 0;

  for (int i = 0; i < s->cdim[0]; i++) {
    for (int j = 0; j < s->cdim[1]; j++) {
      for (int k = 0; k < s->cdim[2]; k++) {
111
        if (n == 0 && l < nregions) {
112
113
114
115
116
117
118
119
120
121
122
          samplecells[m++] = i;
          samplecells[m++] = j;
          samplecells[m++] = k;
          l++;
        }
        n++;
        if (n == step) n = 0;
      }
    }
  }
}
123
#endif
124

125
#if defined(WITH_MPI)
126
127
128
/**
 * @brief Partition the space.
 *
Peter W. Draper's avatar
Peter W. Draper committed
129
130
 * Using the sample positions as seeds pick cells that are geometrically
 * closest and apply the partition to the space.
131
 */
132
static void split_vector(struct space *s, int nregions, int *samplecells) {
133
134
135
136
137
138
139
  int n = 0;
  for (int i = 0; i < s->cdim[0]; i++) {
    for (int j = 0; j < s->cdim[1]; j++) {
      for (int k = 0; k < s->cdim[2]; k++) {
        int select = -1;
        float rsqmax = FLT_MAX;
        int m = 0;
140
        for (int l = 0; l < nregions; l++) {
141
142
143
144
145
146
147
148
149
          float dx = samplecells[m++] - i;
          float dy = samplecells[m++] - j;
          float dz = samplecells[m++] - k;
          float rsq = (dx * dx + dy * dy + dz * dz);
          if (rsq < rsqmax) {
            rsqmax = rsq;
            select = l;
          }
        }
150
        s->cells_top[n++].nodeID = select;
151
152
153
154
      }
    }
  }
}
155
#endif
156
157
158
159
160
161
162

/* METIS support
 * =============
 *
 * METIS partitions using a multi-level k-way scheme. We support using this in
 * a unweighted scheme, which works well and seems to be guaranteed, and a
 * weighted by the number of particles scheme. Note METIS is optional.
Peter W. Draper's avatar
Peter W. Draper committed
163
164
165
166
 *
 * Repartitioning is based on METIS and uses weights determined from the times
 * that cell tasks have taken. These weight the graph edges and vertices, or
 * just the edges, with vertex weights from the particle counts or none.
167
168
 */

Peter W. Draper's avatar
Peter W. Draper committed
169
#if defined(WITH_MPI) && defined(HAVE_METIS)
170
/**
171
172
173
174
175
176
177
178
179
180
181
182
183
 * @brief Fill the METIS xadj and adjncy arrays defining the graph of cells
 *        in a space.
 *
 * See the METIS manual if you want to understand this format. The cell graph
 * consists of all nodes as vertices with edges as the connections to all
 * neighbours, so we have 26 per vertex.
 *
 * @param s the space of cells.
 * @param adjncy the METIS adjncy array to fill, must be of size
 *               26 * the number of cells in the space.
 * @param xadj the METIS xadj array to fill, must be of size
 *             number of cells in space + 1. NULL for not used.
 */
184
static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) {
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235

  /* Loop over all cells in the space. */
  int cid = 0;
  for (int l = 0; l < s->cdim[0]; l++) {
    for (int m = 0; m < s->cdim[1]; m++) {
      for (int n = 0; n < s->cdim[2]; n++) {

        /* Visit all neighbours of this cell, wrapping space at edges. */
        int p = 0;
        for (int i = -1; i <= 1; i++) {
          int ii = l + i;
          if (ii < 0)
            ii += s->cdim[0];
          else if (ii >= s->cdim[0])
            ii -= s->cdim[0];
          for (int j = -1; j <= 1; j++) {
            int jj = m + j;
            if (jj < 0)
              jj += s->cdim[1];
            else if (jj >= s->cdim[1])
              jj -= s->cdim[1];
            for (int k = -1; k <= 1; k++) {
              int kk = n + k;
              if (kk < 0)
                kk += s->cdim[2];
              else if (kk >= s->cdim[2])
                kk -= s->cdim[2];

              /* If not self, record id of neighbour. */
              if (i || j || k) {
                adjncy[cid * 26 + p] = cell_getid(s->cdim, ii, jj, kk);
                p++;
              }
            }
          }
        }

        /* Next cell. */
        cid++;
      }
    }
  }

  /* If given set xadj. */
  if (xadj != NULL) {
    xadj[0] = 0;
    for (int k = 0; k < s->nr_cells; k++) xadj[k + 1] = xadj[k] + 26;
  }
}
#endif

Peter W. Draper's avatar
Peter W. Draper committed
236
#if defined(WITH_MPI) && defined(HAVE_METIS)
237
238
239
240
241
242
243
/**
 * @brief Accumulate the counts of particles per cell.
 *
 * @param s the space containing the cells.
 * @param counts the number of particles per cell. Should be
 *               allocated as size s->nr_parts.
 */
244
static void accumulate_counts(struct space *s, double *counts) {
245
246
247

  struct part *parts = s->parts;
  int *cdim = s->cdim;
248
  double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]};
249
  double dim[3] = {s->dim[0], s->dim[1], s->dim[2]};
250

251
  bzero(counts, sizeof(double) * s->nr_cells);
252

253
  for (size_t k = 0; k < s->nr_parts; k++) {
254
255
256
257
258
259
    for (int j = 0; j < 3; j++) {
      if (parts[k].x[j] < 0.0)
        parts[k].x[j] += dim[j];
      else if (parts[k].x[j] >= dim[j])
        parts[k].x[j] -= dim[j];
    }
James Willis's avatar
James Willis committed
260
261
262
    const int cid =
        cell_getid(cdim, parts[k].x[0] * iwidth[0], parts[k].x[1] * iwidth[1],
                   parts[k].x[2] * iwidth[2]);
263
264
265
266
267
    counts[cid]++;
  }
}
#endif

268
269
270
271
272
273
274
275
276
277
278
279
280
#if defined(WITH_MPI) && defined(HAVE_METIS)
/**
 * @brief Apply METIS cell list partitioning to a cell structure.
 *
 * Uses the results of part_metis_pick to assign each cell's nodeID to the
 * picked region index, thus partitioning the space into regions.
 *
 * @param s the space containing the cells to split into regions.
 * @param nregions number of regions.
 * @param celllist list of regions for each cell.
 */
static void split_metis(struct space *s, int nregions, int *celllist) {

281
  for (int i = 0; i < s->nr_cells; i++) s->cells_top[i].nodeID = celllist[i];
282

283
284
  /* To check or visualise the partition dump all the cells. */
  /* dumpCellRanks("metis_partition", s->cells_top, s->nr_cells); */
285
286
287
288
}
#endif

#if defined(WITH_MPI) && defined(HAVE_METIS)
289
290
291
292
293
294
295
296
297
298
299
300

/* qsort support. */
struct indexval {
  int index;
  int count;
};
static int indexvalcmp(const void *p1, const void *p2) {
  const struct indexval *iv1 = (const struct indexval *)p1;
  const struct indexval *iv2 = (const struct indexval *)p2;
  return iv2->count - iv1->count;
}

301
302
303
304
/**
 * @brief Partition the given space into a number of connected regions.
 *
 * Split the space using METIS to derive a partitions using the
305
 * given edge and vertex weights. If no weights are given then an
Peter W. Draper's avatar
Peter W. Draper committed
306
 * unweighted partition is performed.
307
308
309
310
 *
 * @param s the space of cells to partition.
 * @param nregions the number of regions required in the partition.
 * @param vertexw weights for the cells, sizeof number of cells if used,
311
 *        NULL for unit weights. Need to be in the range of idx_t.
312
313
 * @param edgew weights for the graph edges between all cells, sizeof number
 *        of cells * 26 if used, NULL for unit weights. Need to be packed
314
315
 *        in CSR format, so same as adjncy array. Need to be in the range of
 *        idx_t.
316
317
318
 * @param celllist on exit this contains the ids of the selected regions,
 *        sizeof number of cells.
 */
319
320
static void pick_metis(struct space *s, int nregions, double *vertexw,
                       double *edgew, int *celllist) {
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356

  /* Total number of cells. */
  int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2];

  /* Nothing much to do if only using a single partition. Also avoids METIS
   * bug that doesn't handle this case well. */
  if (nregions == 1) {
    for (int i = 0; i < ncells; i++) celllist[i] = 0;
    return;
  }

  /* Allocate weights and adjacency arrays . */
  idx_t *xadj;
  if ((xadj = (idx_t *)malloc(sizeof(idx_t) * (ncells + 1))) == NULL)
    error("Failed to allocate xadj buffer.");
  idx_t *adjncy;
  if ((adjncy = (idx_t *)malloc(sizeof(idx_t) * 26 * ncells)) == NULL)
    error("Failed to allocate adjncy array.");
  idx_t *weights_v = NULL;
  if (vertexw != NULL)
    if ((weights_v = (idx_t *)malloc(sizeof(idx_t) * ncells)) == NULL)
      error("Failed to allocate vertex weights array");
  idx_t *weights_e = NULL;
  if (edgew != NULL)
    if ((weights_e = (idx_t *)malloc(26 * sizeof(idx_t) * ncells)) == NULL)
      error("Failed to allocate edge weights array");
  idx_t *regionid;
  if ((regionid = (idx_t *)malloc(sizeof(idx_t) * ncells)) == NULL)
    error("Failed to allocate regionid array");

  /* Define the cell graph. */
  graph_init_metis(s, adjncy, xadj);

  /* Init the vertex weights array. */
  if (vertexw != NULL) {
    for (int k = 0; k < ncells; k++) {
357
      if (vertexw[k] > 1) {
358
359
360
361
362
        weights_v[k] = vertexw[k];
      } else {
        weights_v[k] = 1;
      }
    }
363
364
365
366
367
368
369
370
371
372

#ifdef SWIFT_DEBUG_CHECKS
    /* Check weights are all in range. */
    int failed = 0;
    for (int k = 0; k < ncells; k++) {
      if ((idx_t)vertexw[k] < 0) {
        message("Input vertex weight out of range: %ld", (long)vertexw[k]);
        failed++;
      }
      if (weights_v[k] < 1) {
373
        message("Used vertex weight  out of range: %" PRIDX, weights_v[k]);
374
375
376
        failed++;
      }
    }
Matthieu Schaller's avatar
Matthieu Schaller committed
377
    if (failed > 0) error("%d vertex weights are out of range", failed);
378
#endif
379
380
381
382
383
  }

  /* Init the edges weights array. */
  if (edgew != NULL) {
    for (int k = 0; k < 26 * ncells; k++) {
384
      if (edgew[k] > 1) {
385
386
387
388
389
        weights_e[k] = edgew[k];
      } else {
        weights_e[k] = 1;
      }
    }
390
391
392
393
394
395
396
397
398
399
400

#ifdef SWIFT_DEBUG_CHECKS
    /* Check weights are all in range. */
    int failed = 0;
    for (int k = 0; k < 26 * ncells; k++) {

      if ((idx_t)edgew[k] < 0) {
        message("Input edge weight out of range: %ld", (long)edgew[k]);
        failed++;
      }
      if (weights_e[k] < 1) {
401
        message("Used edge weight out of range: %" PRIDX, weights_e[k]);
402
403
404
        failed++;
      }
    }
Matthieu Schaller's avatar
Matthieu Schaller committed
405
    if (failed > 0) error("%d edge weights are out of range", failed);
406
#endif
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
  }

  /* Set the METIS options. */
  idx_t options[METIS_NOPTIONS];
  METIS_SetDefaultOptions(options);
  options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT;
  options[METIS_OPTION_NUMBERING] = 0;
  options[METIS_OPTION_CONTIG] = 1;
  options[METIS_OPTION_NCUTS] = 10;
  options[METIS_OPTION_NITER] = 20;

  /* Call METIS. */
  idx_t one = 1;
  idx_t idx_ncells = ncells;
  idx_t idx_nregions = nregions;
  idx_t objval;

  /* Dump graph in METIS format */
425
426
  /*dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy,
   *               weights_v, NULL, weights_e);
427
   */
428
  if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, NULL,
Matthieu Schaller's avatar
Matthieu Schaller committed
429
430
                          weights_e, &idx_nregions, NULL, NULL, options,
                          &objval, regionid) != METIS_OK)
431
432
433
434
435
436
437
    error("Call to METIS_PartGraphKway failed.");

  /* Check that the regionids are ok. */
  for (int k = 0; k < ncells; k++)
    if (regionid[k] < 0 || regionid[k] >= nregions)
      error("Got bad nodeID %" PRIDX " for cell %i.", regionid[k], k);

438
439
  /* We want a solution in which the current regions of the space are
   * preserved when possible, to avoid unneccesary particle movement.
Peter W. Draper's avatar
Peter W. Draper committed
440
441
442
443
444
445
   * So create a 2d-array of cells counts that are common to all pairs
   * of old and new ranks. Each element of the array has a cell count and
   * an unique index so we can sort into decreasing counts. */
  int indmax = nregions * nregions;
  struct indexval *ivs = malloc(sizeof(struct indexval) * indmax);
  bzero(ivs, sizeof(struct indexval) * indmax);
446
  for (int k = 0; k < ncells; k++) {
Peter W. Draper's avatar
Peter W. Draper committed
447
448
449
    int index = regionid[k] + nregions * s->cells_top[k].nodeID;
    ivs[index].count++;
    ivs[index].index = index;
450
  }
Peter W. Draper's avatar
Peter W. Draper committed
451
  qsort(ivs, indmax, sizeof(struct indexval), indexvalcmp);
452
453
454
455
456
457

  /* Go through the ivs using the largest counts first, these are the
   * regions with the most cells in common, old partition to new. */
  int *oldmap = malloc(sizeof(int) * nregions);
  int *newmap = malloc(sizeof(int) * nregions);
  for (int k = 0; k < nregions; k++) {
Peter W. Draper's avatar
Peter W. Draper committed
458
459
    oldmap[k] = -1;
    newmap[k] = -1;
460
  }
Peter W. Draper's avatar
Peter W. Draper committed
461
  for (int k = 0; k < indmax; k++) {
462
463
464
465
466
467
468
469
470
471
472
473
474

    /* Stop when all regions with common cells have been considered. */
    if (ivs[k].count == 0) break;

    /* Store old and new IDs, if not already used. */
    int oldregion = ivs[k].index / nregions;
    int newregion = ivs[k].index - oldregion * nregions;
    if (newmap[newregion] == -1 && oldmap[oldregion] == -1) {
      newmap[newregion] = oldregion;
      oldmap[oldregion] = newregion;
    }
  }

475
476
  /* Handle any regions that did not get selected by picking an unused rank
   * from oldmap and assigning to newmap. */
477
478
479
480
481
482
483
484
485
486
487
488
489
490
  int spare = 0;
  for (int k = 0; k < nregions; k++) {
    if (newmap[k] == -1) {
      for (int j = spare; j < nregions; j++) {
        if (oldmap[j] == -1) {
          newmap[k] = j;
          oldmap[j] = j;
          spare = j;
          break;
        }
      }
    }
  }

491
492
  /* Set the cell list to the region index. */
  for (int k = 0; k < ncells; k++) {
493
    celllist[k] = newmap[regionid[k]];
494
495
496
497
498
  }

  /* Clean up. */
  if (weights_v != NULL) free(weights_v);
  if (weights_e != NULL) free(weights_e);
499
500
501
  free(ivs);
  free(oldmap);
  free(newmap);
502
503
504
505
506
507
  free(xadj);
  free(adjncy);
  free(regionid);
}
#endif

Peter W. Draper's avatar
Peter W. Draper committed
508
#if defined(WITH_MPI) && defined(HAVE_METIS)
509
/**
510
511
 * @brief Repartition the cells amongst the nodes using task costs
 *        as edge weights and vertex weights also from task costs
512
513
514
515
516
 *        or particle cells counts.
 *
 * @param partweights whether particle counts will be used as vertex weights.
 * @param bothweights whether vertex and edge weights will be used, otherwise
 *                    only edge weights will be used.
517
 * @param timebins use timebins as edge weights.
Peter W. Draper's avatar
Peter W. Draper committed
518
519
520
521
522
 * @param nodeID our nodeID.
 * @param nr_nodes the number of nodes.
 * @param s the space of cells holding our local particles.
 * @param tasks the completed tasks from the last engine step for our node.
 * @param nr_tasks the number of tasks.
523
 */
524
525
526
static void repart_edge_metis(int partweights, int bothweights, int timebins,
                              int nodeID, int nr_nodes, struct space *s,
                              struct task *tasks, int nr_tasks) {
527
528

  /* Create weight arrays using task ticks for vertices and edges (edges
529
   * assume the same graph structure as used in the part_ calls). */
530
  int nr_cells = s->nr_cells;
531
  struct cell *cells = s->cells_top;
532
533
534
535
536
537
538
539
540

  /* Allocate and fill the adjncy indexing array defining the graph of
   * cells. */
  idx_t *inds;
  if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 * nr_cells)) == NULL)
    error("Failed to allocate the inds array");
  graph_init_metis(s, inds, NULL);

  /* Allocate and init weights. */
541
542
  double *weights_v = NULL;
  double *weights_e = NULL;
543
  if (bothweights) {
544
    if ((weights_v = (double *)malloc(sizeof(double) * nr_cells)) == NULL)
545
      error("Failed to allocate vertex weights arrays.");
546
    bzero(weights_v, sizeof(double) * nr_cells);
547
  }
548
  if ((weights_e = (double *)malloc(sizeof(double) * 26 * nr_cells)) == NULL)
Matthieu Schaller's avatar
Matthieu Schaller committed
549
    error("Failed to allocate edge weights arrays.");
550
  bzero(weights_e, sizeof(double) * 26 * nr_cells);
551
552
553
554
555
556
557
558
559
560

  /* Generate task weights for vertices. */
  int taskvweights = (bothweights && !partweights);

  /* Loop over the tasks... */
  for (int j = 0; j < nr_tasks; j++) {
    /* Get a pointer to the kth task. */
    struct task *t = &tasks[j];

    /* Skip un-interesting tasks. */
Matthieu Schaller's avatar
Matthieu Schaller committed
561
    if (t->cost == 0) continue;
562

563
    /* Get the task weight based on costs. */
Peter W. Draper's avatar
Peter W. Draper committed
564
    double w = (double)t->cost;
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579

    /* Get the top-level cells involved. */
    struct cell *ci, *cj;
    for (ci = t->ci; ci->parent != NULL; ci = ci->parent)
      ;
    if (t->cj != NULL)
      for (cj = t->cj; cj->parent != NULL; cj = cj->parent)
        ;
    else
      cj = NULL;

    /* Get the cell IDs. */
    int cid = ci - cells;

    /* Different weights for different tasks. */
580
    if (t->type == task_type_drift_part || t->type == task_type_drift_gpart ||
Matthieu Schaller's avatar
Matthieu Schaller committed
581
582
583
584
585
        t->type == task_type_ghost || t->type == task_type_extra_ghost ||
        t->type == task_type_kick1 || t->type == task_type_kick2 ||
        t->type == task_type_timestep || t->type == task_type_init_grav ||
        t->type == task_type_grav_down ||
        t->type == task_type_grav_long_range) {
586

587
      /* Particle updates add only to vertex weight. */
Matthieu Schaller's avatar
Matthieu Schaller committed
588
      if (taskvweights) weights_v[cid] += w;
589
590
591
592
    }

    /* Self interaction? */
    else if ((t->type == task_type_self && ci->nodeID == nodeID) ||
593
594
             (t->type == task_type_sub_self && cj == NULL &&
              ci->nodeID == nodeID)) {
595
      /* Self interactions add only to vertex weight. */
Matthieu Schaller's avatar
Matthieu Schaller committed
596
      if (taskvweights) weights_v[cid] += w;
597
598
599
600

    }

    /* Pair? */
Matthieu Schaller's avatar
Matthieu Schaller committed
601
    else if (t->type == task_type_pair || (t->type == task_type_sub_pair)) {
602
603
604
      /* In-cell pair? */
      if (ci == cj) {
        /* Add weight to vertex for ci. */
Matthieu Schaller's avatar
Matthieu Schaller committed
605
        if (taskvweights) weights_v[cid] += w;
606
607
608

      }

609
610
      /* Distinct cells. */
      else {
611
612
613
        /* Index of the jth cell. */
        int cjd = cj - cells;

614
615
616
        /* Local cells add weight to vertices. */
        if (taskvweights && ci->nodeID == nodeID) {
          weights_v[cid] += 0.5 * w;
617
618
619
          if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w;
        }

620
621
622
623
624
        if (timebins) {
          /* Add weights to edge for all cells based on the expected
           * interaction time (calculated as the time to the last expected
           * time) as we want to avoid having active cells on the edges, so we
           * cut for that. Note that weight is added to the local and remote
625
626
           * cells, as we want to keep both away from any cuts, this can
           * overflow int, so take care. */
627
628
          int dti = num_time_bins - get_time_bin(ci->ti_end_min);
          int dtj = num_time_bins - get_time_bin(cj->ti_end_min);
Peter W. Draper's avatar
Peter W. Draper committed
629
          double dt = (double)(1 << dti) + (double)(1 << dtj);
630
631
632

          /* ci */
          int kk;
Peter W. Draper's avatar
Peter W. Draper committed
633
634
          for (kk = 26 * cid; inds[kk] != cjd; kk++)
            ;
635
636
637
          weights_e[kk] += dt;

          /* cj */
Peter W. Draper's avatar
Peter W. Draper committed
638
639
          for (kk = 26 * cjd; inds[kk] != cid; kk++)
            ;
640
641
642
643
644
645
646
          weights_e[kk] += dt;
        } else {

          /* Add weights from task costs to the edge. */

          /* ci */
          int kk;
Peter W. Draper's avatar
Peter W. Draper committed
647
648
          for (kk = 26 * cid; inds[kk] != cjd; kk++)
            ;
649
650
651
          weights_e[kk] += w;

          /* cj */
Peter W. Draper's avatar
Peter W. Draper committed
652
653
          for (kk = 26 * cjd; inds[kk] != cid; kk++)
            ;
654
655
          weights_e[kk] += w;
        }
656
657
658
659
660
      }
    }
  }

  /* Re-calculate the vertices if using particle counts. */
661
  if (partweights && bothweights) accumulate_counts(s, weights_v);
662
663

  /* Merge the weights arrays across all nodes. */
664
  int res;
665
666
  if (bothweights) {
    if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v,
Peter W. Draper's avatar
Peter W. Draper committed
667
668
                          nr_cells, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD)) !=
        MPI_SUCCESS)
669
670
671
672
      mpi_error(res, "Failed to allreduce vertex weights.");
  }

  if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e,
Peter W. Draper's avatar
Peter W. Draper committed
673
674
                        26 * nr_cells, MPI_DOUBLE, MPI_SUM, 0,
                        MPI_COMM_WORLD)) != MPI_SUCCESS)
675
676
677
678
679
680
681
682
    mpi_error(res, "Failed to allreduce edge weights.");

  /* Allocate cell list for the partition. */
  int *celllist = (int *)malloc(sizeof(int) * s->nr_cells);
  if (celllist == NULL) error("Failed to allocate celllist");

  /* As of here, only one node needs to compute the partition. */
  if (nodeID == 0) {
683

684
    /* We need to rescale the weights into the range of an integer for METIS
Peter W. Draper's avatar
Peter W. Draper committed
685
686
     * (really range of idx_t). Also we would like the range of vertex and
     * edges weights to be similar so they balance. */
687
688
    double wminv = 0.0;
    double wmaxv = 0.0;
689
    if (bothweights) {
690
691
      wminv = weights_v[0];
      wmaxv = weights_v[0];
692
      for (int k = 0; k < nr_cells; k++) {
693
694
        wmaxv = weights_v[k] > wmaxv ? weights_v[k] : wmaxv;
        wminv = weights_v[k] < wminv ? weights_v[k] : wminv;
695
696
697
      }
    }

698
699
    double wmine = weights_e[0];
    double wmaxe = weights_e[0];
700
    for (int k = 0; k < 26 * nr_cells; k++) {
701
702
      wmaxe = weights_e[k] > wmaxe ? weights_e[k] : wmaxe;
      wmine = weights_e[k] < wmine ? weights_e[k] : wmine;
703
    }
704
705
706

    if (bothweights) {

707
708
709
710
711
712
713
      /* Make range the same in both weights systems. */
      if ((wmaxv - wminv) > (wmaxe - wmine)) {
        double wscale = (wmaxv - wminv) / (wmaxe - wmine);
        for (int k = 0; k < 26 * nr_cells; k++) {
          weights_e[k] = (weights_e[k] - wmine) * wscale + wminv;
        }
        wmine = wminv;
714
        wmaxe = wmaxv;
715

716
      } else {
717
718
719
720
721
        double wscale = (wmaxe - wmine) / (wmaxv - wminv);
        for (int k = 0; k < nr_cells; k++) {
          weights_v[k] = (weights_v[k] - wminv) * wscale + wmine;
        }
        wminv = wmine;
722
723
724
725
        wmaxv = wmaxe;
      }

      /* Scale to the METIS range. */
726
      double wscale = (metis_maxweight - 1.0) / (wmaxv - wminv);
727
      for (int k = 0; k < nr_cells; k++) {
728
        weights_v[k] = (weights_v[k] - wminv) * wscale + 1.0;
729
730
731
      }
    }

732
    /* Scale to the METIS range. */
733
    double wscale = (metis_maxweight - 1.0) / (wmaxe - wmine);
734
    for (int k = 0; k < 26 * nr_cells; k++) {
735
      weights_e[k] = (weights_e[k] - wmine) * wscale + 1.0;
736
    }
737
738
739

    /* And partition, use both weights or not as requested. */
    if (bothweights)
740
      pick_metis(s, nr_nodes, weights_v, weights_e, celllist);
741
    else
742
      pick_metis(s, nr_nodes, NULL, weights_e, celllist);
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763

    /* Check that all cells have good values. */
    for (int k = 0; k < nr_cells; k++)
      if (celllist[k] < 0 || celllist[k] >= nr_nodes)
        error("Got bad nodeID %d for cell %i.", celllist[k], k);

    /* Check that the partition is complete and all nodes have some work. */
    int present[nr_nodes];
    int failed = 0;
    for (int i = 0; i < nr_nodes; i++) present[i] = 0;
    for (int i = 0; i < nr_cells; i++) present[celllist[i]]++;
    for (int i = 0; i < nr_nodes; i++) {
      if (!present[i]) {
        failed = 1;
        message("Node %d is not present after repartition", i);
      }
    }

    /* If partition failed continue with the current one, but make this
     * clear. */
    if (failed) {
Matthieu Schaller's avatar
Matthieu Schaller committed
764
765
766
      message(
          "WARNING: METIS repartition has failed, continuing with "
          "the current partition, load balance will not be optimal");
767
768
769
770
771
772
773
774
775
776
      for (int k = 0; k < nr_cells; k++) celllist[k] = cells[k].nodeID;
    }
  }

  /* Distribute the celllist partition and apply. */
  if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) !=
      MPI_SUCCESS)
    mpi_error(res, "Failed to bcast the cell list");

  /* And apply to our cells */
777
  split_metis(s, nr_nodes, celllist);
778
779

  /* Clean up. */
780
  free(inds);
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
  if (bothweights) free(weights_v);
  free(weights_e);
  free(celllist);
}
#endif

/**
 * @brief Repartition the cells amongst the nodes using vertex weights
 *
 * @param s The space containing the local cells.
 * @param nodeID our MPI node id.
 * @param nr_nodes number of MPI nodes.
 */
#if defined(WITH_MPI) && defined(HAVE_METIS)
static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) {

  /* Use particle counts as vertex weights. */
  /* Space for particles per cell counts, which will be used as weights. */
799
800
  double *weights = NULL;
  if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
801
802
803
804
805
806
807
    error("Failed to allocate weights buffer.");

  /* Check each particle and accumulate the counts per cell. */
  accumulate_counts(s, weights);

  /* Get all the counts from all the nodes. */
  int res;
808
809
  if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE,
                           MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS)
810
811
812
813
814
815
    mpi_error(res, "Failed to allreduce particle cell weights.");

  /* Main node does the partition calculation. */
  int *celllist = (int *)malloc(sizeof(int) * s->nr_cells);
  if (celllist == NULL) error("Failed to allocate celllist");

Matthieu Schaller's avatar
Matthieu Schaller committed
816
  if (nodeID == 0) pick_metis(s, nr_nodes, weights, NULL, celllist);
817
818
819
820
821
822
823

  /* Distribute the celllist partition and apply. */
  if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) !=
      MPI_SUCCESS)
    mpi_error(res, "Failed to bcast the cell list");

  /* And apply to our cells */
824
  split_metis(s, nr_nodes, celllist);
825
826
827
828
829
830
831
832
833
834
835
836

  free(weights);
  free(celllist);
}
#endif

/**
 * @brief Repartition the space using the given repartition type.
 *
 * Note that at the end of this process all the cells will be re-distributed
 * across the nodes, but the particles themselves will not be.
 *
837
 * @param reparttype #repartition struct
838
839
840
841
842
843
 * @param nodeID our nodeID.
 * @param nr_nodes the number of nodes.
 * @param s the space of cells holding our local particles.
 * @param tasks the completed tasks from the last engine step for our node.
 * @param nr_tasks the number of tasks.
 */
844
void partition_repartition(struct repartition *reparttype, int nodeID,
845
846
                           int nr_nodes, struct space *s, struct task *tasks,
                           int nr_tasks) {
847
848
849

#if defined(WITH_MPI) && defined(HAVE_METIS)

850
  if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_COSTS) {
851
    repart_edge_metis(0, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
852

853
854
  } else if (reparttype->type == REPART_METIS_EDGE_COSTS) {
    repart_edge_metis(0, 0, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
855

856
  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_COSTS) {
857
    repart_edge_metis(1, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks);
858

859
  } else if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS) {
860
861
862
    repart_edge_metis(0, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks);

  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS) {
863
    repart_edge_metis(1, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks);
864

865
866
867
  } else if (reparttype->type == REPART_METIS_EDGE_TIMEBINS) {
    repart_edge_metis(0, 0, 1, nodeID, nr_nodes, s, tasks, nr_tasks);

868
  } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS) {
869
870
    repart_vertex_metis(s, nodeID, nr_nodes);

871
872
873
  } else if (reparttype->type == REPART_NONE) {
    /* Doing nothing. */

874
  } else {
875
    error("Impossible repartition type");
876
877
878
879
880
881
  }
#else
  error("SWIFT was not compiled with METIS support.");
#endif
}

882
/**
Peter W. Draper's avatar
Peter W. Draper committed
883
 * @brief Initial partition of space cells.
884
 *
885
886
887
 * Cells are assigned to a node on the basis of various schemes, all of which
 * should attempt to distribute them in geometrically close regions to
 * minimise the movement of particles.
888
 *
889
890
891
892
 * Note that the partition type is a suggestion and will be ignored if that
 * scheme fails. In that case we fallback to a vectorised scheme, that is
 * guaranteed to work provided we have more cells than nodes.
 *
893
 * @param initial_partition the type of partitioning to try.
894
895
896
 * @param nodeID our nodeID.
 * @param nr_nodes the number of nodes.
 * @param s the space of cells.
897
 */
898
899
void partition_initial_partition(struct partition *initial_partition,
                                 int nodeID, int nr_nodes, struct space *s) {
900
901

  /* Geometric grid partitioning. */
902
  if (initial_partition->type == INITPART_GRID) {
903
904
905
906
907
    int j, k;
    int ind[3];
    struct cell *c;

    /* If we've got the wrong number of nodes, fail. */
908
909
910
    if (nr_nodes !=
        initial_partition->grid[0] * initial_partition->grid[1] *
            initial_partition->grid[2])
911
912
913
914
915
      error("Grid size does not match number of nodes.");

    /* Run through the cells and set their nodeID. */
    // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]);
    for (k = 0; k < s->nr_cells; k++) {
916
      c = &s->cells_top[k];
917
      for (j = 0; j < 3; j++)
Matthieu Schaller's avatar
Matthieu Schaller committed
918
        ind[j] = c->loc[j] / s->dim[j] * initial_partition->grid[j];
919
920
921
      c->nodeID = ind[0] +
                  initial_partition->grid[0] *
                      (ind[1] + initial_partition->grid[1] * ind[2]);
922
923
924
      // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0],
      // c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID);
    }
925

926
    /* The grid technique can fail, so check for this before proceeding. */
Peter W. Draper's avatar
Peter W. Draper committed
927
    if (!check_complete(s, (nodeID == 0), nr_nodes)) {
928
929
      if (nodeID == 0)
        message("Grid initial partition failed, using a vectorised partition");
930
931
      initial_partition->type = INITPART_VECTORIZE;
      partition_initial_partition(initial_partition, nodeID, nr_nodes, s);
932
933
      return;
    }
934

935
936
  } else if (initial_partition->type == INITPART_METIS_WEIGHT ||
             initial_partition->type == INITPART_METIS_NOWEIGHT) {
937
938
939
940
941
942
943
#if defined(WITH_MPI) && defined(HAVE_METIS)
    /* Simple k-way partition selected by METIS using cell particle counts as
     * weights or not. Should be best when starting with a inhomogeneous dist.
     */

    /* Space for particles per cell counts, which will be used as weights or
     * not. */
944
    double *weights = NULL;
945
    if (initial_partition->type == INITPART_METIS_WEIGHT) {
946
      if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL)
947
        error("Failed to allocate weights buffer.");
948
      bzero(weights, sizeof(double) * s->nr_cells);
949
950

      /* Check each particle and accumilate the counts per cell. */
Peter W. Draper's avatar
Peter W. Draper committed
951
      accumulate_counts(s, weights);
952

953
      /* Get all the counts from all the nodes. */
Peter W. Draper's avatar
Peter W. Draper committed
954
955
      if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM,
                        MPI_COMM_WORLD) != MPI_SUCCESS)
956
        error("Failed to allreduce particle cell weights.");
957
    }
958

959
960
961
    /* Main node does the partition calculation. */
    int *celllist = (int *)malloc(sizeof(int) * s->nr_cells);
    if (celllist == NULL) error("Failed to allocate celllist");
Matthieu Schaller's avatar
Matthieu Schaller committed
962
    if (nodeID == 0) pick_metis(s, nr_nodes, weights, NULL, celllist);
963
964
965
966
967
968
969
970

    /* Distribute the celllist partition and apply. */
    int res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD);
    if (res != MPI_SUCCESS) mpi_error(res, "Failed to bcast the cell list");

    /* And apply to our cells */
    split_metis(s, nr_nodes, celllist);

971
972
    /* It's not known if this can fail, but check for this before
     * proceeding. */
Peter W. Draper's avatar
Peter W. Draper committed
973
    if (!check_complete(s, (nodeID == 0), nr_nodes)) {
974
975
      if (nodeID == 0)
        message("METIS initial partition failed, using a vectorised partition");
976
977
      initial_partition->type = INITPART_VECTORIZE;
      partition_initial_partition(initial_partition, nodeID, nr_nodes, s);
978
    }
979

980
981
982
983
984
    if (weights != NULL) free(weights);
    free(celllist);
#else
    error("SWIFT was not compiled with METIS support");
#endif
985

986
  } else if (initial_partition->type == INITPART_VECTORIZE) {
987

988
989
990
991
992
#if defined(WITH_MPI)
    /* Vectorised selection, guaranteed to work for samples less than the
     * number of cells, but not very clumpy in the selection of regions. */
    int *samplecells = (int *)malloc(sizeof(int) * nr_nodes * 3);
    if (samplecells == NULL) error("Failed to allocate samplecells");
993

994
995
996
    if (nodeID == 0) {
      pick_vector(s, nr_nodes, samplecells);
    }
997

998
    /* Share the samplecells around all the nodes. */
Matthieu Schaller's avatar
Matthieu Schaller committed
999
    int res = MPI_Bcast(samplecells, nr_nodes * 3, MPI_INT, 0, MPI_COMM_WORLD);
1000
    if (res != MPI_SUCCESS)