diff --git a/src/engine.c b/src/engine.c index c9bc720e0c39df26c3f990888b93186934dea7f0..3f8329eb5301f3f053d6415cbbc946c2a010fec1 100644 --- a/src/engine.c +++ b/src/engine.c @@ -34,10 +34,6 @@ /* MPI headers. */ #ifdef WITH_MPI #include <mpi.h> -/* METIS headers only used when MPI is also available. */ -#ifdef HAVE_METIS -#include <metis.h> -#endif #endif #ifdef HAVE_LIBNUMA @@ -287,354 +283,6 @@ void engine_redistribute(struct engine *e) { #endif } -#if defined(WITH_MPI) && defined(HAVE_METIS) -/** - * @brief Accumulate the counts of particles per cell. - * - * @param s the space containing the cells. - * @param counts the number of particles per cell. Should be - * allocated as size s->nr_parts. - */ -static void metis_repart_counts(struct space *s, int *counts) { - - struct part *parts = s->parts; - int *cdim = s->cdim; - double ih[3], dim[3]; - ih[0] = s->ih[0]; - ih[1] = s->ih[1]; - ih[2] = s->ih[2]; - dim[0] = s->dim[0]; - dim[1] = s->dim[1]; - dim[2] = s->dim[2]; - - bzero(counts, sizeof(int) * s->nr_cells); - - for (int k = 0; k < s->nr_parts; k++) { - for (int j = 0; j < 3; j++) { - if (parts[k].x[j] < 0.0) - parts[k].x[j] += dim[j]; - else if (parts[k].x[j] >= dim[j]) - parts[k].x[j] -= dim[j]; - } - const int cid = cell_getid(cdim, parts[k].x[0] * ih[0], - parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]); - counts[cid]++; - } -} -#endif - -#if defined(WITH_MPI) && defined(HAVE_METIS) -/** - * @brief Repartition the cells amongst the nodes using task timings - * as edge weights and vertex weights also from task timings - * or particle cells counts. - * - * @param partweights whether particle counts will be used as vertex weights. - * @param bothweights whether vertex and edge weights will be used, otherwise - * only edge weights will be used. - * @param e The #engine. - */ -static void metis_repart_edge(int partweights, int bothweights, struct engine *e) { - - /* Create weight arrays using task ticks for vertices and edges (edges - * assume the same graph structure as used in the part_ calls). */ - - int nr_nodes = e->nr_nodes; - int nodeID = e->nodeID; - struct space *s = e->s; - int nr_cells = s->nr_cells; - struct cell *cells = s->cells; - struct task *tasks = e->sched.tasks; - float wscale = 1e-3, vscale = 1e-3, wscale_buff; - int wtot = 0; - int wmax = 1e9 / nr_nodes; - int wmin; - - /* Allocate and fill the adjncy indexing array defining the graph of - * cells. */ - idx_t *inds; - if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 * nr_cells)) == NULL) - error("Failed to allocate the inds array"); - part_graph_init_metis(s, inds, NULL); - - /* Allocate and init weights. */ - int *weights_v = NULL; - int *weights_e = NULL; - if (bothweights) { - if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL) - error("Failed to allocate vertex weights arrays."); - bzero(weights_v, sizeof(int) * nr_cells); - } - if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL) - error("Failed to allocate edge weights arrays."); - bzero(weights_e, sizeof(int) * 26 * nr_cells); - - /* Generate task weights for vertices. */ - int taskvweights = (bothweights && !partweights); - - /* Loop over the tasks... */ - for (int j = 0; j < e->sched.nr_tasks; j++) { - /* Get a pointer to the kth task. */ - struct task *t = &tasks[j]; - - /* Skip un-interesting tasks. */ - if (t->type != task_type_self && t->type != task_type_pair && - t->type != task_type_sub && t->type != task_type_ghost && - t->type != task_type_drift && t->type != task_type_kick && - t->type != task_type_init) - continue; - - /* Get the task weight. */ - int w = (t->toc - t->tic) * wscale; - if (w < 0) error("Bad task weight (%d).", w); - - /* Do we need to re-scale? */ - wtot += w; - while (wtot > wmax) { - wscale /= 2; - wtot /= 2; - w /= 2; - for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; - if (taskvweights) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; - } - - /* Get the top-level cells involved. */ - struct cell *ci, *cj; - for (ci = t->ci; ci->parent != NULL; ci = ci->parent) - ; - if (t->cj != NULL) - for (cj = t->cj; cj->parent != NULL; cj = cj->parent) - ; - else - cj = NULL; - - /* Get the cell IDs. */ - int cid = ci - cells; - - /* Different weights for different tasks. */ - if (t->type == task_type_ghost || t->type == task_type_drift || - t->type == task_type_kick) { - /* Particle updates add only to vertex weight. */ - if (taskvweights) - weights_v[cid] += w; - - } - - /* Self interaction? */ - else if ((t->type == task_type_self && ci->nodeID == nodeID) || - (t->type == task_type_sub && cj == NULL && - ci->nodeID == nodeID)) { - /* Self interactions add only to vertex weight. */ - if (taskvweights) - weights_v[cid] += w; - - } - - /* Pair? */ - else if (t->type == task_type_pair || - (t->type == task_type_sub && cj != NULL)) { - /* In-cell pair? */ - if (ci == cj) { - /* Add weight to vertex for ci. */ - if (taskvweights) - weights_v[cid] += w; - - } - - /* Distinct cells with local ci? */ - else if (ci->nodeID == nodeID) { - /* Index of the jth cell. */ - int cjd = cj - cells; - - /* Add half of weight to each cell. */ - if (taskvweights) { - if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; - if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; - } - - /* Add weights to edge. */ - int kk; - for (kk = 26 * cid; inds[kk] != cjd; kk++) - ; - weights_e[kk] += w; - for (kk = 26 * cjd; inds[kk] != cid; kk++) - ; - weights_e[kk] += w; - } - } - } - - /* Re-calculate the vertices if using particle counts. */ - if (partweights && bothweights) { - metis_repart_counts(s, weights_v); - - /* Rescale to balance times. */ - float vwscale = (float)wtot / (float)e->sched.nr_tasks; - for (int k = 0; k < nr_cells; k++) { - weights_v[k] *= vwscale; - } - } - - /* Get the minimum scaling and re-scale if necessary. */ - int res; - if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, - MPI_COMM_WORLD)) != MPI_SUCCESS) - mpi_error(res, "Failed to allreduce the weight scales."); - - if (wscale_buff != wscale) { - float scale = wscale_buff / wscale; - for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; - if (bothweights) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale; - } - - /* Merge the weights arrays across all nodes. */ - if (bothweights) { - if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, - nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != - MPI_SUCCESS) - mpi_error(res, "Failed to allreduce vertex weights."); - } - - if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, - 26 * nr_cells, MPI_INT, MPI_SUM, 0, - MPI_COMM_WORLD)) != MPI_SUCCESS) - mpi_error(res, "Failed to allreduce edge weights."); - - /* Allocate cell list for the partition. */ - int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); - if (celllist == NULL) error("Failed to allocate celllist"); - - /* As of here, only one node needs to compute the partition. */ - if (nodeID == 0) { - /* Final rescale of all weights to avoid a large range. Large ranges - * have been seen to cause an incomplete graph. */ - wmin = wmax; - wmax = 0; - for (int k = 0; k < 26 * nr_cells; k++) { - wmax = weights_e[k] > wmax ? weights_e[k] : wmax; - wmin = weights_e[k] < wmin ? weights_e[k] : wmin; - } - if (bothweights) { - for (int k = 0; k < nr_cells; k++) { - wmax = weights_v[k] > wmax ? weights_v[k] : wmax; - wmin = weights_v[k] < wmin ? weights_v[k] : wmin; - } - } - - if ((wmax - wmin) > engine_maxmetisweight) { - wscale = engine_maxmetisweight / (wmax - wmin); - for (int k = 0; k < 26 * nr_cells; k++) { - weights_e[k] = (weights_e[k] - wmin) * wscale + 1; - } - if (bothweights) { - for (int k = 0; k < nr_cells; k++) { - weights_v[k] = (weights_v[k] - wmin) * wscale + 1; - } - } - } - - /* Make sure there are no zero weights. */ - for (int k = 0; k < 26 * nr_cells; k++) - if (weights_e[k] == 0) weights_e[k] = 1; - if (bothweights) - for (int k = 0; k < nr_cells; k++) - if ((weights_v[k] *= vscale) == 0) weights_v[k] = 1; - - /* And partition, use both weights or not as requested. */ - if (bothweights) - part_pick_metis(s, nr_nodes, weights_v, weights_e, celllist); - else - part_pick_metis(s, nr_nodes, NULL, weights_e, celllist); - - /* Check that all cells have good values. */ - for (int k = 0; k < nr_cells; k++) - if (celllist[k] < 0 || celllist[k] >= nr_nodes) - error("Got bad nodeID %d for cell %i.", celllist[k], k); - - /* Check that the partition is complete and all nodes have some work. */ - int present[nr_nodes]; - int failed = 0; - for (int i = 0; i < nr_nodes; i++) present[i] = 0; - for (int i = 0; i < nr_cells; i++) present[celllist[i]]++; - for (int i = 0; i < nr_nodes; i++) { - if (!present[i]) { - failed = 1; - message("Node %d is not present after repartition", i); - } - } - - /* If partition failed continue with the current one, but make this - * clear. */ - if (failed) { - message( - "WARNING: METIS repartition has failed, continuing with " - "the current partition, load balance will not be optimal"); - for (int k = 0; k < nr_cells; k++) celllist[k] = cells[k].nodeID; - } - } - - /* Distribute the celllist partition and apply. */ - if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) != - MPI_SUCCESS) - mpi_error(res, "Failed to bcast the cell list"); - - /* And apply to our cells */ - part_split_metis(s, nr_nodes, celllist); - - /* Clean up. */ - if (bothweights) free(weights_v); - free(weights_e); - free(celllist); -} -#endif - -#if defined(WITH_MPI) && defined(HAVE_METIS) -/** - * @brief Repartition the cells amongst the nodes using vertex weights - * - * @param e The #engine. - */ -static void metis_repart_vertex(struct engine *e) { - - struct space *s = e->s; - - /* Use particle counts as vertex weights. */ - /* Space for particles per cell counts, which will be used as weights. */ - int *weights = NULL; - if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) - error("Failed to allocate weights buffer."); - - /* Check each particle and accumulate the counts per cell. */ - metis_repart_counts(s, weights); - - /* Get all the counts from all the nodes. */ - int res; - if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, - MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS) - mpi_error(res, "Failed to allreduce particle cell weights."); - - /* Main node does the partition calculation. */ - int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); - if (celllist == NULL) error("Failed to allocate celllist"); - - if (e->nodeID == 0) - part_pick_metis(s, e->nr_nodes, weights, NULL, celllist); - - /* Distribute the celllist partition and apply. */ - if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) != - MPI_SUCCESS) - mpi_error(res, "Failed to bcast the cell list"); - - /* And apply to our cells */ - part_split_metis(s, e->nr_nodes, celllist); - - free(weights); - free(celllist); -} -#endif - /** * @brief Repartition the cells amongst the nodes. @@ -655,15 +303,8 @@ void engine_repartition(struct engine *e) { if (e->nr_nodes == 1) return; /* Do the repartitioning. */ - if (reparttype == REPART_METIS_BOTH || reparttype == REPART_METIS_EDGE || - reparttype == REPART_METIS_VERTEX_EDGE) { - - metis_repart_edge(reparttype == REPART_METIS_VERTEX_EDGE, - reparttype == REPART_METIS_BOTH, e); - } else if (reparttype == REPART_METIS_VERTEX) { - - metis_repart_vertex(e); - } + part_repart(reparttype, e->nodeID, e->nr_nodes, e->s, e->sched.tasks, + e->sched.nr_tasks); /* Now comes the tricky part: Exchange particles between all nodes. This is done in two steps, first allreducing a matrix of diff --git a/src/engine.h b/src/engine.h index 6bbb10a3112495fe16f591dbca16621768234f76..42cfbac432a2f54ebff6c1f138a17391071c1aa1 100644 --- a/src/engine.h +++ b/src/engine.h @@ -63,7 +63,6 @@ extern const char *engine_policy_names[]; #define engine_maxproxies 64 #define engine_tasksreweight 10 -#define engine_maxmetisweight 10000.0f /* The rank of the engine as a global variable (for messages). */ extern int engine_rank; diff --git a/src/partition.c b/src/partition.c index cee8fe115e6f61ed99014fa6e321623846541170..f40d4feb428410cf483b00dcdd5866ccabf9a942 100644 --- a/src/partition.c +++ b/src/partition.c @@ -33,6 +33,7 @@ #include <stdlib.h> #include <math.h> #include <values.h> +#include <strings.h> /* MPI headers. */ #ifdef WITH_MPI @@ -55,17 +56,20 @@ #define MIN(a, b) ((a) > (b) ? (b) : (a)) #define CHUNK 512 +/* Maximum weight used for METIS. */ +#define metis_maxweight 10000.0f + /* Simple descriptions of initial partition types for reports. */ const char *initpart_name[] = { "gridded cells", "vectorized point associated cells", - "METIS particle weighted cells", + "METIS particle weighted cells", "METIS unweighted cells" }; /* Simple descriptions of repartition types for reports. */ const char *repart_name[] = { - "no", + "no", "METIS edge and vertex time weighted cells", "METIS particle count vertex weighted cells", "METIS time edge weighted cells", @@ -154,7 +158,7 @@ void part_split_vector(struct space *s, int nregions, int *samplecells) { * weighted by the number of particles scheme. Note METIS is optional. */ -/** +/** * @brief Fill the METIS xadj and adjncy arrays defining the graph of cells * in a space. * @@ -168,8 +172,8 @@ void part_split_vector(struct space *s, int nregions, int *samplecells) { * @param xadj the METIS xadj array to fill, must be of size * number of cells in space + 1. NULL for not used. */ -#ifdef HAVE_METIS -void part_graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { +#if defined(WITH_MPI) && defined(HAVE_METIS) +static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { /* Loop over all cells in the space. */ int cid = 0; @@ -222,6 +226,401 @@ void part_graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { } #endif +/** + * @brief Accumulate the counts of particles per cell. + * + * @param s the space containing the cells. + * @param counts the number of particles per cell. Should be + * allocated as size s->nr_parts. + */ +#if defined(WITH_MPI) && defined(HAVE_METIS) +static void accumulate_counts(struct space *s, int *counts) { + + struct part *parts = s->parts; + int *cdim = s->cdim; + double ih[3], dim[3]; + ih[0] = s->ih[0]; + ih[1] = s->ih[1]; + ih[2] = s->ih[2]; + dim[0] = s->dim[0]; + dim[1] = s->dim[1]; + dim[2] = s->dim[2]; + + bzero(counts, sizeof(int) * s->nr_cells); + + for (int k = 0; k < s->nr_parts; k++) { + for (int j = 0; j < 3; j++) { + if (parts[k].x[j] < 0.0) + parts[k].x[j] += dim[j]; + else if (parts[k].x[j] >= dim[j]) + parts[k].x[j] -= dim[j]; + } + const int cid = cell_getid(cdim, parts[k].x[0] * ih[0], + parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]); + counts[cid]++; + } +} +#endif + +/** + * @brief Repartition the cells amongst the nodes using task timings + * as edge weights and vertex weights also from task timings + * or particle cells counts. + * + * @param partweights whether particle counts will be used as vertex weights. + * @param bothweights whether vertex and edge weights will be used, otherwise + * only edge weights will be used. + * @param e The #engine. + */ +#if defined(WITH_MPI) && defined(HAVE_METIS) +static void repart_edge_metis(int partweights, int bothweights, + int nodeID, int nr_nodes, struct space *s, + struct task *tasks, int nr_tasks) { + + + /* Create weight arrays using task ticks for vertices and edges (edges + * assume the same graph structure as used in the part_ calls). */ + int nr_cells = s->nr_cells; + struct cell *cells = s->cells; + float wscale = 1e-3, vscale = 1e-3, wscale_buff; + int wtot = 0; + int wmax = 1e9 / nr_nodes; + int wmin; + + /* Allocate and fill the adjncy indexing array defining the graph of + * cells. */ + idx_t *inds; + if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 * nr_cells)) == NULL) + error("Failed to allocate the inds array"); + graph_init_metis(s, inds, NULL); + + /* Allocate and init weights. */ + int *weights_v = NULL; + int *weights_e = NULL; + if (bothweights) { + if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL) + error("Failed to allocate vertex weights arrays."); + bzero(weights_v, sizeof(int) * nr_cells); + } + if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL) + error("Failed to allocate edge weights arrays."); + bzero(weights_e, sizeof(int) * 26 * nr_cells); + + /* Generate task weights for vertices. */ + int taskvweights = (bothweights && !partweights); + + /* Loop over the tasks... */ + for (int j = 0; j < nr_tasks; j++) { + /* Get a pointer to the kth task. */ + struct task *t = &tasks[j]; + + /* Skip un-interesting tasks. */ + if (t->type != task_type_self && t->type != task_type_pair && + t->type != task_type_sub && t->type != task_type_ghost && + t->type != task_type_drift && t->type != task_type_kick && + t->type != task_type_init) + continue; + + /* Get the task weight. */ + int w = (t->toc - t->tic) * wscale; + if (w < 0) error("Bad task weight (%d).", w); + + /* Do we need to re-scale? */ + wtot += w; + while (wtot > wmax) { + wscale /= 2; + wtot /= 2; + w /= 2; + for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; + if (taskvweights) + for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; + } + + /* Get the top-level cells involved. */ + struct cell *ci, *cj; + for (ci = t->ci; ci->parent != NULL; ci = ci->parent) + ; + if (t->cj != NULL) + for (cj = t->cj; cj->parent != NULL; cj = cj->parent) + ; + else + cj = NULL; + + /* Get the cell IDs. */ + int cid = ci - cells; + + /* Different weights for different tasks. */ + if (t->type == task_type_ghost || t->type == task_type_drift || + t->type == task_type_kick) { + /* Particle updates add only to vertex weight. */ + if (taskvweights) + weights_v[cid] += w; + + } + + /* Self interaction? */ + else if ((t->type == task_type_self && ci->nodeID == nodeID) || + (t->type == task_type_sub && cj == NULL && + ci->nodeID == nodeID)) { + /* Self interactions add only to vertex weight. */ + if (taskvweights) + weights_v[cid] += w; + + } + + /* Pair? */ + else if (t->type == task_type_pair || + (t->type == task_type_sub && cj != NULL)) { + /* In-cell pair? */ + if (ci == cj) { + /* Add weight to vertex for ci. */ + if (taskvweights) + weights_v[cid] += w; + + } + + /* Distinct cells with local ci? */ + else if (ci->nodeID == nodeID) { + /* Index of the jth cell. */ + int cjd = cj - cells; + + /* Add half of weight to each cell. */ + if (taskvweights) { + if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; + if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; + } + + /* Add weights to edge. */ + int kk; + for (kk = 26 * cid; inds[kk] != cjd; kk++) + ; + weights_e[kk] += w; + for (kk = 26 * cjd; inds[kk] != cid; kk++) + ; + weights_e[kk] += w; + } + } + } + + /* Re-calculate the vertices if using particle counts. */ + if (partweights && bothweights) { + accumulate_counts(s, weights_v); + + /* Rescale to balance times. */ + float vwscale = (float)wtot / (float)nr_tasks; + for (int k = 0; k < nr_cells; k++) { + weights_v[k] *= vwscale; + } + } + + /* Get the minimum scaling and re-scale if necessary. */ + int res; + if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, + MPI_COMM_WORLD)) != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce the weight scales."); + + if (wscale_buff != wscale) { + float scale = wscale_buff / wscale; + for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; + if (bothweights) + for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale; + } + + /* Merge the weights arrays across all nodes. */ + if (bothweights) { + if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, + nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) + mpi_error(res, "Failed to allreduce vertex weights."); + } + + if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, + 26 * nr_cells, MPI_INT, MPI_SUM, 0, + MPI_COMM_WORLD)) != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce edge weights."); + + /* Allocate cell list for the partition. */ + int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); + if (celllist == NULL) error("Failed to allocate celllist"); + + /* As of here, only one node needs to compute the partition. */ + if (nodeID == 0) { + /* Final rescale of all weights to avoid a large range. Large ranges + * have been seen to cause an incomplete graph. */ + wmin = wmax; + wmax = 0; + for (int k = 0; k < 26 * nr_cells; k++) { + wmax = weights_e[k] > wmax ? weights_e[k] : wmax; + wmin = weights_e[k] < wmin ? weights_e[k] : wmin; + } + if (bothweights) { + for (int k = 0; k < nr_cells; k++) { + wmax = weights_v[k] > wmax ? weights_v[k] : wmax; + wmin = weights_v[k] < wmin ? weights_v[k] : wmin; + } + } + + if ((wmax - wmin) > metis_maxweight) { + wscale = metis_maxweight / (wmax - wmin); + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] = (weights_e[k] - wmin) * wscale + 1; + } + if (bothweights) { + for (int k = 0; k < nr_cells; k++) { + weights_v[k] = (weights_v[k] - wmin) * wscale + 1; + } + } + } + + /* Make sure there are no zero weights. */ + for (int k = 0; k < 26 * nr_cells; k++) + if (weights_e[k] == 0) weights_e[k] = 1; + if (bothweights) + for (int k = 0; k < nr_cells; k++) + if ((weights_v[k] *= vscale) == 0) weights_v[k] = 1; + + /* And partition, use both weights or not as requested. */ + if (bothweights) + part_pick_metis(s, nr_nodes, weights_v, weights_e, celllist); + else + part_pick_metis(s, nr_nodes, NULL, weights_e, celllist); + + /* Check that all cells have good values. */ + for (int k = 0; k < nr_cells; k++) + if (celllist[k] < 0 || celllist[k] >= nr_nodes) + error("Got bad nodeID %d for cell %i.", celllist[k], k); + + /* Check that the partition is complete and all nodes have some work. */ + int present[nr_nodes]; + int failed = 0; + for (int i = 0; i < nr_nodes; i++) present[i] = 0; + for (int i = 0; i < nr_cells; i++) present[celllist[i]]++; + for (int i = 0; i < nr_nodes; i++) { + if (!present[i]) { + failed = 1; + message("Node %d is not present after repartition", i); + } + } + + /* If partition failed continue with the current one, but make this + * clear. */ + if (failed) { + message( + "WARNING: METIS repartition has failed, continuing with " + "the current partition, load balance will not be optimal"); + for (int k = 0; k < nr_cells; k++) celllist[k] = cells[k].nodeID; + } + } + + /* Distribute the celllist partition and apply. */ + if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) + mpi_error(res, "Failed to bcast the cell list"); + + /* And apply to our cells */ + part_split_metis(s, nr_nodes, celllist); + + /* Clean up. */ + if (bothweights) free(weights_v); + free(weights_e); + free(celllist); +} +#endif + +/** + * @brief Repartition the cells amongst the nodes using vertex weights + * + * @param s The space containing the local cells. + * @param nodeID our MPI node id. + * @param nr_nodes number of MPI nodes. + */ +#if defined(WITH_MPI) && defined(HAVE_METIS) +static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { + + /* Use particle counts as vertex weights. */ + /* Space for particles per cell counts, which will be used as weights. */ + int *weights = NULL; + if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + error("Failed to allocate weights buffer."); + + /* Check each particle and accumulate the counts per cell. */ + accumulate_counts(s, weights); + + /* Get all the counts from all the nodes. */ + int res; + if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, + MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce particle cell weights."); + + /* Main node does the partition calculation. */ + int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); + if (celllist == NULL) error("Failed to allocate celllist"); + + if (nodeID == 0) + part_pick_metis(s, nr_nodes, weights, NULL, celllist); + + /* Distribute the celllist partition and apply. */ + if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) + mpi_error(res, "Failed to bcast the cell list"); + + /* And apply to our cells */ + part_split_metis(s, nr_nodes, celllist); + + free(weights); + free(celllist); +} +#endif + +/** + * @brief Repartition the space using the given repartition type. + * + * Note that at the end of this process all the cells will be re-distributed + * across the nodes, but the particles themselves will not be. + * + * @param reparttype the type of repartition to attempt, see the repart_type enum. + * @param nodeID our nodeID. + * @param nr_nodes the number of nodes. + * @param s the space of cells holding our local particles. + * @param tasks the completed tasks from the last engine step for our node. + * @param nr_tasks the number of tasks. + */ +void part_repart(enum repart_type reparttype, int nodeID, int nr_nodes, + struct space *s, struct task *tasks, int nr_tasks) { + +#if defined(WITH_MPI) && defined(HAVE_METIS) + + if (reparttype == REPART_METIS_BOTH || reparttype == REPART_METIS_EDGE || + reparttype == REPART_METIS_VERTEX_EDGE) { + + int partweights; + int bothweights; + if (reparttype == REPART_METIS_VERTEX_EDGE) + partweights = 1; + else + partweights = 0; + + if (reparttype == REPART_METIS_BOTH) + bothweights = 1; + else + bothweights = 0; + + repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks, + nr_tasks); + + } else if (reparttype == REPART_METIS_VERTEX) { + + repart_vertex_metis(s, nodeID, nr_nodes); + + } else { + error("Unknown repartition type"); + } +#else + error("SWIFT was not compiled with METIS support."); +#endif +} + + /** * @brief Partition the given space into a number of connected regions. * @@ -240,7 +639,7 @@ void part_graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { */ void part_pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, int *celllist) { -#ifdef HAVE_METIS +#if defined(WITH_MPI) && defined(HAVE_METIS) /* Total number of cells. */ int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2]; @@ -272,7 +671,7 @@ void part_pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, error("Failed to allocate regionid array"); /* Define the cell graph. */ - part_graph_init_metis(s, adjncy, xadj); + graph_init_metis(s, adjncy, xadj); /* Init the vertex weights array. */ if (vertexw != NULL) { @@ -355,7 +754,7 @@ void part_pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, */ void part_split_metis(struct space *s, int nregions, int *celllist) { - for (int i = 0; i < s->nr_cells; i++) + for (int i = 0; i < s->nr_cells; i++) s->cells[i].nodeID = celllist[i]; } diff --git a/src/partition.h b/src/partition.h index 679bcd76e07645da3690cd5af9aeff9a7be62545..c9f9601fa872f5b07230b84a977bc6a87b5986d2 100644 --- a/src/partition.h +++ b/src/partition.h @@ -21,6 +21,7 @@ #include "space.h" #include "cell.h" +#include "task.h" #ifdef HAVE_METIS #include <metis.h> #endif @@ -63,6 +64,8 @@ void part_split_metis(struct space *s, int nregions, int *celllist); #ifdef HAVE_METIS void part_graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj); +void part_repart(enum repart_type reparttype, int nodeID, int nr_nodes, + struct space *s, struct task *tasks, int nr_tasks); #endif int part_check_complete(struct space *s, int verbose, int nregions);