diff --git a/examples/main.c b/examples/main.c index 32264f185f2d7cd7efb010f667ce74e486114c9d..a28faa628b856b3f1917331aecc6b95835c628b3 100644 --- a/examples/main.c +++ b/examples/main.c @@ -79,10 +79,25 @@ int main(int argc, char *argv[]) { char dumpfile[30]; float dt_max = 0.0f, dt_min = 0.0f; ticks tic; - int nr_nodes = 1, myrank = 0, grid[3] = {1, 1, 1}; + int nr_nodes = 1, myrank = 0; FILE *file_thread; int with_outputs = 1; +#ifdef WITH_MPI + struct partition initial_partition; + enum repartition_type reparttype = REPART_NONE; + + initial_partition.type = INITPART_GRID; + initial_partition.grid[0] = 1; + initial_partition.grid[1] = 1; + initial_partition.grid[2] = 1; +#ifdef HAVE_METIS + /* Defaults make use of METIS. */ + reparttype = REPART_METIS_BOTH; + initial_partition.type = INITPART_METIS_NOWEIGHT; +#endif +#endif + /* Choke on FP-exceptions. */ // feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW ); @@ -107,9 +122,11 @@ int main(int argc, char *argv[]) { fflush(stdout); /* Set a default grid so that grid[0]*grid[1]*grid[2] == nr_nodes. */ - factor(nr_nodes, &grid[0], &grid[1]); - factor(nr_nodes / grid[1], &grid[0], &grid[2]); - factor(grid[0] * grid[1], &grid[1], &grid[0]); + factor(nr_nodes, &initial_partition.grid[0], &initial_partition.grid[1]); + factor(nr_nodes / initial_partition.grid[1], &initial_partition.grid[0], + &initial_partition.grid[2]); + factor(initial_partition.grid[0] * initial_partition.grid[1], + &initial_partition.grid[1], &initial_partition.grid[0]); #endif /* Greeting message */ @@ -137,7 +154,7 @@ int main(int argc, char *argv[]) { bzero(&s, sizeof(struct space)); /* Parse the options */ - while ((c = getopt(argc, argv, "a:c:d:e:f:g:m:oq:s:t:w:y:z:")) != -1) + while ((c = getopt(argc, argv, "a:c:d:e:f:m:oP:q:R:s:t:w:y:z:")) != -1) switch (c) { case 'a': if (sscanf(optarg, "%lf", &scaling) != 1) @@ -166,10 +183,6 @@ int main(int argc, char *argv[]) { case 'f': if (!strcpy(ICfileName, optarg)) error("Error parsing IC file name."); break; - case 'g': - if (sscanf(optarg, "%i %i %i", &grid[0], &grid[1], &grid[2]) != 3) - error("Error parsing grid."); - break; case 'm': if (sscanf(optarg, "%lf", &h_max) != 1) error("Error parsing h_max."); if (myrank == 0) message("maximum h set to %e.", h_max); @@ -178,10 +191,63 @@ int main(int argc, char *argv[]) { case 'o': with_outputs = 0; break; + case 'P': + /* Partition type is one of "g", "m", "w", or "v"; "g" can be + * followed by three numbers defining the grid. */ +#ifdef WITH_MPI + switch (optarg[0]) { + case 'g': + initial_partition.type = INITPART_GRID; + if (strlen(optarg) > 2) { + if (sscanf(optarg, "g %i %i %i", &initial_partition.grid[0], + &initial_partition.grid[1], + &initial_partition.grid[2]) != 3) + error("Error parsing grid."); + } + break; +#ifdef HAVE_METIS + case 'm': + initial_partition.type = INITPART_METIS_NOWEIGHT; + break; + case 'w': + initial_partition.type = INITPART_METIS_WEIGHT; + break; +#endif + case 'v': + initial_partition.type = INITPART_VECTORIZE; + break; + } +#endif + break; case 'q': if (sscanf(optarg, "%d", &nr_queues) != 1) error("Error parsing number of queues."); break; + case 'R': + /* Repartition type "n", "b", "v", "e" or "x". + * Note only none is available without METIS. */ +#ifdef WITH_MPI + switch (optarg[0]) { + case 'n': + reparttype = REPART_NONE; + break; +#ifdef HAVE_METIS + case 'b': + reparttype = REPART_METIS_BOTH; + break; + case 'v': + reparttype = REPART_METIS_VERTEX; + break; + case 'e': + reparttype = REPART_METIS_EDGE; + break; + case 'x': + reparttype = REPART_METIS_VERTEX_EDGE; + break; +#endif + } +#endif + break; case 's': if (sscanf(optarg, "%lf %lf %lf", &shift[0], &shift[1], &shift[2]) != 3) error("Error parsing shift."); @@ -212,10 +278,15 @@ int main(int argc, char *argv[]) { break; } -#if defined(WITH_MPI) +#ifdef WITH_MPI if (myrank == 0) { message("Running with %i thread(s) per node.", nr_threads); - message("grid set to [ %i %i %i ].", grid[0], grid[1], grid[2]); + message("Using initial partition %s", + initial_partition_name[initial_partition.type]); + if (initial_partition.type == INITPART_GRID) + message("grid set to [ %i %i %i ].", initial_partition.grid[0], + initial_partition.grid[1], initial_partition.grid[2]); + message("Using %s repartitioning", repartition_name[reparttype]); if (nr_nodes == 1) { message("WARNING: you are running with one MPI rank."); @@ -349,7 +420,7 @@ int main(int argc, char *argv[]) { #ifdef WITH_MPI /* Split the space. */ - engine_split(&e, grid); + engine_split(&e, &initial_partition); engine_redistribute(&e); #endif @@ -400,8 +471,8 @@ int main(int argc, char *argv[]) { for (j = 0; e.time < time_end; j++) { /* Repartition the space amongst the nodes? */ -#if defined(WITH_MPI) && defined(HAVE_METIS) - if (j % 100 == 2) e.forcerepart = 1; +#ifdef WITH_MPI + if (j % 100 == 2) e.forcerepart = reparttype; #endif timers_reset(timers_mask_all); diff --git a/src/Makefile.am b/src/Makefile.am index 8cb27ef60ffaf884b1c0d7626b5d17d6a7a2a7fc..f4b293925bfcfdf989889db768d60e3de778c985 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -35,13 +35,13 @@ endif # List required headers include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \ - common_io.h single_io.h multipole.h map.h tools.h + common_io.h single_io.h multipole.h map.h tools.h partition.h # Common source files AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \ serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \ units.c common_io.c single_io.c multipole.c version.c map.c \ - kernel.c tools.c part.c + kernel.c tools.c part.c partition.c # Include files for distribution, not installation. nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel.h vector.h \ diff --git a/src/engine.c b/src/engine.c index 9c89080ef8fa5bf8d0b9c3c6c18f29f2a0d7c084..272d5c0c9ff07b98befed3c68810d4c909e78c57 100644 --- a/src/engine.c +++ b/src/engine.c @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * 2015 Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -33,10 +34,6 @@ /* MPI headers. */ #ifdef WITH_MPI #include <mpi.h> -/* METIS headers only used when MPI is also available. */ -#ifdef HAVE_METIS -#include <metis.h> -#endif #endif #ifdef HAVE_LIBNUMA @@ -55,6 +52,7 @@ #include "hydro.h" #include "minmax.h" #include "part.h" +#include "partition.h" #include "timers.h" const char *engine_policy_names[12] = { @@ -287,6 +285,7 @@ void engine_redistribute(struct engine *e) { #endif } + /** * @brief Repartition the cells amongst the nodes. * @@ -297,343 +296,17 @@ void engine_repartition(struct engine *e) { #if defined(WITH_MPI) && defined(HAVE_METIS) - int i, j, k, l, cid, cjd, ii, jj, kk, res; - idx_t *inds, *nodeIDs; - idx_t *weights_v = NULL, *weights_e = NULL; - struct space *s = e->s; - int nr_cells = s->nr_cells, my_cells = 0; - struct cell *cells = s->cells; - int ind[3], *cdim = s->cdim; - struct task *t, *tasks = e->sched.tasks; - struct cell *ci, *cj; - int nr_nodes = e->nr_nodes, nodeID = e->nodeID; - float wscale = 1e-3, vscale = 1e-3, wscale_buff; - idx_t wtot = 0; - idx_t wmax = 1e9 / e->nr_nodes; - idx_t wmin; - /* Clear the repartition flag. */ - e->forcerepart = 0; + enum repartition_type reparttype = e->forcerepart; + e->forcerepart = REPART_NONE; /* Nothing to do if only using a single node. Also avoids METIS * bug that doesn't handle this case well. */ - if (nr_nodes == 1) return; - - /* Allocate the inds and weights. */ - if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL || - (weights_v = (idx_t *)malloc(sizeof(idx_t) *nr_cells)) == NULL || - (weights_e = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL || - (nodeIDs = (idx_t *)malloc(sizeof(idx_t) * nr_cells)) == NULL) - error("Failed to allocate inds and weights arrays."); - - /* Fill the inds array. */ - for (cid = 0; cid < nr_cells; cid++) { - ind[0] = cells[cid].loc[0] / s->cells[cid].h[0] + 0.5; - ind[1] = cells[cid].loc[1] / s->cells[cid].h[1] + 0.5; - ind[2] = cells[cid].loc[2] / s->cells[cid].h[2] + 0.5; - l = 0; - for (i = -1; i <= 1; i++) { - ii = ind[0] + i; - if (ii < 0) - ii += cdim[0]; - else if (ii >= cdim[0]) - ii -= cdim[0]; - for (j = -1; j <= 1; j++) { - jj = ind[1] + j; - if (jj < 0) - jj += cdim[1]; - else if (jj >= cdim[1]) - jj -= cdim[1]; - for (k = -1; k <= 1; k++) { - kk = ind[2] + k; - if (kk < 0) - kk += cdim[2]; - else if (kk >= cdim[2]) - kk -= cdim[2]; - if (i || j || k) { - inds[cid * 26 + l] = cell_getid(cdim, ii, jj, kk); - l += 1; - } - } - } - } - } - - /* Init the weights arrays. */ - bzero(weights_e, sizeof(idx_t) * 26 * nr_cells); - bzero(weights_v, sizeof(idx_t) * nr_cells); - - /* Loop over the tasks... */ - for (j = 0; j < e->sched.nr_tasks; j++) { - - /* Get a pointer to the kth task. */ - t = &tasks[j]; - - /* Skip un-interesting tasks. */ - if (t->type != task_type_self && t->type != task_type_pair && - t->type != task_type_sub && t->type != task_type_ghost && - t->type != task_type_drift && t->type != task_type_kick && - t->type != task_type_init) - continue; - - /* Get the task weight. */ - idx_t w = (t->toc - t->tic) * wscale; - if (w < 0) error("Bad task weight (%" SCIDX ").", w); - - /* Do we need to re-scale? */ - wtot += w; - while (wtot > wmax) { - wscale /= 2; - wtot /= 2; - w /= 2; - for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; - for (k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; - } - - /* Get the top-level cells involved. */ - for (ci = t->ci; ci->parent != NULL; ci = ci->parent) - ; - if (t->cj != NULL) - for (cj = t->cj; cj->parent != NULL; cj = cj->parent) - ; - else - cj = NULL; - - /* Get the cell IDs. */ - cid = ci - cells; - - /* Different weights for different tasks. */ - if (t->type == task_type_ghost || t->type == task_type_drift || - t->type == task_type_kick) { + if (e->nr_nodes == 1) return; - /* Particle updates add only to vertex weight. */ - weights_v[cid] += w; - - } - - /* Self interaction? */ - else if ((t->type == task_type_self && ci->nodeID == nodeID) || - (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) { - - /* Self interactions add only to vertex weight. */ - weights_v[cid] += w; - - } - - /* Pair? */ - else if (t->type == task_type_pair || - (t->type == task_type_sub && cj != NULL)) { - - /* In-cell pair? */ - if (ci == cj) { - - /* Add weight to vertex for ci. */ - weights_v[cid] += w; - - } - - /* Distinct cells with local ci? */ - else if (ci->nodeID == nodeID) { - - /* Index of the jth cell. */ - cjd = cj - cells; - - /* Add half of weight to each cell. */ - if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; - if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; - - /* Add Weight to edge. */ - for (k = 26 * cid; inds[k] != cjd; k++) - ; - weights_e[k] += w; - for (k = 26 * cjd; inds[k] != cid; k++) - ; - weights_e[k] += w; - } - } - } - - /* Get the minimum scaling and re-scale if necessary. */ - if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, - MPI_COMM_WORLD)) != MPI_SUCCESS) { - char buff[MPI_MAX_ERROR_STRING]; - MPI_Error_string(res, buff, &i); - error("Failed to allreduce the weight scales (%s).", buff); - } - if (wscale_buff != wscale) { - float scale = wscale_buff / wscale; - for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; - for (k = 0; k < nr_cells; k++) weights_v[k] *= scale; - } - -/* Merge the weights arrays across all nodes. */ -#if IDXTYPEWIDTH == 32 - if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, - nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != - MPI_SUCCESS) { -#else - if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, - nr_cells, MPI_LONG_LONG_INT, MPI_SUM, 0, - MPI_COMM_WORLD)) != MPI_SUCCESS) { -#endif - char buff[MPI_MAX_ERROR_STRING]; - MPI_Error_string(res, buff, &i); - error("Failed to allreduce vertex weights (%s).", buff); - } -#if IDXTYPEWIDTH == 32 - if (MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, - 26 * nr_cells, MPI_INT, MPI_SUM, 0, - MPI_COMM_WORLD) != MPI_SUCCESS) -#else - if (MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, - 26 * nr_cells, MPI_LONG_LONG_INT, MPI_SUM, 0, - MPI_COMM_WORLD) != MPI_SUCCESS) -#endif - error("Failed to allreduce edge weights."); - - /* As of here, only one node needs to compute the partition. */ - if (nodeID == 0) { - - /* Final rescale of all weights to avoid a large range. Large ranges have - * been seen to cause an incomplete graph. */ - wmin = wmax; - wmax = 0.0; - for (k = 0; k < 26 * nr_cells; k++) { - wmax = weights_e[k] > wmax ? weights_e[k] : wmax; - wmin = weights_e[k] < wmin ? weights_e[k] : wmin; - } - if ((wmax - wmin) > engine_maxmetisweight) { - wscale = engine_maxmetisweight / (wmax - wmin); - for (k = 0; k < 26 * nr_cells; k++) { - weights_e[k] = (weights_e[k] - wmin) * wscale + 1; - } - for (k = 0; k < nr_cells; k++) { - weights_v[k] = (weights_v[k] - wmin) * wscale + 1; - } - } - - /* Check that the edge weights are fully symmetric. */ - /* for ( cid = 0 ; cid < nr_cells ; cid++ ) - for ( k = 0 ; k < 26 ; k++ ) { - cjd = inds[ cid*26 + k ]; - for ( j = 26*cjd ; inds[j] != cid ; j++ ); - if ( weights_e[ cid*26+k ] != weights_e[ j ] ) - error( "Unsymmetric edge weights detected (%i vs %i)." , - weights_e[ cid*26+k ] , weights_e[ j ] ); - } */ - /* int w_min = weights_e[0], w_max = weights_e[0], w_tot = weights_e[0]; - for ( k = 1 ; k < 26*nr_cells ; k++ ) { - w_tot += weights_e[k]; - if ( weights_e[k] < w_min ) - w_min = weights_e[k]; - else if ( weights_e[k] > w_max ) - w_max = weights_e[k]; - } - message( "edge weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); - w_min = weights_e[0], w_max = weights_e[0]; w_tot = weights_v[0]; - for ( k = 1 ; k < nr_cells ; k++ ) { - w_tot += weights_v[k]; - if ( weights_v[k] < w_min ) - w_min = weights_v[k]; - else if ( weights_v[k] > w_max ) - w_max = weights_v[k]; - } - message( "vertex weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); - */ - - /* Make sure there are no zero weights. */ - for (k = 0; k < 26 * nr_cells; k++) - if (weights_e[k] == 0) weights_e[k] = 1; - for (k = 0; k < nr_cells; k++) - if ((weights_v[k] *= vscale) == 0) weights_v[k] = 1; - - /* Allocate and fill the connection array. */ - idx_t *offsets; - if ((offsets = (idx_t *)malloc(sizeof(idx_t) * (nr_cells + 1))) == NULL) - error("Failed to allocate offsets buffer."); - offsets[0] = 0; - for (k = 0; k < nr_cells; k++) offsets[k + 1] = offsets[k] + 26; - - /* Set the METIS options. +1 to keep the GCC sanitizer happy. */ - idx_t options[METIS_NOPTIONS + 1]; - METIS_SetDefaultOptions(options); - options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT; - options[METIS_OPTION_NUMBERING] = 0; - options[METIS_OPTION_CONTIG] = 1; - options[METIS_OPTION_NCUTS] = 10; - options[METIS_OPTION_NITER] = 20; - // options[ METIS_OPTION_UFACTOR ] = 1; - - /* Set the initial partition, although this is probably ignored. */ - for (k = 0; k < nr_cells; k++) nodeIDs[k] = cells[k].nodeID; - - /* Call METIS. */ - idx_t one = 1, idx_nr_cells = nr_cells, idx_nr_nodes = nr_nodes; - idx_t objval; - - /* Dump graph in METIS format */ - /*dumpMETISGraph("metis_graph", idx_nr_cells, one, offsets, inds, - weights_v, NULL, weights_e);*/ - - if (METIS_PartGraphRecursive(&idx_nr_cells, &one, offsets, inds, weights_v, - NULL, weights_e, &idx_nr_nodes, NULL, NULL, - options, &objval, nodeIDs) != METIS_OK) - error("Call to METIS_PartGraphRecursive failed."); - - /* Dump the 3d array of cell IDs. */ - /* printf( "engine_repartition: nodeIDs = reshape( [" ); - for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ ) - printf( "%i " , (int)nodeIDs[ i ] ); - printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */ - - /* Check that the nodeIDs are ok. */ - for (k = 0; k < nr_cells; k++) - if (nodeIDs[k] < 0 || nodeIDs[k] >= nr_nodes) - error("Got bad nodeID %" PRIDX " for cell %i.", nodeIDs[k], k); - - /* Check that the partition is complete and all nodes have some work. */ - int present[nr_nodes]; - int failed = 0; - for (i = 0; i < nr_nodes; i++) present[i] = 0; - for (i = 0; i < nr_cells; i++) present[nodeIDs[i]]++; - for (i = 0; i < nr_nodes; i++) { - if (!present[i]) { - failed = 1; - message("Node %d is not present after repartition", i); - } - } - - /* If partition failed continue with the current one, but make this - * clear. */ - if (failed) { - message( - "WARNING: METIS repartition has failed, continuing with " - "the current partition, load balance will not be optimal"); - for (k = 0; k < nr_cells; k++) nodeIDs[k] = cells[k].nodeID; - } - } - -/* Broadcast the result of the partition. */ -#if IDXTYPEWIDTH == 32 - if (MPI_Bcast(nodeIDs, nr_cells, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS) - error("Failed to bcast the node IDs."); -#else - if (MPI_Bcast(nodeIDs, nr_cells, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD) != - MPI_SUCCESS) - error("Failed to bcast the node IDs."); -#endif - - /* Set the cell nodeIDs and clear any non-local parts. */ - for (k = 0; k < nr_cells; k++) { - cells[k].nodeID = nodeIDs[k]; - if (nodeIDs[k] == nodeID) my_cells += 1; - } - - /* Clean up. */ - free(inds); - free(weights_v); - free(weights_e); - free(nodeIDs); + /* Do the repartitioning. */ + partition_repartition(reparttype, e->nodeID, e->nr_nodes, e->s, + e->sched.tasks, e->sched.nr_tasks); /* Now comes the tricky part: Exchange particles between all nodes. This is done in two steps, first allreducing a matrix of @@ -1894,7 +1567,7 @@ void engine_step(struct engine *e) { // message("\nACCELERATION AND KICK\n"); /* Re-distribute the particles amongst the nodes? */ - if (e->forcerepart) engine_repartition(e); + if (e->forcerepart != REPART_NONE) engine_repartition(e); /* Prepare the space. */ engine_prepare(e); @@ -2054,34 +1727,19 @@ void engine_makeproxies(struct engine *e) { } /** - * @brief Split the underlying space according to the given grid. + * @brief Split the underlying space into regions and assign to separate nodes. * * @param e The #engine. - * @param grid The grid. + * @param initial_partition structure defining the cell partition technique */ -void engine_split(struct engine *e, int *grid) { +void engine_split(struct engine *e, struct partition *initial_partition) { #ifdef WITH_MPI - - int j, k; - int ind[3]; struct space *s = e->s; - struct cell *c; - /* If we've got the wrong number of nodes, fail. */ - if (e->nr_nodes != grid[0] * grid[1] * grid[2]) - error("Grid size does not match number of nodes."); - - /* Run through the cells and set their nodeID. */ - // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]); - for (k = 0; k < s->nr_cells; k++) { - c = &s->cells[k]; - for (j = 0; j < 3; j++) ind[j] = c->loc[j] / s->dim[j] * grid[j]; - c->nodeID = ind[0] + grid[0] * (ind[1] + grid[1] * ind[2]); - // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0], - // c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID); - } + /* Do the initial partition of the cells. */ + partition_initial_partition(initial_partition, e->nodeID, e->nr_nodes, s); /* Make the proxies. */ engine_makeproxies(e); @@ -2104,9 +1762,6 @@ void engine_split(struct engine *e, int *grid) { free(s->xparts); s->parts = parts_new; s->xparts = xparts_new; - -#else - error("SWIFT was not compiled with MPI support."); #endif } @@ -2175,7 +1830,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2; bool done = false, swap_hyperthreads = hyperthreads_present(); - if (swap_hyperthreads && nodeID == 0) + if (swap_hyperthreads && nodeID == 0) message("prefer physical cores to hyperthreads"); while (!done) { @@ -2228,7 +1883,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, e->proxy_ind = NULL; e->nr_proxies = 0; e->forcerebuild = 1; - e->forcerepart = 0; + e->forcerepart = REPART_NONE; e->links = NULL; e->nr_links = 0; e->timeBegin = timeBegin; diff --git a/src/engine.h b/src/engine.h index 38bdb262288f456d70c597f0d18dd4715dfdd2b8..13c7ec40612713d3771543150763da6924144c1a 100644 --- a/src/engine.h +++ b/src/engine.h @@ -38,6 +38,7 @@ #include "scheduler.h" #include "space.h" #include "task.h" +#include "partition.h" /* Some constants. */ enum engine_policy { @@ -62,7 +63,6 @@ extern const char *engine_policy_names[]; #define engine_maxproxies 64 #define engine_tasksreweight 10 -#define engine_maxmetisweight 10000.0f /* The rank of the engine as a global variable (for messages). */ extern int engine_rank; @@ -149,7 +149,8 @@ struct engine { float wallclock_time; /* Force the engine to rebuild? */ - int forcerebuild, forcerepart; + int forcerebuild; + enum repartition_type forcerepart; /* How many steps have we done with the same set of tasks? */ int tasks_age; @@ -177,7 +178,7 @@ void engine_print(struct engine *e); void engine_init_particles(struct engine *e); void engine_step(struct engine *e); void engine_maketasks(struct engine *e); -void engine_split(struct engine *e, int *grid); +void engine_split(struct engine *e, struct partition *initial_partition); int engine_exchange_strays(struct engine *e, int offset, int *ind, int N); void engine_rebuild(struct engine *e); void engine_repartition(struct engine *e); diff --git a/src/error.h b/src/error.h index 7e5c290203786bce33d244f0aa2dace621ec374d..14c6518efa99b4c5b2a5862619426f30d7f69b7b 100644 --- a/src/error.h +++ b/src/error.h @@ -2,6 +2,7 @@ * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * 2015 Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -49,6 +50,34 @@ extern int engine_rank; } #endif +#ifdef WITH_MPI +/** + * @brief MPI error macro. Prints the message given in argument, + * followed by the MPI error string and aborts. + * + */ +#define mpi_error(res,s, ...) \ + { \ + fprintf(stderr, "[%03i] %s:%s():%i: " s "\n", engine_rank, __FILE__, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + int len = 1024; \ + char buf[len]; \ + MPI_Error_string( res, buf, &len ); \ + fprintf(stderr, "%s\n\n", buf ); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } + +#define mpi_error_string(res,s, ...) \ + { \ + fprintf(stderr, "[%03i] %s:%s():%i: " s "\n", engine_rank, __FILE__, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + int len = 1024; \ + char buf[len]; \ + MPI_Error_string( res, buf, &len ); \ + fprintf(stderr, "%s\n\n", buf ); \ + } +#endif + /** * @brief Macro to print a localized message with variable arguments. * diff --git a/src/partition.c b/src/partition.c new file mode 100644 index 0000000000000000000000000000000000000000..f1fe2b2a8436994de13501dc4ce6714ec1d2e92d --- /dev/null +++ b/src/partition.c @@ -0,0 +1,955 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 Peter W. Draper (p.w.draper@durham.ac.uk) + * Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/** + * @file partition.c + * @brief file of various techniques for partitioning and repartitioning + * a grid of cells into geometrically connected regions and distributing + * these around a number of MPI nodes. + * + * Currently supported partitioning types: grid, vectorise and METIS. + */ + +/* Config parameters. */ +#include "../config.h" + +/* Standard headers. */ +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <values.h> + +/* MPI headers. */ +#ifdef WITH_MPI +#include <mpi.h> +/* METIS headers only used when MPI is also available. */ +#ifdef HAVE_METIS +#include <metis.h> +#endif +#endif + +/* Local headers. */ +#include "const.h" +#include "debug.h" +#include "error.h" +#include "partition.h" +#include "space.h" + +/* Maximum weight used for METIS. */ +#define metis_maxweight 10000.0f + +/* Simple descriptions of initial partition types for reports. */ +const char *initial_partition_name[] = { + "gridded cells", + "vectorized point associated cells", + "METIS particle weighted cells", + "METIS unweighted cells" +}; + +/* Simple descriptions of repartition types for reports. */ +const char *repartition_name[] = { + "no", + "METIS edge and vertex time weighted cells", + "METIS particle count vertex weighted cells", + "METIS time edge weighted cells", + "METIS particle count vertex and time edge cells" +}; + +/* Local functions, if needed. */ +static int check_complete(struct space *s, int verbose, int nregions); + +/* Vectorisation support */ +/* ===================== */ + +#if defined(WITH_MPI) +/** + * @brief Pick a number of cell positions from a vectorised list. + * + * Vectorise the cell space and pick positions in it for the number of + * expected regions using a single step. Vectorisation is guaranteed + * to work, providing there are more cells than regions. + * + * @param s the space. + * @param nregions the number of regions + * @param samplecells the list of sample cell positions, size of 3*nregions + */ +static void pick_vector(struct space *s, int nregions, int *samplecells) { + + /* Get length of space and divide up. */ + int length = s->cdim[0] * s->cdim[1] * s->cdim[2]; + if (nregions > length) { + error("Too few cells (%d) for this number of regions (%d)", length, + nregions); + } + + int step = length / nregions; + int n = 0; + int m = 0; + int l = 0; + + for (int i = 0; i < s->cdim[0]; i++) { + for (int j = 0; j < s->cdim[1]; j++) { + for (int k = 0; k < s->cdim[2]; k++) { + if (n == 0 && l < nregions) { + samplecells[m++] = i; + samplecells[m++] = j; + samplecells[m++] = k; + l++; + } + n++; + if (n == step) n = 0; + } + } + } +} +#endif + +#if defined(WITH_MPI) +/** + * @brief Partition the space. + * + * Using the sample positions as seeds pick cells that are geometrically + * closest and apply the partition to the space. + */ +static void split_vector(struct space *s, int nregions, int *samplecells) { + int n = 0; + for (int i = 0; i < s->cdim[0]; i++) { + for (int j = 0; j < s->cdim[1]; j++) { + for (int k = 0; k < s->cdim[2]; k++) { + int select = -1; + float rsqmax = FLT_MAX; + int m = 0; + for (int l = 0; l < nregions; l++) { + float dx = samplecells[m++] - i; + float dy = samplecells[m++] - j; + float dz = samplecells[m++] - k; + float rsq = (dx * dx + dy * dy + dz * dz); + if (rsq < rsqmax) { + rsqmax = rsq; + select = l; + } + } + s->cells[n++].nodeID = select; + } + } + } +} +#endif + +/* METIS support + * ============= + * + * METIS partitions using a multi-level k-way scheme. We support using this in + * a unweighted scheme, which works well and seems to be guaranteed, and a + * weighted by the number of particles scheme. Note METIS is optional. + * + * Repartitioning is based on METIS and uses weights determined from the times + * that cell tasks have taken. These weight the graph edges and vertices, or + * just the edges, with vertex weights from the particle counts or none. + */ + +#if defined(WITH_MPI) && defined(HAVE_METIS) +/** + * @brief Fill the METIS xadj and adjncy arrays defining the graph of cells + * in a space. + * + * See the METIS manual if you want to understand this format. The cell graph + * consists of all nodes as vertices with edges as the connections to all + * neighbours, so we have 26 per vertex. + * + * @param s the space of cells. + * @param adjncy the METIS adjncy array to fill, must be of size + * 26 * the number of cells in the space. + * @param xadj the METIS xadj array to fill, must be of size + * number of cells in space + 1. NULL for not used. + */ +static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { + + /* Loop over all cells in the space. */ + int cid = 0; + for (int l = 0; l < s->cdim[0]; l++) { + for (int m = 0; m < s->cdim[1]; m++) { + for (int n = 0; n < s->cdim[2]; n++) { + + /* Visit all neighbours of this cell, wrapping space at edges. */ + int p = 0; + for (int i = -1; i <= 1; i++) { + int ii = l + i; + if (ii < 0) + ii += s->cdim[0]; + else if (ii >= s->cdim[0]) + ii -= s->cdim[0]; + for (int j = -1; j <= 1; j++) { + int jj = m + j; + if (jj < 0) + jj += s->cdim[1]; + else if (jj >= s->cdim[1]) + jj -= s->cdim[1]; + for (int k = -1; k <= 1; k++) { + int kk = n + k; + if (kk < 0) + kk += s->cdim[2]; + else if (kk >= s->cdim[2]) + kk -= s->cdim[2]; + + /* If not self, record id of neighbour. */ + if (i || j || k) { + adjncy[cid * 26 + p] = cell_getid(s->cdim, ii, jj, kk); + p++; + } + } + } + } + + /* Next cell. */ + cid++; + } + } + } + + /* If given set xadj. */ + if (xadj != NULL) { + xadj[0] = 0; + for (int k = 0; k < s->nr_cells; k++) xadj[k + 1] = xadj[k] + 26; + + } +} +#endif + +#if defined(WITH_MPI) && defined(HAVE_METIS) +/** + * @brief Accumulate the counts of particles per cell. + * + * @param s the space containing the cells. + * @param counts the number of particles per cell. Should be + * allocated as size s->nr_parts. + */ +static void accumulate_counts(struct space *s, int *counts) { + + struct part *parts = s->parts; + int *cdim = s->cdim; + double ih[3], dim[3]; + ih[0] = s->ih[0]; + ih[1] = s->ih[1]; + ih[2] = s->ih[2]; + dim[0] = s->dim[0]; + dim[1] = s->dim[1]; + dim[2] = s->dim[2]; + + bzero(counts, sizeof(int) * s->nr_cells); + + for (int k = 0; k < s->nr_parts; k++) { + for (int j = 0; j < 3; j++) { + if (parts[k].x[j] < 0.0) + parts[k].x[j] += dim[j]; + else if (parts[k].x[j] >= dim[j]) + parts[k].x[j] -= dim[j]; + } + const int cid = cell_getid(cdim, parts[k].x[0] * ih[0], + parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]); + counts[cid]++; + } +} +#endif + +#if defined(WITH_MPI) && defined(HAVE_METIS) +/** + * @brief Apply METIS cell list partitioning to a cell structure. + * + * Uses the results of part_metis_pick to assign each cell's nodeID to the + * picked region index, thus partitioning the space into regions. + * + * @param s the space containing the cells to split into regions. + * @param nregions number of regions. + * @param celllist list of regions for each cell. + */ +static void split_metis(struct space *s, int nregions, int *celllist) { + + for (int i = 0; i < s->nr_cells; i++) + s->cells[i].nodeID = celllist[i]; +} +#endif + +#if defined(WITH_MPI) && defined(HAVE_METIS) +/** + * @brief Partition the given space into a number of connected regions. + * + * Split the space using METIS to derive a partitions using the + * given edge and vertex weights. If no weights are given then an + * unweighted partition is performed. + * + * @param s the space of cells to partition. + * @param nregions the number of regions required in the partition. + * @param vertexw weights for the cells, sizeof number of cells if used, + * NULL for unit weights. + * @param edgew weights for the graph edges between all cells, sizeof number + * of cells * 26 if used, NULL for unit weights. Need to be packed + * in CSR format, so same as adjncy array. + * @param celllist on exit this contains the ids of the selected regions, + * sizeof number of cells. + */ +static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, + int *celllist) { + + /* Total number of cells. */ + int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2]; + + /* Nothing much to do if only using a single partition. Also avoids METIS + * bug that doesn't handle this case well. */ + if (nregions == 1) { + for (int i = 0; i < ncells; i++) celllist[i] = 0; + return; + } + + /* Allocate weights and adjacency arrays . */ + idx_t *xadj; + if ((xadj = (idx_t *)malloc(sizeof(idx_t) * (ncells + 1))) == NULL) + error("Failed to allocate xadj buffer."); + idx_t *adjncy; + if ((adjncy = (idx_t *)malloc(sizeof(idx_t) * 26 * ncells)) == NULL) + error("Failed to allocate adjncy array."); + idx_t *weights_v = NULL; + if (vertexw != NULL) + if ((weights_v = (idx_t *)malloc(sizeof(idx_t) * ncells)) == NULL) + error("Failed to allocate vertex weights array"); + idx_t *weights_e = NULL; + if (edgew != NULL) + if ((weights_e = (idx_t *)malloc(26 * sizeof(idx_t) * ncells)) == NULL) + error("Failed to allocate edge weights array"); + idx_t *regionid; + if ((regionid = (idx_t *)malloc(sizeof(idx_t) * ncells)) == NULL) + error("Failed to allocate regionid array"); + + /* Define the cell graph. */ + graph_init_metis(s, adjncy, xadj); + + /* Init the vertex weights array. */ + if (vertexw != NULL) { + for (int k = 0; k < ncells; k++) { + if (vertexw[k] > 0) { + weights_v[k] = vertexw[k]; + } else { + weights_v[k] = 1; + } + } + } + + /* Init the edges weights array. */ + if (edgew != NULL) { + for (int k = 0; k < 26 * ncells; k++) { + if (edgew[k] > 0) { + weights_e[k] = edgew[k]; + } else { + weights_e[k] = 1; + } + } + } + + /* Set the METIS options. */ + idx_t options[METIS_NOPTIONS]; + METIS_SetDefaultOptions(options); + options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT; + options[METIS_OPTION_NUMBERING] = 0; + options[METIS_OPTION_CONTIG] = 1; + options[METIS_OPTION_NCUTS] = 10; + options[METIS_OPTION_NITER] = 20; + + /* Call METIS. */ + idx_t one = 1; + idx_t idx_ncells = ncells; + idx_t idx_nregions = nregions; + idx_t objval; + + /* Dump graph in METIS format */ + /* dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy, + * weights_v, weights_e, NULL); + */ + + if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, weights_e, + NULL, &idx_nregions, NULL, NULL, options, &objval, + regionid) != METIS_OK) + error("Call to METIS_PartGraphKway failed."); + + /* Check that the regionids are ok. */ + for (int k = 0; k < ncells; k++) + if (regionid[k] < 0 || regionid[k] >= nregions) + error("Got bad nodeID %" PRIDX " for cell %i.", regionid[k], k); + + /* Set the cell list to the region index. */ + for (int k = 0; k < ncells; k++) { + celllist[k] = regionid[k]; + } + + /* Clean up. */ + if (weights_v != NULL) free(weights_v); + if (weights_e != NULL) free(weights_e); + free(xadj); + free(adjncy); + free(regionid); + +} +#endif + +#if defined(WITH_MPI) && defined(HAVE_METIS) +/** + * @brief Repartition the cells amongst the nodes using task timings + * as edge weights and vertex weights also from task timings + * or particle cells counts. + * + * @param partweights whether particle counts will be used as vertex weights. + * @param bothweights whether vertex and edge weights will be used, otherwise + * only edge weights will be used. + * @param nodeID our nodeID. + * @param nr_nodes the number of nodes. + * @param s the space of cells holding our local particles. + * @param tasks the completed tasks from the last engine step for our node. + * @param nr_tasks the number of tasks. + */ +static void repart_edge_metis(int partweights, int bothweights, + int nodeID, int nr_nodes, struct space *s, + struct task *tasks, int nr_tasks) { + + + /* Create weight arrays using task ticks for vertices and edges (edges + * assume the same graph structure as used in the part_ calls). */ + int nr_cells = s->nr_cells; + struct cell *cells = s->cells; + float wscale = 1e-3, vscale = 1e-3, wscale_buff; + int wtot = 0; + int wmax = 1e9 / nr_nodes; + int wmin; + + /* Allocate and fill the adjncy indexing array defining the graph of + * cells. */ + idx_t *inds; + if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 * nr_cells)) == NULL) + error("Failed to allocate the inds array"); + graph_init_metis(s, inds, NULL); + + /* Allocate and init weights. */ + int *weights_v = NULL; + int *weights_e = NULL; + if (bothweights) { + if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL) + error("Failed to allocate vertex weights arrays."); + bzero(weights_v, sizeof(int) * nr_cells); + } + if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL) + error("Failed to allocate edge weights arrays."); + bzero(weights_e, sizeof(int) * 26 * nr_cells); + + /* Generate task weights for vertices. */ + int taskvweights = (bothweights && !partweights); + + /* Loop over the tasks... */ + for (int j = 0; j < nr_tasks; j++) { + /* Get a pointer to the kth task. */ + struct task *t = &tasks[j]; + + /* Skip un-interesting tasks. */ + if (t->type != task_type_self && t->type != task_type_pair && + t->type != task_type_sub && t->type != task_type_ghost && + t->type != task_type_drift && t->type != task_type_kick && + t->type != task_type_init) + continue; + + /* Get the task weight. */ + int w = (t->toc - t->tic) * wscale; + if (w < 0) error("Bad task weight (%d).", w); + + /* Do we need to re-scale? */ + wtot += w; + while (wtot > wmax) { + wscale /= 2; + wtot /= 2; + w /= 2; + for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; + if (taskvweights) + for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; + } + + /* Get the top-level cells involved. */ + struct cell *ci, *cj; + for (ci = t->ci; ci->parent != NULL; ci = ci->parent) + ; + if (t->cj != NULL) + for (cj = t->cj; cj->parent != NULL; cj = cj->parent) + ; + else + cj = NULL; + + /* Get the cell IDs. */ + int cid = ci - cells; + + /* Different weights for different tasks. */ + if (t->type == task_type_ghost || t->type == task_type_drift || + t->type == task_type_kick) { + /* Particle updates add only to vertex weight. */ + if (taskvweights) + weights_v[cid] += w; + + } + + /* Self interaction? */ + else if ((t->type == task_type_self && ci->nodeID == nodeID) || + (t->type == task_type_sub && cj == NULL && + ci->nodeID == nodeID)) { + /* Self interactions add only to vertex weight. */ + if (taskvweights) + weights_v[cid] += w; + + } + + /* Pair? */ + else if (t->type == task_type_pair || + (t->type == task_type_sub && cj != NULL)) { + /* In-cell pair? */ + if (ci == cj) { + /* Add weight to vertex for ci. */ + if (taskvweights) + weights_v[cid] += w; + + } + + /* Distinct cells with local ci? */ + else if (ci->nodeID == nodeID) { + /* Index of the jth cell. */ + int cjd = cj - cells; + + /* Add half of weight to each cell. */ + if (taskvweights) { + if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; + if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; + } + + /* Add weights to edge. */ + int kk; + for (kk = 26 * cid; inds[kk] != cjd; kk++) + ; + weights_e[kk] += w; + for (kk = 26 * cjd; inds[kk] != cid; kk++) + ; + weights_e[kk] += w; + } + } + } + + /* Re-calculate the vertices if using particle counts. */ + if (partweights && bothweights) { + accumulate_counts(s, weights_v); + + /* Rescale to balance times. */ + float vwscale = (float)wtot / (float)nr_tasks; + for (int k = 0; k < nr_cells; k++) { + weights_v[k] *= vwscale; + } + } + + /* Get the minimum scaling and re-scale if necessary. */ + int res; + if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, + MPI_COMM_WORLD)) != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce the weight scales."); + + if (wscale_buff != wscale) { + float scale = wscale_buff / wscale; + for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; + if (bothweights) + for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale; + } + + /* Merge the weights arrays across all nodes. */ + if (bothweights) { + if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, + nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) + mpi_error(res, "Failed to allreduce vertex weights."); + } + + if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, + 26 * nr_cells, MPI_INT, MPI_SUM, 0, + MPI_COMM_WORLD)) != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce edge weights."); + + /* Allocate cell list for the partition. */ + int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); + if (celllist == NULL) error("Failed to allocate celllist"); + + /* As of here, only one node needs to compute the partition. */ + if (nodeID == 0) { + /* Final rescale of all weights to avoid a large range. Large ranges + * have been seen to cause an incomplete graph. */ + wmin = wmax; + wmax = 0; + for (int k = 0; k < 26 * nr_cells; k++) { + wmax = weights_e[k] > wmax ? weights_e[k] : wmax; + wmin = weights_e[k] < wmin ? weights_e[k] : wmin; + } + if (bothweights) { + for (int k = 0; k < nr_cells; k++) { + wmax = weights_v[k] > wmax ? weights_v[k] : wmax; + wmin = weights_v[k] < wmin ? weights_v[k] : wmin; + } + } + + if ((wmax - wmin) > metis_maxweight) { + wscale = metis_maxweight / (wmax - wmin); + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] = (weights_e[k] - wmin) * wscale + 1; + } + if (bothweights) { + for (int k = 0; k < nr_cells; k++) { + weights_v[k] = (weights_v[k] - wmin) * wscale + 1; + } + } + } + + /* Make sure there are no zero weights. */ + for (int k = 0; k < 26 * nr_cells; k++) + if (weights_e[k] == 0) weights_e[k] = 1; + if (bothweights) + for (int k = 0; k < nr_cells; k++) + if ((weights_v[k] *= vscale) == 0) weights_v[k] = 1; + + /* And partition, use both weights or not as requested. */ + if (bothweights) + pick_metis(s, nr_nodes, weights_v, weights_e, celllist); + else + pick_metis(s, nr_nodes, NULL, weights_e, celllist); + + /* Check that all cells have good values. */ + for (int k = 0; k < nr_cells; k++) + if (celllist[k] < 0 || celllist[k] >= nr_nodes) + error("Got bad nodeID %d for cell %i.", celllist[k], k); + + /* Check that the partition is complete and all nodes have some work. */ + int present[nr_nodes]; + int failed = 0; + for (int i = 0; i < nr_nodes; i++) present[i] = 0; + for (int i = 0; i < nr_cells; i++) present[celllist[i]]++; + for (int i = 0; i < nr_nodes; i++) { + if (!present[i]) { + failed = 1; + message("Node %d is not present after repartition", i); + } + } + + /* If partition failed continue with the current one, but make this + * clear. */ + if (failed) { + message("WARNING: METIS repartition has failed, continuing with " + "the current partition, load balance will not be optimal"); + for (int k = 0; k < nr_cells; k++) celllist[k] = cells[k].nodeID; + } + } + + /* Distribute the celllist partition and apply. */ + if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) + mpi_error(res, "Failed to bcast the cell list"); + + /* And apply to our cells */ + split_metis(s, nr_nodes, celllist); + + /* Clean up. */ + if (bothweights) free(weights_v); + free(weights_e); + free(celllist); +} +#endif + +/** + * @brief Repartition the cells amongst the nodes using vertex weights + * + * @param s The space containing the local cells. + * @param nodeID our MPI node id. + * @param nr_nodes number of MPI nodes. + */ +#if defined(WITH_MPI) && defined(HAVE_METIS) +static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { + + /* Use particle counts as vertex weights. */ + /* Space for particles per cell counts, which will be used as weights. */ + int *weights = NULL; + if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + error("Failed to allocate weights buffer."); + + /* Check each particle and accumulate the counts per cell. */ + accumulate_counts(s, weights); + + /* Get all the counts from all the nodes. */ + int res; + if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, + MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS) + mpi_error(res, "Failed to allreduce particle cell weights."); + + /* Main node does the partition calculation. */ + int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); + if (celllist == NULL) error("Failed to allocate celllist"); + + if (nodeID == 0) + pick_metis(s, nr_nodes, weights, NULL, celllist); + + /* Distribute the celllist partition and apply. */ + if ((res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) + mpi_error(res, "Failed to bcast the cell list"); + + /* And apply to our cells */ + split_metis(s, nr_nodes, celllist); + + free(weights); + free(celllist); +} +#endif + + +/** + * @brief Repartition the space using the given repartition type. + * + * Note that at the end of this process all the cells will be re-distributed + * across the nodes, but the particles themselves will not be. + * + * @param reparttype the type of repartition to attempt, see the repart_type enum. + * @param nodeID our nodeID. + * @param nr_nodes the number of nodes. + * @param s the space of cells holding our local particles. + * @param tasks the completed tasks from the last engine step for our node. + * @param nr_tasks the number of tasks. + */ +void partition_repartition(enum repartition_type reparttype, int nodeID, + int nr_nodes, struct space *s, struct task *tasks, + int nr_tasks) { + +#if defined(WITH_MPI) && defined(HAVE_METIS) + + if (reparttype == REPART_METIS_BOTH || reparttype == REPART_METIS_EDGE || + reparttype == REPART_METIS_VERTEX_EDGE) { + + int partweights; + int bothweights; + if (reparttype == REPART_METIS_VERTEX_EDGE) + partweights = 1; + else + partweights = 0; + + if (reparttype == REPART_METIS_BOTH) + bothweights = 1; + else + bothweights = 0; + + repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks, + nr_tasks); + + } else if (reparttype == REPART_METIS_VERTEX) { + + repart_vertex_metis(s, nodeID, nr_nodes); + + } else { + error("Unknown repartition type"); + } +#else + error("SWIFT was not compiled with METIS support."); +#endif +} + +/** + * @brief Initial partition of space cells. + * + * Cells are assigned to a node on the basis of various schemes, all of which + * should attempt to distribute them in geometrically close regions to + * minimise the movement of particles. + * + * Note that the partition type is a suggestion and will be ignored if that + * scheme fails. In that case we fallback to a vectorised scheme, that is + * guaranteed to work provided we have more cells than nodes. + * + * @param initial_partition the type of partitioning to try. + * @param nodeID our nodeID. + * @param nr_nodes the number of nodes. + * @param s the space of cells. + */ +void partition_initial_partition(struct partition *initial_partition, + int nodeID, int nr_nodes, struct space *s) { + + /* Geometric grid partitioning. */ + if (initial_partition->type == INITPART_GRID) { + int j, k; + int ind[3]; + struct cell *c; + + /* If we've got the wrong number of nodes, fail. */ + if (nr_nodes != initial_partition->grid[0] * initial_partition->grid[1] + * initial_partition->grid[2]) + error("Grid size does not match number of nodes."); + + /* Run through the cells and set their nodeID. */ + // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]); + for (k = 0; k < s->nr_cells; k++) { + c = &s->cells[k]; + for (j = 0; j < 3; j++) + ind[j] = c->loc[j] / s->dim[j] * initial_partition->grid[j]; + c->nodeID = ind[0] + initial_partition->grid[0] * + (ind[1] + initial_partition->grid[1] * ind[2]); + // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0], + // c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID); + } + + /* The grid technique can fail, so check for this before proceeding. */ + if (!check_complete(s, (nodeID == 0), nr_nodes)) { + if (nodeID == 0) + message("Grid initial partition failed, using a vectorised partition"); + initial_partition->type = INITPART_VECTORIZE; + partition_initial_partition(initial_partition, nodeID, nr_nodes, s); + return; + } + + } else if (initial_partition->type == INITPART_METIS_WEIGHT || + initial_partition->type == INITPART_METIS_NOWEIGHT) { +#if defined(WITH_MPI) && defined(HAVE_METIS) + /* Simple k-way partition selected by METIS using cell particle counts as + * weights or not. Should be best when starting with a inhomogeneous dist. + */ + + /* Space for particles per cell counts, which will be used as weights or + * not. */ + int *weights = NULL; + if (initial_partition->type == INITPART_METIS_WEIGHT) { + if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + error("Failed to allocate weights buffer."); + bzero(weights, sizeof(int) * s->nr_cells); + + /* Check each particle and accumilate the counts per cell. */ + struct part *parts = s->parts; + int *cdim = s->cdim; + double ih[3], dim[3]; + ih[0] = s->ih[0]; + ih[1] = s->ih[1]; + ih[2] = s->ih[2]; + dim[0] = s->dim[0]; + dim[1] = s->dim[1]; + dim[2] = s->dim[2]; + for (int k = 0; k < s->nr_parts; k++) { + for (int j = 0; j < 3; j++) { + if (parts[k].x[j] < 0.0) + parts[k].x[j] += dim[j]; + else if (parts[k].x[j] >= dim[j]) + parts[k].x[j] -= dim[j]; + } + const int cid = + cell_getid(cdim, parts[k].x[0] * ih[0], parts[k].x[1] * ih[1], + parts[k].x[2] * ih[2]); + weights[cid]++; + } + + /* Get all the counts from all the nodes. */ + if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM, + MPI_COMM_WORLD) != MPI_SUCCESS) + error("Failed to allreduce particle cell weights."); + + } + + /* Main node does the partition calculation. */ + int *celllist = (int *)malloc(sizeof(int) * s->nr_cells); + if (celllist == NULL) error("Failed to allocate celllist"); + if (nodeID == 0) + pick_metis(s, nr_nodes, weights, NULL, celllist); + + /* Distribute the celllist partition and apply. */ + int res = MPI_Bcast(celllist, s->nr_cells, MPI_INT, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to bcast the cell list"); + + /* And apply to our cells */ + split_metis(s, nr_nodes, celllist); + + /* It's not known if this can fail, but check for this before + * proceeding. */ + if (!check_complete(s, (nodeID == 0), nr_nodes)) { + if (nodeID == 0) + message("METIS initial partition failed, using a vectorised partition"); + initial_partition->type = INITPART_VECTORIZE; + partition_initial_partition(initial_partition, nodeID, nr_nodes, s); + } + + if (weights != NULL) free(weights); + free(celllist); +#else + error("SWIFT was not compiled with METIS support"); +#endif + + } else if (initial_partition->type == INITPART_VECTORIZE) { + +#if defined(WITH_MPI) + /* Vectorised selection, guaranteed to work for samples less than the + * number of cells, but not very clumpy in the selection of regions. */ + int *samplecells = (int *)malloc(sizeof(int) * nr_nodes * 3); + if (samplecells == NULL) error("Failed to allocate samplecells"); + + if (nodeID == 0) { + pick_vector(s, nr_nodes, samplecells); + } + + /* Share the samplecells around all the nodes. */ + int res = + MPI_Bcast(samplecells, nr_nodes * 3, MPI_INT, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) + mpi_error(res, "Failed to bcast the partition sample cells."); + + /* And apply to our cells */ + split_vector(s, nr_nodes, samplecells); + free(samplecells); +#else + error("SWIFT was not compiled with MPI support"); +#endif + } +} + +/* General support */ +/* =============== */ + +/** + * @brief Check if all regions have been assigned a node in the + * cells of a space. + * + * @param s the space containing the cells to check. + * @param nregions number of regions expected. + * @param verbose if true report the missing regions. + * @return true if all regions have been found, false otherwise. + */ +static int check_complete(struct space *s, int verbose, int nregions) { + + int *present = (int *)malloc(sizeof(int) * nregions); + if (present == NULL) error("Failed to allocate present array"); + + int failed = 0; + for (int i = 0; i < nregions; i++) present[i] = 0; + for (int i = 0; i < s->nr_cells; i++) { + present[s->cells[i].nodeID]++; + } + for (int i = 0; i < nregions; i++) { + if (!present[i]) { + failed = 1; + if (verbose) message("Region %d is not present in partition", i); + } + } + free(present); + return (!failed); +} diff --git a/src/partition.h b/src/partition.h new file mode 100644 index 0000000000000000000000000000000000000000..492af04ac9ef017ceddabb410ee53496e64bb018 --- /dev/null +++ b/src/partition.h @@ -0,0 +1,59 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2015 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_PARTITION_H +#define SWIFT_PARTITION_H + +#include "space.h" +#include "task.h" + +/* Initial partitioning types. */ +enum partition_type { + INITPART_GRID = 0, + INITPART_VECTORIZE, + INITPART_METIS_WEIGHT, + INITPART_METIS_NOWEIGHT +}; + +/* Simple descriptions of types for reports. */ +extern const char *initial_partition_name[]; + +/* The selected initial partition type and any related metadata. */ +struct partition { + enum partition_type type; + int grid[3]; +}; +/* Repartition type to use. */ +enum repartition_type { + REPART_NONE = 0, + REPART_METIS_BOTH, + REPART_METIS_VERTEX, + REPART_METIS_EDGE, + REPART_METIS_VERTEX_EDGE +}; + +/* Simple descriptions of types for reports. */ +extern const char *repartition_name[]; + +void partition_repartition(enum repartition_type reparttype, int nodeID, + int nr_nodes, struct space *s, + struct task *tasks, int nr_tasks); +void partition_initial_partition(struct partition *initial_partition, + int nodeID, int nr_nodes, struct space *s); + +#endif /* SWIFT_PARTITION_H */ diff --git a/src/swift.h b/src/swift.h index 9cd7ac65da305a995fb48ba9c931cb8a9be782e6..03960e83bf75488b3456327157288160e995951b 100644 --- a/src/swift.h +++ b/src/swift.h @@ -48,6 +48,7 @@ #include "timers.h" #include "units.h" #include "tools.h" +#include "partition.h" #include "version.h" #endif /* SWIFT_SWIFT_H */