diff --git a/examples/main.c b/examples/main.c index 8d4289c872ce34ba626cacffca4097c0093fe0c3..034b800887928c049a610c27ef7c916573c71be6 100644 --- a/examples/main.c +++ b/examples/main.c @@ -348,7 +348,7 @@ int main(int argc, char *argv[]) { #endif /* Prepare the domain decomposition scheme */ - enum repartition_type reparttype = REPART_NONE; + struct repartition reparttype; #ifdef WITH_MPI struct partition initial_partition; partition_init(&initial_partition, &reparttype, params, nr_nodes); @@ -360,7 +360,7 @@ int main(int argc, char *argv[]) { if (initial_partition.type == INITPART_GRID) message("grid set to [ %i %i %i ].", initial_partition.grid[0], initial_partition.grid[1], initial_partition.grid[2]); - message("Using %s repartitioning", repartition_name[reparttype]); + message("Using %s repartitioning", repartition_name[reparttype.type]); } #endif @@ -444,8 +444,8 @@ int main(int argc, char *argv[]) { long long N_total[3] = {0, 0, 0}; #if defined(WITH_MPI) long long N_long[3] = {Ngas, Ngpart, Nspart}; - MPI_Reduce(&N_long, &N_total, 3, MPI_LONG_LONG_INT, MPI_SUM, 0, - MPI_COMM_WORLD); + MPI_Allreduce(&N_long, &N_total, 3, MPI_LONG_LONG_INT, MPI_SUM, + MPI_COMM_WORLD); #else N_total[0] = Ngas; N_total[1] = Ngpart; @@ -527,10 +527,10 @@ int main(int argc, char *argv[]) { /* Initialize the engine with the space and policies. */ if (myrank == 0) clocks_gettime(&tic); struct engine e; - engine_init(&e, &s, params, nr_nodes, myrank, nr_threads, with_aff, - engine_policies, talking, reparttype, &us, &prog_const, - &hydro_properties, &gravity_properties, &potential, &cooling_func, - &sourceterms); + engine_init(&e, &s, params, nr_nodes, myrank, nr_threads, N_total[0], + N_total[1], with_aff, engine_policies, talking, &reparttype, &us, + &prog_const, &hydro_properties, &gravity_properties, &potential, + &cooling_func, &sourceterms); if (myrank == 0) { clocks_gettime(&toc); message("engine_init took %.3f %s.", clocks_diff(&tic, &toc), diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 7c69d19bc215472c90bb1b91f23f46702afc64f2..501863a7047df33592fe77b523fdb211dfb7d16b 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -1,4 +1,4 @@ -# Define the system of units to use internally. +# Define the system of units to use internally. InternalUnitSystem: UnitMass_in_cgs: 1 # Grams UnitLength_in_cgs: 1 # Centimeters @@ -38,7 +38,7 @@ Snapshots: Statistics: delta_time: 1e-2 # Time between statistics output energy_file_name: energy # (Optional) File name for energy output - timestep_file_name: timesteps # (Optional) File name for timing information output. Note: No underscores "_" allowed in file name + timestep_file_name: timesteps # (Optional) File name for timing information output. Note: No underscores "_" allowed in file name # Parameters for the hydrodynamics scheme SPH: @@ -51,11 +51,11 @@ SPH: # Parameters for the self-gravity scheme Gravity: - eta: 0.025 # Constant dimensionless multiplier for time integration. + eta: 0.025 # Constant dimensionless multiplier for time integration. epsilon: 0.1 # Softening length (in internal units). a_smooth: 1.25 # (Optional) Smoothing scale in top-level cell sizes to smooth the long-range forces over (this is the default value). r_cut: 4.5 # (Optional) Cut-off in number of top-level cells beyond which no FMM forces are computed (this is the default value). - + # Parameters related to the initial conditions InitialConditions: file_name: SedovBlast/sedov.hdf5 # The file to read @@ -67,14 +67,20 @@ InitialConditions: # Parameters governing domain decomposition DomainDecomposition: - initial_type: m # (Optional) The initial strategy ("g", "m", "w", or "v"). - initial_grid_x: 10 # (Optional) Grid size if the "g" strategy is chosen. - initial_grid_y: 10 - initial_grid_z: 10 - repartition_type: b # (Optional) The re-decomposition strategy ("n", "b", "v", "e" or "x"). - + initial_type: simple_metis # (Optional) The initial decomposition strategy: "grid", + # "simple_metis", "weighted_metis", or "vectorized". + initial_grid_x: 10 # (Optional) Grid size if the "grid" strategy is chosen. + initial_grid_y: 10 # "" + initial_grid_z: 10 # "" + repartition_type: task_weights # (Optional) The re-decomposition strategy: "none", "task_weights", "particle_weights", + # "edge_task_weights" or "hybrid_weights". + trigger: 0.05 # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a + # new decomposition, or number of steps (>1) between decompositions + minfrac: 0.9 # (Optional) Fractional of all particles that should be updated in previous step when + # using CPU time trigger + # Parameters related to external potentials -------------------------------------------- - + # Point mass external potentials PointMassPotential: position_x: 50. # location of external point mass (internal units) @@ -91,7 +97,7 @@ IsothermalPotential: vrot: 200. # Rotation speed of isothermal potential (internal units) timestep_mult: 0.03 # Dimensionless pre-factor for the time-step condition epsilon: 0.1 # Softening size (internal units) - + # Disk-patch potential parameters DiscPatchPotential: surface_density: 10. # Surface density of the disc (internal units) diff --git a/src/clocks.c b/src/clocks.c index 5d662a2c1cc67d56c6f08b334554935cdecd4611..d5f0e571fe0a4694c4088bb05014fafa99d60488 100644 --- a/src/clocks.c +++ b/src/clocks.c @@ -244,3 +244,22 @@ const char *clocks_get_timesincestart() { return buffer; } + +/** + * @brief return the cpu time used. + * + * Uses the times(2) function to access the user cpu times and returns the sum + * of these for the process tree, i.e. current process plus "waited-for" + * children. This may be pthread implementation specific as to what that + * exactly means. Note we do not include the system time as that includes + * spin times and we don't want to give credit for that. + * + * @result cpu time used in sysconf(_SC_CLK_TCK) ticks, usually 100/s not our + * usual ticks. + */ +double clocks_get_cputime_used() { + + struct tms tmstic; + times(&tmstic); + return (double)(tmstic.tms_utime + tmstic.tms_cutime); +} diff --git a/src/clocks.h b/src/clocks.h index 1b4640d605c5ff5d0d9a3d07af158e346695974e..4c69e130d9bfc5e61fb03fc9820ffc76d2440dc4 100644 --- a/src/clocks.h +++ b/src/clocks.h @@ -19,6 +19,7 @@ #ifndef SWIFT_CLOCKS_H #define SWIFT_CLOCKS_H +#include <sys/times.h> #include <time.h> #include "cycle.h" @@ -41,4 +42,6 @@ double clocks_from_ticks(ticks tics); double clocks_diff_ticks(ticks tic, ticks toc); const char *clocks_get_timesincestart(); +double clocks_get_cputime_used(); + #endif /* SWIFT_CLOCKS_H */ diff --git a/src/engine.c b/src/engine.c index 52d4be9b3470e398efdd9669191c92790d472444..98f5cba933158c4f202377e270a6ac51b6541de3 100644 --- a/src/engine.c +++ b/src/engine.c @@ -882,6 +882,84 @@ void engine_repartition(struct engine *e) { #endif } +/** + * @brief Decide whether trigger a repartition the cells amongst the nodes. + * + * @param e The #engine. + */ +void engine_repartition_trigger(struct engine *e) { + +#ifdef WITH_MPI + + /* Do nothing if there have not been enough steps since the last + * repartition, don't want to repeat this too often or immediately after + * a repartition step. */ + if (e->step - e->last_repartition > 2) { + + /* Old style if trigger is >1 or this is the second step (want an early + * repartition following the initial repartition). */ + if (e->reparttype->trigger > 1 || e->step == 2) { + if (e->reparttype->trigger > 1) { + if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1; + } else { + e->forcerepart = 1; + } + + } else { + + /* Use cputimes from ranks to estimate the imbalance. */ + /* First check if we are going to skip this stage anyway, if so do that + * now. If is only worth checking the CPU loads when we have processed a + * significant number of all particles. */ + if ((e->updates > 1 && + e->updates >= e->total_nr_parts * e->reparttype->minfrac) || + (e->g_updates > 1 && + e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) { + + /* Get CPU time used since the last call to this function. */ + double elapsed_cputime = + clocks_get_cputime_used() - e->cputime_last_step; + + /* Gather the elapsed CPU times from all ranks for the last step. */ + double elapsed_cputimes[e->nr_nodes]; + MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1, + MPI_DOUBLE, 0, MPI_COMM_WORLD); + if (e->nodeID == 0) { + + /* Get the range and mean of cputimes. */ + double mintime = elapsed_cputimes[0]; + double maxtime = elapsed_cputimes[0]; + double sum = elapsed_cputimes[0]; + for (int k = 1; k < e->nr_nodes; k++) { + if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k]; + if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k]; + sum += elapsed_cputimes[k]; + } + double mean = sum / (double)e->nr_nodes; + + /* Are we out of balance? */ + if (((maxtime - mintime) / mean) > e->reparttype->trigger) { + if (e->verbose) + message("trigger fraction %.3f exceeds %.3f will repartition", + (maxtime - mintime) / mintime, e->reparttype->trigger); + e->forcerepart = 1; + } + } + + /* All nodes do this together. */ + MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD); + } + } + + /* Remember we did this. */ + if (e->forcerepart) e->last_repartition = e->step; + } + + /* We always reset CPU time for next check. */ + e->cputime_last_step = clocks_get_cputime_used(); +#endif +} + /** * @brief Add up/down gravity tasks to a cell hierarchy. * @@ -3041,7 +3119,9 @@ void engine_step(struct engine *e) { struct clocks_time time1, time2; clocks_gettime(&time1); +#ifdef SWIFT_DEBUG_TASKS e->tic_step = getticks(); +#endif if (e->nodeID == 0) { @@ -3069,9 +3149,9 @@ void engine_step(struct engine *e) { /* Prepare the tasks to be launched, rebuild or repartition if needed. */ engine_prepare(e); -/* Repartition the space amongst the nodes? */ -#if defined(WITH_MPI) && defined(HAVE_METIS) - if (e->step % 100 == 2) e->forcerepart = 1; +#ifdef WITH_MPI + /* Repartition the space amongst the nodes? */ + engine_repartition_trigger(e); #endif /* Are we drifting everything (a la Gadget/GIZMO) ? */ @@ -3144,9 +3224,12 @@ void engine_step(struct engine *e) { TIMER_TOC2(timer_step); clocks_gettime(&time2); - e->wallclock_time = (float)clocks_diff(&time1, &time2); + +#ifdef SWIFT_DEBUG_TASKS + /* Time in ticks at the end of this step. */ e->toc_step = getticks(); +#endif } /** @@ -3530,6 +3613,8 @@ void engine_unpin() { * @param nr_nodes The number of MPI ranks. * @param nodeID The MPI rank of this node. * @param nr_threads The number of threads per MPI rank. + * @param Ngas total number of gas particles in the simulation. + * @param Ndm total number of gravity particles in the simulation. * @param with_aff use processor affinity, if supported. * @param policy The queuing policy to use. * @param verbose Is this #engine talkative ? @@ -3544,8 +3629,8 @@ void engine_unpin() { */ void engine_init(struct engine *e, struct space *s, const struct swift_params *params, int nr_nodes, int nodeID, - int nr_threads, int with_aff, int policy, int verbose, - enum repartition_type reparttype, + int nr_threads, int Ngas, int Ndm, int with_aff, int policy, + int verbose, struct repartition *reparttype, const struct unit_system *internal_units, const struct phys_const *physical_constants, const struct hydro_props *hydro, @@ -3564,6 +3649,8 @@ void engine_init(struct engine *e, struct space *s, e->step = 0; e->nr_nodes = nr_nodes; e->nodeID = nodeID; + e->total_nr_parts = Ngas; + e->total_nr_gparts = Ndm; e->proxy_ind = NULL; e->nr_proxies = 0; e->forcerebuild = 1; @@ -3611,6 +3698,10 @@ void engine_init(struct engine *e, struct space *s, e->cooling_func = cooling_func; e->sourceterms = sourceterms; e->parameter_file = params; +#ifdef WITH_MPI + e->cputime_last_step = 0; + e->last_repartition = -1; +#endif engine_rank = nodeID; /* Make the space link back to the engine. */ diff --git a/src/engine.h b/src/engine.h index 80903ee78156c7e4efb2650e59e4fa832fecbfa3..a0e32ad15b79c364d13d19589f8462ff8705ee29 100644 --- a/src/engine.h +++ b/src/engine.h @@ -144,6 +144,9 @@ struct engine { /* Number of particles updated */ size_t updates, g_updates, s_updates; + /* Total numbers of particles in the system. */ + size_t total_nr_parts, total_nr_gparts; + /* The internal system of units */ const struct unit_system *internal_units; @@ -181,8 +184,18 @@ struct engine { struct proxy *proxies; int nr_proxies, *proxy_ind; +#ifdef SWIFT_DEBUG_TASKS /* Tic/toc at the start/end of a step. */ ticks tic_step, toc_step; +#endif + +#ifdef WITH_MPI + /* CPU time of the last step. */ + double cputime_last_step; + + /* Step of last repartition. */ + int last_repartition; +#endif /* Wallclock time of the last time-step */ float wallclock_time; @@ -192,7 +205,7 @@ struct engine { /* Force the engine to repartition ? */ int forcerepart; - enum repartition_type reparttype; + struct repartition *reparttype; /* Need to dump some statistics ? */ int save_stats; @@ -240,14 +253,14 @@ void engine_drift_all(struct engine *e); void engine_dump_snapshot(struct engine *e); void engine_init(struct engine *e, struct space *s, const struct swift_params *params, int nr_nodes, int nodeID, - int nr_threads, int with_aff, int policy, int verbose, - enum repartition_type reparttype, + int nr_threads, int Ngas, int Ndm, int with_aff, int policy, + int verbose, struct repartition *reparttype, const struct unit_system *internal_units, const struct phys_const *physical_constants, const struct hydro_props *hydro, const struct gravity_props *gravity, const struct external_potential *potential, - const struct cooling_function_data *cooling, + const struct cooling_function_data *cooling_func, struct sourceterms *sourceterms); void engine_launch(struct engine *e, int nr_runners); void engine_prepare(struct engine *e); @@ -262,6 +275,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, size_t *Nspart); void engine_rebuild(struct engine *e); void engine_repartition(struct engine *e); +void engine_repartition_trigger(struct engine *e); void engine_makeproxies(struct engine *e); void engine_redistribute(struct engine *e); void engine_print_policy(struct engine *e); diff --git a/src/partition.c b/src/partition.c index 5a4d2f8d2e0bb4a3f1c8d1b3e7dc32293965c7dc..6dd35a99d6102074aaf71f803990d5d425f1a093 100644 --- a/src/partition.c +++ b/src/partition.c @@ -780,31 +780,31 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { * Note that at the end of this process all the cells will be re-distributed * across the nodes, but the particles themselves will not be. * - * @param reparttype the type of repartition to attempt, see the repart_type - *enum. + * @param reparttype #repartition struct * @param nodeID our nodeID. * @param nr_nodes the number of nodes. * @param s the space of cells holding our local particles. * @param tasks the completed tasks from the last engine step for our node. * @param nr_tasks the number of tasks. */ -void partition_repartition(enum repartition_type reparttype, int nodeID, +void partition_repartition(struct repartition *reparttype, int nodeID, int nr_nodes, struct space *s, struct task *tasks, int nr_tasks) { #if defined(WITH_MPI) && defined(HAVE_METIS) - if (reparttype == REPART_METIS_BOTH || reparttype == REPART_METIS_EDGE || - reparttype == REPART_METIS_VERTEX_EDGE) { + if (reparttype->type == REPART_METIS_BOTH || + reparttype->type == REPART_METIS_EDGE || + reparttype->type == REPART_METIS_VERTEX_EDGE) { int partweights; int bothweights; - if (reparttype == REPART_METIS_VERTEX_EDGE) + if (reparttype->type == REPART_METIS_VERTEX_EDGE) partweights = 1; else partweights = 0; - if (reparttype == REPART_METIS_BOTH) + if (reparttype->type == REPART_METIS_BOTH) bothweights = 1; else bothweights = 0; @@ -812,10 +812,14 @@ void partition_repartition(enum repartition_type reparttype, int nodeID, repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks, nr_tasks); - } else if (reparttype == REPART_METIS_VERTEX) { + } else if (reparttype->type == REPART_METIS_VERTEX) { repart_vertex_metis(s, nodeID, nr_nodes); + } else if (reparttype->type == REPART_NONE) { + + /* Doing nothing. */ + } else { error("Unknown repartition type"); } @@ -976,27 +980,26 @@ void partition_initial_partition(struct partition *initial_partition, /** * @brief Initialises the partition and re-partition scheme from the parameter - *file + * file * * @param partition The #partition scheme to initialise. - * @param reparttype The repartition scheme to initialise. + * @param repartition The #repartition scheme to initialise. * @param params The parsed parameter file. * @param nr_nodes The number of MPI nodes we are running on. */ void partition_init(struct partition *partition, - enum repartition_type *reparttype, + struct repartition *repartition, const struct swift_params *params, int nr_nodes) { #ifdef WITH_MPI /* Defaults make use of METIS if available */ #ifdef HAVE_METIS - char default_repart = 'b'; - ; - char default_part = 'm'; + const char *default_repart = "both"; + const char *default_part = "simple_metis"; #else - char default_repart = 'n'; - char default_part = 'g'; + const char *default_repart = "none"; + const char *default_part = "grid"; #endif /* Set a default grid so that grid[0]*grid[1]*grid[2] == nr_nodes. */ @@ -1007,10 +1010,11 @@ void partition_init(struct partition *partition, &partition->grid[0]); /* Now let's check what the user wants as an initial domain. */ - const char part_type = parser_get_opt_param_char( - params, "DomainDecomposition:initial_type", default_part); - - switch (part_type) { + char part_type[20]; + parser_get_opt_param_string(params, + "DomainDecomposition:initial_type", + part_type, default_part); + switch (part_type[0]) { case 'g': partition->type = INITPART_GRID; break; @@ -1018,24 +1022,26 @@ void partition_init(struct partition *partition, partition->type = INITPART_VECTORIZE; break; #ifdef HAVE_METIS - case 'm': + case 's': partition->type = INITPART_METIS_NOWEIGHT; break; case 'w': partition->type = INITPART_METIS_WEIGHT; break; default: - message("Invalid choice of initial partition type '%c'.", part_type); - error("Permitted values are: 'g','m','v' or 'w'."); + message("Invalid choice of initial partition type '%s'.", part_type); + error("Permitted values are: 'grid', 'simple_metis', 'weighted_metis'" + " or 'vectorized'"); #else default: - message("Invalid choice of initial partition type '%c'.", part_type); - error("Permitted values are: 'g' or 'v' when compiled without metis."); + message("Invalid choice of initial partition type '%s'.", part_type); + error("Permitted values are: 'grid' or 'vectorized' when compiled " + "without METIS."); #endif } /* In case of grid, read more parameters */ - if (part_type == 'g') { + if (part_type[0] == 'g') { partition->grid[0] = parser_get_opt_param_int( params, "DomainDecomposition:initial_grid_x", partition->grid[0]); partition->grid[1] = parser_get_opt_param_int( @@ -1045,36 +1051,54 @@ void partition_init(struct partition *partition, } /* Now let's check what the user wants as a repartition strategy */ - const char repart_type = parser_get_opt_param_char( - params, "DomainDecomposition:repartition_type", default_repart); + parser_get_opt_param_string(params, + "DomainDecomposition:repartition_type", + part_type, default_repart); - switch (repart_type) { + switch (part_type[0]) { case 'n': - *reparttype = REPART_NONE; + repartition->type = REPART_NONE; break; #ifdef HAVE_METIS - case 'b': - *reparttype = REPART_METIS_BOTH; + case 't': + repartition->type = REPART_METIS_BOTH; break; case 'e': - *reparttype = REPART_METIS_EDGE; + repartition->type = REPART_METIS_EDGE; break; - case 'v': - *reparttype = REPART_METIS_VERTEX; + case 'p': + repartition->type = REPART_METIS_VERTEX; break; - case 'x': - *reparttype = REPART_METIS_VERTEX_EDGE; + case 'h': + repartition->type = REPART_METIS_VERTEX_EDGE; break; default: - message("Invalid choice of re-partition type '%c'.", repart_type); - error("Permitted values are: 'b','e','n', 'v' or 'x'."); + message("Invalid choice of re-partition type '%s'.", part_type); + error("Permitted values are: 'none', 'task_weights', 'particle_weights'," + "'edge_task_weights' or 'hybrid_weights'"); #else default: - message("Invalid choice of re-partition type '%c'.", repart_type); - error("Permitted values are: 'n' when compiled without metis."); + message("Invalid choice of re-partition type '%s'.", part_type); + error("Permitted values are: 'none' when compiled without METIS."); #endif } + /* Get the fraction CPU time difference between nodes (<1) or the number + * of steps between repartitions (>1). */ + repartition->trigger = + parser_get_opt_param_float(params, "DomainDecomposition:trigger", 0.05f); + if (repartition->trigger <= 0) + error("Invalid DomainDecomposition:trigger, must be greater than zero"); + + /* Fraction of particles that should be updated before a repartition + * based on CPU time is considered. */ + repartition->minfrac = + parser_get_opt_param_float(params, "DomainDecomposition:minfrac", 0.9f); + if (repartition->minfrac <= 0 || repartition->minfrac > 1) + error( + "Invalid DomainDecomposition:minfrac, must be greater than 0 and less " + "than equal to 1"); + #else error("SWIFT was not compiled with MPI support"); #endif diff --git a/src/partition.h b/src/partition.h index b2a132ed48e48573949d16291f72218990589158..03523b165a930b085224e458ac0dd8c8232a578d 100644 --- a/src/partition.h +++ b/src/partition.h @@ -39,6 +39,7 @@ struct partition { enum partition_type type; int grid[3]; }; + /* Repartition type to use. */ enum repartition_type { REPART_NONE = 0, @@ -48,10 +49,17 @@ enum repartition_type { REPART_METIS_VERTEX_EDGE }; +/* Repartition preferences. */ +struct repartition { + enum repartition_type type; + float trigger; + float minfrac; +}; + /* Simple descriptions of types for reports. */ extern const char *repartition_name[]; -void partition_repartition(enum repartition_type reparttype, int nodeID, +void partition_repartition(struct repartition *reparttype, int nodeID, int nr_nodes, struct space *s, struct task *tasks, int nr_tasks); void partition_initial_partition(struct partition *initial_partition, @@ -60,7 +68,7 @@ void partition_initial_partition(struct partition *initial_partition, int partition_space_to_space(double *oldh, double *oldcdim, int *oldnodeID, struct space *s); void partition_init(struct partition *partition, - enum repartition_type *reparttypestruct, + struct repartition *repartition, const struct swift_params *params, int nr_nodes); #endif /* SWIFT_PARTITION_H */