diff --git a/examples/main.c b/examples/main.c index 8b367cba14c96414d6eb5eb758a2e46f01a24774..034b800887928c049a610c27ef7c916573c71be6 100644 --- a/examples/main.c +++ b/examples/main.c @@ -444,8 +444,8 @@ int main(int argc, char *argv[]) { long long N_total[3] = {0, 0, 0}; #if defined(WITH_MPI) long long N_long[3] = {Ngas, Ngpart, Nspart}; - MPI_Reduce(&N_long, &N_total, 3, MPI_LONG_LONG_INT, MPI_SUM, 0, - MPI_COMM_WORLD); + MPI_Allreduce(&N_long, &N_total, 3, MPI_LONG_LONG_INT, MPI_SUM, + MPI_COMM_WORLD); #else N_total[0] = Ngas; N_total[1] = Ngpart; diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index bc893c6d27b67a2687e7766977c61f22df83d290..35550de3d752d91daf60bea92c16e9cd4a55fca5 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -76,7 +76,6 @@ DomainDecomposition: # new decomposition, or number of steps (>1) between decompositions minfrac: 0.9 # (Optional) Fractional of all particles that should be updated in previous step when # using CPU time trigger - cputime_file_name: cputime # (Optional) File name for per node CPU time estimates for each step, requires --enable-task-debugging # Parameters related to external potentials -------------------------------------------- diff --git a/src/engine.c b/src/engine.c index 7f987db400b49a893a8e8e794356f02d25127251..e4b4085e4a1ffcf1ba491bc789f3587fced95215 100644 --- a/src/engine.c +++ b/src/engine.c @@ -882,6 +882,81 @@ void engine_repartition(struct engine *e) { #endif } +/** + * @brief Decide whether trigger a repartition the cells amongst the nodes. + * + * @param e The #engine. + */ +void engine_repartition_trigger(struct engine *e) { + +#ifdef WITH_MPI + + /* Do nothing if there have not been enough steps since the last + * repartition, don't want to repeat this too often or immediately after + * a repartition step. */ + if (e->step - e->last_repartition > 2) { + + /* Old style if trigger is >1 or this is the second step (want an early + * repartition following the initial repartition). */ + if (e->reparttype->trigger > 1 || e->step == 2) { + if (e->reparttype->trigger > 1) { + if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1; + } else { + e->forcerepart = 1; + } + + } else { + + /* Use cputimes from ranks to estimate the imbalance. */ + /* First check if we are going to skip this stage anyway, if so do that + * now. If is only worth checking the CPU loads when we have processed a + * significant number of all particles. */ + if ((e->updates > 1 && + e->updates >= e->total_nr_parts * e->reparttype->minfrac) || + (e->g_updates > 1 && + e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) { + + /* Get CPU time used since the last call to this function. */ + double elapsed_cputime = clocks_get_cputime_used() - e->cputime_last_step; + + /* Gather the elapsed CPU times from all ranks for the last step. */ + double elapsed_cputimes[e->nr_nodes]; + MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + if (e->nodeID == 0) { + + /* Get the range of cputimes. */ + double mintime = elapsed_cputimes[0]; + double maxtime = elapsed_cputimes[0]; + for (int k = 1; k < e->nr_nodes; k++) { + if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k]; + if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k]; + } + + /* Are we out of balance? */ + if (((maxtime - mintime) / mintime) > e->reparttype->trigger) { + if (e->verbose) + message("trigger fraction %.3f exceeds %.3f will repartition", + (maxtime - mintime) / mintime, e->reparttype->trigger); + e->forcerepart = 1; + } + } + + /* All nodes do this together. */ + MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD); + } + } + + /* Remember we did this. */ + if (e->forcerepart) + e->last_repartition = e->step; + } + + /* We always reset CPU time for next check. */ + e->cputime_last_step = clocks_get_cputime_used(); +#endif +} + /** * @brief Add up/down gravity tasks to a cell hierarchy. * @@ -3070,102 +3145,11 @@ void engine_step(struct engine *e) { e->timeStep = (e->ti_current - e->ti_old) * e->timeBase; /* Prepare the tasks to be launched, rebuild or repartition if needed. */ -#ifdef WITH_MPI - int justrepart = (e->forcerepart != REPART_NONE); -#endif engine_prepare(e); -/* Repartition the space amongst the nodes? */ #ifdef WITH_MPI - - /* Old style if trigger is >1 or this is the second step, but we never - * repartition immediately after a repartition, those timings will not be - * representative. */ - if (e->reparttype->trigger > 1 || e->step == 2 || justrepart) { - if (! justrepart) { - if (e->reparttype->trigger > 1) { - if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1; - } else { - e->forcerepart = 1; - } - } - -#ifdef SWIFT_DEBUG_TASKS - /* Capture CPU times for comparisons with other methods. */ - double elapsed_cputime = e->cputoc_step - e->cputic_step; - e->cputic_step = clocks_get_cputime_used(); - double elapsed_cputimes[e->nr_nodes]; - MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1, MPI_DOUBLE, - 0, MPI_COMM_WORLD); - if (e->nodeID == 0) { - double mintime = elapsed_cputimes[0]; - double maxtime = elapsed_cputimes[0]; - for (int k = 1; k < e->nr_nodes; k++) { - if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k]; - if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k]; - } - fprintf(e->file_cputimes, "%6d ", e->step); - for (int k = 0; k < e->nr_nodes; k++) { - fprintf(e->file_cputimes, " %14.7g", elapsed_cputimes[k]); - } - fprintf(e->file_cputimes, "\n"); - fflush(e->file_cputimes); - } -#endif - - } else { - - /* Use cputimes from ranks to estimate the imbalance. */ - double elapsed_cputime = e->cputoc_step - e->cputic_step; - e->cputic_step = clocks_get_cputime_used(); - - /* Gather the elapsed CPU times from all ranks for the last step. */ - double elapsed_cputimes[e->nr_nodes]; - MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1, MPI_DOUBLE, - 0, MPI_COMM_WORLD); - - /* If all available particles of any type have been updated then consider - * if a repartition might be needed. Only worth checking when there is - * load on all ranks, so require that some fraction of all particles have - * been processed. */ - if (e->nodeID == 0) { - - /* Get the range of cputimes. */ - double mintime = elapsed_cputimes[0]; - double maxtime = elapsed_cputimes[0]; - for (int k = 1; k < e->nr_nodes; k++) { - if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k]; - if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k]; - } - - if ((e->updates > 1 && - e->updates >= e->total_nr_parts * e->reparttype->minfrac) || - (e->g_updates > 1 && - e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) { - - /* Are we out of balance? */ - if (((maxtime - mintime) / mintime) > e->reparttype->trigger) { - if (e->verbose) - message("fractionaltime %.2f > %.2f will repartition", - (maxtime - mintime) / mintime, e->reparttype->trigger); - e->forcerepart = 1; - } - } - -#ifdef SWIFT_DEBUG_TASKS - /* Save the cputimes for analysis. */ - fprintf(e->file_cputimes, "%6d ", e->step); - for (int k = 0; k < e->nr_nodes; k++) { - fprintf(e->file_cputimes, " %14.7g", elapsed_cputimes[k]); - } - fprintf(e->file_cputimes, "\n"); - fflush(e->file_cputimes); -#endif - } - } - - /* All nodes do this together. */ - MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD); + /* Repartition the space amongst the nodes? */ + engine_repartition_trigger(e); #endif /* Are we drifting everything (a la Gadget/GIZMO) ? */ @@ -3244,11 +3228,6 @@ void engine_step(struct engine *e) { /* Time in ticks at the end of this step. */ e->toc_step = getticks(); #endif - -#ifdef WITH_MPI - /* CPU time used at the end of this step. */ - e->cputoc_step = clocks_get_cputime_used(); -#endif } /** @@ -3704,9 +3683,6 @@ void engine_init(struct engine *e, struct space *s, e->dt_max = parser_get_param_double(params, "TimeIntegration:dt_max"); e->file_stats = NULL; e->file_timesteps = NULL; -#if WITH_MPI - e->file_cputimes = NULL; -#endif e->deltaTimeStatistics = parser_get_param_double(params, "Statistics:delta_time"); e->timeLastStatistics = e->timeBegin - e->deltaTimeStatistics; @@ -3720,6 +3696,10 @@ void engine_init(struct engine *e, struct space *s, e->cooling_func = cooling_func; e->sourceterms = sourceterms; e->parameter_file = params; +#ifdef WITH_MPI + e->cputime_last_step = 0; + e->last_repartition = -1; +#endif engine_rank = nodeID; /* Make the space link back to the engine. */ @@ -3903,15 +3883,6 @@ void engine_init(struct engine *e, struct space *s, "Step", "Time", "Time-step", "Updates", "g-Updates", "s-Updates", "Wall-clock time", clocks_getunit()); fflush(e->file_timesteps); - -#if defined(SWIFT_DEBUG_TASKS) && defined(WITH_MPI) - char cputimefileName[200] = ""; - parser_get_opt_param_string(params, "DomainDecomposition:cputime_file_name", - cputimefileName, - engine_default_cputime_file_name); - sprintf(cputimefileName + strlen(cputimefileName), ".txt"); - e->file_cputimes = fopen(cputimefileName, "w"); -#endif } /* Print policy */ diff --git a/src/engine.h b/src/engine.h index dba72cc97cbba93c742fea8391a5811104e4b0b6..a0e32ad15b79c364d13d19589f8462ff8705ee29 100644 --- a/src/engine.h +++ b/src/engine.h @@ -80,7 +80,6 @@ extern const char *engine_policy_names[]; #define engine_redistribute_alloc_margin 1.2 #define engine_default_energy_file_name "energy" #define engine_default_timesteps_file_name "timesteps" -#define engine_default_cputime_file_name "cputime" /* The rank of the engine as a global variable (for messages). */ extern int engine_rank; @@ -191,11 +190,11 @@ struct engine { #endif #ifdef WITH_MPI - /* CPU times at the start/end of a step. */ - double cputic_step, cputoc_step; + /* CPU time of the last step. */ + double cputime_last_step; - /* Record of these. */ - FILE *file_cputimes; + /* Step of last repartition. */ + int last_repartition; #endif /* Wallclock time of the last time-step */ @@ -276,6 +275,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, size_t *Nspart); void engine_rebuild(struct engine *e); void engine_repartition(struct engine *e); +void engine_repartition_trigger(struct engine *e); void engine_makeproxies(struct engine *e); void engine_redistribute(struct engine *e); void engine_print_policy(struct engine *e);