From 9f5b3fea65b09922e5d58c9e66f0e2b92a532c8b Mon Sep 17 00:00:00 2001 From: Matthieu Schaller <schaller@strw.leidenuniv.nl> Date: Fri, 29 Jul 2022 14:58:46 +0000 Subject: [PATCH] Re-order the operations around i/o to cope with stop files --- src/engine.c | 32 ++++++++++++++------- src/engine.h | 30 +++++++++++++++---- src/engine_config.c | 21 ++++++++++++-- src/engine_drift.c | 8 ++---- src/engine_io.c | 25 ++++++++++++++-- swift.c | 70 ++++++++++++++------------------------------- swift_fof.c | 3 +- 7 files changed, 113 insertions(+), 76 deletions(-) diff --git a/src/engine.c b/src/engine.c index c75e95cd18..8b47234e55 100644 --- a/src/engine.c +++ b/src/engine.c @@ -2113,8 +2113,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, * @brief Let the #engine loose to compute the forces. * * @param e The #engine. + * @return Should the run stop after this step? */ -void engine_step(struct engine *e) { +int engine_step(struct engine *e) { TIMER_TIC2; @@ -2182,8 +2183,11 @@ void engine_step(struct engine *e) { clocks_from_ticks(getticks() - tic_files), clocks_getunit()); } - /* We need some cells to exist but not the whole task stuff. */ + /* When restarting, we may have had some i/o to do on the step + * where we decided to stop. We have to do this now. + * We need some cells to exist but not the whole task stuff. */ if (e->restarting) space_rebuild(e->s, 0, e->verbose); + if (e->restarting) engine_io(e); /* Move forward in time */ e->ti_old = e->ti_current; @@ -2254,6 +2258,7 @@ void engine_step(struct engine *e) { } } + /* Trigger a rebuild if we reached a gravity mesh step? */ if ((e->policy & engine_policy_self_gravity) && e->s->periodic && e->mesh->ti_end_mesh_next == e->ti_current) e->forcerebuild = 1; @@ -2303,15 +2308,13 @@ void engine_step(struct engine *e) { /* Prepare the tasks to be launched, rebuild or repartition if needed. */ const int drifted_all = engine_prepare(e); + /* Dump local cells and active particle counts. */ + // dumpCells("cells", 1, 0, 0, 0, e->s, e->nodeID, e->step); + #ifdef SWIFT_DEBUG_CHECKS /* Print the number of active tasks */ if (e->verbose) engine_print_task_counts(e); -#endif - /* Dump local cells and active particle counts. */ - // dumpCells("cells", 1, 0, 0, 0, e->s, e->nodeID, e->step); - -#ifdef SWIFT_DEBUG_CHECKS /* Check that we have the correct total mass in the top-level multipoles */ long long num_gpart_mpole = 0; if (e->policy & engine_policy_self_gravity) { @@ -2360,7 +2363,7 @@ void engine_step(struct engine *e) { } #endif - /* Re-compute the mesh forces */ + /* Re-compute the mesh forces? */ if ((e->policy & engine_policy_self_gravity) && e->s->periodic && e->mesh->ti_end_mesh_next == e->ti_current) { @@ -2509,11 +2512,16 @@ void engine_step(struct engine *e) { #endif /* Create a restart file if needed. */ - engine_dump_restarts(e, 0, e->restart_onexit && engine_is_done(e)); + const int force_stop = + engine_dump_restarts(e, 0, e->restart_onexit && engine_is_done(e)); - engine_check_for_dumps(e); + /* Is there any form of i/o this step? + * + * Note that if the run was forced to stop, we do not dump, + * we will do so when the run is restarted*/ + if (!force_stop) engine_io(e); -#if defined(SWIFT_RT_DEBUG_CHECKS) +#ifdef SWIFT_RT_DEBUG_CHECKS /* if we're running the debug RT scheme, do some checks after every step. * Do this after the output so we can safely reset debugging checks now. */ if (e->policy & engine_policy_rt) @@ -2527,6 +2535,8 @@ void engine_step(struct engine *e) { /* Time in ticks at the end of this step. */ e->toc_step = getticks(); + + return force_stop; } /** diff --git a/src/engine.h b/src/engine.h index ba1498df6a..c35e88e1c6 100644 --- a/src/engine.h +++ b/src/engine.h @@ -560,9 +560,29 @@ struct engine { /* Do we free the foreign data before rebuilding the tree? */ int free_foreign_when_rebuilding; + /* Name of the restart file directory. */ + const char *restart_dir; + /* Name of the restart file. */ const char *restart_file; + /* Flag whether we should resubmit on this step? */ + int resubmit; + + /* Do we want to run the resubmission command once a run reaches the time + * limit? */ + int resubmit_after_max_hours; + + /* What command should we run to resubmit at the end? */ + char resubmit_command[PARSER_MAX_LINE_SIZE]; + + /* How often to check for the stop file and dump restarts and exit the + * application. */ + int restart_stop_steps; + + /* Get the maximal wall-clock time of this run */ + float restart_max_hours_runtime; + /* Ticks between restart dumps. */ ticks restart_dt; @@ -639,7 +659,7 @@ void engine_drift_top_multipoles(struct engine *e); void engine_reconstruct_multipoles(struct engine *e); void engine_allocate_foreign_particles(struct engine *e, const int fof); void engine_print_stats(struct engine *e); -void engine_check_for_dumps(struct engine *e); +void engine_io(struct engine *e); void engine_collect_end_of_step(struct engine *e, int apply); void engine_dump_snapshot(struct engine *e); void engine_run_on_dump(struct engine *e); @@ -672,13 +692,13 @@ void engine_init( void engine_config(int restart, int fof, struct engine *e, struct swift_params *params, int nr_nodes, int nodeID, int nr_task_threads, int nr_pool_threads, int with_aff, - int verbose, const char *restart_file, - struct repartition *reparttype); + int verbose, const char *restart_dir, + const char *restart_file, struct repartition *reparttype); void engine_launch(struct engine *e, const char *call); int engine_prepare(struct engine *e); void engine_init_particles(struct engine *e, int flag_entropy_ICs, int clean_h_values); -void engine_step(struct engine *e); +int engine_step(struct engine *e); void engine_split(struct engine *e, struct partition *initial_partition); void engine_exchange_strays(struct engine *e, const size_t offset_parts, const int *ind_part, size_t *Npart, @@ -724,6 +744,6 @@ void engine_numa_policies(int rank, int verbose); /* Struct dump/restore support. */ void engine_struct_dump(struct engine *e, FILE *stream); void engine_struct_restore(struct engine *e, FILE *stream); -void engine_dump_restarts(struct engine *e, int drifted_all, int final_step); +int engine_dump_restarts(struct engine *e, int drifted_all, int force); #endif /* SWIFT_ENGINE_H */ diff --git a/src/engine_config.c b/src/engine_config.c index e7e0fc3e5b..74dd9b6717 100644 --- a/src/engine_config.c +++ b/src/engine_config.c @@ -162,8 +162,8 @@ static void engine_dumper_init(struct engine *e) { void engine_config(int restart, int fof, struct engine *e, struct swift_params *params, int nr_nodes, int nodeID, int nr_task_threads, int nr_pool_threads, int with_aff, - int verbose, const char *restart_file, - struct repartition *reparttype) { + int verbose, const char *restart_dir, + const char *restart_file, struct repartition *reparttype) { struct clocks_time tic, toc; if (nodeID == 0) clocks_gettime(&tic); @@ -187,7 +187,9 @@ void engine_config(int restart, int fof, struct engine *e, e->verbose = verbose; e->wallclock_time = 0.f; e->restart_dump = 0; + e->restart_dir = restart_dir; e->restart_file = restart_file; + e->resubmit = 0; e->restart_next = 0; e->restart_dt = 0; e->run_fof = 0; @@ -215,6 +217,21 @@ void engine_config(int restart, int fof, struct engine *e, /* Welcome message */ if (e->nodeID == 0) message("Running simulation '%s'.", e->run_name); + /* Check-pointing properties */ + + e->restart_stop_steps = + parser_get_opt_param_int(params, "Restarts:stop_steps", 100); + + e->restart_max_hours_runtime = + parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX); + + e->resubmit_after_max_hours = + parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0); + + if (e->resubmit_after_max_hours) + parser_get_param_string(params, "Restarts:resubmit_command", + e->resubmit_command); + /* Get the number of queues */ int nr_queues = parser_get_opt_param_int(params, "Scheduler:nr_queues", e->nr_threads); diff --git a/src/engine_drift.c b/src/engine_drift.c index 7deb558489..6df3230325 100644 --- a/src/engine_drift.c +++ b/src/engine_drift.c @@ -324,16 +324,14 @@ void engine_drift_all(struct engine *e, const int drift_mpoles) { const ticks tic = getticks(); -#ifdef SWIFT_DEBUG_CHECKS - if (e->nodeID == 0) { + if (e->nodeID == 0 && e->verbose) { if (e->policy & engine_policy_cosmology) - message("Drifting all to a=%e", + message("Drifting all to a=%15.12e", exp(e->ti_current * e->time_base) * e->cosmology->a_begin); else - message("Drifting all to t=%e", + message("Drifting all to t=%15.12e", e->ti_current * e->time_base + e->time_begin); } -#endif #ifdef WITH_LIGHTCONE /* Determine which periodic replications could contribute to the lightcone diff --git a/src/engine_io.c b/src/engine_io.c index 90b74e97e8..94350c0e3e 100644 --- a/src/engine_io.c +++ b/src/engine_io.c @@ -50,20 +50,33 @@ * @param e the engine. * @param drifted_all true if a drift_all has just been performed. * @param force force a dump, if dumping is enabled. + * @return Do we want to stop the run altogether? */ -void engine_dump_restarts(struct engine *e, int drifted_all, int force) { +int engine_dump_restarts(struct engine *e, const int drifted_all, + const int force) { + + /* Are any of the conditions to fully stop a run met? */ + const int end_run_time = e->runtime > e->restart_max_hours_runtime; + const int stop_file = (e->step % e->restart_stop_steps == 0 && + restart_stop_now(e->restart_dir, 0)); + + /* Exit run when told to */ + const int exit_run = (end_run_time || stop_file); if (e->restart_dump) { ticks tic = getticks(); + const int check_point_time = tic > e->restart_next; + /* Dump when the time has arrived, or we are told to. */ - int dump = ((tic > e->restart_next) || force); + int dump = (check_point_time || end_run_time || force || stop_file); #ifdef WITH_MPI /* Synchronize this action from rank 0 (ticks may differ between * machines). */ MPI_Bcast(&dump, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif + if (dump) { if (e->nodeID == 0) { @@ -123,6 +136,12 @@ void engine_dump_restarts(struct engine *e, int drifted_all, int force) { e->step_props |= engine_step_prop_restarts; } } + + /* If we stopped by reaching the time limit, flag that we need to + * run the resubmission command */ + if (end_run_time && e->resubmit_after_max_hours) e->resubmit = 1; + + return exit_run; } /** @@ -249,7 +268,7 @@ void engine_run_on_dump(struct engine *e) { * * @param e The #engine. */ -void engine_check_for_dumps(struct engine *e) { +void engine_io(struct engine *e) { const int with_cosmology = (e->policy & engine_policy_cosmology); const int with_stf = (e->policy & engine_policy_structure_finding); const int with_los = (e->policy & engine_policy_line_of_sight); diff --git a/swift.c b/swift.c index 26eccd8981..491877b0f4 100644 --- a/swift.c +++ b/swift.c @@ -879,25 +879,6 @@ int main(int argc, char *argv[]) { parser_get_opt_param_string(params, "Restarts:basename", restart_name, "swift"); - /* How often to check for the stop file and dump restarts and exit the - * application. */ - const int restart_stop_steps = - parser_get_opt_param_int(params, "Restarts:stop_steps", 100); - - /* Get the maximal wall-clock time of this run */ - const float restart_max_hours_runtime = - parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX); - - /* Do we want to resubmit when we hit the limit? */ - const int resubmit_after_max_hours = - parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0); - - /* What command should we run to resubmit at the end? */ - char resubmit_command[PARSER_MAX_LINE_SIZE]; - if (resubmit_after_max_hours) - parser_get_param_string(params, "Restarts:resubmit_command", - resubmit_command); - /* If restarting, look for the restart files. */ if (restart) { @@ -987,13 +968,19 @@ int main(int argc, char *argv[]) { /* And initialize the engine with the space and policies. */ engine_config(/*restart=*/1, /*fof=*/0, &e, params, nr_nodes, myrank, - nr_threads, nr_pool_threads, with_aff, talking, restart_file, - &reparttype); + nr_threads, nr_pool_threads, with_aff, talking, restart_dir, + restart_file, &reparttype); /* Check if we are already done when given steps on the command-line. */ if (e.step >= nsteps && nsteps > 0) error("Not restarting, already completed %d steps", e.step); + /* If we are restarting at the very end of a run, just build the tree and + * prepare to dump. + * The main simulation loop below (where rebuild normally happens) won't be + * executed. */ + if (engine_is_done(&e)) space_rebuild(e.s, /*repartitioned=*/0, e.verbose); + } else { /* Prepare and verify the selection of outputs */ @@ -1509,8 +1496,8 @@ int main(int argc, char *argv[]) { &chemistry, &extra_io_props, &fof_properties, &los_properties, &lightcone_array_properties, &ics_metadata); engine_config(/*restart=*/0, /*fof=*/0, &e, params, nr_nodes, myrank, - nr_threads, nr_pool_threads, with_aff, talking, restart_file, - &reparttype); + nr_threads, nr_pool_threads, with_aff, talking, restart_dir, + restart_file, &reparttype); /* Compute some stats for the star formation */ if (with_star_formation) { @@ -1603,7 +1590,7 @@ int main(int argc, char *argv[]) { if (!e.output_list_stats) engine_print_stats(&e); /* Is there a dump before the end of the first time-step? */ - engine_check_for_dumps(&e); + engine_io(&e); } /* Legend */ @@ -1675,7 +1662,7 @@ int main(int argc, char *argv[]) { /* Main simulation loop */ /* ==================== */ - int force_stop = 0, resubmit = 0; + int force_stop = 0; for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop; j++) { @@ -1683,30 +1670,15 @@ int main(int argc, char *argv[]) { timers_reset_all(); /* Take a step. */ - engine_step(&e); + force_stop = engine_step(&e); /* Print the timers. */ if (with_verbose_timers) timers_print(e.step); - /* Every so often allow the user to stop the application and dump the - * restart files. */ - if (j % restart_stop_steps == 0) { - force_stop = restart_stop_now(restart_dir, 0); - if (myrank == 0 && force_stop) - message("Forcing application exit, dumping restart files..."); - } - - /* Did we exceed the maximal runtime? */ - if (e.runtime > restart_max_hours_runtime) { - force_stop = 1; - message("Runtime limit reached, dumping restart files..."); - if (resubmit_after_max_hours) resubmit = 1; - } - - /* Also if using nsteps to exit, will not have saved any restarts on exit, - * make sure we do that (useful in testing only). */ - if (force_stop || (e.restart_onexit && e.step - 1 == nsteps)) - engine_dump_restarts(&e, 0, 1); + /* Shall we write some check-point files? + * Note that this was already done by engine_step() if force_stop is set */ + if (e.restart_onexit && e.step - 1 == nsteps && !force_stop) + engine_dump_restarts(&e, /*drifted=*/0, /*force=*/1); /* Dump the task data using the given frequency. */ if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) { @@ -1750,7 +1722,7 @@ int main(int argc, char *argv[]) { j + 1); mpiuse_log_dump(dumpfile, e.tic_step); } -#endif // WITH_MPI +#endif #ifdef SWIFT_DEBUG_THREADPOOL /* Dump the task data using the given frequency. */ @@ -1766,7 +1738,7 @@ int main(int argc, char *argv[]) { } else { threadpool_reset_log(&e.threadpool); } -#endif // SWIFT_DEBUG_THREADPOOL +#endif } /* Write final time information */ @@ -1884,9 +1856,9 @@ int main(int argc, char *argv[]) { if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1); /* Did we want to run a re-submission command just before dying? */ - if (myrank == 0 && resubmit) { + if (myrank == 0 && e.resubmit) { message("Running the resubmission command:"); - restart_resubmit(resubmit_command); + restart_resubmit(e.resubmit_command); fflush(stdout); fflush(stderr); message("resubmission command completed."); diff --git a/swift_fof.c b/swift_fof.c index 3791b78d88..6bbed0fa96 100644 --- a/swift_fof.c +++ b/swift_fof.c @@ -664,7 +664,8 @@ int main(int argc, char *argv[]) { /*extra_io_props=*/NULL, &fof_properties, /*los_properties=*/NULL, /*lightcone_properties=*/NULL, &ics_metadata); engine_config(/*restart=*/0, /*fof=*/1, &e, params, nr_nodes, myrank, - nr_threads, nr_threads, with_aff, talking, NULL, &reparttype); + nr_threads, nr_threads, with_aff, talking, NULL, NULL, + &reparttype); /* Get some info to the user. */ if (myrank == 0) { -- GitLab