From c6ff1d86825f4b9159a7cf80f409381b044d2cf9 Mon Sep 17 00:00:00 2001 From: Matthieu Schaller <schaller@strw.leidenuniv.nl> Date: Sun, 21 Oct 2018 00:42:52 +0200 Subject: [PATCH] Add ability to stop the code after a specified amount of time. Also add option to run a re-submission command. --- examples/main.c | 44 ++++++++++++++++++++++++++-------- examples/parameter_example.yml | 16 +++++++------ src/clocks.c | 11 +++++++++ src/clocks.h | 1 + src/restart.c | 14 +++++++++++ src/restart.h | 2 ++ 6 files changed, 71 insertions(+), 17 deletions(-) diff --git a/examples/main.c b/examples/main.c index 116d422c9a..cb424a5fa0 100644 --- a/examples/main.c +++ b/examples/main.c @@ -593,9 +593,22 @@ int main(int argc, char *argv[]) { /* How often to check for the stop file and dump restarts and exit the * application. */ - int restart_stop_steps = + const int restart_stop_steps = parser_get_opt_param_int(params, "Restarts:stop_steps", 100); + /* Get the maximal wall-clock time of this run */ + const float restart_max_hours_runtime = + parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX); + + /* Do we want to resubmit when we hit the limit? */ + const int resubmit_after_max_hours = + parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0); + + /* What command should we run to resubmit at the end? */ + char resubmit_command[PARSER_MAX_LINE_SIZE]; + if (resubmit_after_max_hours) + parser_get_param_string(params, "Restarts:basename", resubmit_command); + /* If restarting, look for the restart files. */ if (restart) { @@ -1025,7 +1038,7 @@ int main(int argc, char *argv[]) { /* Main simulation loop */ /* ==================== */ - int force_stop = 0; + int force_stop = 0, resubmit = 0; for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop; j++) { @@ -1046,6 +1059,12 @@ int main(int argc, char *argv[]) { message("Forcing application exit, dumping restart files..."); } + /* Did we exceed the maximal runtime? */ + if (clocks_get_hours_since_start() > restart_max_hours_runtime) { + force_stop = 1; + if (resubmit_after_max_hours) resubmit = 1; + } + /* Also if using nsteps to exit, will not have saved any restarts on exit, * make sure we do that (useful in testing only). */ if (force_stop || (e.restart_onexit && e.step - 1 == nsteps)) @@ -1195,17 +1214,19 @@ int main(int argc, char *argv[]) { } /* Write final output. */ - engine_drift_all(&e); - engine_print_stats(&e); - engine_dump_snapshot(&e); + if (!force_stop) { + engine_drift_all(&e); + engine_print_stats(&e); + engine_dump_snapshot(&e); #ifdef HAVE_VELOCIRAPTOR - /* Call VELOCIraptor at the end of the run to find groups. */ - if (e.policy & engine_policy_structure_finding) { - velociraptor_init(&e); - velociraptor_invoke(&e); - } + /* Call VELOCIraptor at the end of the run to find groups. */ + if (e.policy & engine_policy_structure_finding) { + velociraptor_init(&e); + velociraptor_invoke(&e); + } #endif + } #ifdef WITH_MPI if ((res = MPI_Finalize()) != MPI_SUCCESS) @@ -1216,6 +1237,9 @@ int main(int argc, char *argv[]) { * stop file if normal exit happened first. */ if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1); + /* Did we want to run a re-submission command just before dying? */ + if (myrank == 0 && resubmit) restart_resubmit(resubmit_command); + /* Clean everything */ if (with_verbose_timers) timers_close_file(); if (with_cosmology) cosmology_clean(&cosmo); diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index f86abe054e..218a3cb2b8 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -114,13 +114,15 @@ InitialConditions: # Parameters controlling restarts Restarts: - enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals. - save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev) - onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*) - subdir: restart # (Optional) name of subdirectory for restart files. - basename: swift # (Optional) prefix used in naming restart files. - delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files. - stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first. + enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals. + save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev) + onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*) + subdir: restart # (Optional) name of subdirectory for restart files. + basename: swift # (Optional) prefix used in naming restart files. + delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files. + stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first. + max_run_time: 24.0 # (optional) Maximal wall-clock time in hours. The application will exit when this limit is reached. + resubmit_on_exit: 0 # (Optional) whether to run a command when exiting after the time limit has been reached. # Parameters governing domain decomposition DomainDecomposition: diff --git a/src/clocks.c b/src/clocks.c index c64276bf83..49297f5db1 100644 --- a/src/clocks.c +++ b/src/clocks.c @@ -263,6 +263,17 @@ const char *clocks_get_timesincestart(void) { return buffer; } +/** + * Returns the wall-clock time since the start of execution in hours. + * + * Need to call clocks_set_cpufreq() to mark the start of execution. + * + * @result the time since the start of the execution + */ +double clocks_get_hours_since_start(void) { + return clocks_diff_ticks(getticks(), clocks_start) / (3600. * 1000.0); +} + /** * @brief return the cpu time used. * diff --git a/src/clocks.h b/src/clocks.h index d33e5a342a..3800938e3e 100644 --- a/src/clocks.h +++ b/src/clocks.h @@ -42,6 +42,7 @@ double clocks_from_ticks(ticks tics); ticks clocks_to_ticks(double interval); double clocks_diff_ticks(ticks tic, ticks toc); const char *clocks_get_timesincestart(void); +double clocks_get_hours_since_start(void); double clocks_get_cputime_used(void); int clocks_random_seed(void); diff --git a/src/restart.c b/src/restart.c index c412c8477d..54a098413d 100644 --- a/src/restart.c +++ b/src/restart.c @@ -334,3 +334,17 @@ void restart_remove_previous(const char *filename) { } } } + +/** + * @brief Run a given command, usually to resubmit a job. + * + * No check is done on the command being run. + * + * @param command The command to run in the system's shell. + */ +void restart_resubmit(const char *command) { + + /* Let's trust the user's command... */ + const int result = system(command); + if (result != 0) message("Command returned error code %d", result); +} diff --git a/src/restart.h b/src/restart.h index 49d1274922..b938020165 100644 --- a/src/restart.h +++ b/src/restart.h @@ -41,4 +41,6 @@ int restart_stop_now(const char *dir, int cleanup); void restart_save_previous(const char *filename); void restart_remove_previous(const char *filename); +void restart_resubmit(const char *command); + #endif /* SWIFT_RESTART_H */ -- GitLab