diff --git a/examples/main.c b/examples/main.c index ce96d4516611373df637e657188e3c5604dcb946..e5ab33b7e555bd5a2cd5f14fd3a45cc8361bbd45 100644 --- a/examples/main.c +++ b/examples/main.c @@ -593,9 +593,23 @@ int main(int argc, char *argv[]) { /* How often to check for the stop file and dump restarts and exit the * application. */ - int restart_stop_steps = + const int restart_stop_steps = parser_get_opt_param_int(params, "Restarts:stop_steps", 100); + /* Get the maximal wall-clock time of this run */ + const float restart_max_hours_runtime = + parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX); + + /* Do we want to resubmit when we hit the limit? */ + const int resubmit_after_max_hours = + parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0); + + /* What command should we run to resubmit at the end? */ + char resubmit_command[PARSER_MAX_LINE_SIZE]; + if (resubmit_after_max_hours) + parser_get_param_string(params, "Restarts:resubmit_command", + resubmit_command); + /* If restarting, look for the restart files. */ if (restart) { @@ -1034,7 +1048,7 @@ int main(int argc, char *argv[]) { /* Main simulation loop */ /* ==================== */ - int force_stop = 0; + int force_stop = 0, resubmit = 0; for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop; j++) { @@ -1055,6 +1069,13 @@ int main(int argc, char *argv[]) { message("Forcing application exit, dumping restart files..."); } + /* Did we exceed the maximal runtime? */ + if (clocks_get_hours_since_start() > restart_max_hours_runtime) { + force_stop = 1; + message("Runtime limit reached, dumping restart files..."); + if (resubmit_after_max_hours) resubmit = 1; + } + /* Also if using nsteps to exit, will not have saved any restarts on exit, * make sure we do that (useful in testing only). */ if (force_stop || (e.restart_onexit && e.step - 1 == nsteps)) @@ -1204,17 +1225,19 @@ int main(int argc, char *argv[]) { } /* Write final output. */ - engine_drift_all(&e); - engine_print_stats(&e); - engine_dump_snapshot(&e); + if (!force_stop) { + engine_drift_all(&e); + engine_print_stats(&e); + engine_dump_snapshot(&e); #ifdef HAVE_VELOCIRAPTOR - /* Call VELOCIraptor at the end of the run to find groups. */ - if (e.policy & engine_policy_structure_finding) { - velociraptor_init(&e); - velociraptor_invoke(&e); - } + /* Call VELOCIraptor at the end of the run to find groups. */ + if (e.policy & engine_policy_structure_finding) { + velociraptor_init(&e); + velociraptor_invoke(&e); + } #endif + } #ifdef WITH_MPI if ((res = MPI_Finalize()) != MPI_SUCCESS) @@ -1225,6 +1248,15 @@ int main(int argc, char *argv[]) { * stop file if normal exit happened first. */ if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1); + /* Did we want to run a re-submission command just before dying? */ + if (myrank == 0 && resubmit) { + message("Running the resubmission command:"); + restart_resubmit(resubmit_command); + fflush(stdout); + fflush(stderr); + message("resubmission command completed."); + } + /* Clean everything */ if (with_verbose_timers) timers_close_file(); if (with_cosmology) cosmology_clean(e.cosmology); diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 50c3fc3e34c0c9ef3bbe82a06c1d816e1c955130..19579522ceb8a15f9d180f0e89caf0ef9c9cceb6 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -115,13 +115,16 @@ InitialConditions: # Parameters controlling restarts Restarts: - enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals. - save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev) - onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*) - subdir: restart # (Optional) name of subdirectory for restart files. - basename: swift # (Optional) prefix used in naming restart files. - delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files. - stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first. + enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals. + save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev) + onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*) + subdir: restart # (Optional) name of subdirectory for restart files. + basename: swift # (Optional) prefix used in naming restart files. + delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files. + stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first. + max_run_time: 24.0 # (optional) Maximal wall-clock time in hours. The application will exit when this limit is reached. + resubmit_on_exit: 0 # (Optional) whether to run a command when exiting after the time limit has been reached. + resubmit_command: ./resub.sh # (Optional) Command to run when time limit is reached. Compulsory if resubmit_on_exit is switched on. Note potentially unsafe. # Parameters governing domain decomposition DomainDecomposition: diff --git a/src/clocks.c b/src/clocks.c index c64276bf83f8b52d6d09aa4950737af2a12aa4f6..49297f5db1cc10a3d9f4537c5900610dded7ffba 100644 --- a/src/clocks.c +++ b/src/clocks.c @@ -263,6 +263,17 @@ const char *clocks_get_timesincestart(void) { return buffer; } +/** + * Returns the wall-clock time since the start of execution in hours. + * + * Need to call clocks_set_cpufreq() to mark the start of execution. + * + * @result the time since the start of the execution + */ +double clocks_get_hours_since_start(void) { + return clocks_diff_ticks(getticks(), clocks_start) / (3600. * 1000.0); +} + /** * @brief return the cpu time used. * diff --git a/src/clocks.h b/src/clocks.h index d33e5a342a9b7024ee918a035547e8351b3dc726..3800938e3effa837fdec3c094414c259b133562c 100644 --- a/src/clocks.h +++ b/src/clocks.h @@ -42,6 +42,7 @@ double clocks_from_ticks(ticks tics); ticks clocks_to_ticks(double interval); double clocks_diff_ticks(ticks tic, ticks toc); const char *clocks_get_timesincestart(void); +double clocks_get_hours_since_start(void); double clocks_get_cputime_used(void); int clocks_random_seed(void); diff --git a/src/parser.c b/src/parser.c index f3e5ef00f96f0f7b55daff0ff32077e9373c4a2f..57592d57abb78100d113b91710af68f7b1c3e32d 100644 --- a/src/parser.c +++ b/src/parser.c @@ -35,6 +35,7 @@ #include "error.h" #include "restart.h" #include "tools.h" +#include "version.h" #define PARSER_COMMENT_STRING "#" #define PARSER_COMMENT_CHAR '#' @@ -1158,7 +1159,13 @@ void parser_write_params_to_file(const struct swift_params *params, char *token; /* Start of file identifier in YAML. */ - fprintf(file, "%s\n", PARSER_START_OF_FILE); + fprintf(file, "%s\n\n", PARSER_START_OF_FILE); + + fprintf(file, "# SWIFT used parameter file\n"); + fprintf(file, "# Code version: %s\n", package_version()); + fprintf(file, "# git revision: %s\n", git_revision()); + fprintf(file, "# git branch: %s\n", git_branch()); + fprintf(file, "# git date: %s\n", git_date()); /* Flags to track which parameters are written. */ int *written = (int *)calloc(params->paramCount, sizeof(int)); diff --git a/src/restart.c b/src/restart.c index c412c8477d9f93e7c085e13c9e3fe72cd0cab9df..54a098413d7a393ac88a7ef5d7300d912c99f845 100644 --- a/src/restart.c +++ b/src/restart.c @@ -334,3 +334,17 @@ void restart_remove_previous(const char *filename) { } } } + +/** + * @brief Run a given command, usually to resubmit a job. + * + * No check is done on the command being run. + * + * @param command The command to run in the system's shell. + */ +void restart_resubmit(const char *command) { + + /* Let's trust the user's command... */ + const int result = system(command); + if (result != 0) message("Command returned error code %d", result); +} diff --git a/src/restart.h b/src/restart.h index 49d127492255364cbf0f48653c560494e83a2920..b9380201659dacf05fcedad8c9fcb29e7bd89be2 100644 --- a/src/restart.h +++ b/src/restart.h @@ -41,4 +41,6 @@ int restart_stop_now(const char *dir, int cleanup); void restart_save_previous(const char *filename); void restart_remove_previous(const char *filename); +void restart_resubmit(const char *command); + #endif /* SWIFT_RESTART_H */