Skip to content
Snippets Groups Projects
Commit c6ff1d86 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Add ability to stop the code after a specified amount of time. Also add option...

Add ability to stop the code after a specified amount of time. Also add option to run a re-submission command.
parent ab08bd9d
No related branches found
No related tags found
1 merge request!643Runtime limit and resubmission command
...@@ -593,9 +593,22 @@ int main(int argc, char *argv[]) { ...@@ -593,9 +593,22 @@ int main(int argc, char *argv[]) {
/* How often to check for the stop file and dump restarts and exit the /* How often to check for the stop file and dump restarts and exit the
* application. */ * application. */
int restart_stop_steps = const int restart_stop_steps =
parser_get_opt_param_int(params, "Restarts:stop_steps", 100); parser_get_opt_param_int(params, "Restarts:stop_steps", 100);
/* Get the maximal wall-clock time of this run */
const float restart_max_hours_runtime =
parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX);
/* Do we want to resubmit when we hit the limit? */
const int resubmit_after_max_hours =
parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0);
/* What command should we run to resubmit at the end? */
char resubmit_command[PARSER_MAX_LINE_SIZE];
if (resubmit_after_max_hours)
parser_get_param_string(params, "Restarts:basename", resubmit_command);
/* If restarting, look for the restart files. */ /* If restarting, look for the restart files. */
if (restart) { if (restart) {
...@@ -1025,7 +1038,7 @@ int main(int argc, char *argv[]) { ...@@ -1025,7 +1038,7 @@ int main(int argc, char *argv[]) {
/* Main simulation loop */ /* Main simulation loop */
/* ==================== */ /* ==================== */
int force_stop = 0; int force_stop = 0, resubmit = 0;
for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop; for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop;
j++) { j++) {
...@@ -1046,6 +1059,12 @@ int main(int argc, char *argv[]) { ...@@ -1046,6 +1059,12 @@ int main(int argc, char *argv[]) {
message("Forcing application exit, dumping restart files..."); message("Forcing application exit, dumping restart files...");
} }
/* Did we exceed the maximal runtime? */
if (clocks_get_hours_since_start() > restart_max_hours_runtime) {
force_stop = 1;
if (resubmit_after_max_hours) resubmit = 1;
}
/* Also if using nsteps to exit, will not have saved any restarts on exit, /* Also if using nsteps to exit, will not have saved any restarts on exit,
* make sure we do that (useful in testing only). */ * make sure we do that (useful in testing only). */
if (force_stop || (e.restart_onexit && e.step - 1 == nsteps)) if (force_stop || (e.restart_onexit && e.step - 1 == nsteps))
...@@ -1195,17 +1214,19 @@ int main(int argc, char *argv[]) { ...@@ -1195,17 +1214,19 @@ int main(int argc, char *argv[]) {
} }
/* Write final output. */ /* Write final output. */
engine_drift_all(&e); if (!force_stop) {
engine_print_stats(&e); engine_drift_all(&e);
engine_dump_snapshot(&e); engine_print_stats(&e);
engine_dump_snapshot(&e);
#ifdef HAVE_VELOCIRAPTOR #ifdef HAVE_VELOCIRAPTOR
/* Call VELOCIraptor at the end of the run to find groups. */ /* Call VELOCIraptor at the end of the run to find groups. */
if (e.policy & engine_policy_structure_finding) { if (e.policy & engine_policy_structure_finding) {
velociraptor_init(&e); velociraptor_init(&e);
velociraptor_invoke(&e); velociraptor_invoke(&e);
} }
#endif #endif
}
#ifdef WITH_MPI #ifdef WITH_MPI
if ((res = MPI_Finalize()) != MPI_SUCCESS) if ((res = MPI_Finalize()) != MPI_SUCCESS)
...@@ -1216,6 +1237,9 @@ int main(int argc, char *argv[]) { ...@@ -1216,6 +1237,9 @@ int main(int argc, char *argv[]) {
* stop file if normal exit happened first. */ * stop file if normal exit happened first. */
if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1); if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1);
/* Did we want to run a re-submission command just before dying? */
if (myrank == 0 && resubmit) restart_resubmit(resubmit_command);
/* Clean everything */ /* Clean everything */
if (with_verbose_timers) timers_close_file(); if (with_verbose_timers) timers_close_file();
if (with_cosmology) cosmology_clean(&cosmo); if (with_cosmology) cosmology_clean(&cosmo);
......
...@@ -114,13 +114,15 @@ InitialConditions: ...@@ -114,13 +114,15 @@ InitialConditions:
# Parameters controlling restarts # Parameters controlling restarts
Restarts: Restarts:
enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals. enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals.
save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev) save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev)
onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*) onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*)
subdir: restart # (Optional) name of subdirectory for restart files. subdir: restart # (Optional) name of subdirectory for restart files.
basename: swift # (Optional) prefix used in naming restart files. basename: swift # (Optional) prefix used in naming restart files.
delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files. delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files.
stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first. stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first.
max_run_time: 24.0 # (optional) Maximal wall-clock time in hours. The application will exit when this limit is reached.
resubmit_on_exit: 0 # (Optional) whether to run a command when exiting after the time limit has been reached.
# Parameters governing domain decomposition # Parameters governing domain decomposition
DomainDecomposition: DomainDecomposition:
......
...@@ -263,6 +263,17 @@ const char *clocks_get_timesincestart(void) { ...@@ -263,6 +263,17 @@ const char *clocks_get_timesincestart(void) {
return buffer; return buffer;
} }
/**
* Returns the wall-clock time since the start of execution in hours.
*
* Need to call clocks_set_cpufreq() to mark the start of execution.
*
* @result the time since the start of the execution
*/
double clocks_get_hours_since_start(void) {
return clocks_diff_ticks(getticks(), clocks_start) / (3600. * 1000.0);
}
/** /**
* @brief return the cpu time used. * @brief return the cpu time used.
* *
......
...@@ -42,6 +42,7 @@ double clocks_from_ticks(ticks tics); ...@@ -42,6 +42,7 @@ double clocks_from_ticks(ticks tics);
ticks clocks_to_ticks(double interval); ticks clocks_to_ticks(double interval);
double clocks_diff_ticks(ticks tic, ticks toc); double clocks_diff_ticks(ticks tic, ticks toc);
const char *clocks_get_timesincestart(void); const char *clocks_get_timesincestart(void);
double clocks_get_hours_since_start(void);
double clocks_get_cputime_used(void); double clocks_get_cputime_used(void);
int clocks_random_seed(void); int clocks_random_seed(void);
......
...@@ -334,3 +334,17 @@ void restart_remove_previous(const char *filename) { ...@@ -334,3 +334,17 @@ void restart_remove_previous(const char *filename) {
} }
} }
} }
/**
* @brief Run a given command, usually to resubmit a job.
*
* No check is done on the command being run.
*
* @param command The command to run in the system's shell.
*/
void restart_resubmit(const char *command) {
/* Let's trust the user's command... */
const int result = system(command);
if (result != 0) message("Command returned error code %d", result);
}
...@@ -41,4 +41,6 @@ int restart_stop_now(const char *dir, int cleanup); ...@@ -41,4 +41,6 @@ int restart_stop_now(const char *dir, int cleanup);
void restart_save_previous(const char *filename); void restart_save_previous(const char *filename);
void restart_remove_previous(const char *filename); void restart_remove_previous(const char *filename);
void restart_resubmit(const char *command);
#endif /* SWIFT_RESTART_H */ #endif /* SWIFT_RESTART_H */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment