Commit 1be34da0 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'resubmission_command' into 'master'

Runtime limit and resubmission command

Closes #461

See merge request !643
parents 5463c94f 109cf456
......@@ -593,9 +593,23 @@ int main(int argc, char *argv[]) {
/* How often to check for the stop file and dump restarts and exit the
* application. */
int restart_stop_steps =
const int restart_stop_steps =
parser_get_opt_param_int(params, "Restarts:stop_steps", 100);
/* Get the maximal wall-clock time of this run */
const float restart_max_hours_runtime =
parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX);
/* Do we want to resubmit when we hit the limit? */
const int resubmit_after_max_hours =
parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0);
/* What command should we run to resubmit at the end? */
char resubmit_command[PARSER_MAX_LINE_SIZE];
if (resubmit_after_max_hours)
parser_get_param_string(params, "Restarts:resubmit_command",
resubmit_command);
/* If restarting, look for the restart files. */
if (restart) {
......@@ -1034,7 +1048,7 @@ int main(int argc, char *argv[]) {
/* Main simulation loop */
/* ==================== */
int force_stop = 0;
int force_stop = 0, resubmit = 0;
for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop;
j++) {
......@@ -1055,6 +1069,13 @@ int main(int argc, char *argv[]) {
message("Forcing application exit, dumping restart files...");
}
/* Did we exceed the maximal runtime? */
if (clocks_get_hours_since_start() > restart_max_hours_runtime) {
force_stop = 1;
message("Runtime limit reached, dumping restart files...");
if (resubmit_after_max_hours) resubmit = 1;
}
/* Also if using nsteps to exit, will not have saved any restarts on exit,
* make sure we do that (useful in testing only). */
if (force_stop || (e.restart_onexit && e.step - 1 == nsteps))
......@@ -1204,17 +1225,19 @@ int main(int argc, char *argv[]) {
}
/* Write final output. */
engine_drift_all(&e);
engine_print_stats(&e);
engine_dump_snapshot(&e);
if (!force_stop) {
engine_drift_all(&e);
engine_print_stats(&e);
engine_dump_snapshot(&e);
#ifdef HAVE_VELOCIRAPTOR
/* Call VELOCIraptor at the end of the run to find groups. */
if (e.policy & engine_policy_structure_finding) {
velociraptor_init(&e);
velociraptor_invoke(&e);
}
/* Call VELOCIraptor at the end of the run to find groups. */
if (e.policy & engine_policy_structure_finding) {
velociraptor_init(&e);
velociraptor_invoke(&e);
}
#endif
}
#ifdef WITH_MPI
if ((res = MPI_Finalize()) != MPI_SUCCESS)
......@@ -1225,6 +1248,15 @@ int main(int argc, char *argv[]) {
* stop file if normal exit happened first. */
if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1);
/* Did we want to run a re-submission command just before dying? */
if (myrank == 0 && resubmit) {
message("Running the resubmission command:");
restart_resubmit(resubmit_command);
fflush(stdout);
fflush(stderr);
message("resubmission command completed.");
}
/* Clean everything */
if (with_verbose_timers) timers_close_file();
if (with_cosmology) cosmology_clean(e.cosmology);
......
......@@ -115,13 +115,16 @@ InitialConditions:
# Parameters controlling restarts
Restarts:
enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals.
save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev)
onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*)
subdir: restart # (Optional) name of subdirectory for restart files.
basename: swift # (Optional) prefix used in naming restart files.
delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files.
stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first.
enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals.
save: 1 # (Optional) whether to save copies of the previous set of restart files (named .prev)
onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*)
subdir: restart # (Optional) name of subdirectory for restart files.
basename: swift # (Optional) prefix used in naming restart files.
delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files.
stop_steps: 100 # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first.
max_run_time: 24.0 # (optional) Maximal wall-clock time in hours. The application will exit when this limit is reached.
resubmit_on_exit: 0 # (Optional) whether to run a command when exiting after the time limit has been reached.
resubmit_command: ./resub.sh # (Optional) Command to run when time limit is reached. Compulsory if resubmit_on_exit is switched on. Note potentially unsafe.
# Parameters governing domain decomposition
DomainDecomposition:
......
......@@ -263,6 +263,17 @@ const char *clocks_get_timesincestart(void) {
return buffer;
}
/**
* Returns the wall-clock time since the start of execution in hours.
*
* Need to call clocks_set_cpufreq() to mark the start of execution.
*
* @result the time since the start of the execution
*/
double clocks_get_hours_since_start(void) {
return clocks_diff_ticks(getticks(), clocks_start) / (3600. * 1000.0);
}
/**
* @brief return the cpu time used.
*
......
......@@ -42,6 +42,7 @@ double clocks_from_ticks(ticks tics);
ticks clocks_to_ticks(double interval);
double clocks_diff_ticks(ticks tic, ticks toc);
const char *clocks_get_timesincestart(void);
double clocks_get_hours_since_start(void);
double clocks_get_cputime_used(void);
int clocks_random_seed(void);
......
......@@ -35,6 +35,7 @@
#include "error.h"
#include "restart.h"
#include "tools.h"
#include "version.h"
#define PARSER_COMMENT_STRING "#"
#define PARSER_COMMENT_CHAR '#'
......@@ -1158,7 +1159,13 @@ void parser_write_params_to_file(const struct swift_params *params,
char *token;
/* Start of file identifier in YAML. */
fprintf(file, "%s\n", PARSER_START_OF_FILE);
fprintf(file, "%s\n\n", PARSER_START_OF_FILE);
fprintf(file, "# SWIFT used parameter file\n");
fprintf(file, "# Code version: %s\n", package_version());
fprintf(file, "# git revision: %s\n", git_revision());
fprintf(file, "# git branch: %s\n", git_branch());
fprintf(file, "# git date: %s\n", git_date());
/* Flags to track which parameters are written. */
int *written = (int *)calloc(params->paramCount, sizeof(int));
......
......@@ -334,3 +334,17 @@ void restart_remove_previous(const char *filename) {
}
}
}
/**
* @brief Run a given command, usually to resubmit a job.
*
* No check is done on the command being run.
*
* @param command The command to run in the system's shell.
*/
void restart_resubmit(const char *command) {
/* Let's trust the user's command... */
const int result = system(command);
if (result != 0) message("Command returned error code %d", result);
}
......@@ -41,4 +41,6 @@ int restart_stop_now(const char *dir, int cleanup);
void restart_save_previous(const char *filename);
void restart_remove_previous(const char *filename);
void restart_resubmit(const char *command);
#endif /* SWIFT_RESTART_H */
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment