From c6ff1d86825f4b9159a7cf80f409381b044d2cf9 Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Sun, 21 Oct 2018 00:42:52 +0200
Subject: [PATCH] Add ability to stop the code after a specified amount of
 time. Also add option to run a re-submission command.

---
 examples/main.c                | 44 ++++++++++++++++++++++++++--------
 examples/parameter_example.yml | 16 +++++++------
 src/clocks.c                   | 11 +++++++++
 src/clocks.h                   |  1 +
 src/restart.c                  | 14 +++++++++++
 src/restart.h                  |  2 ++
 6 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/examples/main.c b/examples/main.c
index 116d422c9a..cb424a5fa0 100644
--- a/examples/main.c
+++ b/examples/main.c
@@ -593,9 +593,22 @@ int main(int argc, char *argv[]) {
 
   /* How often to check for the stop file and dump restarts and exit the
    * application. */
-  int restart_stop_steps =
+  const int restart_stop_steps =
       parser_get_opt_param_int(params, "Restarts:stop_steps", 100);
 
+  /* Get the maximal wall-clock time of this run */
+  const float restart_max_hours_runtime =
+      parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX);
+
+  /* Do we want to resubmit when we hit the limit? */
+  const int resubmit_after_max_hours =
+      parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0);
+
+  /* What command should we run to resubmit at the end? */
+  char resubmit_command[PARSER_MAX_LINE_SIZE];
+  if (resubmit_after_max_hours)
+    parser_get_param_string(params, "Restarts:basename", resubmit_command);
+
   /* If restarting, look for the restart files. */
   if (restart) {
 
@@ -1025,7 +1038,7 @@ int main(int argc, char *argv[]) {
 
   /* Main simulation loop */
   /* ==================== */
-  int force_stop = 0;
+  int force_stop = 0, resubmit = 0;
   for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop;
        j++) {
 
@@ -1046,6 +1059,12 @@ int main(int argc, char *argv[]) {
         message("Forcing application exit, dumping restart files...");
     }
 
+    /* Did we exceed the maximal runtime? */
+    if (clocks_get_hours_since_start() > restart_max_hours_runtime) {
+      force_stop = 1;
+      if (resubmit_after_max_hours) resubmit = 1;
+    }
+
     /* Also if using nsteps to exit, will not have saved any restarts on exit,
      * make sure we do that (useful in testing only). */
     if (force_stop || (e.restart_onexit && e.step - 1 == nsteps))
@@ -1195,17 +1214,19 @@ int main(int argc, char *argv[]) {
   }
 
   /* Write final output. */
-  engine_drift_all(&e);
-  engine_print_stats(&e);
-  engine_dump_snapshot(&e);
+  if (!force_stop) {
+    engine_drift_all(&e);
+    engine_print_stats(&e);
+    engine_dump_snapshot(&e);
 
 #ifdef HAVE_VELOCIRAPTOR
-  /* Call VELOCIraptor at the end of the run to find groups. */
-  if (e.policy & engine_policy_structure_finding) {
-    velociraptor_init(&e);
-    velociraptor_invoke(&e);
-  }
+    /* Call VELOCIraptor at the end of the run to find groups. */
+    if (e.policy & engine_policy_structure_finding) {
+      velociraptor_init(&e);
+      velociraptor_invoke(&e);
+    }
 #endif
+  }
 
 #ifdef WITH_MPI
   if ((res = MPI_Finalize()) != MPI_SUCCESS)
@@ -1216,6 +1237,9 @@ int main(int argc, char *argv[]) {
    * stop file if normal exit happened first. */
   if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1);
 
+  /* Did we want to run a re-submission command just before dying? */
+  if (myrank == 0 && resubmit) restart_resubmit(resubmit_command);
+
   /* Clean everything */
   if (with_verbose_timers) timers_close_file();
   if (with_cosmology) cosmology_clean(&cosmo);
diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml
index f86abe054e..218a3cb2b8 100644
--- a/examples/parameter_example.yml
+++ b/examples/parameter_example.yml
@@ -114,13 +114,15 @@ InitialConditions:
 
 # Parameters controlling restarts
 Restarts:
-  enable:      1        # (Optional) whether to enable dumping restarts at fixed intervals.
-  save:        1        # (Optional) whether to save copies of the previous set of restart files (named .prev)
-  onexit:      0        # (Optional) whether to dump restarts on exit (*needs enable*)
-  subdir:      restart  # (Optional) name of subdirectory for restart files.
-  basename:    swift    # (Optional) prefix used in naming restart files.
-  delta_hours: 6.0      # (Optional) decimal hours between dumps of restart files.
-  stop_steps:  100      # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first.
+  enable:             1        # (Optional) whether to enable dumping restarts at fixed intervals.
+  save:               1        # (Optional) whether to save copies of the previous set of restart files (named .prev)
+  onexit:             0        # (Optional) whether to dump restarts on exit (*needs enable*)
+  subdir:             restart  # (Optional) name of subdirectory for restart files.
+  basename:           swift    # (Optional) prefix used in naming restart files.
+  delta_hours:        6.0      # (Optional) decimal hours between dumps of restart files.
+  stop_steps:         100      # (Optional) how many steps to process before checking if the <subdir>/stop file exists. When present the application will attempt to exit early, dumping restart files first.
+  max_run_time:       24.0     # (optional) Maximal wall-clock time in hours. The application will exit when this limit is reached.
+  resubmit_on_exit:   0        # (Optional) whether to run a command when exiting after the time limit has been reached.
 
 # Parameters governing domain decomposition
 DomainDecomposition:
diff --git a/src/clocks.c b/src/clocks.c
index c64276bf83..49297f5db1 100644
--- a/src/clocks.c
+++ b/src/clocks.c
@@ -263,6 +263,17 @@ const char *clocks_get_timesincestart(void) {
   return buffer;
 }
 
+/**
+ * Returns the wall-clock time since the start of execution in hours.
+ *
+ * Need to call clocks_set_cpufreq() to mark the start of execution.
+ *
+ * @result the time since the start of the execution
+ */
+double clocks_get_hours_since_start(void) {
+  return clocks_diff_ticks(getticks(), clocks_start) / (3600. * 1000.0);
+}
+
 /**
  * @brief return the cpu time used.
  *
diff --git a/src/clocks.h b/src/clocks.h
index d33e5a342a..3800938e3e 100644
--- a/src/clocks.h
+++ b/src/clocks.h
@@ -42,6 +42,7 @@ double clocks_from_ticks(ticks tics);
 ticks clocks_to_ticks(double interval);
 double clocks_diff_ticks(ticks tic, ticks toc);
 const char *clocks_get_timesincestart(void);
+double clocks_get_hours_since_start(void);
 
 double clocks_get_cputime_used(void);
 int clocks_random_seed(void);
diff --git a/src/restart.c b/src/restart.c
index c412c8477d..54a098413d 100644
--- a/src/restart.c
+++ b/src/restart.c
@@ -334,3 +334,17 @@ void restart_remove_previous(const char *filename) {
     }
   }
 }
+
+/**
+ * @brief Run a given command, usually to resubmit a job.
+ *
+ * No check is done on the command being run.
+ *
+ * @param command The command to run in the system's shell.
+ */
+void restart_resubmit(const char *command) {
+
+  /* Let's trust the user's command... */
+  const int result = system(command);
+  if (result != 0) message("Command returned error code %d", result);
+}
diff --git a/src/restart.h b/src/restart.h
index 49d1274922..b938020165 100644
--- a/src/restart.h
+++ b/src/restart.h
@@ -41,4 +41,6 @@ int restart_stop_now(const char *dir, int cleanup);
 void restart_save_previous(const char *filename);
 void restart_remove_previous(const char *filename);
 
+void restart_resubmit(const char *command);
+
 #endif /* SWIFT_RESTART_H */
-- 
GitLab