From 9f5b3fea65b09922e5d58c9e66f0e2b92a532c8b Mon Sep 17 00:00:00 2001
From: Matthieu Schaller <schaller@strw.leidenuniv.nl>
Date: Fri, 29 Jul 2022 14:58:46 +0000
Subject: [PATCH] Re-order the operations around i/o to cope with stop files

---
 src/engine.c        | 32 ++++++++++++++-------
 src/engine.h        | 30 +++++++++++++++----
 src/engine_config.c | 21 ++++++++++++--
 src/engine_drift.c  |  8 ++----
 src/engine_io.c     | 25 ++++++++++++++--
 swift.c             | 70 ++++++++++++++-------------------------------
 swift_fof.c         |  3 +-
 7 files changed, 113 insertions(+), 76 deletions(-)

diff --git a/src/engine.c b/src/engine.c
index c75e95cd18..8b47234e55 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -2113,8 +2113,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs,
  * @brief Let the #engine loose to compute the forces.
  *
  * @param e The #engine.
+ * @return Should the run stop after this step?
  */
-void engine_step(struct engine *e) {
+int engine_step(struct engine *e) {
 
   TIMER_TIC2;
 
@@ -2182,8 +2183,11 @@ void engine_step(struct engine *e) {
               clocks_from_ticks(getticks() - tic_files), clocks_getunit());
   }
 
-  /* We need some cells to exist but not the whole task stuff. */
+  /* When restarting, we may have had some i/o to do on the step
+   * where we decided to stop. We have to do this now.
+   * We need some cells to exist but not the whole task stuff. */
   if (e->restarting) space_rebuild(e->s, 0, e->verbose);
+  if (e->restarting) engine_io(e);
 
   /* Move forward in time */
   e->ti_old = e->ti_current;
@@ -2254,6 +2258,7 @@ void engine_step(struct engine *e) {
     }
   }
 
+  /* Trigger a rebuild if we reached a gravity mesh step? */
   if ((e->policy & engine_policy_self_gravity) && e->s->periodic &&
       e->mesh->ti_end_mesh_next == e->ti_current)
     e->forcerebuild = 1;
@@ -2303,15 +2308,13 @@ void engine_step(struct engine *e) {
   /* Prepare the tasks to be launched, rebuild or repartition if needed. */
   const int drifted_all = engine_prepare(e);
 
+  /* Dump local cells and active particle counts. */
+  // dumpCells("cells", 1, 0, 0, 0, e->s, e->nodeID, e->step);
+
 #ifdef SWIFT_DEBUG_CHECKS
   /* Print the number of active tasks */
   if (e->verbose) engine_print_task_counts(e);
-#endif
 
-    /* Dump local cells and active particle counts. */
-    // dumpCells("cells", 1, 0, 0, 0, e->s, e->nodeID, e->step);
-
-#ifdef SWIFT_DEBUG_CHECKS
   /* Check that we have the correct total mass in the top-level multipoles */
   long long num_gpart_mpole = 0;
   if (e->policy & engine_policy_self_gravity) {
@@ -2360,7 +2363,7 @@ void engine_step(struct engine *e) {
   }
 #endif
 
-  /* Re-compute the mesh forces */
+  /* Re-compute the mesh forces? */
   if ((e->policy & engine_policy_self_gravity) && e->s->periodic &&
       e->mesh->ti_end_mesh_next == e->ti_current) {
 
@@ -2509,11 +2512,16 @@ void engine_step(struct engine *e) {
 #endif
 
   /* Create a restart file if needed. */
-  engine_dump_restarts(e, 0, e->restart_onexit && engine_is_done(e));
+  const int force_stop =
+      engine_dump_restarts(e, 0, e->restart_onexit && engine_is_done(e));
 
-  engine_check_for_dumps(e);
+  /* Is there any form of i/o this step?
+   *
+   * Note that if the run was forced to stop, we do not dump,
+   * we will do so when the run is restarted*/
+  if (!force_stop) engine_io(e);
 
-#if defined(SWIFT_RT_DEBUG_CHECKS)
+#ifdef SWIFT_RT_DEBUG_CHECKS
   /* if we're running the debug RT scheme, do some checks after every step.
    * Do this after the output so we can safely reset debugging checks now. */
   if (e->policy & engine_policy_rt)
@@ -2527,6 +2535,8 @@ void engine_step(struct engine *e) {
 
   /* Time in ticks at the end of this step. */
   e->toc_step = getticks();
+
+  return force_stop;
 }
 
 /**
diff --git a/src/engine.h b/src/engine.h
index ba1498df6a..c35e88e1c6 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -560,9 +560,29 @@ struct engine {
   /* Do we free the foreign data before rebuilding the tree? */
   int free_foreign_when_rebuilding;
 
+  /* Name of the restart file directory. */
+  const char *restart_dir;
+
   /* Name of the restart file. */
   const char *restart_file;
 
+  /* Flag whether we should resubmit on this step? */
+  int resubmit;
+
+  /* Do we want to run the resubmission command once a run reaches the time
+   * limit? */
+  int resubmit_after_max_hours;
+
+  /* What command should we run to resubmit at the end? */
+  char resubmit_command[PARSER_MAX_LINE_SIZE];
+
+  /* How often to check for the stop file and dump restarts and exit the
+   * application. */
+  int restart_stop_steps;
+
+  /* Get the maximal wall-clock time of this run */
+  float restart_max_hours_runtime;
+
   /* Ticks between restart dumps. */
   ticks restart_dt;
 
@@ -639,7 +659,7 @@ void engine_drift_top_multipoles(struct engine *e);
 void engine_reconstruct_multipoles(struct engine *e);
 void engine_allocate_foreign_particles(struct engine *e, const int fof);
 void engine_print_stats(struct engine *e);
-void engine_check_for_dumps(struct engine *e);
+void engine_io(struct engine *e);
 void engine_collect_end_of_step(struct engine *e, int apply);
 void engine_dump_snapshot(struct engine *e);
 void engine_run_on_dump(struct engine *e);
@@ -672,13 +692,13 @@ void engine_init(
 void engine_config(int restart, int fof, struct engine *e,
                    struct swift_params *params, int nr_nodes, int nodeID,
                    int nr_task_threads, int nr_pool_threads, int with_aff,
-                   int verbose, const char *restart_file,
-                   struct repartition *reparttype);
+                   int verbose, const char *restart_dir,
+                   const char *restart_file, struct repartition *reparttype);
 void engine_launch(struct engine *e, const char *call);
 int engine_prepare(struct engine *e);
 void engine_init_particles(struct engine *e, int flag_entropy_ICs,
                            int clean_h_values);
-void engine_step(struct engine *e);
+int engine_step(struct engine *e);
 void engine_split(struct engine *e, struct partition *initial_partition);
 void engine_exchange_strays(struct engine *e, const size_t offset_parts,
                             const int *ind_part, size_t *Npart,
@@ -724,6 +744,6 @@ void engine_numa_policies(int rank, int verbose);
 /* Struct dump/restore support. */
 void engine_struct_dump(struct engine *e, FILE *stream);
 void engine_struct_restore(struct engine *e, FILE *stream);
-void engine_dump_restarts(struct engine *e, int drifted_all, int final_step);
+int engine_dump_restarts(struct engine *e, int drifted_all, int force);
 
 #endif /* SWIFT_ENGINE_H */
diff --git a/src/engine_config.c b/src/engine_config.c
index e7e0fc3e5b..74dd9b6717 100644
--- a/src/engine_config.c
+++ b/src/engine_config.c
@@ -162,8 +162,8 @@ static void engine_dumper_init(struct engine *e) {
 void engine_config(int restart, int fof, struct engine *e,
                    struct swift_params *params, int nr_nodes, int nodeID,
                    int nr_task_threads, int nr_pool_threads, int with_aff,
-                   int verbose, const char *restart_file,
-                   struct repartition *reparttype) {
+                   int verbose, const char *restart_dir,
+                   const char *restart_file, struct repartition *reparttype) {
 
   struct clocks_time tic, toc;
   if (nodeID == 0) clocks_gettime(&tic);
@@ -187,7 +187,9 @@ void engine_config(int restart, int fof, struct engine *e,
   e->verbose = verbose;
   e->wallclock_time = 0.f;
   e->restart_dump = 0;
+  e->restart_dir = restart_dir;
   e->restart_file = restart_file;
+  e->resubmit = 0;
   e->restart_next = 0;
   e->restart_dt = 0;
   e->run_fof = 0;
@@ -215,6 +217,21 @@ void engine_config(int restart, int fof, struct engine *e,
   /* Welcome message */
   if (e->nodeID == 0) message("Running simulation '%s'.", e->run_name);
 
+  /* Check-pointing properties */
+
+  e->restart_stop_steps =
+      parser_get_opt_param_int(params, "Restarts:stop_steps", 100);
+
+  e->restart_max_hours_runtime =
+      parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX);
+
+  e->resubmit_after_max_hours =
+      parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0);
+
+  if (e->resubmit_after_max_hours)
+    parser_get_param_string(params, "Restarts:resubmit_command",
+                            e->resubmit_command);
+
   /* Get the number of queues */
   int nr_queues =
       parser_get_opt_param_int(params, "Scheduler:nr_queues", e->nr_threads);
diff --git a/src/engine_drift.c b/src/engine_drift.c
index 7deb558489..6df3230325 100644
--- a/src/engine_drift.c
+++ b/src/engine_drift.c
@@ -324,16 +324,14 @@ void engine_drift_all(struct engine *e, const int drift_mpoles) {
 
   const ticks tic = getticks();
 
-#ifdef SWIFT_DEBUG_CHECKS
-  if (e->nodeID == 0) {
+  if (e->nodeID == 0 && e->verbose) {
     if (e->policy & engine_policy_cosmology)
-      message("Drifting all to a=%e",
+      message("Drifting all to a=%15.12e",
               exp(e->ti_current * e->time_base) * e->cosmology->a_begin);
     else
-      message("Drifting all to t=%e",
+      message("Drifting all to t=%15.12e",
               e->ti_current * e->time_base + e->time_begin);
   }
-#endif
 
 #ifdef WITH_LIGHTCONE
   /* Determine which periodic replications could contribute to the lightcone
diff --git a/src/engine_io.c b/src/engine_io.c
index 90b74e97e8..94350c0e3e 100644
--- a/src/engine_io.c
+++ b/src/engine_io.c
@@ -50,20 +50,33 @@
  * @param e the engine.
  * @param drifted_all true if a drift_all has just been performed.
  * @param force force a dump, if dumping is enabled.
+ * @return Do we want to stop the run altogether?
  */
-void engine_dump_restarts(struct engine *e, int drifted_all, int force) {
+int engine_dump_restarts(struct engine *e, const int drifted_all,
+                         const int force) {
+
+  /* Are any of the conditions to fully stop a run met? */
+  const int end_run_time = e->runtime > e->restart_max_hours_runtime;
+  const int stop_file = (e->step % e->restart_stop_steps == 0 &&
+                         restart_stop_now(e->restart_dir, 0));
+
+  /* Exit run when told to */
+  const int exit_run = (end_run_time || stop_file);
 
   if (e->restart_dump) {
     ticks tic = getticks();
 
+    const int check_point_time = tic > e->restart_next;
+
     /* Dump when the time has arrived, or we are told to. */
-    int dump = ((tic > e->restart_next) || force);
+    int dump = (check_point_time || end_run_time || force || stop_file);
 
 #ifdef WITH_MPI
     /* Synchronize this action from rank 0 (ticks may differ between
      * machines). */
     MPI_Bcast(&dump, 1, MPI_INT, 0, MPI_COMM_WORLD);
 #endif
+
     if (dump) {
 
       if (e->nodeID == 0) {
@@ -123,6 +136,12 @@ void engine_dump_restarts(struct engine *e, int drifted_all, int force) {
       e->step_props |= engine_step_prop_restarts;
     }
   }
+
+  /* If we stopped by reaching the time limit, flag that we need to
+   * run the resubmission command */
+  if (end_run_time && e->resubmit_after_max_hours) e->resubmit = 1;
+
+  return exit_run;
 }
 
 /**
@@ -249,7 +268,7 @@ void engine_run_on_dump(struct engine *e) {
  *
  * @param e The #engine.
  */
-void engine_check_for_dumps(struct engine *e) {
+void engine_io(struct engine *e) {
   const int with_cosmology = (e->policy & engine_policy_cosmology);
   const int with_stf = (e->policy & engine_policy_structure_finding);
   const int with_los = (e->policy & engine_policy_line_of_sight);
diff --git a/swift.c b/swift.c
index 26eccd8981..491877b0f4 100644
--- a/swift.c
+++ b/swift.c
@@ -879,25 +879,6 @@ int main(int argc, char *argv[]) {
   parser_get_opt_param_string(params, "Restarts:basename", restart_name,
                               "swift");
 
-  /* How often to check for the stop file and dump restarts and exit the
-   * application. */
-  const int restart_stop_steps =
-      parser_get_opt_param_int(params, "Restarts:stop_steps", 100);
-
-  /* Get the maximal wall-clock time of this run */
-  const float restart_max_hours_runtime =
-      parser_get_opt_param_float(params, "Restarts:max_run_time", FLT_MAX);
-
-  /* Do we want to resubmit when we hit the limit? */
-  const int resubmit_after_max_hours =
-      parser_get_opt_param_int(params, "Restarts:resubmit_on_exit", 0);
-
-  /* What command should we run to resubmit at the end? */
-  char resubmit_command[PARSER_MAX_LINE_SIZE];
-  if (resubmit_after_max_hours)
-    parser_get_param_string(params, "Restarts:resubmit_command",
-                            resubmit_command);
-
   /* If restarting, look for the restart files. */
   if (restart) {
 
@@ -987,13 +968,19 @@ int main(int argc, char *argv[]) {
 
     /* And initialize the engine with the space and policies. */
     engine_config(/*restart=*/1, /*fof=*/0, &e, params, nr_nodes, myrank,
-                  nr_threads, nr_pool_threads, with_aff, talking, restart_file,
-                  &reparttype);
+                  nr_threads, nr_pool_threads, with_aff, talking, restart_dir,
+                  restart_file, &reparttype);
 
     /* Check if we are already done when given steps on the command-line. */
     if (e.step >= nsteps && nsteps > 0)
       error("Not restarting, already completed %d steps", e.step);
 
+    /* If we are restarting at the very end of a run, just build the tree and
+     * prepare to dump.
+     * The main simulation loop below (where rebuild normally happens) won't be
+     * executed. */
+    if (engine_is_done(&e)) space_rebuild(e.s, /*repartitioned=*/0, e.verbose);
+
   } else {
 
     /* Prepare and verify the selection of outputs */
@@ -1509,8 +1496,8 @@ int main(int argc, char *argv[]) {
                 &chemistry, &extra_io_props, &fof_properties, &los_properties,
                 &lightcone_array_properties, &ics_metadata);
     engine_config(/*restart=*/0, /*fof=*/0, &e, params, nr_nodes, myrank,
-                  nr_threads, nr_pool_threads, with_aff, talking, restart_file,
-                  &reparttype);
+                  nr_threads, nr_pool_threads, with_aff, talking, restart_dir,
+                  restart_file, &reparttype);
 
     /* Compute some stats for the star formation */
     if (with_star_formation) {
@@ -1603,7 +1590,7 @@ int main(int argc, char *argv[]) {
     if (!e.output_list_stats) engine_print_stats(&e);
 
     /* Is there a dump before the end of the first time-step? */
-    engine_check_for_dumps(&e);
+    engine_io(&e);
   }
 
   /* Legend */
@@ -1675,7 +1662,7 @@ int main(int argc, char *argv[]) {
 
   /* Main simulation loop */
   /* ==================== */
-  int force_stop = 0, resubmit = 0;
+  int force_stop = 0;
   for (int j = 0; !engine_is_done(&e) && e.step - 1 != nsteps && !force_stop;
        j++) {
 
@@ -1683,30 +1670,15 @@ int main(int argc, char *argv[]) {
     timers_reset_all();
 
     /* Take a step. */
-    engine_step(&e);
+    force_stop = engine_step(&e);
 
     /* Print the timers. */
     if (with_verbose_timers) timers_print(e.step);
 
-    /* Every so often allow the user to stop the application and dump the
-     * restart files. */
-    if (j % restart_stop_steps == 0) {
-      force_stop = restart_stop_now(restart_dir, 0);
-      if (myrank == 0 && force_stop)
-        message("Forcing application exit, dumping restart files...");
-    }
-
-    /* Did we exceed the maximal runtime? */
-    if (e.runtime > restart_max_hours_runtime) {
-      force_stop = 1;
-      message("Runtime limit reached, dumping restart files...");
-      if (resubmit_after_max_hours) resubmit = 1;
-    }
-
-    /* Also if using nsteps to exit, will not have saved any restarts on exit,
-     * make sure we do that (useful in testing only). */
-    if (force_stop || (e.restart_onexit && e.step - 1 == nsteps))
-      engine_dump_restarts(&e, 0, 1);
+    /* Shall we write some check-point files?
+     * Note that this was already done by engine_step() if force_stop is set */
+    if (e.restart_onexit && e.step - 1 == nsteps && !force_stop)
+      engine_dump_restarts(&e, /*drifted=*/0, /*force=*/1);
 
     /* Dump the task data using the given frequency. */
     if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) {
@@ -1750,7 +1722,7 @@ int main(int argc, char *argv[]) {
                j + 1);
       mpiuse_log_dump(dumpfile, e.tic_step);
     }
-#endif  // WITH_MPI
+#endif
 
 #ifdef SWIFT_DEBUG_THREADPOOL
     /* Dump the task data using the given frequency. */
@@ -1766,7 +1738,7 @@ int main(int argc, char *argv[]) {
     } else {
       threadpool_reset_log(&e.threadpool);
     }
-#endif  // SWIFT_DEBUG_THREADPOOL
+#endif
   }
 
   /* Write final time information */
@@ -1884,9 +1856,9 @@ int main(int argc, char *argv[]) {
   if (myrank == 0) force_stop = restart_stop_now(restart_dir, 1);
 
   /* Did we want to run a re-submission command just before dying? */
-  if (myrank == 0 && resubmit) {
+  if (myrank == 0 && e.resubmit) {
     message("Running the resubmission command:");
-    restart_resubmit(resubmit_command);
+    restart_resubmit(e.resubmit_command);
     fflush(stdout);
     fflush(stderr);
     message("resubmission command completed.");
diff --git a/swift_fof.c b/swift_fof.c
index 3791b78d88..6bbed0fa96 100644
--- a/swift_fof.c
+++ b/swift_fof.c
@@ -664,7 +664,8 @@ int main(int argc, char *argv[]) {
       /*extra_io_props=*/NULL, &fof_properties, /*los_properties=*/NULL,
       /*lightcone_properties=*/NULL, &ics_metadata);
   engine_config(/*restart=*/0, /*fof=*/1, &e, params, nr_nodes, myrank,
-                nr_threads, nr_threads, with_aff, talking, NULL, &reparttype);
+                nr_threads, nr_threads, with_aff, talking, NULL, NULL,
+                &reparttype);
 
   /* Get some info to the user. */
   if (myrank == 0) {
-- 
GitLab