diff --git a/examples/main.c b/examples/main.c index b48b056e39eb43bb60bc164cd36d281bdf775725..c419bd423eca9062ddf1543c904f52fb4684a94b 100644 --- a/examples/main.c +++ b/examples/main.c @@ -181,7 +181,7 @@ int main(int argc, char *argv[]) { int nparams = 0; char *cmdparams[PARSER_MAX_NO_OF_PARAMS]; char paramFileName[200] = ""; - char restartfile[200] = ""; + char restart_file[200] = ""; unsigned long long cpufreq = 0; /* Parse the parameters */ @@ -471,50 +471,50 @@ int main(int argc, char *argv[]) { int flag_entropy_ICs = 0; /* Work out where we will read and write restart files. */ - char restartdir[PARSER_MAX_LINE_SIZE]; - parser_get_opt_param_string(params, "Restarts:subdir", restartdir, "restart"); + char restart_dir[PARSER_MAX_LINE_SIZE]; + parser_get_opt_param_string(params, "Restarts:subdir", restart_dir, "restart"); /* The directory must exist. */ if (myrank == 0) { - if (access(restartdir, W_OK | X_OK) != 0) { + if (access(restart_dir, W_OK | X_OK) != 0) { if (restart) { - error("Cannot restart as no restart subdirectory: %s (%s)", restartdir, + error("Cannot restart as no restart subdirectory: %s (%s)", restart_dir, strerror(errno)); } else { - if (mkdir(restartdir, 0777) != 0) - error("Failed to create restart directory: %s (%s)", restartdir, + if (mkdir(restart_dir, 0777) != 0) + error("Failed to create restart directory: %s (%s)", restart_dir, strerror(errno)); } } } /* Basename for any restart files. */ - char restartname[PARSER_MAX_LINE_SIZE]; - parser_get_opt_param_string(params, "Restarts:basename", restartname, + char restart_name[PARSER_MAX_LINE_SIZE]; + parser_get_opt_param_string(params, "Restarts:basename", restart_name, "swift"); if (restart) { /* Attempting a restart. */ - char **restartfiles = NULL; - int nrestartfiles = 0; + char **restart_files = NULL; + int restart_nfiles = 0; if (myrank == 0) { message("Restarting SWIFT"); /* Locate the restart files. */ - restartfiles = restart_locate(restartdir, restartname, &nrestartfiles); - if (nrestartfiles == 0) - error("Failed to locate any restart files in %s", restartdir); + restart_files = restart_locate(restart_dir, restart_name, &restart_nfiles); + if (restart_nfiles == 0) + error("Failed to locate any restart files in %s", restart_dir); /* We need one file per rank. */ - if (nrestartfiles != nr_nodes) + if (restart_nfiles != nr_nodes) error("Incorrect number of restart files, expected %d found %d", - nr_nodes, nrestartfiles); + nr_nodes, restart_nfiles); if (verbose > 0) - for (int i = 0; i < nrestartfiles; i++) - message("found restart file: %s", restartfiles[i]); + for (int i = 0; i < restart_nfiles; i++) + message("found restart file: %s", restart_files[i]); } #ifdef WITH_MPI @@ -522,34 +522,34 @@ int main(int argc, char *argv[]) { if (myrank == 0) { for (int i = 1; i < nr_nodes; i++) { - strcpy(restartfile, restartfiles[i]); - MPI_Send(restartfile, 200, MPI_BYTE, i, 0, MPI_COMM_WORLD); + strcpy(restart_file, restart_files[i]); + MPI_Send(restart_file, 200, MPI_BYTE, i, 0, MPI_COMM_WORLD); } /* Keep local file. */ - strcpy(restartfile, restartfiles[0]); + strcpy(restart_file, restart_files[0]); /* Finished with the list. */ - restart_locate_free(nrestartfiles, restartfiles); + restart_locate_free(restart_nfiles, restart_files); } else { - MPI_Recv(restartfile, 200, MPI_BYTE, 0, 0, MPI_COMM_WORLD, + MPI_Recv(restart_file, 200, MPI_BYTE, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); } - if (verbose > 1) message("local restart file = %s", restartfile); + if (verbose > 1) message("local restart file = %s", restart_file); #else /* Just one restart file. */ - strcpy(restartfile, restartfiles[0]); + strcpy(restart_file, restart_files[0]); #endif /* Now read it. */ - restart_read(&e, restartfile); + restart_read(&e, restart_file); /* And initialize the engine with the space and policies. */ if (myrank == 0) clocks_gettime(&tic); engine_config(1, &e, nr_nodes, myrank, nr_threads, with_aff, talking, - restartfile); + restart_file); if (myrank == 0) { clocks_gettime(&toc); message("engine_config took %.3f %s.", clocks_diff(&tic, &toc), @@ -559,8 +559,7 @@ int main(int argc, char *argv[]) { /* Check if we are already done when given steps on the command-line. */ if (e.step >= nsteps && nsteps > 0) - error("Not restarting, already completed %d steps (of out %d)", e.step, - nsteps); + error("Not restarting, already completed %d steps", e.step); } else { @@ -759,7 +758,7 @@ int main(int argc, char *argv[]) { &gravity_properties, &potential, &cooling_func, &chemistry, &sourceterms); engine_config(0, &e, nr_nodes, myrank, nr_threads, with_aff, talking, - restartfile); + restart_file); if (myrank == 0) { clocks_gettime(&toc); message("engine_init took %.3f %s.", clocks_diff(&tic, &toc), @@ -833,7 +832,7 @@ int main(int argc, char *argv[]) { if (with_verbose_timers) timers_open_file(myrank); /* Create a name for restart file of this rank. */ - if (restart_genname(restartdir, restartname, e.nodeID, restartfile, 200) != 0) + if (restart_genname(restart_dir, restart_name, e.nodeID, restart_file, 200) != 0) error("Failed to generate restart filename"); /* Main simulation loop */ @@ -848,6 +847,11 @@ int main(int argc, char *argv[]) { /* Print the timers. */ if (with_verbose_timers) timers_print(e.step); + /* If using nsteps to exit, will not have saved any restarts on exit, make + * sure we do that (useful in testing only). */ + if (!engine_is_done(&e) && e.step - 1 == nsteps) + engine_dump_restarts(&e, 0, 1); + #ifdef SWIFT_DEBUG_TASKS /* Dump the task data using the given frequency. */ if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) { diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index db6ebd6c63cee47d3858591a83adfb9a617b3f16..1c44ad6308464816f887b737ee6a29f4c20fc42b 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -77,8 +77,10 @@ InitialConditions: # Parameters controlling restarts Restarts: - subdir: restart # (Optional) name of subdirectory for restart files. - basename: swift # (Optional) prefix used in naming restart files. + enable: 1 # (Optional) whether to enable dumping restarts at fixed intervals. + onexit: 0 # (Optional) whether to dump restarts on exit (*needs enable*) + subdir: restart # (Optional) name of subdirectory for restart files. + basename: swift # (Optional) prefix used in naming restart files. delta_hours: 6.0 # (Optional) decimal hours between dumps of restart files. # Parameters governing domain decomposition diff --git a/src/engine.c b/src/engine.c index 561784ae9d452111973bdda2db06c5915ace27ec..0a441deac1cd3f1beeae042bf9f141115e80e4bf 100644 --- a/src/engine.c +++ b/src/engine.c @@ -4504,33 +4504,56 @@ void engine_step(struct engine *e) { clocks_gettime(&time2); e->wallclock_time = (float)clocks_diff(&time1, &time2); - ticks tic = getticks(); #ifdef SWIFT_DEBUG_TASKS /* Time in ticks at the end of this step. */ - e->toc_step = tic; + e->toc_step = getticks(); #endif - /* Final job is to create a restart file if needed. Synchronize all to rank - * 0 step as clocks may differ between machines. */ - int dump = (tic > e->restart_next); + /* Final job is to create a restart file if needed. */ + engine_dump_restarts(e, drifted_all, engine_is_done(e)); +} + +/** + * @brief dump restart files if it is time to do so and dumps are enabled. + * + * @param e the engine. + * @param drifted_all true if a drift_all has just been performed. + * @param final_step set to true if this is the final step. + */ +void engine_dump_restarts(struct engine *e, int drifted_all, int final_step) { + + if (e->restart_dump) { + ticks tic = getticks(); + int dump = (tic > e->restart_next); + + /* If this is the last step, do we want a final update? */ + if (e->restart_onexit && final_step) dump = 1; + #ifdef WITH_MPI - MPI_Bcast(&dump, 1, MPI_INT, 0, MPI_COMM_WORLD); + /* Synchronize this action from rank 0 (ticks may differ between + * machines). */ + MPI_Bcast(&dump, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif - if (dump) { + if (dump) { - /* Drift all particles first (may have just been done). */ - if (!drifted_all) engine_drift_all(e); - restart_write(e, e->restartfile); + /* Drift all particles first (may have just been done). */ + if (!drifted_all) engine_drift_all(e); + restart_write(e, e->restart_file); - if (e->verbose) - message("Dumping restart files took %.3f %s", - clocks_from_ticks(getticks() - tic), clocks_getunit()); + if (e->verbose) + message("Dumping restart files took %.3f %s", + clocks_from_ticks(getticks() - tic), clocks_getunit()); - /* Time after which next dump will occur. */ - e->restart_next += e->restart_dt; + /* Time after which next dump will occur. */ + e->restart_next += e->restart_dt; + + /* Flag that we dumped the restarts */ + e->step_props |= engine_step_prop_restarts; + } } } + /** * @brief Returns 1 if the simulation has reached its end point, 0 otherwise */ @@ -5236,11 +5259,11 @@ void engine_init( * @param nr_threads The number of threads per MPI rank. * @param with_aff use processor affinity, if supported. * @param verbose Is this #engine talkative ? - * @param restartfile The name of our restart file. + * @param restart_file The name of our restart file. */ void engine_config(int restart, struct engine *e, int nr_nodes, int nodeID, int nr_threads, int with_aff, int verbose, - const char *restartfile) { + const char *restart_file) { /* Store the values and initialise global fields. */ e->nodeID = nodeID; @@ -5259,7 +5282,8 @@ void engine_config(int restart, struct engine *e, int nr_nodes, int nodeID, e->file_timesteps = NULL; e->verbose = verbose; e->wallclock_time = 0.f; - e->restartfile = restartfile; + e->restart_dump = 0; + e->restart_file = restart_file; e->restart_next = 0; e->restart_dt = 0; engine_rank = nodeID; @@ -5455,10 +5479,10 @@ void engine_config(int restart, struct engine *e, int nr_nodes, int nodeID, fprintf(e->file_timesteps, "# Step Properties: Rebuild=%d, Redistribute=%d, Repartition=%d, " - "Statistics=%d, Snapshot=%d\n", + "Statistics=%d, Snapshot=%d, Restarts=%d\n", engine_step_prop_rebuild, engine_step_prop_redistribute, engine_step_prop_repartition, engine_step_prop_statistics, - engine_step_prop_snapshot); + engine_step_prop_snapshot, engine_step_prop_restarts); fprintf(e->file_timesteps, "# %6s %14s %14s %12s %12s %12s %16s [%s] %6s\n", "Step", "Time", @@ -5538,12 +5562,27 @@ void engine_config(int restart, struct engine *e, int nr_nodes, int nodeID, /* Find the time of the first output */ engine_compute_next_snapshot_time(e); + /* Whether restarts are enabled. Yes by default. */ + e->restart_dump = parser_get_opt_param_int(e->parameter_file, + "Restarts:enable", 1); + + /* Whether restarts should be dumped on exit. Not by default. */ + e->restart_onexit = parser_get_opt_param_int(e->parameter_file, + "Restarts:onexit", 0); + /* Hours between restart dumps. */ float dhours = parser_get_opt_param_float(e->parameter_file, "Restarts:delta_hours", 6.0); - if (e->verbose) - message("restarts every %f hours", dhours); + if (e->nodeID == 0) { + if(e->restart_dump) + message("Restarts will be dumped every %f hours", dhours); + else + message("WARNING: restarts will not be dumped"); + + if (e->verbose && e->restart_onexit) + message("Restarts will be dumped after the final step"); + } /* Internally we use ticks, so convert into a delta ticks. Assumes we can * convert from ticks into milliseconds. */ @@ -5584,7 +5623,7 @@ void engine_config(int restart, struct engine *e, int nr_nodes, int nodeID, maxtasks = engine_estimate_nr_tasks(e); /* Init the scheduler. */ - scheduler_init(&e->sched, e->s, maxtasks, nr_queues, + scheduler_init(&e->sched, e->s, maxtasks, nr_queues, (e->policy & scheduler_flag_steal), e->nodeID, &e->threadpool); diff --git a/src/engine.h b/src/engine.h index 70768abfd11b38ba227b63e6787d3ea2b8d79b64..4531d2eb10aa78efdb1c6283b7560ba554524d03 100644 --- a/src/engine.h +++ b/src/engine.h @@ -83,7 +83,8 @@ enum engine_step_properties { engine_step_prop_redistribute = (1 << 1), engine_step_prop_repartition = (1 << 2), engine_step_prop_statistics = (1 << 3), - engine_step_prop_snapshot = (1 << 4) + engine_step_prop_snapshot = (1 << 4), + engine_step_prop_restarts = (1 << 5) }; /* Some constants */ @@ -291,8 +292,14 @@ struct engine { * these are reduced together, but may not be required just yet). */ struct collectgroup1 collect_group1; + /* Whether to dump restart files. */ + int restart_dump; + + /* Whether to dump restart files after the last step. */ + int restart_onexit; + /* Name of the restart file. */ - const char *restartfile; + const char *restart_file; /* Ticks between restart dumps. */ ticks restart_dt; @@ -324,7 +331,7 @@ void engine_init( const struct chemistry_data *chemistry, struct sourceterms *sourceterms); void engine_config(int restart, struct engine *e, int nr_nodes, int nodeID, int nr_threads, int with_aff, int verbose, - const char *restartfile); + const char *restart_file); void engine_launch(struct engine *e); void engine_prepare(struct engine *e); void engine_init_particles(struct engine *e, int flag_entropy_ICs, @@ -352,5 +359,6 @@ int engine_estimate_nr_tasks(struct engine *e); /* Struct dump/restore support. */ void engine_struct_dump(struct engine *e, FILE *stream); void engine_struct_restore(struct engine *e, FILE *stream); +void engine_dump_restarts(struct engine *e, int drifted_all, int final_step); #endif /* SWIFT_ENGINE_H */ diff --git a/src/restart.c b/src/restart.c index e284eaea5c64e34284e9eec4814806a5134b6d08..97122a66698c5344c12f8d40e548db7de7c82167 100644 --- a/src/restart.c +++ b/src/restart.c @@ -105,7 +105,10 @@ void restart_locate_free(int nfiles, char **files) { } /** - * @brief Write a restart file for the given engine struct. + * @brief Write a restart file for the state of the given engine struct. + * + * @param e the engine with our state information. + * @param filename name of the file to write the restart data to. */ void restart_write(struct engine *e, const char *filename) { @@ -124,7 +127,10 @@ void restart_write(struct engine *e, const char *filename) { } /** - * @brief Read a restart file to construct a saved engine. + * @brief Read a restart file to construct a saved engine struct state. + * + * @param e the engine to recover from the saved state. + * @param filename name of the file containing the staved state. */ void restart_read(struct engine *e, const char *filename) {