Commit 04c82352 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'restarts-at-runtime-update' into 'master'

Restarts at runtime update

See merge request !1007
parents b821896f 4fc3782d
...@@ -1296,7 +1296,7 @@ int main(int argc, char *argv[]) { ...@@ -1296,7 +1296,7 @@ int main(int argc, char *argv[]) {
} }
/* Did we exceed the maximal runtime? */ /* Did we exceed the maximal runtime? */
if (clocks_get_hours_since_start() > restart_max_hours_runtime) { if (e.runtime > restart_max_hours_runtime) {
force_stop = 1; force_stop = 1;
message("Runtime limit reached, dumping restart files..."); message("Runtime limit reached, dumping restart files...");
if (resubmit_after_max_hours) resubmit = 1; if (resubmit_after_max_hours) resubmit = 1;
......
...@@ -56,6 +56,7 @@ struct mpicollectgroup1 { ...@@ -56,6 +56,7 @@ struct mpicollectgroup1 {
long long total_nr_tasks; long long total_nr_tasks;
float tasks_per_cell_max; float tasks_per_cell_max;
struct star_formation_history sfh; struct star_formation_history sfh;
float runtime;
}; };
/* Forward declarations. */ /* Forward declarations. */
...@@ -125,6 +126,8 @@ void collectgroup1_apply(const struct collectgroup1 *grp1, struct engine *e) { ...@@ -125,6 +126,8 @@ void collectgroup1_apply(const struct collectgroup1 *grp1, struct engine *e) {
e->tasks_per_cell_max = grp1->tasks_per_cell_max; e->tasks_per_cell_max = grp1->tasks_per_cell_max;
star_formation_logger_add_to_accumulator(&e->sfh, &grp1->sfh); star_formation_logger_add_to_accumulator(&e->sfh, &grp1->sfh);
e->runtime = grp1->runtime;
} }
/** /**
...@@ -174,6 +177,7 @@ void collectgroup1_apply(const struct collectgroup1 *grp1, struct engine *e) { ...@@ -174,6 +177,7 @@ void collectgroup1_apply(const struct collectgroup1 *grp1, struct engine *e) {
* @param total_nr_tasks total number of tasks on rank. * @param total_nr_tasks total number of tasks on rank.
* @param tasks_per_cell the used number of tasks per cell. * @param tasks_per_cell the used number of tasks per cell.
* @param sfh The star formation history logger * @param sfh The star formation history logger
* @param runtime The runtime of rank in hours.
*/ */
void collectgroup1_init( void collectgroup1_init(
struct collectgroup1 *grp1, size_t updated, size_t g_updated, struct collectgroup1 *grp1, size_t updated, size_t g_updated,
...@@ -186,7 +190,7 @@ void collectgroup1_init( ...@@ -186,7 +190,7 @@ void collectgroup1_init(
integertime_t ti_black_holes_end_min, integertime_t ti_black_holes_end_max, integertime_t ti_black_holes_end_min, integertime_t ti_black_holes_end_max,
integertime_t ti_black_holes_beg_max, int forcerebuild, integertime_t ti_black_holes_beg_max, int forcerebuild,
long long total_nr_cells, long long total_nr_tasks, float tasks_per_cell, long long total_nr_cells, long long total_nr_tasks, float tasks_per_cell,
const struct star_formation_history sfh) { const struct star_formation_history sfh, float runtime) {
grp1->updated = updated; grp1->updated = updated;
grp1->g_updated = g_updated; grp1->g_updated = g_updated;
...@@ -213,6 +217,7 @@ void collectgroup1_init( ...@@ -213,6 +217,7 @@ void collectgroup1_init(
grp1->total_nr_tasks = total_nr_tasks; grp1->total_nr_tasks = total_nr_tasks;
grp1->tasks_per_cell_max = tasks_per_cell; grp1->tasks_per_cell_max = tasks_per_cell;
grp1->sfh = sfh; grp1->sfh = sfh;
grp1->runtime = runtime;
} }
/** /**
...@@ -254,6 +259,7 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) { ...@@ -254,6 +259,7 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
mpigrp11.total_nr_tasks = grp1->total_nr_tasks; mpigrp11.total_nr_tasks = grp1->total_nr_tasks;
mpigrp11.tasks_per_cell_max = grp1->tasks_per_cell_max; mpigrp11.tasks_per_cell_max = grp1->tasks_per_cell_max;
mpigrp11.sfh = grp1->sfh; mpigrp11.sfh = grp1->sfh;
mpigrp11.runtime = grp1->runtime;
struct mpicollectgroup1 mpigrp12; struct mpicollectgroup1 mpigrp12;
if (MPI_Allreduce(&mpigrp11, &mpigrp12, 1, mpicollectgroup1_type, if (MPI_Allreduce(&mpigrp11, &mpigrp12, 1, mpicollectgroup1_type,
...@@ -286,6 +292,7 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) { ...@@ -286,6 +292,7 @@ void collectgroup1_reduce(struct collectgroup1 *grp1) {
grp1->total_nr_tasks = mpigrp12.total_nr_tasks; grp1->total_nr_tasks = mpigrp12.total_nr_tasks;
grp1->tasks_per_cell_max = mpigrp12.tasks_per_cell_max; grp1->tasks_per_cell_max = mpigrp12.tasks_per_cell_max;
grp1->sfh = mpigrp12.sfh; grp1->sfh = mpigrp12.sfh;
grp1->runtime = mpigrp12.runtime;
#endif #endif
} }
...@@ -357,6 +364,9 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11, ...@@ -357,6 +364,9 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11,
/* Star formation history */ /* Star formation history */
star_formation_logger_add(&mpigrp11->sfh, &mpigrp12->sfh); star_formation_logger_add(&mpigrp11->sfh, &mpigrp12->sfh);
/* Use the maximum runtime as the global runtime. */
mpigrp11->runtime = max(mpigrp11->runtime, mpigrp12->runtime);
} }
/** /**
......
...@@ -60,6 +60,9 @@ struct collectgroup1 { ...@@ -60,6 +60,9 @@ struct collectgroup1 {
/* Maximum value of actual tasks per cell across all ranks. */ /* Maximum value of actual tasks per cell across all ranks. */
float tasks_per_cell_max; float tasks_per_cell_max;
/* Global runtime of application in hours. */
float runtime;
}; };
void collectgroup_init(void); void collectgroup_init(void);
...@@ -75,7 +78,7 @@ void collectgroup1_init( ...@@ -75,7 +78,7 @@ void collectgroup1_init(
integertime_t ti_black_holes_end_min, integertime_t ti_black_holes_end_max, integertime_t ti_black_holes_end_min, integertime_t ti_black_holes_end_max,
integertime_t ti_black_holes_beg_max, int forcerebuild, integertime_t ti_black_holes_beg_max, int forcerebuild,
long long total_nr_cells, long long total_nr_tasks, float tasks_per_cell, long long total_nr_cells, long long total_nr_tasks, float tasks_per_cell,
const struct star_formation_history sfh); const struct star_formation_history sfh, float runtime);
void collectgroup1_reduce(struct collectgroup1 *grp1); void collectgroup1_reduce(struct collectgroup1 *grp1);
#ifdef WITH_MPI #ifdef WITH_MPI
void mpicollect_free_MPI_type(void); void mpicollect_free_MPI_type(void);
......
...@@ -476,6 +476,9 @@ struct engine { ...@@ -476,6 +476,9 @@ struct engine {
/* Maximum number of tasks needed for restarting. */ /* Maximum number of tasks needed for restarting. */
int restart_max_tasks; int restart_max_tasks;
/* The globally agreed runtime, in hours. */
float runtime;
/* Label of the run */ /* Label of the run */
char run_name[PARSER_MAX_LINE_SIZE]; char run_name[PARSER_MAX_LINE_SIZE];
......
...@@ -44,6 +44,7 @@ struct end_of_step_data { ...@@ -44,6 +44,7 @@ struct end_of_step_data {
ti_black_holes_beg_max; ti_black_holes_beg_max;
struct engine *e; struct engine *e;
struct star_formation_history sfh; struct star_formation_history sfh;
float runtime;
}; };
/** /**
...@@ -455,6 +456,9 @@ void engine_collect_end_of_step(struct engine *e, int apply) { ...@@ -455,6 +456,9 @@ void engine_collect_end_of_step(struct engine *e, int apply) {
data.ti_black_holes_end_max = 0, data.ti_black_holes_beg_max = 0; data.ti_black_holes_end_max = 0, data.ti_black_holes_beg_max = 0;
data.e = e; data.e = e;
/* Need to use a consistent check of the hours since we started. */
data.runtime = clocks_get_hours_since_start();
/* Initialize the total SFH of the simulation to zero */ /* Initialize the total SFH of the simulation to zero */
star_formation_logger_init(&data.sfh); star_formation_logger_init(&data.sfh);
...@@ -480,7 +484,8 @@ void engine_collect_end_of_step(struct engine *e, int apply) { ...@@ -480,7 +484,8 @@ void engine_collect_end_of_step(struct engine *e, int apply) {
data.ti_stars_beg_max, data.ti_black_holes_end_min, data.ti_stars_beg_max, data.ti_black_holes_end_min,
data.ti_black_holes_end_max, data.ti_black_holes_beg_max, e->forcerebuild, data.ti_black_holes_end_max, data.ti_black_holes_beg_max, e->forcerebuild,
e->s->tot_cells, e->sched.nr_tasks, e->s->tot_cells, e->sched.nr_tasks,
(float)e->sched.nr_tasks / (float)e->s->tot_cells, data.sfh); (float)e->sched.nr_tasks / (float)e->s->tot_cells, data.sfh,
data.runtime);
/* Aggregate collective data from the different nodes for this step. */ /* Aggregate collective data from the different nodes for this step. */
#ifdef WITH_MPI #ifdef WITH_MPI
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment