diff --git a/README b/README index 8587a13b3583b9e1ccb1612b0a1abf905b7a2218..6a850c5cd4a88a3588497c9637335b9967f73efd 100644 --- a/README +++ b/README @@ -90,5 +90,8 @@ Parameters: are dumped. -Y, --threadpool-dumps=<int> Time-step frequency at which threadpool tasks are dumped. + --dump-tasks-threshold=<flt> Fraction of the total step's time spent + in a task to trigger a dump of the task plot + on this step See the file examples/parameter_example.yml for an example of parameter file. diff --git a/README.md b/README.md index 7223f4719ab99fc01c103c6813ca319f89424a49..bae021843e63967c3134d419ec28892a9e177b34 100644 --- a/README.md +++ b/README.md @@ -178,5 +178,8 @@ Parameters: are dumped. -Y, --threadpool-dumps=<int> Time-step frequency at which threadpool tasks are dumped. + --dump-tasks-threshold=<flt> Fraction of the total step's time spent + in a task to trigger a dump of the task plot + on this step See the file examples/parameter_example.yml for an example of parameter file. diff --git a/doc/RTD/source/CommandLineOptions/index.rst b/doc/RTD/source/CommandLineOptions/index.rst index 2de6c6bb254aadc940db35de6806fc7785152f3e..bef995382041397f4eafa9924b0cf72bc222dc74 100644 --- a/doc/RTD/source/CommandLineOptions/index.rst +++ b/doc/RTD/source/CommandLineOptions/index.rst @@ -87,3 +87,6 @@ can be found by typing ``./swift -h``: are dumped. -Y, --threadpool-dumps=<int> Time-step frequency at which threadpool tasks are dumped. + --dump-tasks-threshold=<flt> Fraction of the total step's time spent + in a task to trigger a dump of the task plot + on this step diff --git a/examples/main.c b/examples/main.c index b9a59c97e14fafc839dd3fb4be4e719394175823..a6d3ab1a3823a0754561cd6cc4b71e8c2c9e7fba 100644 --- a/examples/main.c +++ b/examples/main.c @@ -183,6 +183,7 @@ int main(int argc, char *argv[]) { char *param_filename = NULL; char restart_file[200] = ""; unsigned long long cpufreq = 0; + float dump_tasks_threshold = 0.f; struct cmdparams cmdps; cmdps.nparam = 0; cmdps.param[0] = NULL; @@ -304,6 +305,10 @@ int main(int argc, char *argv[]) { OPT_INTEGER('Y', "threadpool-dumps", &dump_threadpool, "Time-step frequency at which threadpool tasks are dumped.", NULL, 0, 0), + OPT_FLOAT(0, "dump-tasks-threshold", &dump_tasks_threshold, + "Fraction of the total step's time spent in a task to trigger " + "a dump of the task plot on this step", + NULL, 0, 0), OPT_END(), }; struct argparse argparse; @@ -401,6 +406,20 @@ int main(int argc, char *argv[]) { } #endif + if (dump_tasks_threshold > 0.f) { +#ifndef SWIFT_DEBUG_TASKS + if (myrank == 0) { + error( + "Error: Dumping task plot data above a fixed time threshold is only " + "valid when the code is configured with --enable-task-debugging."); + } +#endif +#ifdef WITH_MPI + if (nr_nodes > 1) + error("Cannot dump tasks above a time threshold over MPI (yet)."); +#endif + } + #ifndef SWIFT_CELL_GRAPH if (dump_cells) { if (myrank == 0) { @@ -1409,13 +1428,14 @@ int main(int argc, char *argv[]) { /* Dump the task data using the given frequency. */ if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) { #ifdef SWIFT_DEBUG_TASKS - task_dump_all(&e, j + 1); + if (dump_tasks_threshold == 0.) task_dump_all(&e, j + 1); #endif /* Generate the task statistics. */ char dumpfile[40]; snprintf(dumpfile, 40, "thread_stats-step%d.dat", e.step + 1); - task_dump_stats(dumpfile, &e, /* header = */ 0, /* allranks = */ 1); + task_dump_stats(dumpfile, &e, dump_tasks_threshold, + /* header = */ 0, /* allranks = */ 1); } #ifdef SWIFT_CELL_GRAPH diff --git a/src/engine.c b/src/engine.c index 47ad8dfec23bea0578e4a6d20c06666a3f982d4b..98c3cb433c56d3b15ee3ece1dfb8a4ea9852a842 100644 --- a/src/engine.c +++ b/src/engine.c @@ -199,8 +199,9 @@ void engine_repartition(struct engine *e) { /* Generate the fixed costs include file. */ if (e->step > 3 && e->reparttype->trigger <= 1.f) { - task_dump_stats("partition_fixed_costs.h", e, /* header = */ 1, - /* allranks = */ 1); + task_dump_stats("partition_fixed_costs.h", e, + /* task_dump_threshold = */ 0.f, + /* header = */ 1, /* allranks = */ 1); } /* Do the repartitioning. */ diff --git a/src/task.c b/src/task.c index 70741af01bbf869ec22b1f5fd8ac5c076784db2a..3efc2fe4f8457e6d9c69a60b3e19ec8accbed05b 100644 --- a/src/task.c +++ b/src/task.c @@ -1095,12 +1095,14 @@ void task_dump_all(struct engine *e, int step) { * * @param dumpfile name of the file for the output. * @param e the #engine + * @param dump_task_threshold Fraction of the step time above whic any task + * triggers a call to task_dump_all(). * @param header whether to write a header include file. * @param allranks do the statistics over all ranks, if not just the current * one, only used if header is false. */ -void task_dump_stats(const char *dumpfile, struct engine *e, int header, - int allranks) { +void task_dump_stats(const char *dumpfile, struct engine *e, + float dump_tasks_threshold, int header, int allranks) { const ticks function_tic = getticks(); @@ -1125,7 +1127,9 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header, } } + double stepdt = (double)e->toc_step - (double)e->tic_step; double total[1] = {0.0}; + int dumped_plot_data = 0; for (int l = 0; l < e->sched.nr_tasks; l++) { int type = e->sched.tasks[l].type; @@ -1152,6 +1156,23 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header, tmax[type][subtype] = tic; } total[0] += dt; + + /* Check if this is a problematic task and make a report. */ + if (dump_tasks_threshold > 0. && dt / stepdt > dump_tasks_threshold) { + + if (e->verbose) + message( + "Long running task detected: %s/%s using %.1f%% of step runtime", + taskID_names[type], subtaskID_names[subtype], + dt / stepdt * 100.0); + + if (!dumped_plot_data) { +#ifdef SWIFT_DEBUG_TASKS + task_dump_all(e, e->step + 1); +#endif + dumped_plot_data = 1; + } + } } } diff --git a/src/task.h b/src/task.h index a093031c9ffeb8ec8480bc22f0b7bd9b429c0386..97ff8b446bcfbc7068cccd05b053875842d3146a 100644 --- a/src/task.h +++ b/src/task.h @@ -265,8 +265,8 @@ int task_lock(struct task *t); void task_do_rewait(struct task *t); void task_print(const struct task *t); void task_dump_all(struct engine *e, int step); -void task_dump_stats(const char *dumpfile, struct engine *e, int header, - int allranks); +void task_dump_stats(const char *dumpfile, struct engine *e, + float dump_tasks_threshold, int header, int allranks); void task_dump_active(struct engine *e); void task_get_full_name(int type, int subtype, char *name); void task_get_group_name(int type, int subtype, char *cluster);