Commit b4e619f7 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'longtask-report' into 'master'

Output a report about tasks that use more than 5% of the runtime in a step

See merge request !1071
parents cc8926a0 9a0b874b
......@@ -90,5 +90,8 @@ Parameters:
are dumped.
-Y, --threadpool-dumps=<int> Time-step frequency at which threadpool
tasks are dumped.
--dump-tasks-threshold=<flt> Fraction of the total step's time spent
in a task to trigger a dump of the task plot
on this step
See the file examples/parameter_example.yml for an example of parameter file.
......@@ -178,5 +178,8 @@ Parameters:
are dumped.
-Y, --threadpool-dumps=<int> Time-step frequency at which threadpool
tasks are dumped.
--dump-tasks-threshold=<flt> Fraction of the total step's time spent
in a task to trigger a dump of the task plot
on this step
See the file examples/parameter_example.yml for an example of parameter file.
......@@ -87,3 +87,6 @@ can be found by typing ``./swift -h``:
are dumped.
-Y, --threadpool-dumps=<int> Time-step frequency at which threadpool
tasks are dumped.
--dump-tasks-threshold=<flt> Fraction of the total step's time spent
in a task to trigger a dump of the task plot
on this step
......@@ -183,6 +183,7 @@ int main(int argc, char *argv[]) {
char *param_filename = NULL;
char restart_file[200] = "";
unsigned long long cpufreq = 0;
float dump_tasks_threshold = 0.f;
struct cmdparams cmdps;
cmdps.nparam = 0;
cmdps.param[0] = NULL;
......@@ -304,6 +305,10 @@ int main(int argc, char *argv[]) {
OPT_INTEGER('Y', "threadpool-dumps", &dump_threadpool,
"Time-step frequency at which threadpool tasks are dumped.",
NULL, 0, 0),
OPT_FLOAT(0, "dump-tasks-threshold", &dump_tasks_threshold,
"Fraction of the total step's time spent in a task to trigger "
"a dump of the task plot on this step",
NULL, 0, 0),
OPT_END(),
};
struct argparse argparse;
......@@ -401,6 +406,20 @@ int main(int argc, char *argv[]) {
}
#endif
if (dump_tasks_threshold > 0.f) {
#ifndef SWIFT_DEBUG_TASKS
if (myrank == 0) {
error(
"Error: Dumping task plot data above a fixed time threshold is only "
"valid when the code is configured with --enable-task-debugging.");
}
#endif
#ifdef WITH_MPI
if (nr_nodes > 1)
error("Cannot dump tasks above a time threshold over MPI (yet).");
#endif
}
#ifndef SWIFT_CELL_GRAPH
if (dump_cells) {
if (myrank == 0) {
......@@ -1409,13 +1428,14 @@ int main(int argc, char *argv[]) {
/* Dump the task data using the given frequency. */
if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) {
#ifdef SWIFT_DEBUG_TASKS
task_dump_all(&e, j + 1);
if (dump_tasks_threshold == 0.) task_dump_all(&e, j + 1);
#endif
/* Generate the task statistics. */
char dumpfile[40];
snprintf(dumpfile, 40, "thread_stats-step%d.dat", e.step + 1);
task_dump_stats(dumpfile, &e, /* header = */ 0, /* allranks = */ 1);
task_dump_stats(dumpfile, &e, dump_tasks_threshold,
/* header = */ 0, /* allranks = */ 1);
}
#ifdef SWIFT_CELL_GRAPH
......
......@@ -199,8 +199,9 @@ void engine_repartition(struct engine *e) {
/* Generate the fixed costs include file. */
if (e->step > 3 && e->reparttype->trigger <= 1.f) {
task_dump_stats("partition_fixed_costs.h", e, /* header = */ 1,
/* allranks = */ 1);
task_dump_stats("partition_fixed_costs.h", e,
/* task_dump_threshold = */ 0.f,
/* header = */ 1, /* allranks = */ 1);
}
/* Do the repartitioning. */
......
......@@ -1095,12 +1095,14 @@ void task_dump_all(struct engine *e, int step) {
*
* @param dumpfile name of the file for the output.
* @param e the #engine
* @param dump_task_threshold Fraction of the step time above whic any task
* triggers a call to task_dump_all().
* @param header whether to write a header include file.
* @param allranks do the statistics over all ranks, if not just the current
* one, only used if header is false.
*/
void task_dump_stats(const char *dumpfile, struct engine *e, int header,
int allranks) {
void task_dump_stats(const char *dumpfile, struct engine *e,
float dump_tasks_threshold, int header, int allranks) {
const ticks function_tic = getticks();
......@@ -1125,7 +1127,9 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header,
}
}
double stepdt = (double)e->toc_step - (double)e->tic_step;
double total[1] = {0.0};
int dumped_plot_data = 0;
for (int l = 0; l < e->sched.nr_tasks; l++) {
int type = e->sched.tasks[l].type;
......@@ -1152,6 +1156,23 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header,
tmax[type][subtype] = tic;
}
total[0] += dt;
/* Check if this is a problematic task and make a report. */
if (dump_tasks_threshold > 0. && dt / stepdt > dump_tasks_threshold) {
if (e->verbose)
message(
"Long running task detected: %s/%s using %.1f%% of step runtime",
taskID_names[type], subtaskID_names[subtype],
dt / stepdt * 100.0);
if (!dumped_plot_data) {
#ifdef SWIFT_DEBUG_TASKS
task_dump_all(e, e->step + 1);
#endif
dumped_plot_data = 1;
}
}
}
}
......
......@@ -265,8 +265,8 @@ int task_lock(struct task *t);
void task_do_rewait(struct task *t);
void task_print(const struct task *t);
void task_dump_all(struct engine *e, int step);
void task_dump_stats(const char *dumpfile, struct engine *e, int header,
int allranks);
void task_dump_stats(const char *dumpfile, struct engine *e,
float dump_tasks_threshold, int header, int allranks);
void task_dump_active(struct engine *e);
void task_get_full_name(int type, int subtype, char *name);
void task_get_group_name(int type, int subtype, char *cluster);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment