Commit 6c4dd72c authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Add an optional task dumper thread

Polls for a file .dump and when found dumps the active tasks and memory (if configured)
parent 8ba8cc33
......@@ -375,6 +375,7 @@ fi
# Check whether we have any of the ARM v8.1 tick timers
AX_ASM_ARM_PMCCNTR
AX_ASM_ARM_CNTVCT
# See if we want memuse reporting.
AC_ARG_ENABLE([memuse-reports],
[AS_HELP_STRING([--enable-memuse-reports],
......@@ -387,6 +388,18 @@ if test "$enable_memuse_reports" = "yes"; then
AC_DEFINE([SWIFT_MEMUSE_REPORTS],1,[Enable memory usage reports])
fi
# Check if we want to make the dumper thread active.
AC_ARG_ENABLE([dumper],
[AS_HELP_STRING([--enable-dumper],
[Dump active tasks and memory use (if configured)@<:@yes/no@:>@]
)],
[enable_dumper="$enableval"],
[enable_dumper="no"]
)
if test "$enable_dumper" = "yes"; then
AC_DEFINE([SWIFT_DUMPER_THREAD],1,[Enable dumper thread])
fi
# Define HAVE_POSIX_MEMALIGN if it works.
AX_FUNC_POSIX_MEMALIGN
......
......@@ -4863,6 +4863,62 @@ void engine_unpin(void) {
#endif
}
#ifdef SWIFT_DUMPER_THREAD
/**
* @brief dumper thread action, checks got the existence of the .dump file
* every 5 seconds and does the dump if found.
*
* @param p the #engine
*/
static void *engine_dumper_poll(void *p) {
struct engine *e = (struct engine *)p;
while (1) {
if (access(".dump", F_OK) == 0) {
/* OK, do our work. */
message("Dumping engine tasks in step: %d", e->step);
task_dump_active(e);
#ifdef SWIFT_MEMUSE_REPORTS
/* Dump the currently logged memory. */
message("Dumping memory use report");
memuse_log_dump_error(e->nodeID);
#endif
/* Add more interesting diagnostics. */
/* Delete the file. */
unlink(".dump");
message("Dumping completed");
fflush(stdout);
}
/* Take a breath. */
sleep(5);
}
return NULL;
}
#endif /* SWIFT_DUMPER_THREAD */
#ifdef SWIFT_DUMPER_THREAD
/**
* @brief creates the dumper thread.
*
* This watches for the creation of a ".dump" file in the current directory
* and if found dumps the current state of the tasks and memory use (if also
* configured).
*
* @param e the #engine
*
*/
static void engine_dumper_init(struct engine *e) {
pthread_t dumper;
pthread_create(&dumper, NULL, &engine_dumper_poll, e);
/* Thread does not exit, so nothing to do. */
}
#endif /* SWIFT_DUMPER_THREAD */
/**
* @brief init an engine struct with the necessary properties for the
* simulation.
......@@ -5701,6 +5757,12 @@ void engine_config(int restart, int fof, struct engine *e,
free(buf);
#endif
#ifdef SWIFT_DUMPER_THREAD
/* Start the dumper thread.*/
engine_dumper_init(e);
#endif
/* Wait for the runner threads to be in place. */
swift_barrier_wait(&e->wait_barrier);
}
......
......@@ -856,7 +856,7 @@ void task_create_mpi_comms(void) {
*
* Dumps the information to a file "thread_info-stepn.dat" where n is the
* given step value, or "thread_info_MPI-stepn.dat", if we are running
* under MPI. Note if running under MPIU all the ranks are dumped into this
* under MPI. Note if running under MPI all the ranks are dumped into this
* one file, which has an additional field to identify the rank.
*
* @param e the #engine
......@@ -1089,3 +1089,123 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header,
}
#endif
}
/**
* @brief dump all the active tasks of all the known engines into a file.
*
* Dumps the information to a file "task_dump-stepn.dat" where n is the given
* step value, or "task_dump_MPI-stepn.dat", if we are running under MPI. Note
* if running under MPI all the ranks are dumped into this one file, which has
* an additional field to identify the rank. Very similar to task_dump_all()
* except for the additional fields used in task debugging and we record tasks
* that have not ran (i.e !skip, but toc == 0) and how many waits are still
* active.
*
* @param e the #engine
*/
void task_dump_active(struct engine *e) {
/* Need this to convert ticks to seconds. */
unsigned long long cpufreq = clocks_get_cpufreq();
#ifdef WITH_MPI
/* Make sure output file is empty, only on one rank. */
char dumpfile[35];
snprintf(dumpfile, sizeof(dumpfile), "task_dump_MPI-step%d.dat", e->step);
FILE *file_thread;
if (engine_rank == 0) {
file_thread = fopen(dumpfile, "w");
fprintf(file_thread,
"# rank type subtype waits pair tic toc"
" ci.hydro.count cj.hydro.count ci.grav.count cj.grav.count"
" flags\n");
fclose(file_thread);
}
MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < e->nr_nodes; i++) {
/* Rank 0 decides the index of the writing node, this happens
* one-by-one. */
int kk = i;
MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (i == engine_rank) {
/* Open file and position at end. */
file_thread = fopen(dumpfile, "a");
/* Add some information to help with the plots and conversion of ticks to
* seconds. */
fprintf(
file_thread, "%i none none -1 0 %lld %lld %lld %lld %lld 0 %lld\n",
engine_rank, (long long int)e->tic_step, (long long int)e->toc_step,
e->updates, e->g_updates, e->s_updates, cpufreq);
int count = 0;
for (int l = 0; l < e->sched.nr_tasks; l++) {
/* Not implicit and not skipped. Note tasks that have not ran will
* have a toc of zero. */
if (!e->sched.tasks[l].implicit && !e->sched.tasks[l].skip) {
fprintf(
file_thread, "%i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
engine_rank, taskID_names[e->sched.tasks[l].type],
subtaskID_names[e->sched.tasks[l].subtype],
e->sched.tasks[l].wait, (e->sched.tasks[l].cj == NULL),
(long long int)e->sched.tasks[l].tic,
(long long int)e->sched.tasks[l].toc,
(e->sched.tasks[l].ci != NULL) ? e->sched.tasks[l].ci->hydro.count
: 0,
(e->sched.tasks[l].cj != NULL) ? e->sched.tasks[l].cj->hydro.count
: 0,
(e->sched.tasks[l].ci != NULL) ? e->sched.tasks[l].ci->grav.count
: 0,
(e->sched.tasks[l].cj != NULL) ? e->sched.tasks[l].cj->grav.count
: 0,
e->sched.tasks[l].flags);
}
count++;
}
fclose(file_thread);
}
/* And we wait for all to synchronize. */
MPI_Barrier(MPI_COMM_WORLD);
}
#else
/* Non-MPI, so just a single engine's worth of tasks to dump. */
char dumpfile[32];
snprintf(dumpfile, sizeof(dumpfile), "task_dump-step%d.dat", e->step);
FILE *file_thread;
file_thread = fopen(dumpfile, "w");
fprintf(file_thread,
"#type subtype waits pair tic toc ci.hydro.count cj.hydro.count "
"ci.grav.count cj.grav.count\n");
/* Add some information to help with the plots and conversion of ticks to
* seconds. */
fprintf(file_thread, "none none -1 0, %lld %lld %lld %lld %lld %lld\n",
(unsigned long long)e->tic_step, (unsigned long long)e->toc_step,
e->updates, e->g_updates, e->s_updates, cpufreq);
for (int l = 0; l < e->sched.nr_tasks; l++) {
if (!e->sched.tasks[l].implicit && !e->sched.tasks[l].skip) {
fprintf(
file_thread, "%s %s %i %i %lli %lli %i %i %i %i\n",
taskID_names[e->sched.tasks[l].type],
subtaskID_names[e->sched.tasks[l].subtype], e->sched.tasks[l].wait,
(e->sched.tasks[l].cj == NULL),
(unsigned long long)e->sched.tasks[l].tic,
(unsigned long long)e->sched.tasks[l].toc,
(e->sched.tasks[l].ci == NULL) ? 0
: e->sched.tasks[l].ci->hydro.count,
(e->sched.tasks[l].cj == NULL) ? 0
: e->sched.tasks[l].cj->hydro.count,
(e->sched.tasks[l].ci == NULL) ? 0 : e->sched.tasks[l].ci->grav.count,
(e->sched.tasks[l].cj == NULL) ? 0
: e->sched.tasks[l].cj->grav.count);
}
}
fclose(file_thread);
#endif // WITH_MPI
}
......@@ -235,6 +235,7 @@ void task_print(const struct task *t);
void task_dump_all(struct engine *e, int step);
void task_dump_stats(const char *dumpfile, struct engine *e, int header,
int allranks);
void task_dump_active(struct engine *e);
void task_get_full_name(int type, int subtype, char *name);
void task_get_group_name(int type, int subtype, char *cluster);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment