Commit ca53f581 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Turns out that having blocking MPI calls in the dumper code is bad as that can...

Turns out that having blocking MPI calls in the dumper code is bad as that can be blocked by other MPI calls in other threads, so we need to avoid any MPI calls here and create a file per rank instead
parent a259d7b6
......@@ -2275,61 +2275,28 @@ void scheduler_write_task_level(const struct scheduler *s) {
free(count);
}
/**
* @brief dump all the active queues of all the known schedulers into a file.
* @brief dump all the active queues of all the known schedulers into files.
*
* @param e the #scheduler
*/
void scheduler_dump_queues(struct engine *e) {
struct scheduler *s = &e->sched;
/* Open the file and write the header. */
char dumpfile[35];
#ifdef WITH_MPI
snprintf(dumpfile, sizeof(dumpfile), "queue_dump_MPI-step%d.dat", e->step);
/* Open a file per rank and write the header. Use per rank to avoid MPI
* calls that can interact with other blocking ones. */
snprintf(dumpfile, sizeof(dumpfile), "queue_dump_MPI-step%d.dat_%d", e->step,
e->nodeID);
#else
snprintf(dumpfile, sizeof(dumpfile), "queue_dump-step%d.dat", e->step);
#endif
FILE *file_thread = NULL;
if (engine_rank == 0) {
file_thread = fopen(dumpfile, "w");
fprintf(file_thread, "# rank queue index type subtype weight\n");
}
#ifdef WITH_MPI
/* Make sure output file is closed and empty, then we reopen on each rank. */
if (engine_rank == 0) fclose(file_thread);
MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < e->nr_nodes; i++) {
/* Rank 0 decides the index of the writing node, this happens
* one-by-one. */
int kk = i;
MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (i == engine_rank) {
/* Open file and position at end. */
file_thread = fopen(dumpfile, "a");
for (int l = 0; l < s->nr_queues; l++) {
queue_dump(engine_rank, l, file_thread, &s->queues[l]);
}
fclose(file_thread);
}
/* And we wait for all to synchronize. */
MPI_Barrier(MPI_COMM_WORLD);
}
#else
/* Non-MPI, so just a single schedulers worth of queues to dump. */
FILE *file_thread = fopen(dumpfile, "w");
fprintf(file_thread, "# rank queue index type subtype weight\n");
for (int l = 0; l < s->nr_queues; l++) {
queue_dump(engine_rank, l, file_thread, &s->queues[l]);
}
fclose(file_thread);
#endif // WITH_MPI
}
......@@ -1174,15 +1174,15 @@ void task_dump_stats(const char *dumpfile, struct engine *e, int header,
}
/**
* @brief dump all the active tasks of all the known engines into a file.
* @brief dump all the active tasks of all the known engines into files.
*
* Dumps the information to a file "task_dump-stepn.dat" where n is the given
* step value, or "task_dump_MPI-stepn.dat", if we are running under MPI. Note
* if running under MPI all the ranks are dumped into this one file, which has
* an additional field to identify the rank. Very similar to task_dump_all()
* except for the additional fields used in task debugging and we record tasks
* that have not ran (i.e !skip, but toc == 0) and how many waits are still
* active.
* Dumps the information into file "task_dump-stepn.dat" where n is the given
* step value, or files "task_dump_MPI-stepn.dat_rank", if we are running
* under MPI. Note if running under MPI all the ranks are dumped into separate
* files to avoid interaction with other MPI calls that may be blocking at the
* time. Very similar to task_dump_all() except for the additional fields used
* in task debugging and we record tasks that have not ran (i.e !skip, but toc
* == 0) and how many waits are still active.
*
* @param e the #engine
*/
......@@ -1190,98 +1190,48 @@ void task_dump_active(struct engine *e) {
/* Need this to convert ticks to seconds. */
unsigned long long cpufreq = clocks_get_cpufreq();
#ifdef WITH_MPI
/* Make sure output file is empty, only on one rank. */
char dumpfile[35];
snprintf(dumpfile, sizeof(dumpfile), "task_dump_MPI-step%d.dat", e->step);
FILE *file_thread;
if (engine_rank == 0) {
file_thread = fopen(dumpfile, "w");
fprintf(file_thread,
"# rank otherrank type subtype waits pair tic toc"
" ci.hydro.count cj.hydro.count ci.grav.count cj.grav.count"
" flags\n");
fclose(file_thread);
}
MPI_Barrier(MPI_COMM_WORLD);
for (int i = 0; i < e->nr_nodes; i++) {
/* Rank 0 decides the index of the writing node, this happens
* one-by-one. */
int kk = i;
MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD);
if (i == engine_rank) {
/* Open file and position at end. */
file_thread = fopen(dumpfile, "a");
/* Add some information to help with the plots and conversion of ticks to
* seconds. */
fprintf(
file_thread, "%i 0 none none -1 0 %lld %lld %lld %lld %lld 0 %lld\n",
engine_rank, (long long int)e->tic_step, (long long int)e->toc_step,
e->updates, e->g_updates, e->s_updates, cpufreq);
int count = 0;
for (int l = 0; l < e->sched.nr_tasks; l++) {
struct task *t = &e->sched.tasks[l];
/* Not implicit and not skipped. */
if (!t->implicit && !t->skip) {
/* Get destination rank of MPI requests. */
int paired = (t->cj != NULL);
int otherrank = t->ci->nodeID;
if (paired) otherrank = t->cj->nodeID;
fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
engine_rank, otherrank, taskID_names[t->type],
subtaskID_names[t->subtype], t->wait, paired,
(long long int)t->tic, (long long int)t->toc,
(t->ci != NULL) ? t->ci->hydro.count : 0,
(t->cj != NULL) ? t->cj->hydro.count : 0,
(t->ci != NULL) ? t->ci->grav.count : 0,
(t->cj != NULL) ? t->cj->grav.count : 0, t->flags);
}
count++;
}
fclose(file_thread);
}
/* And we wait for all to synchronize. */
MPI_Barrier(MPI_COMM_WORLD);
}
#ifdef WITH_MPI
snprintf(dumpfile, sizeof(dumpfile), "task_dump_MPI-step%d.dat_%d", e->step,
e->nodeID);
#else
/* Non-MPI, so just a single engine's worth of tasks to dump. */
char dumpfile[32];
snprintf(dumpfile, sizeof(dumpfile), "task_dump-step%d.dat", e->step);
FILE *file_thread;
file_thread = fopen(dumpfile, "w");
#endif
FILE *file_thread = fopen(dumpfile, "w");
fprintf(file_thread,
"#type subtype waits pair tic toc ci.hydro.count cj.hydro.count "
"ci.grav.count cj.grav.count\n");
"# rank otherrank type subtype waits pair tic toc"
" ci.hydro.count cj.hydro.count ci.grav.count cj.grav.count"
" flags\n");
/* Add some information to help with the plots and conversion of ticks to
* seconds. */
fprintf(file_thread, "none none -1 0, %lld %lld %lld %lld %lld %lld\n",
(unsigned long long)e->tic_step, (unsigned long long)e->toc_step,
fprintf(file_thread, "%i 0 none none -1 0 %lld %lld %lld %lld %lld 0 %lld\n",
engine_rank, (long long int)e->tic_step, (long long int)e->toc_step,
e->updates, e->g_updates, e->s_updates, cpufreq);
int count = 0;
for (int l = 0; l < e->sched.nr_tasks; l++) {
struct task *t = &e->sched.tasks[l];
/* Not implicit and not skipped. */
if (!t->implicit && !t->skip) {
fprintf(file_thread, "%s %s %i %i %lli %lli %i %i %i %i\n",
taskID_names[t->type], subtaskID_names[t->subtype], t->wait,
(t->cj == NULL), (unsigned long long)t->tic,
(unsigned long long)t->toc,
(t->ci == NULL) ? 0 : t->ci->hydro.count,
(t->cj == NULL) ? 0 : t->cj->hydro.count,
(t->ci == NULL) ? 0 : t->ci->grav.count,
(t->cj == NULL) ? 0 : t->cj->grav.count);
/* Get destination rank of MPI requests. */
int paired = (t->cj != NULL);
int otherrank = t->ci->nodeID;
if (paired) otherrank = t->cj->nodeID;
fprintf(file_thread, "%i %i %s %s %i %i %lli %lli %i %i %i %i %lli\n",
engine_rank, otherrank, taskID_names[t->type],
subtaskID_names[t->subtype], t->wait, paired,
(long long int)t->tic, (long long int)t->toc,
(t->ci != NULL) ? t->ci->hydro.count : 0,
(t->cj != NULL) ? t->cj->hydro.count : 0,
(t->ci != NULL) ? t->ci->grav.count : 0,
(t->cj != NULL) ? t->cj->grav.count : 0, t->flags);
}
count++;
}
fclose(file_thread);
#endif // WITH_MPI
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment