Commit b20c2a01 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Added a command-line flag to increase verbosity

parent 1555aefd
......@@ -77,6 +77,7 @@ int main(int argc, char *argv[]) {
int nr_nodes = 1, myrank = 0;
FILE *file_thread;
int with_outputs = 1;
int verbose = 0;
unsigned long long cpufreq = 0;
#ifdef WITH_MPI
......@@ -150,7 +151,7 @@ int main(int argc, char *argv[]) {
bzero(&s, sizeof(struct space));
/* Parse the options */
while ((c = getopt(argc, argv, "a:c:d:e:f:h:m:oP:q:R:s:t:w:y:z:")) != -1)
while ((c = getopt(argc, argv, "a:c:d:e:f:h:m:oP:q:R:s:t:vw:y:z:")) != -1)
switch (c) {
case 'a':
if (sscanf(optarg, "%lf", &scaling) != 1)
......@@ -226,8 +227,8 @@ int main(int argc, char *argv[]) {
error("Error parsing number of queues.");
break;
case 'R':
/* Repartition type "n", "b", "v", "e" or "x".
* Note only none is available without METIS. */
/* Repartition type "n", "b", "v", "e" or "x".
* Note only none is available without METIS. */
#ifdef WITH_MPI
switch (optarg[0]) {
case 'n':
......@@ -261,6 +262,9 @@ int main(int argc, char *argv[]) {
if (sscanf(optarg, "%d", &nr_threads) != 1)
error("Error parsing number of threads.");
break;
case 'v':
verbose = 1;
break;
case 'w':
if (sscanf(optarg, "%d", &space_subsize) != 1)
error("Error parsing sub size.");
......@@ -329,7 +333,7 @@ int main(int argc, char *argv[]) {
clocks_set_cpufreq(cpufreq);
cpufreq = clocks_get_cpufreq();
if (myrank == 0)
message("CPU frequency used for tick conversion: %llu Hz", cpufreq);
message("CPU frequency used for tick conversion: %llu Hz", cpufreq);
/* Check we have sensible time step bounds */
if (dt_min > dt_max)
......@@ -341,8 +345,7 @@ int main(int argc, char *argv[]) {
/* Read particles and space information from (GADGET) IC */
if (myrank == 0)
clocks_gettime(&tic);
if (myrank == 0) clocks_gettime(&tic);
#if defined(WITH_MPI)
#if defined(HAVE_PARALLEL_HDF5)
read_ic_parallel(ICfileName, dim, &parts, &N, &periodic, myrank, nr_nodes,
......@@ -386,10 +389,9 @@ int main(int argc, char *argv[]) {
if (nr_queues < 0) nr_queues = nr_threads;
/* Initialize the space with this data. */
if (myrank == 0)
clocks_gettime(&tic);
if (myrank == 0) clocks_gettime(&tic);
space_init(&s, dim, parts, N, periodic, h_max, myrank == 0);
if (myrank == 0) {
if (myrank == 0 && verbose) {
clocks_gettime(&toc);
message("space_init took %.3f %s.", clocks_diff(&tic, &toc),
clocks_getunit());
......@@ -423,13 +425,12 @@ int main(int argc, char *argv[]) {
}
/* Initialize the engine with this space. */
if (myrank == 0)
clocks_gettime(&tic);
if (myrank == 0) clocks_gettime(&tic);
if (myrank == 0) message("nr_nodes is %i.", nr_nodes);
engine_init(&e, &s, dt_max, nr_threads, nr_queues, nr_nodes, myrank,
ENGINE_POLICY | engine_policy_steal | engine_policy_hydro, 0,
time_end, dt_min, dt_max);
if (myrank == 0) {
time_end, dt_min, dt_max, (myrank == 0 && verbose));
if (myrank == 0 && verbose) {
clocks_gettime(&toc);
message("engine_init took %.3f %s.", clocks_diff(&tic, &toc),
clocks_getunit());
......@@ -445,8 +446,7 @@ int main(int argc, char *argv[]) {
if (with_outputs) {
/* Write the state of the system as it is before starting time integration.
*/
if (myrank == 0)
clocks_gettime(&tic);
if (myrank == 0) clocks_gettime(&tic);
#if defined(WITH_MPI)
#if defined(HAVE_PARALLEL_HDF5)
write_output_parallel(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
......@@ -458,7 +458,7 @@ int main(int argc, char *argv[]) {
#else
write_output_single(&e, &us);
#endif
if (myrank == 0) {
if (myrank == 0 && verbose) {
clocks_gettime(&toc);
message("writing particle properties took %.3f %s.",
clocks_diff(&tic, &toc), clocks_getunit());
......@@ -486,7 +486,8 @@ int main(int argc, char *argv[]) {
if (myrank == 0)
printf(
"# Step Time time-step Number of updates CPU Wall-clock time "
"[%s]\n", clocks_getunit());
"[%s]\n",
clocks_getunit());
/* Let loose a runner on the space. */
for (j = 0; !engine_is_done(&e); j++) {
......@@ -506,6 +507,7 @@ int main(int argc, char *argv[]) {
if (with_outputs && j % 100 == 0) {
if (myrank == 0) clocks_gettime(&tic);
#if defined(WITH_MPI)
#if defined(HAVE_PARALLEL_HDF5)
write_output_parallel(&e, &us, myrank, nr_nodes, MPI_COMM_WORLD,
......@@ -517,6 +519,12 @@ int main(int argc, char *argv[]) {
#else
write_output_single(&e, &us);
#endif
if (myrank == 0 && verbose) {
clocks_gettime(&toc);
message("writing particle properties took %.3f %s.",
clocks_diff(&tic, &toc), clocks_getunit());
fflush(stdout);
}
}
/* Dump the task data using the given frequency. */
......@@ -617,6 +625,8 @@ int main(int argc, char *argv[]) {
#endif
if (with_outputs) {
if (myrank == 0) clocks_gettime(&tic);
/* Write final output. */
#if defined(WITH_MPI)
#if defined(HAVE_PARALLEL_HDF5)
......@@ -629,6 +639,12 @@ int main(int argc, char *argv[]) {
#else
write_output_single(&e, &us);
#endif
if (myrank == 0 && verbose) {
clocks_gettime(&toc);
message("writing particle properties took %.3f %s.",
clocks_diff(&tic, &toc), clocks_getunit());
fflush(stdout);
}
}
#ifdef WITH_MPI
......
......@@ -149,6 +149,7 @@ void engine_redistribute(struct engine *e) {
int *cdim = s->cdim;
struct cell *cells = s->cells;
int nr_cells = s->nr_cells;
ticks tic = getticks();
/* Start by sorting the particles according to their nodes and
getting the counts. The counts array is indexed as
......@@ -274,13 +275,18 @@ void engine_redistribute(struct engine *e) {
/* Be verbose about what just happened. */
for (int k = 0; k < nr_cells; k++)
if (cells[k].nodeID == nodeID) my_cells += 1;
message("node %i now has %i parts in %i cells.", nodeID, nr_parts, my_cells);
if (e->verbose)
message("node %i now has %i parts in %i cells.", nodeID, nr_parts,
my_cells);
/* Clean up other stuff. */
free(reqs);
free(counts);
free(dest);
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
#else
error("SWIFT was not compiled with MPI support.");
#endif
......@@ -296,6 +302,8 @@ void engine_repartition(struct engine *e) {
#if defined(WITH_MPI) && defined(HAVE_METIS)
ticks tic = getticks();
/* Clear the repartition flag. */
enum repartition_type reparttype = e->forcerepart;
e->forcerepart = REPART_NONE;
......@@ -323,6 +331,9 @@ void engine_repartition(struct engine *e) {
/* Tell the engine it should re-build whenever possible */
e->forcerebuild = 1;
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
#else
error("SWIFT was not compiled with MPI and METIS support.");
#endif
......@@ -472,6 +483,7 @@ void engine_exchange_cells(struct engine *e) {
MPI_Request reqs_in[engine_maxproxies];
MPI_Request reqs_out[engine_maxproxies];
MPI_Status status;
ticks tic = getticks();
/* Run through the cells and get the size of the ones that will be sent off.
*/
......@@ -564,6 +576,10 @@ void engine_exchange_cells(struct engine *e) {
/* Free the pcell buffer. */
free(pcells);
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
#else
error("SWIFT was not compiled with MPI support.");
#endif
......@@ -591,6 +607,7 @@ int engine_exchange_strays(struct engine *e, int offset, int *ind, int N) {
MPI_Status status;
struct proxy *p;
struct space *s = e->s;
ticks tic = getticks();
/* Re-set the proxies. */
for (k = 0; k < e->nr_proxies; k++) e->proxies[k].nr_parts_out = 0;
......@@ -635,7 +652,7 @@ int engine_exchange_strays(struct engine *e, int offset, int *ind, int N) {
enough space to accommodate them. */
int count_in = 0;
for (k = 0; k < e->nr_proxies; k++) count_in += e->proxies[k].nr_parts_in;
message("sent out %i particles, got %i back.", N, count_in);
if (e->verbose) message("sent out %i particles, got %i back.", N, count_in);
if (offset + count_in > s->size_parts) {
s->size_parts = (offset + count_in) * 1.05;
struct part *parts_new = NULL;
......@@ -704,6 +721,10 @@ int engine_exchange_strays(struct engine *e, int offset, int *ind, int N) {
MPI_SUCCESS)
error("MPI_Waitall on sends failed.");
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
/* Return the number of harvested parts. */
return count;
......@@ -730,6 +751,7 @@ void engine_maketasks(struct engine *e) {
int *cdim = s->cdim;
struct task *t, *t2;
struct cell *ci, *cj;
ticks tic = getticks();
/* Re-set the scheduler. */
scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
......@@ -984,6 +1006,10 @@ void engine_maketasks(struct engine *e) {
/* Set the tasks age. */
e->tasks_age = 0;
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
}
/**
......@@ -999,7 +1025,7 @@ int engine_marktasks(struct engine *e) {
struct task *t, *tasks = s->tasks;
float ti_end = e->ti_current;
struct cell *ci, *cj;
// ticks tic = getticks();
ticks tic = getticks();
/* Much less to do here if we're on a fixed time-step. */
if ((e->policy & engine_policy_fixdt) == engine_policy_fixdt) {
......@@ -1116,8 +1142,9 @@ int engine_marktasks(struct engine *e) {
}
}
// message( "took %.3f %s." , clocks_from_ticks(getticks() - tic),
// clocks_getunit());
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
/* All is well... */
return 0;
......@@ -1163,38 +1190,33 @@ void engine_print(struct engine *e) {
*/
void engine_rebuild(struct engine *e) {
ticks tic = getticks();
/* Clear the forcerebuild flag, whatever it was. */
e->forcerebuild = 0;
/* Re-build the space. */
// tic = getticks();
space_rebuild(e->s, 0.0, e->nodeID == 0);
// message( "space_rebuild took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
space_rebuild(e->s, 0.0, e->verbose);
/* If in parallel, exchange the cell structure. */
#ifdef WITH_MPI
// tic = getticks();
engine_exchange_cells(e);
// message( "engine_exchange_cells took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
#endif
/* Re-build the tasks. */
// tic = getticks();
engine_maketasks(e);
// message( "engine_maketasks took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
/* Run through the tasks and mark as skip or not. */
// tic = getticks();
if (engine_marktasks(e))
error("engine_marktasks failed after space_rebuild.");
// message( "engine_marktasks took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
/* Print the status of the system */
engine_print(e);
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
}
/**
......@@ -1207,45 +1229,37 @@ void engine_prepare(struct engine *e) {
int rebuild;
TIMER_TIC
TIMER_TIC;
/* Run through the tasks and mark as skip or not. */
// tic = getticks();
rebuild = (e->forcerebuild || engine_marktasks(e));
// message( "space_marktasks took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
/* Collect the values of rebuild from all nodes. */
#ifdef WITH_MPI
// tic = getticks();
int buff;
if (MPI_Allreduce(&rebuild, &buff, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD) !=
MPI_SUCCESS)
error("Failed to aggregate the rebuild flag across nodes.");
rebuild = buff;
// message( "rebuild allreduce took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
#endif
e->tic_step = getticks();
/* Did this not go through? */
if (rebuild) {
// tic = getticks();
engine_rebuild(e);
// message( "engine_rebuild took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
}
/* Re-rank the tasks every now and then. */
if (e->tasks_age % engine_tasksreweight == 1) {
// tic = getticks();
scheduler_reweight(&e->sched);
// message( "scheduler_reweight took %.3f %s." ,
// clocks_from_ticks(getticks() -tic), clocks_getunit());
}
e->tasks_age += 1;
TIMER_TOC(timer_prepare);
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
}
/**
......@@ -1653,6 +1667,7 @@ void engine_makeproxies(struct engine *e) {
struct space *s = e->s;
struct cell *cells = s->cells;
struct proxy *proxies = e->proxies;
ticks tic = getticks();
/* Prepare the proxies and the proxy index. */
if (e->proxy_ind == NULL)
......@@ -1735,6 +1750,9 @@ void engine_makeproxies(struct engine *e) {
}
}
if (e->verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
#else
error("SWIFT was not compiled with MPI support.");
#endif
......@@ -1817,24 +1835,55 @@ static bool hyperthreads_present(void) {
void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
int nr_queues, int nr_nodes, int nodeID, int policy,
float timeBegin, float timeEnd, float dt_min, float dt_max) {
float timeBegin, float timeEnd, float dt_min, float dt_max,
int verbose) {
/* Store the values. */
e->s = s;
e->nr_threads = nr_threads;
e->policy = policy;
e->step = 0;
e->nullstep = 0;
e->nr_nodes = nr_nodes;
e->nodeID = nodeID;
e->proxy_ind = NULL;
e->nr_proxies = 0;
e->forcerebuild = 1;
e->forcerepart = REPART_NONE;
e->links = NULL;
e->nr_links = 0;
e->timeBegin = timeBegin;
e->timeEnd = timeEnd;
e->timeOld = timeBegin;
e->time = timeBegin;
e->ti_old = 0;
e->ti_current = 0;
e->timeStep = 0.;
e->dt_min = dt_min;
e->dt_max = dt_max;
e->file_stats = NULL;
e->verbose = verbose;
e->wallclock_time = 0.f;
engine_rank = nodeID;
/* Make the space link back to the engine. */
s->e = e;
int i, k;
#if defined(HAVE_SETAFFINITY)
int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
int j, cpuid[nr_cores];
const int nr_cores = sysconf(_SC_NPROCESSORS_ONLN);
int cpuid[nr_cores];
cpu_set_t cpuset;
if ((policy & engine_policy_cputight) == engine_policy_cputight) {
for (k = 0; k < nr_cores; k++) cpuid[k] = k;
for (int k = 0; k < nr_cores; k++) cpuid[k] = k;
} else {
/* Get next highest power of 2. */
int maxint = 1;
while (maxint < nr_cores) maxint *= 2;
cpuid[0] = 0;
k = 1;
for (i = 1; i < maxint; i *= 2)
for (j = maxint / i / 2; j < maxint; j += maxint / i)
int k = 1;
for (int i = 1; i < maxint; i *= 2)
for (int j = maxint / i / 2; j < maxint; j += maxint / i)
if (j < nr_cores && j != 0) cpuid[k++] = j;
#if defined(HAVE_LIBNUMA) && defined(_GNU_SOURCE)
......@@ -1842,22 +1891,24 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
if (numa_available() >= 0) {
if (nodeID == 0) message("prefer NUMA-local CPUs");
int home = numa_node_of_cpu(sched_getcpu()), half = nr_cores / 2;
bool done = false, swap_hyperthreads = hyperthreads_present();
const int home = numa_node_of_cpu(sched_getcpu());
const int half = nr_cores / 2;
const int swap_hyperthreads = hyperthreads_present();
bool done = false;
if (swap_hyperthreads && nodeID == 0)
message("prefer physical cores to hyperthreads");
while (!done) {
done = true;
for (i = 1; i < nr_cores; i++) {
int node_a = numa_node_of_cpu(cpuid[i - 1]);
int node_b = numa_node_of_cpu(cpuid[i]);
for (int i = 1; i < nr_cores; i++) {
const int node_a = numa_node_of_cpu(cpuid[i - 1]);
const int node_b = numa_node_of_cpu(cpuid[i]);
/* Avoid using local hyperthreads over unused remote physical cores.
* Assume two hyperthreads, and that cpuid >= half partitions them.
*/
int thread_a = swap_hyperthreads && cpuid[i - 1] >= half;
int thread_b = swap_hyperthreads && cpuid[i] >= half;
const int thread_a = swap_hyperthreads && cpuid[i - 1] >= half;
const int thread_b = swap_hyperthreads && cpuid[i] >= half;
bool swap = thread_a > thread_b;
if (thread_a == thread_b)
......@@ -1881,42 +1932,12 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
#else
printf("%s engine_init: cpu map is [ ", clocks_get_timeofday());
#endif
for (i = 0; i < nr_cores; i++) printf("%i ", cpuid[i]);
for (int i = 0; i < nr_cores; i++) printf("%i ", cpuid[i]);
printf("].\n");
}
}
#endif
/* Store the values. */
e->s = s;
e->nr_threads = nr_threads;
e->policy = policy;
e->step = 0;
e->nullstep = 0;
e->nr_nodes = nr_nodes;
e->nodeID = nodeID;
e->proxy_ind = NULL;
e->nr_proxies = 0;
e->forcerebuild = 1;
e->forcerepart = REPART_NONE;
e->links = NULL;
e->nr_links = 0;
e->timeBegin = timeBegin;
e->timeEnd = timeEnd;
e->timeOld = timeBegin;
e->time = timeBegin;
e->ti_old = 0;
e->ti_current = 0;
e->timeStep = 0.;
e->dt_min = dt_min;
e->dt_max = dt_max;
e->file_stats = NULL;
e->wallclock_time = 0.f;
engine_rank = nodeID;
/* Make the space link back to the engine. */
s->e = e;
/* Are we doing stuff in parallel? */
if (nr_nodes > 1) {
#ifndef WITH_MPI
......@@ -2023,7 +2044,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
s->nr_queues = nr_queues;
/* Create the sorting tasks. */
for (i = 0; i < e->nr_threads; i++)
for (int i = 0; i < e->nr_threads; i++)
scheduler_addtask(&e->sched, task_type_psort, task_subtype_none, i, 0, NULL,
NULL, 0);
......@@ -2033,7 +2054,7 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
if ((e->runners =
(struct runner *)malloc(sizeof(struct runner) * nr_threads)) == NULL)
error("Failed to allocate threads array.");
for (k = 0; k < nr_threads; k++) {
for (int k = 0; k < nr_threads; k++) {
e->runners[k].id = k;
e->runners[k].e = e;
e->barrier_running += 1;
......
......@@ -158,6 +158,9 @@ struct engine {
struct link *links;
int nr_links, size_links;
/* Are we talkative ? */
int verbose;
#ifdef WITH_MPI
/* MPI data type for the particle transfers */
MPI_Datatype part_mpi_type;
......@@ -169,7 +172,8 @@ struct engine {
void engine_barrier(struct engine *e, int tid);
void engine_init(struct engine *e, struct space *s, float dt, int nr_threads,
int nr_queues, int nr_nodes, int nodeID, int policy,
float timeBegin, float timeEnd, float dt_min, float dt_max);
float timeBegin, float timeEnd, float dt_min, float dt_max,
int verbose);
void engine_launch(struct engine *e, int nr_runners, unsigned int mask,
unsigned int submask);
void engine_prepare(struct engine *e);
......
......@@ -1061,7 +1061,7 @@ void *runner_main(void *data) {
space_do_parts_sort();
break;
case task_type_split_cell:
space_split(e->s, t->ci);
space_do_split(e->s, t->ci);
break;
case task_type_rewait:
scheduler_do_rewait((struct task *)t->ci, (struct task *)t->cj,
......
......@@ -161,7 +161,7 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
float h_max = s->cell_min / kernel_gamma / space_stretch, dmin;
int i, j, k, cdim[3], nr_parts = s->nr_parts;
struct cell *restrict c;
// ticks tic;
ticks tic = getticks();
/* Run through the parts and get the current h_max. */
// tic = getticks();
......@@ -291,6 +291,10 @@ void space_regrid(struct space *s, double cell_max, int verbose) {
}
s->maxdepth = 0;
}
if (verbose)
message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
clocks_getunit());
}
/**
......@@ -309,7 +313,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
struct part *restrict p;
int *ind;
double ih[3], dim[3];
// ticks tic;
ticks tic = getticks();
/* Be verbose about this. */
// message( "re)building space..." ); fflush(stdout);
......@@ -394,10 +398,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
#endif
/* Sort the parts according to their cells. */
// tic = getticks();
space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1);
// message( "parts_sort took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
/* Re-link the gparts. */
for (k = 0; k < nr_parts; k++)
......@@ -437,10 +438,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {
/* TODO: Here we should exchange the gparts as well! */
/* Sort the parts according to their cells. */
// tic = getticks();
gparts_sort(s->gparts, ind, nr_gparts, 0, s->nr_cells - 1);
// message( "gparts_sort took %.3f %s." ,
// clocks_from_ticks(getticks() - tic), clocks_getunit());
space_gparts_sort(s->gparts, ind, nr_gparts, 0, s->nr_cells - 1);
/* Re-link the parts. */
for (k = 0; k < nr_gparts; k++)
......@@ -468,15 +466,32 @@ void space_rebuild(struct space *s, double cell_max, int verbose) {