Commit 45442447 authored by Peter W. Draper's avatar Peter W. Draper Committed by Matthieu Schaller
Browse files

Log rank CPU times and memory use to assess the balance.

parent 9ca58f42
......@@ -407,6 +407,12 @@ if test "$enable_memuse_reports" = "yes"; then
AC_DEFINE([SWIFT_MEMUSE_REPORTS],1,[Enable memory usage reports])
fi
# The system memory report depends on the presence of /proc/self/statm, i.e.
# a linux OS, check for that.
if test -f "/proc/self/statm"; then
AC_DEFINE([SWIFT_MEMUSE_STATM],1,[Have /proc/self/statm capability])
fi
# Check if we want to make the dumper thread active.
AC_ARG_ENABLE([dumper],
[AS_HELP_STRING([--enable-dumper],
......
......@@ -276,22 +276,24 @@ double clocks_get_hours_since_start(void) {
}
/**
* @brief return the cpu time used.
* @brief return the cpu times used.
*
* Uses the times(2) function to access the user cpu times and returns the sum
* of these for the process tree, i.e. current process plus "waited-for"
* children. This may be pthread implementation specific as to what that
* exactly means. Note we do not include the system time as that includes
* spin times and we don't want to give credit for that.
* Uses the times(2) function to access the user and system cpu times and
* returns the sum of these for the process tree, i.e. current process plus
* "waited-for" children. This may be pthread implementation specific as to
* what that exactly means.
*
* @result cpu time used in sysconf(_SC_CLK_TCK) ticks, usually 100/s not our
* usual ticks.
* Note cpu times are reported in sysconf(_SC_CLK_TCK) ticks, usually 100/s
* not our usual ticks.
*
* @param usertime the user time.
* @param systime the system time.
*/
double clocks_get_cputime_used(void) {
void clocks_get_cputimes_used(double *usertime, double *systime) {
struct tms tmstic;
times(&tmstic);
return (double)(tmstic.tms_utime + tmstic.tms_cutime);
*usertime = (tmstic.tms_utime + tmstic.tms_cutime);
*systime = (tmstic.tms_stime + tmstic.tms_cstime);
}
/**
......
......@@ -52,7 +52,7 @@ double clocks_diff_ticks(ticks tic, ticks toc);
const char *clocks_get_timesincestart(void);
double clocks_get_hours_since_start(void);
double clocks_get_cputime_used(void);
void clocks_get_cputimes_used(double *usertime, double *systime);
int clocks_random_seed(void);
#endif /* SWIFT_CLOCKS_H */
......@@ -256,79 +256,169 @@ void engine_repartition_trigger(struct engine *e) {
#ifdef WITH_MPI
const ticks tic = getticks();
static int opened = 0;
/* Do nothing if there have not been enough steps since the last repartition
* as we don't want to repeat this too often or immediately after a
* repartition step. Also nothing to do when requested. */
if (e->step - e->last_repartition >= 2 &&
e->reparttype->type != REPART_NONE) {
* repartition step. We attempt all this even when we are not repartitioning
* as the balance logs can still be interesting. */
if (e->step - e->last_repartition >= 2) {
/* If we have fixed costs available and this is step 2 or we are forcing
* repartitioning then we do a fixed costs one now. */
if (e->reparttype->trigger > 1 ||
(e->step == 2 && e->reparttype->use_fixed_costs)) {
* repartitioning then we do a forced fixed costs repartition regardless. */
int forced = 0;
if (e->reparttype->type != REPART_NONE) {
if (e->reparttype->trigger > 1 ||
(e->step == 2 && e->reparttype->use_fixed_costs)) {
if (e->reparttype->trigger > 1) {
if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
} else {
e->forcerepart = 1;
}
e->reparttype->use_ticks = 0;
forced = 1;
}
}
if (e->reparttype->trigger > 1) {
if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
/* We only check the CPU loads when we have processed a significant number
* of all particles as we require all tasks to have timings or are
* interested in the various balances logs. */
if ((e->updates > 1 &&
e->updates >= e->total_nr_parts * e->reparttype->minfrac) ||
(e->g_updates > 1 &&
e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) {
/* Are we using the task tick timings or fixed costs? */
if (e->reparttype->use_fixed_costs > 1) {
e->reparttype->use_ticks = 0;
} else {
e->forcerepart = 1;
e->reparttype->use_ticks = 1;
}
e->reparttype->use_ticks = 0;
} else {
/* Get the resident size of the process for the memory logs. */
long size, resident, shared, text, library, data, dirty;
memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty);
/* Gather it together with the CPU times used by the tasks in the last
* step. */
double timemem[3] = {e->usertime_last_step, e->systime_last_step,
(double)resident};
double timemems[e->nr_nodes * 3];
MPI_Gather(&timemem, 3, MPI_DOUBLE, timemems, 3, MPI_DOUBLE, 0,
MPI_COMM_WORLD);
if (e->nodeID == 0) {
/* Get the range and mean of the two CPU times and memory. */
double umintime = timemems[0];
double umaxtime = timemems[0];
double smintime = timemems[1];
double smaxtime = timemems[1];
double minmem = timemems[2];
double maxmem = timemems[2];
double tmintime = umintime + smintime;
double tmaxtime = umaxtime + smaxtime;
double usum = timemems[0];
double ssum = timemems[1];
double tsum = usum + ssum;
double msum = timemems[2];
for (int k = 3; k < e->nr_nodes * 3; k += 3) {
if (timemems[k] > umaxtime) umaxtime = timemems[k];
if (timemems[k] < umintime) umintime = timemems[k];
if (timemems[k + 1] > smaxtime) smaxtime = timemems[k + 1];
if (timemems[k + 1] < smintime) smintime = timemems[k + 1];
double total = timemems[k] + timemems[k + 1];
if (total > tmaxtime) tmaxtime = total;
if (total < tmintime) tmintime = total;
usum += timemems[k];
ssum += timemems[k + 1];
tsum += total;
if (timemems[k + 2] > maxmem) maxmem = timemems[k + 2];
if (timemems[k + 2] < minmem) minmem = timemems[k + 2];
msum += timemems[k + 2];
}
double umean = usum / (double)e->nr_nodes;
double smean = ssum / (double)e->nr_nodes;
double tmean = tsum / (double)e->nr_nodes;
double mmean = msum / (double)e->nr_nodes;
/* Are we out of balance and need to repartition? */
/* ---------------------------------------------- */
double abs_trigger = fabs(e->reparttype->trigger);
double balance = (umaxtime - umintime) / umean;
if (e->reparttype->type != REPART_NONE) {
/* When forced we don't care about the balance. */
if (!forced) {
if (balance > abs_trigger) {
if (e->verbose)
message("trigger fraction %.3f > %.3f will repartition",
balance, abs_trigger);
e->forcerepart = 1;
} else {
if (e->verbose && e->reparttype->type != REPART_NONE)
message("trigger fraction %.3f =< %.3f will not repartition",
balance, abs_trigger);
}
}
/* It is only worth checking the CPU loads when we have processed a
* significant number of all particles as we require all tasks to have
* timings. */
if ((e->updates > 1 &&
e->updates >= e->total_nr_parts * e->reparttype->minfrac) ||
(e->g_updates > 1 &&
e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) {
/* Should we are use the task timings or fixed costs. */
if (e->reparttype->use_fixed_costs > 1) {
e->reparttype->use_ticks = 0;
} else {
e->reparttype->use_ticks = 1;
/* Not repartitioning, would that have been done otherwise? */
if (e->verbose)
message("trigger fraction %.3f > %.3f would have repartitioned",
balance, abs_trigger);
}
/* Get CPU time used since the last call to this function. */
double elapsed_cputime =
clocks_get_cputime_used() - e->cputime_last_step;
/* Gather the elapsed CPU times from all ranks for the last step. */
double elapsed_cputimes[e->nr_nodes];
MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1,
MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (e->nodeID == 0) {
/* Get the range and mean of cputimes. */
double mintime = elapsed_cputimes[0];
double maxtime = elapsed_cputimes[0];
double sum = elapsed_cputimes[0];
for (int k = 1; k < e->nr_nodes; k++) {
if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k];
if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k];
sum += elapsed_cputimes[k];
}
double mean = sum / (double)e->nr_nodes;
/* Are we out of balance? */
double abs_trigger = fabs(e->reparttype->trigger);
if (((maxtime - mintime) / mean) > abs_trigger) {
if (e->verbose)
message("trigger fraction %.3f > %.3f will repartition",
(maxtime - mintime) / mean, abs_trigger);
e->forcerepart = 1;
} else {
if (e->verbose)
message("trigger fraction %.3f =< %.3f will not repartition",
(maxtime - mintime) / mean, abs_trigger);
}
/* Keep logs of all CPU times and resident memory size for debugging
* load issues. */
FILE *timelog = NULL;
FILE *memlog = NULL;
if (!opened) {
timelog = fopen("rank_cpu_balance.log", "w");
fprintf(timelog, "# step rank user sys sum\n");
memlog = fopen("rank_memory_balance.log", "w");
fprintf(memlog, "# step rank resident\n");
opened = 1;
} else {
timelog = fopen("rank_cpu_balance.log", "a");
memlog = fopen("rank_memory_balance.log", "a");
}
for (int k = 0; k < e->nr_nodes * 3; k += 3) {
fprintf(timelog, "%d %d %f %f %f\n", e->step, k / 3, timemems[k],
timemems[k + 1], timemems[k] + timemems[k + 1]);
fprintf(memlog, "%d %d %ld\n", e->step, k / 3, (long)timemems[k + 2]);
}
fprintf(timelog, "# %d mean times: %f %f %f\n", e->step, umean, smean,
tmean);
if (abs_trigger > 1.f) abs_trigger = 0.f; /* Not relevant. */
fprintf(timelog,
"# %d balance: %f, expected: %f (sys: %f, total: %f)\n",
e->step, balance, abs_trigger, (smaxtime - smintime) / smean,
(tmaxtime - tmintime) / tmean);
fclose(timelog);
fprintf(memlog, "# %d mean resident memory: %f, balance: %f\n", e->step,
mmean, (maxmem - minmem) / mmean);
fclose(memlog);
}
/* All nodes do this together. */
/* All nodes do this together, so send to other ranks. */
MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
......@@ -336,10 +426,6 @@ void engine_repartition_trigger(struct engine *e) {
if (e->forcerepart) e->last_repartition = e->step;
}
/* We always reset CPU time for next check, unless it will not be used. */
if (e->reparttype->type != REPART_NONE)
e->cputime_last_step = clocks_get_cputime_used();
if (e->verbose)
message("took %.3f %s", clocks_from_ticks(getticks() - tic),
clocks_getunit());
......@@ -2343,11 +2429,29 @@ void engine_step(struct engine *e) {
gravity_exact_force_compute(e->s, e);
#endif
/* Get current CPU times.*/
#ifdef WITH_MPI
double start_usertime = 0.0;
double start_systime = 0.0;
if (e->reparttype->type != REPART_NONE) {
clocks_get_cputimes_used(&start_usertime, &start_systime);
}
#endif
/* Start all the tasks. */
TIMER_TIC;
engine_launch(e, "tasks");
TIMER_TOC(timer_runners);
/* Now record the CPU times used by the tasks. */
#ifdef WITH_MPI
double end_usertime = 0.0;
double end_systime = 0.0;
clocks_get_cputimes_used(&end_usertime, &end_systime);
e->usertime_last_step = end_usertime - start_usertime;
e->systime_last_step = end_systime - start_systime;
#endif
/* Since the time-steps may have changed because of the limiter's
* action, we need to communicate the new time-step sizes */
if ((e->policy & engine_policy_timestep_sync) ||
......@@ -3575,7 +3679,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
e->parameter_file = params;
e->stf_this_timestep = 0;
#ifdef WITH_MPI
e->cputime_last_step = 0;
e->usertime_last_step = 0.0;
e->systime_last_step = 0.0;
e->last_repartition = 0;
#endif
e->total_nr_cells = 0;
......
......@@ -360,8 +360,9 @@ struct engine {
ticks tic_step, toc_step;
#ifdef WITH_MPI
/* CPU time of the last step. */
double cputime_last_step;
/* CPU times that the tasks used in the last step. */
double usertime_last_step;
double systime_last_step;
/* Step of last repartition. */
int last_repartition;
......
......@@ -34,6 +34,11 @@
#include <sys/types.h>
#include <unistd.h>
#ifndef SWIFT_MEMUSE_STATM
#include <sys/resource.h>
#include <sys/time.h>
#endif
/* Local defines. */
#include "memuse.h"
......@@ -408,7 +413,8 @@ void memuse_log_dump_error(int rank) {
/**
* @brief parse the process /proc/self/statm file to get the process
* memory use (in KB). Top field in ().
* memory use (in KB). Top field in (). If this file is not
* available only the resident field will be returned.
*
* @param size total virtual memory (VIRT/VmSize)
* @param resident resident non-swapped memory (RES/VmRSS)
......@@ -423,6 +429,8 @@ void memuse_log_dump_error(int rank) {
void memuse_use(long *size, long *resident, long *shared, long *text,
long *data, long *library, long *dirty) {
#ifdef SWIFT_MEMUSE_STATM
/* Open the file. */
FILE *file = fopen("/proc/self/statm", "r");
if (file != NULL) {
......@@ -432,29 +440,38 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
if (nscan == 7) {
/* Convert pages into bytes. Usually 4096, but could be 512 on some
* systems so take care in conversion to KB. */
long sz = sysconf(_SC_PAGESIZE);
*size *= sz;
*resident *= sz;
*shared *= sz;
*text *= sz;
*library *= sz;
*data *= sz;
*dirty *= sz;
*size /= 1024;
*resident /= 1024;
*shared /= 1024;
*text /= 1024;
*library /= 1024;
*data /= 1024;
*dirty /= 1024;
uint64_t sz = sysconf(_SC_PAGESIZE);
*size = (*size) * sz / 1024;
*resident = (*resident) * sz / 1024;
*shared = (*shared) * sz / 1024;
*text = (*text) * sz / 1024;
*library = (*library) * sz / 1024;
*data = (*data) * sz / 1024;
*dirty = (*dirty) * sz / 1024;
} else {
error("Failed to read sufficient fields from /proc/self/statm");
}
fclose(file);
} else {
error("Failed to open /proc/self/statm");
}
#else
/* Not a Linux compatible OS, try to use very limited POSIX call instead.
* Linux only claims to support ru_maxrss, and POSIX only ru_utime and
* ru_stime, so this may still fail. */
struct rusage usage;
getrusage(RUSAGE_SELF, &usage);
*size = 0;
*resident = usage.ru_maxrss;
*shared = usage.ru_ixrss;
*text = 0;
*library = 0;
*data = usage.ru_isrss;
*dirty = 0;
#endif
}
/**
......@@ -469,13 +486,8 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
*/
const char *memuse_process(int inmb) {
static char buffer[256];
long size;
long resident;
long shared;
long text;
long library;
long data;
long dirty;
long size, resident, shared, text, library, data, dirty;
memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty);
if (inmb) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment