Commit 3288efec authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'repartition-cputime-update' into 'master'

Log rank CPU times and memory use to assess the balance.

See merge request !1017
parents 9ca58f42 45442447
......@@ -407,6 +407,12 @@ if test "$enable_memuse_reports" = "yes"; then
AC_DEFINE([SWIFT_MEMUSE_REPORTS],1,[Enable memory usage reports])
fi
# The system memory report depends on the presence of /proc/self/statm, i.e.
# a linux OS, check for that.
if test -f "/proc/self/statm"; then
AC_DEFINE([SWIFT_MEMUSE_STATM],1,[Have /proc/self/statm capability])
fi
# Check if we want to make the dumper thread active.
AC_ARG_ENABLE([dumper],
[AS_HELP_STRING([--enable-dumper],
......
......@@ -276,22 +276,24 @@ double clocks_get_hours_since_start(void) {
}
/**
* @brief return the cpu time used.
* @brief return the cpu times used.
*
* Uses the times(2) function to access the user cpu times and returns the sum
* of these for the process tree, i.e. current process plus "waited-for"
* children. This may be pthread implementation specific as to what that
* exactly means. Note we do not include the system time as that includes
* spin times and we don't want to give credit for that.
* Uses the times(2) function to access the user and system cpu times and
* returns the sum of these for the process tree, i.e. current process plus
* "waited-for" children. This may be pthread implementation specific as to
* what that exactly means.
*
* @result cpu time used in sysconf(_SC_CLK_TCK) ticks, usually 100/s not our
* usual ticks.
* Note cpu times are reported in sysconf(_SC_CLK_TCK) ticks, usually 100/s
* not our usual ticks.
*
* @param usertime the user time.
* @param systime the system time.
*/
double clocks_get_cputime_used(void) {
void clocks_get_cputimes_used(double *usertime, double *systime) {
struct tms tmstic;
times(&tmstic);
return (double)(tmstic.tms_utime + tmstic.tms_cutime);
*usertime = (tmstic.tms_utime + tmstic.tms_cutime);
*systime = (tmstic.tms_stime + tmstic.tms_cstime);
}
/**
......
......@@ -52,7 +52,7 @@ double clocks_diff_ticks(ticks tic, ticks toc);
const char *clocks_get_timesincestart(void);
double clocks_get_hours_since_start(void);
double clocks_get_cputime_used(void);
void clocks_get_cputimes_used(double *usertime, double *systime);
int clocks_random_seed(void);
#endif /* SWIFT_CLOCKS_H */
......@@ -256,79 +256,169 @@ void engine_repartition_trigger(struct engine *e) {
#ifdef WITH_MPI
const ticks tic = getticks();
static int opened = 0;
/* Do nothing if there have not been enough steps since the last repartition
* as we don't want to repeat this too often or immediately after a
* repartition step. Also nothing to do when requested. */
if (e->step - e->last_repartition >= 2 &&
e->reparttype->type != REPART_NONE) {
* repartition step. We attempt all this even when we are not repartitioning
* as the balance logs can still be interesting. */
if (e->step - e->last_repartition >= 2) {
/* If we have fixed costs available and this is step 2 or we are forcing
* repartitioning then we do a fixed costs one now. */
if (e->reparttype->trigger > 1 ||
(e->step == 2 && e->reparttype->use_fixed_costs)) {
* repartitioning then we do a forced fixed costs repartition regardless. */
int forced = 0;
if (e->reparttype->type != REPART_NONE) {
if (e->reparttype->trigger > 1 ||
(e->step == 2 && e->reparttype->use_fixed_costs)) {
if (e->reparttype->trigger > 1) {
if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
} else {
e->forcerepart = 1;
}
e->reparttype->use_ticks = 0;
forced = 1;
}
}
if (e->reparttype->trigger > 1) {
if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1;
/* We only check the CPU loads when we have processed a significant number
* of all particles as we require all tasks to have timings or are
* interested in the various balances logs. */
if ((e->updates > 1 &&
e->updates >= e->total_nr_parts * e->reparttype->minfrac) ||
(e->g_updates > 1 &&
e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) {
/* Are we using the task tick timings or fixed costs? */
if (e->reparttype->use_fixed_costs > 1) {
e->reparttype->use_ticks = 0;
} else {
e->forcerepart = 1;
e->reparttype->use_ticks = 1;
}
e->reparttype->use_ticks = 0;
} else {
/* Get the resident size of the process for the memory logs. */
long size, resident, shared, text, library, data, dirty;
memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty);
/* Gather it together with the CPU times used by the tasks in the last
* step. */
double timemem[3] = {e->usertime_last_step, e->systime_last_step,
(double)resident};
double timemems[e->nr_nodes * 3];
MPI_Gather(&timemem, 3, MPI_DOUBLE, timemems, 3, MPI_DOUBLE, 0,
MPI_COMM_WORLD);
if (e->nodeID == 0) {
/* Get the range and mean of the two CPU times and memory. */
double umintime = timemems[0];
double umaxtime = timemems[0];
double smintime = timemems[1];
double smaxtime = timemems[1];
double minmem = timemems[2];
double maxmem = timemems[2];
double tmintime = umintime + smintime;
double tmaxtime = umaxtime + smaxtime;
double usum = timemems[0];
double ssum = timemems[1];
double tsum = usum + ssum;
double msum = timemems[2];
for (int k = 3; k < e->nr_nodes * 3; k += 3) {
if (timemems[k] > umaxtime) umaxtime = timemems[k];
if (timemems[k] < umintime) umintime = timemems[k];
if (timemems[k + 1] > smaxtime) smaxtime = timemems[k + 1];
if (timemems[k + 1] < smintime) smintime = timemems[k + 1];
double total = timemems[k] + timemems[k + 1];
if (total > tmaxtime) tmaxtime = total;
if (total < tmintime) tmintime = total;
usum += timemems[k];
ssum += timemems[k + 1];
tsum += total;
if (timemems[k + 2] > maxmem) maxmem = timemems[k + 2];
if (timemems[k + 2] < minmem) minmem = timemems[k + 2];
msum += timemems[k + 2];
}
double umean = usum / (double)e->nr_nodes;
double smean = ssum / (double)e->nr_nodes;
double tmean = tsum / (double)e->nr_nodes;
double mmean = msum / (double)e->nr_nodes;
/* Are we out of balance and need to repartition? */
/* ---------------------------------------------- */
double abs_trigger = fabs(e->reparttype->trigger);
double balance = (umaxtime - umintime) / umean;
if (e->reparttype->type != REPART_NONE) {
/* When forced we don't care about the balance. */
if (!forced) {
if (balance > abs_trigger) {
if (e->verbose)
message("trigger fraction %.3f > %.3f will repartition",
balance, abs_trigger);
e->forcerepart = 1;
} else {
if (e->verbose && e->reparttype->type != REPART_NONE)
message("trigger fraction %.3f =< %.3f will not repartition",
balance, abs_trigger);
}
}
/* It is only worth checking the CPU loads when we have processed a
* significant number of all particles as we require all tasks to have
* timings. */
if ((e->updates > 1 &&
e->updates >= e->total_nr_parts * e->reparttype->minfrac) ||
(e->g_updates > 1 &&
e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) {
/* Should we are use the task timings or fixed costs. */
if (e->reparttype->use_fixed_costs > 1) {
e->reparttype->use_ticks = 0;
} else {
e->reparttype->use_ticks = 1;
/* Not repartitioning, would that have been done otherwise? */
if (e->verbose)
message("trigger fraction %.3f > %.3f would have repartitioned",
balance, abs_trigger);
}
/* Get CPU time used since the last call to this function. */
double elapsed_cputime =
clocks_get_cputime_used() - e->cputime_last_step;
/* Gather the elapsed CPU times from all ranks for the last step. */
double elapsed_cputimes[e->nr_nodes];
MPI_Gather(&elapsed_cputime, 1, MPI_DOUBLE, elapsed_cputimes, 1,
MPI_DOUBLE, 0, MPI_COMM_WORLD);
if (e->nodeID == 0) {
/* Get the range and mean of cputimes. */
double mintime = elapsed_cputimes[0];
double maxtime = elapsed_cputimes[0];
double sum = elapsed_cputimes[0];
for (int k = 1; k < e->nr_nodes; k++) {
if (elapsed_cputimes[k] > maxtime) maxtime = elapsed_cputimes[k];
if (elapsed_cputimes[k] < mintime) mintime = elapsed_cputimes[k];
sum += elapsed_cputimes[k];
}
double mean = sum / (double)e->nr_nodes;
/* Are we out of balance? */
double abs_trigger = fabs(e->reparttype->trigger);
if (((maxtime - mintime) / mean) > abs_trigger) {
if (e->verbose)
message("trigger fraction %.3f > %.3f will repartition",
(maxtime - mintime) / mean, abs_trigger);
e->forcerepart = 1;
} else {
if (e->verbose)
message("trigger fraction %.3f =< %.3f will not repartition",
(maxtime - mintime) / mean, abs_trigger);
}
/* Keep logs of all CPU times and resident memory size for debugging
* load issues. */
FILE *timelog = NULL;
FILE *memlog = NULL;
if (!opened) {
timelog = fopen("rank_cpu_balance.log", "w");
fprintf(timelog, "# step rank user sys sum\n");
memlog = fopen("rank_memory_balance.log", "w");
fprintf(memlog, "# step rank resident\n");
opened = 1;
} else {
timelog = fopen("rank_cpu_balance.log", "a");
memlog = fopen("rank_memory_balance.log", "a");
}
for (int k = 0; k < e->nr_nodes * 3; k += 3) {
fprintf(timelog, "%d %d %f %f %f\n", e->step, k / 3, timemems[k],
timemems[k + 1], timemems[k] + timemems[k + 1]);
fprintf(memlog, "%d %d %ld\n", e->step, k / 3, (long)timemems[k + 2]);
}
fprintf(timelog, "# %d mean times: %f %f %f\n", e->step, umean, smean,
tmean);
if (abs_trigger > 1.f) abs_trigger = 0.f; /* Not relevant. */
fprintf(timelog,
"# %d balance: %f, expected: %f (sys: %f, total: %f)\n",
e->step, balance, abs_trigger, (smaxtime - smintime) / smean,
(tmaxtime - tmintime) / tmean);
fclose(timelog);
fprintf(memlog, "# %d mean resident memory: %f, balance: %f\n", e->step,
mmean, (maxmem - minmem) / mmean);
fclose(memlog);
}
/* All nodes do this together. */
/* All nodes do this together, so send to other ranks. */
MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD);
}
......@@ -336,10 +426,6 @@ void engine_repartition_trigger(struct engine *e) {
if (e->forcerepart) e->last_repartition = e->step;
}
/* We always reset CPU time for next check, unless it will not be used. */
if (e->reparttype->type != REPART_NONE)
e->cputime_last_step = clocks_get_cputime_used();
if (e->verbose)
message("took %.3f %s", clocks_from_ticks(getticks() - tic),
clocks_getunit());
......@@ -2343,11 +2429,29 @@ void engine_step(struct engine *e) {
gravity_exact_force_compute(e->s, e);
#endif
/* Get current CPU times.*/
#ifdef WITH_MPI
double start_usertime = 0.0;
double start_systime = 0.0;
if (e->reparttype->type != REPART_NONE) {
clocks_get_cputimes_used(&start_usertime, &start_systime);
}
#endif
/* Start all the tasks. */
TIMER_TIC;
engine_launch(e, "tasks");
TIMER_TOC(timer_runners);
/* Now record the CPU times used by the tasks. */
#ifdef WITH_MPI
double end_usertime = 0.0;
double end_systime = 0.0;
clocks_get_cputimes_used(&end_usertime, &end_systime);
e->usertime_last_step = end_usertime - start_usertime;
e->systime_last_step = end_systime - start_systime;
#endif
/* Since the time-steps may have changed because of the limiter's
* action, we need to communicate the new time-step sizes */
if ((e->policy & engine_policy_timestep_sync) ||
......@@ -3575,7 +3679,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
e->parameter_file = params;
e->stf_this_timestep = 0;
#ifdef WITH_MPI
e->cputime_last_step = 0;
e->usertime_last_step = 0.0;
e->systime_last_step = 0.0;
e->last_repartition = 0;
#endif
e->total_nr_cells = 0;
......
......@@ -360,8 +360,9 @@ struct engine {
ticks tic_step, toc_step;
#ifdef WITH_MPI
/* CPU time of the last step. */
double cputime_last_step;
/* CPU times that the tasks used in the last step. */
double usertime_last_step;
double systime_last_step;
/* Step of last repartition. */
int last_repartition;
......
......@@ -34,6 +34,11 @@
#include <sys/types.h>
#include <unistd.h>
#ifndef SWIFT_MEMUSE_STATM
#include <sys/resource.h>
#include <sys/time.h>
#endif
/* Local defines. */
#include "memuse.h"
......@@ -408,7 +413,8 @@ void memuse_log_dump_error(int rank) {
/**
* @brief parse the process /proc/self/statm file to get the process
* memory use (in KB). Top field in ().
* memory use (in KB). Top field in (). If this file is not
* available only the resident field will be returned.
*
* @param size total virtual memory (VIRT/VmSize)
* @param resident resident non-swapped memory (RES/VmRSS)
......@@ -423,6 +429,8 @@ void memuse_log_dump_error(int rank) {
void memuse_use(long *size, long *resident, long *shared, long *text,
long *data, long *library, long *dirty) {
#ifdef SWIFT_MEMUSE_STATM
/* Open the file. */
FILE *file = fopen("/proc/self/statm", "r");
if (file != NULL) {
......@@ -432,29 +440,38 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
if (nscan == 7) {
/* Convert pages into bytes. Usually 4096, but could be 512 on some
* systems so take care in conversion to KB. */
long sz = sysconf(_SC_PAGESIZE);
*size *= sz;
*resident *= sz;
*shared *= sz;
*text *= sz;
*library *= sz;
*data *= sz;
*dirty *= sz;
*size /= 1024;
*resident /= 1024;
*shared /= 1024;
*text /= 1024;
*library /= 1024;
*data /= 1024;
*dirty /= 1024;
uint64_t sz = sysconf(_SC_PAGESIZE);
*size = (*size) * sz / 1024;
*resident = (*resident) * sz / 1024;
*shared = (*shared) * sz / 1024;
*text = (*text) * sz / 1024;
*library = (*library) * sz / 1024;
*data = (*data) * sz / 1024;
*dirty = (*dirty) * sz / 1024;
} else {
error("Failed to read sufficient fields from /proc/self/statm");
}
fclose(file);
} else {
error("Failed to open /proc/self/statm");
}
#else
/* Not a Linux compatible OS, try to use very limited POSIX call instead.
* Linux only claims to support ru_maxrss, and POSIX only ru_utime and
* ru_stime, so this may still fail. */
struct rusage usage;
getrusage(RUSAGE_SELF, &usage);
*size = 0;
*resident = usage.ru_maxrss;
*shared = usage.ru_ixrss;
*text = 0;
*library = 0;
*data = usage.ru_isrss;
*dirty = 0;
#endif
}
/**
......@@ -469,13 +486,8 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
*/
const char *memuse_process(int inmb) {
static char buffer[256];
long size;
long resident;
long shared;
long text;
long library;
long data;
long dirty;
long size, resident, shared, text, library, data, dirty;
memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty);
if (inmb) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment