Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
SWIFT
SWIFTsim
Commits
45442447
Commit
45442447
authored
Feb 24, 2020
by
Peter W. Draper
Committed by
Matthieu Schaller
Feb 24, 2020
Browse files
Log rank CPU times and memory use to assess the balance.
parent
9ca58f42
Changes
6
Hide whitespace changes
Inline
Side-by-side
configure.ac
View file @
45442447
...
...
@@ -407,6 +407,12 @@ if test "$enable_memuse_reports" = "yes"; then
AC_DEFINE([SWIFT_MEMUSE_REPORTS],1,[Enable memory usage reports])
fi
# The system memory report depends on the presence of /proc/self/statm, i.e.
# a linux OS, check for that.
if test -f "/proc/self/statm"; then
AC_DEFINE([SWIFT_MEMUSE_STATM],1,[Have /proc/self/statm capability])
fi
# Check if we want to make the dumper thread active.
AC_ARG_ENABLE([dumper],
[AS_HELP_STRING([--enable-dumper],
...
...
src/clocks.c
View file @
45442447
...
...
@@ -276,22 +276,24 @@ double clocks_get_hours_since_start(void) {
}
/**
* @brief return the cpu time used.
* @brief return the cpu time
s
used.
*
* Uses the times(2) function to access the user cpu times and returns the sum
* of these for the process tree, i.e. current process plus "waited-for"
* children. This may be pthread implementation specific as to what that
* exactly means. Note we do not include the system time as that includes
* spin times and we don't want to give credit for that.
* Uses the times(2) function to access the user and system cpu times and
* returns the sum of these for the process tree, i.e. current process plus
* "waited-for" children. This may be pthread implementation specific as to
* what that exactly means.
*
* @result cpu time used in sysconf(_SC_CLK_TCK) ticks, usually 100/s not our
* usual ticks.
* Note cpu times are reported in sysconf(_SC_CLK_TCK) ticks, usually 100/s
* not our usual ticks.
*
* @param usertime the user time.
* @param systime the system time.
*/
double
clocks_get_cputime_used
(
void
)
{
void
clocks_get_cputimes_used
(
double
*
usertime
,
double
*
systime
)
{
struct
tms
tmstic
;
times
(
&
tmstic
);
return
(
double
)(
tmstic
.
tms_utime
+
tmstic
.
tms_cutime
);
*
usertime
=
(
tmstic
.
tms_utime
+
tmstic
.
tms_cutime
);
*
systime
=
(
tmstic
.
tms_stime
+
tmstic
.
tms_cstime
);
}
/**
...
...
src/clocks.h
View file @
45442447
...
...
@@ -52,7 +52,7 @@ double clocks_diff_ticks(ticks tic, ticks toc);
const
char
*
clocks_get_timesincestart
(
void
);
double
clocks_get_hours_since_start
(
void
);
double
clocks_get_cputime_used
(
void
);
void
clocks_get_cputime
s
_used
(
double
*
usertime
,
double
*
systime
);
int
clocks_random_seed
(
void
);
#endif
/* SWIFT_CLOCKS_H */
src/engine.c
View file @
45442447
...
...
@@ -256,79 +256,169 @@ void engine_repartition_trigger(struct engine *e) {
#ifdef WITH_MPI
const
ticks
tic
=
getticks
();
static
int
opened
=
0
;
/* Do nothing if there have not been enough steps since the last repartition
* as we don't want to repeat this too often or immediately after a
* repartition step.
Also nothing to do when requested. */
if
(
e
->
step
-
e
->
last_repartition
>=
2
&&
e
->
reparttype
->
type
!=
REPART_NONE
)
{
* repartition step.
We attempt all this even when we are not repartitioning
* as the balance logs can still be interesting. */
if
(
e
->
step
-
e
->
last_repartition
>=
2
)
{
/* If we have fixed costs available and this is step 2 or we are forcing
* repartitioning then we do a fixed costs one now. */
if
(
e
->
reparttype
->
trigger
>
1
||
(
e
->
step
==
2
&&
e
->
reparttype
->
use_fixed_costs
))
{
* repartitioning then we do a forced fixed costs repartition regardless. */
int
forced
=
0
;
if
(
e
->
reparttype
->
type
!=
REPART_NONE
)
{
if
(
e
->
reparttype
->
trigger
>
1
||
(
e
->
step
==
2
&&
e
->
reparttype
->
use_fixed_costs
))
{
if
(
e
->
reparttype
->
trigger
>
1
)
{
if
((
e
->
step
%
(
int
)
e
->
reparttype
->
trigger
)
==
0
)
e
->
forcerepart
=
1
;
}
else
{
e
->
forcerepart
=
1
;
}
e
->
reparttype
->
use_ticks
=
0
;
forced
=
1
;
}
}
if
(
e
->
reparttype
->
trigger
>
1
)
{
if
((
e
->
step
%
(
int
)
e
->
reparttype
->
trigger
)
==
0
)
e
->
forcerepart
=
1
;
/* We only check the CPU loads when we have processed a significant number
* of all particles as we require all tasks to have timings or are
* interested in the various balances logs. */
if
((
e
->
updates
>
1
&&
e
->
updates
>=
e
->
total_nr_parts
*
e
->
reparttype
->
minfrac
)
||
(
e
->
g_updates
>
1
&&
e
->
g_updates
>=
e
->
total_nr_gparts
*
e
->
reparttype
->
minfrac
))
{
/* Are we using the task tick timings or fixed costs? */
if
(
e
->
reparttype
->
use_fixed_costs
>
1
)
{
e
->
reparttype
->
use_ticks
=
0
;
}
else
{
e
->
force
repart
=
1
;
e
->
repart
type
->
use_ticks
=
1
;
}
e
->
reparttype
->
use_ticks
=
0
;
}
else
{
/* Get the resident size of the process for the memory logs. */
long
size
,
resident
,
shared
,
text
,
library
,
data
,
dirty
;
memuse_use
(
&
size
,
&
resident
,
&
shared
,
&
text
,
&
data
,
&
library
,
&
dirty
);
/* Gather it together with the CPU times used by the tasks in the last
* step. */
double
timemem
[
3
]
=
{
e
->
usertime_last_step
,
e
->
systime_last_step
,
(
double
)
resident
};
double
timemems
[
e
->
nr_nodes
*
3
];
MPI_Gather
(
&
timemem
,
3
,
MPI_DOUBLE
,
timemems
,
3
,
MPI_DOUBLE
,
0
,
MPI_COMM_WORLD
);
if
(
e
->
nodeID
==
0
)
{
/* Get the range and mean of the two CPU times and memory. */
double
umintime
=
timemems
[
0
];
double
umaxtime
=
timemems
[
0
];
double
smintime
=
timemems
[
1
];
double
smaxtime
=
timemems
[
1
];
double
minmem
=
timemems
[
2
];
double
maxmem
=
timemems
[
2
];
double
tmintime
=
umintime
+
smintime
;
double
tmaxtime
=
umaxtime
+
smaxtime
;
double
usum
=
timemems
[
0
];
double
ssum
=
timemems
[
1
];
double
tsum
=
usum
+
ssum
;
double
msum
=
timemems
[
2
];
for
(
int
k
=
3
;
k
<
e
->
nr_nodes
*
3
;
k
+=
3
)
{
if
(
timemems
[
k
]
>
umaxtime
)
umaxtime
=
timemems
[
k
];
if
(
timemems
[
k
]
<
umintime
)
umintime
=
timemems
[
k
];
if
(
timemems
[
k
+
1
]
>
smaxtime
)
smaxtime
=
timemems
[
k
+
1
];
if
(
timemems
[
k
+
1
]
<
smintime
)
smintime
=
timemems
[
k
+
1
];
double
total
=
timemems
[
k
]
+
timemems
[
k
+
1
];
if
(
total
>
tmaxtime
)
tmaxtime
=
total
;
if
(
total
<
tmintime
)
tmintime
=
total
;
usum
+=
timemems
[
k
];
ssum
+=
timemems
[
k
+
1
];
tsum
+=
total
;
if
(
timemems
[
k
+
2
]
>
maxmem
)
maxmem
=
timemems
[
k
+
2
];
if
(
timemems
[
k
+
2
]
<
minmem
)
minmem
=
timemems
[
k
+
2
];
msum
+=
timemems
[
k
+
2
];
}
double
umean
=
usum
/
(
double
)
e
->
nr_nodes
;
double
smean
=
ssum
/
(
double
)
e
->
nr_nodes
;
double
tmean
=
tsum
/
(
double
)
e
->
nr_nodes
;
double
mmean
=
msum
/
(
double
)
e
->
nr_nodes
;
/* Are we out of balance and need to repartition? */
/* ---------------------------------------------- */
double
abs_trigger
=
fabs
(
e
->
reparttype
->
trigger
);
double
balance
=
(
umaxtime
-
umintime
)
/
umean
;
if
(
e
->
reparttype
->
type
!=
REPART_NONE
)
{
/* When forced we don't care about the balance. */
if
(
!
forced
)
{
if
(
balance
>
abs_trigger
)
{
if
(
e
->
verbose
)
message
(
"trigger fraction %.3f > %.3f will repartition"
,
balance
,
abs_trigger
);
e
->
forcerepart
=
1
;
}
else
{
if
(
e
->
verbose
&&
e
->
reparttype
->
type
!=
REPART_NONE
)
message
(
"trigger fraction %.3f =< %.3f will not repartition"
,
balance
,
abs_trigger
);
}
}
/* It is only worth checking the CPU loads when we have processed a
* significant number of all particles as we require all tasks to have
* timings. */
if
((
e
->
updates
>
1
&&
e
->
updates
>=
e
->
total_nr_parts
*
e
->
reparttype
->
minfrac
)
||
(
e
->
g_updates
>
1
&&
e
->
g_updates
>=
e
->
total_nr_gparts
*
e
->
reparttype
->
minfrac
))
{
/* Should we are use the task timings or fixed costs. */
if
(
e
->
reparttype
->
use_fixed_costs
>
1
)
{
e
->
reparttype
->
use_ticks
=
0
;
}
else
{
e
->
reparttype
->
use_ticks
=
1
;
/* Not repartitioning, would that have been done otherwise? */
if
(
e
->
verbose
)
message
(
"trigger fraction %.3f > %.3f would have repartitioned"
,
balance
,
abs_trigger
);
}
/* Get CPU time used since the last call to this function. */
double
elapsed_cputime
=
clocks_get_cputime_used
()
-
e
->
cputime_last_step
;
/* Gather the elapsed CPU times from all ranks for the last step. */
double
elapsed_cputimes
[
e
->
nr_nodes
];
MPI_Gather
(
&
elapsed_cputime
,
1
,
MPI_DOUBLE
,
elapsed_cputimes
,
1
,
MPI_DOUBLE
,
0
,
MPI_COMM_WORLD
);
if
(
e
->
nodeID
==
0
)
{
/* Get the range and mean of cputimes. */
double
mintime
=
elapsed_cputimes
[
0
];
double
maxtime
=
elapsed_cputimes
[
0
];
double
sum
=
elapsed_cputimes
[
0
];
for
(
int
k
=
1
;
k
<
e
->
nr_nodes
;
k
++
)
{
if
(
elapsed_cputimes
[
k
]
>
maxtime
)
maxtime
=
elapsed_cputimes
[
k
];
if
(
elapsed_cputimes
[
k
]
<
mintime
)
mintime
=
elapsed_cputimes
[
k
];
sum
+=
elapsed_cputimes
[
k
];
}
double
mean
=
sum
/
(
double
)
e
->
nr_nodes
;
/* Are we out of balance? */
double
abs_trigger
=
fabs
(
e
->
reparttype
->
trigger
);
if
(((
maxtime
-
mintime
)
/
mean
)
>
abs_trigger
)
{
if
(
e
->
verbose
)
message
(
"trigger fraction %.3f > %.3f will repartition"
,
(
maxtime
-
mintime
)
/
mean
,
abs_trigger
);
e
->
forcerepart
=
1
;
}
else
{
if
(
e
->
verbose
)
message
(
"trigger fraction %.3f =< %.3f will not repartition"
,
(
maxtime
-
mintime
)
/
mean
,
abs_trigger
);
}
/* Keep logs of all CPU times and resident memory size for debugging
* load issues. */
FILE
*
timelog
=
NULL
;
FILE
*
memlog
=
NULL
;
if
(
!
opened
)
{
timelog
=
fopen
(
"rank_cpu_balance.log"
,
"w"
);
fprintf
(
timelog
,
"# step rank user sys sum
\n
"
);
memlog
=
fopen
(
"rank_memory_balance.log"
,
"w"
);
fprintf
(
memlog
,
"# step rank resident
\n
"
);
opened
=
1
;
}
else
{
timelog
=
fopen
(
"rank_cpu_balance.log"
,
"a"
);
memlog
=
fopen
(
"rank_memory_balance.log"
,
"a"
);
}
for
(
int
k
=
0
;
k
<
e
->
nr_nodes
*
3
;
k
+=
3
)
{
fprintf
(
timelog
,
"%d %d %f %f %f
\n
"
,
e
->
step
,
k
/
3
,
timemems
[
k
],
timemems
[
k
+
1
],
timemems
[
k
]
+
timemems
[
k
+
1
]);
fprintf
(
memlog
,
"%d %d %ld
\n
"
,
e
->
step
,
k
/
3
,
(
long
)
timemems
[
k
+
2
]);
}
fprintf
(
timelog
,
"# %d mean times: %f %f %f
\n
"
,
e
->
step
,
umean
,
smean
,
tmean
);
if
(
abs_trigger
>
1
.
f
)
abs_trigger
=
0
.
f
;
/* Not relevant. */
fprintf
(
timelog
,
"# %d balance: %f, expected: %f (sys: %f, total: %f)
\n
"
,
e
->
step
,
balance
,
abs_trigger
,
(
smaxtime
-
smintime
)
/
smean
,
(
tmaxtime
-
tmintime
)
/
tmean
);
fclose
(
timelog
);
fprintf
(
memlog
,
"# %d mean resident memory: %f, balance: %f
\n
"
,
e
->
step
,
mmean
,
(
maxmem
-
minmem
)
/
mmean
);
fclose
(
memlog
);
}
/* All nodes do this together. */
/* All nodes do this together
, so send to other ranks
. */
MPI_Bcast
(
&
e
->
forcerepart
,
1
,
MPI_INT
,
0
,
MPI_COMM_WORLD
);
}
...
...
@@ -336,10 +426,6 @@ void engine_repartition_trigger(struct engine *e) {
if
(
e
->
forcerepart
)
e
->
last_repartition
=
e
->
step
;
}
/* We always reset CPU time for next check, unless it will not be used. */
if
(
e
->
reparttype
->
type
!=
REPART_NONE
)
e
->
cputime_last_step
=
clocks_get_cputime_used
();
if
(
e
->
verbose
)
message
(
"took %.3f %s"
,
clocks_from_ticks
(
getticks
()
-
tic
),
clocks_getunit
());
...
...
@@ -2343,11 +2429,29 @@ void engine_step(struct engine *e) {
gravity_exact_force_compute
(
e
->
s
,
e
);
#endif
/* Get current CPU times.*/
#ifdef WITH_MPI
double
start_usertime
=
0
.
0
;
double
start_systime
=
0
.
0
;
if
(
e
->
reparttype
->
type
!=
REPART_NONE
)
{
clocks_get_cputimes_used
(
&
start_usertime
,
&
start_systime
);
}
#endif
/* Start all the tasks. */
TIMER_TIC
;
engine_launch
(
e
,
"tasks"
);
TIMER_TOC
(
timer_runners
);
/* Now record the CPU times used by the tasks. */
#ifdef WITH_MPI
double
end_usertime
=
0
.
0
;
double
end_systime
=
0
.
0
;
clocks_get_cputimes_used
(
&
end_usertime
,
&
end_systime
);
e
->
usertime_last_step
=
end_usertime
-
start_usertime
;
e
->
systime_last_step
=
end_systime
-
start_systime
;
#endif
/* Since the time-steps may have changed because of the limiter's
* action, we need to communicate the new time-step sizes */
if
((
e
->
policy
&
engine_policy_timestep_sync
)
||
...
...
@@ -3575,7 +3679,8 @@ void engine_init(struct engine *e, struct space *s, struct swift_params *params,
e
->
parameter_file
=
params
;
e
->
stf_this_timestep
=
0
;
#ifdef WITH_MPI
e
->
cputime_last_step
=
0
;
e
->
usertime_last_step
=
0
.
0
;
e
->
systime_last_step
=
0
.
0
;
e
->
last_repartition
=
0
;
#endif
e
->
total_nr_cells
=
0
;
...
...
src/engine.h
View file @
45442447
...
...
@@ -360,8 +360,9 @@ struct engine {
ticks
tic_step
,
toc_step
;
#ifdef WITH_MPI
/* CPU time of the last step. */
double
cputime_last_step
;
/* CPU times that the tasks used in the last step. */
double
usertime_last_step
;
double
systime_last_step
;
/* Step of last repartition. */
int
last_repartition
;
...
...
src/memuse.c
View file @
45442447
...
...
@@ -34,6 +34,11 @@
#include <sys/types.h>
#include <unistd.h>
#ifndef SWIFT_MEMUSE_STATM
#include <sys/resource.h>
#include <sys/time.h>
#endif
/* Local defines. */
#include "memuse.h"
...
...
@@ -408,7 +413,8 @@ void memuse_log_dump_error(int rank) {
/**
* @brief parse the process /proc/self/statm file to get the process
* memory use (in KB). Top field in ().
* memory use (in KB). Top field in (). If this file is not
* available only the resident field will be returned.
*
* @param size total virtual memory (VIRT/VmSize)
* @param resident resident non-swapped memory (RES/VmRSS)
...
...
@@ -423,6 +429,8 @@ void memuse_log_dump_error(int rank) {
void
memuse_use
(
long
*
size
,
long
*
resident
,
long
*
shared
,
long
*
text
,
long
*
data
,
long
*
library
,
long
*
dirty
)
{
#ifdef SWIFT_MEMUSE_STATM
/* Open the file. */
FILE
*
file
=
fopen
(
"/proc/self/statm"
,
"r"
);
if
(
file
!=
NULL
)
{
...
...
@@ -432,29 +440,38 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
if
(
nscan
==
7
)
{
/* Convert pages into bytes. Usually 4096, but could be 512 on some
* systems so take care in conversion to KB. */
long
sz
=
sysconf
(
_SC_PAGESIZE
);
*
size
*=
sz
;
*
resident
*=
sz
;
*
shared
*=
sz
;
*
text
*=
sz
;
*
library
*=
sz
;
*
data
*=
sz
;
*
dirty
*=
sz
;
*
size
/=
1024
;
*
resident
/=
1024
;
*
shared
/=
1024
;
*
text
/=
1024
;
*
library
/=
1024
;
*
data
/=
1024
;
*
dirty
/=
1024
;
uint64_t
sz
=
sysconf
(
_SC_PAGESIZE
);
*
size
=
(
*
size
)
*
sz
/
1024
;
*
resident
=
(
*
resident
)
*
sz
/
1024
;
*
shared
=
(
*
shared
)
*
sz
/
1024
;
*
text
=
(
*
text
)
*
sz
/
1024
;
*
library
=
(
*
library
)
*
sz
/
1024
;
*
data
=
(
*
data
)
*
sz
/
1024
;
*
dirty
=
(
*
dirty
)
*
sz
/
1024
;
}
else
{
error
(
"Failed to read sufficient fields from /proc/self/statm"
);
}
fclose
(
file
);
}
else
{
error
(
"Failed to open /proc/self/statm"
);
}
#else
/* Not a Linux compatible OS, try to use very limited POSIX call instead.
* Linux only claims to support ru_maxrss, and POSIX only ru_utime and
* ru_stime, so this may still fail. */
struct
rusage
usage
;
getrusage
(
RUSAGE_SELF
,
&
usage
);
*
size
=
0
;
*
resident
=
usage
.
ru_maxrss
;
*
shared
=
usage
.
ru_ixrss
;
*
text
=
0
;
*
library
=
0
;
*
data
=
usage
.
ru_isrss
;
*
dirty
=
0
;
#endif
}
/**
...
...
@@ -469,13 +486,8 @@ void memuse_use(long *size, long *resident, long *shared, long *text,
*/
const
char
*
memuse_process
(
int
inmb
)
{
static
char
buffer
[
256
];
long
size
;
long
resident
;
long
shared
;
long
text
;
long
library
;
long
data
;
long
dirty
;
long
size
,
resident
,
shared
,
text
,
library
,
data
,
dirty
;
memuse_use
(
&
size
,
&
resident
,
&
shared
,
&
text
,
&
data
,
&
library
,
&
dirty
);
if
(
inmb
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment