Commit ba842dd6 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'memuse_rnodes' into 'master'

Speed up memory use reports

Closes #572

See merge request !838
parents 602ad119 726a43c8
......@@ -58,24 +58,13 @@ can be made by calling the ``memuse_log_allocation()`` function directly.
The output files are called ``memuse_report-step<n>.dat`` or
``memuse_report-rank<m>-step<n>.dat`` if running using MPI. These have a line
for each allocation or free that records the time, memory address, step,
whether an allocation or free, the label and when an allocation, the amount of
memory. The comments in this file also record the actual memory use of the
process (including threads) as reported by the operating system at the end of
the step.
To post process these files into a memory used timeline and get a report of
the peak memory use, as well as the memory still in use at the end of the step
there is an basic analysis script ``analyse_memuse_logs.py`` and two wrappers
that process a directory of logs, these are ``./process_memuse_logs.sh`` and
``./process_memuse_logs_MPI.sh`` for non-MPI and MPI runs respectively.
Note that the process scripts process each step individually and also process
all the logs as a single sequence. When interpreting these some care should be
taken as they are not all the memory allocated, just important allocations in
SWIFT and when looking at a single step the context of any previous steps is
not used, so you only see allocations made in that step and the effect of any
matching frees (so allocations made in previous steps that are freed in this
step will not be understood and will be ignored, you need the global analysis
to understand that).
for each allocation or free that records the time, step, whether an allocation
or free, the label, the amount of memory allocated or freed and the total of
all (labelled) memory in use at that time.
Comments at the end of this file also record the actual memory use of the
process (including threads), as reported by the operating system at the end of
the step, and the total memory still in use per label. Note this includes
memory still active from previous steps and the total memory is also continued
from the previous dump.
......@@ -346,7 +346,7 @@ __attribute__((always_inline)) INLINE int cache_read_particles_subset_self(
*/
__attribute__((always_inline)) INLINE void cache_read_particles_subset_pair(
const struct cell *restrict const ci, struct cache *restrict const ci_cache,
const struct entry *restrict sort_i, int *first_pi, int *last_pi,
const struct sort_entry *restrict sort_i, int *first_pi, int *last_pi,
const double *loc, const int flipped) {
#if defined(GADGET2_SPH)
......@@ -608,9 +608,10 @@ __attribute__((always_inline)) INLINE int cache_read_force_particles(
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const struct cell *restrict const ci, const struct cell *restrict const cj,
struct cache *restrict const ci_cache,
struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
const struct entry *restrict sort_j, const double *restrict const shift,
int *first_pi, int *last_pj) {
struct cache *restrict const cj_cache,
const struct sort_entry *restrict sort_i,
const struct sort_entry *restrict sort_j,
const double *restrict const shift, int *first_pi, int *last_pj) {
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
......@@ -860,8 +861,9 @@ __attribute__((always_inline)) INLINE void
cache_read_two_partial_cells_sorted_force(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj) {
const struct sort_entry *restrict sort_i,
const struct sort_entry *restrict sort_j, const double *const shift,
int *first_pi, int *last_pj) {
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
......
......@@ -332,7 +332,7 @@ struct cell {
struct xpart *xparts;
/*! Pointer for the sorted indices. */
struct entry *sort[13];
struct sort_entry *sort[13];
/*! Super cell, i.e. the highest-level parent cell that has a hydro
* pair/self tasks */
......@@ -600,7 +600,7 @@ struct cell {
float dx_max_sort_old;
/*! Pointer for the sorted indices. */
struct entry *sort[13];
struct sort_entry *sort[13];
/*! Bit mask of sort directions that will be needed in the next timestep. */
uint16_t requires_sorts;
......@@ -1261,8 +1261,8 @@ __attribute__((always_inline)) INLINE static void cell_malloc_hydro_sorts(
* on the same dimensions), so we need separate allocations per dimension. */
for (int j = 0; j < 13; j++) {
if ((flags & (1 << j)) && c->hydro.sort[j] == NULL) {
if ((c->hydro.sort[j] = (struct entry *)swift_malloc(
"hydro.sort", sizeof(struct entry) * (count + 1))) == NULL)
if ((c->hydro.sort[j] = (struct sort_entry *)swift_malloc(
"hydro.sort", sizeof(struct sort_entry) * (count + 1))) == NULL)
error("Failed to allocate sort memory.");
}
}
......@@ -1299,8 +1299,8 @@ __attribute__((always_inline)) INLINE static void cell_malloc_stars_sorts(
* on the same dimensions), so we need separate allocations per dimension. */
for (int j = 0; j < 13; j++) {
if ((flags & (1 << j)) && c->stars.sort[j] == NULL) {
if ((c->stars.sort[j] = (struct entry *)swift_malloc(
"stars.sort", sizeof(struct entry) * (count + 1))) == NULL)
if ((c->stars.sort[j] = (struct sort_entry *)swift_malloc(
"stars.sort", sizeof(struct sort_entry) * (count + 1))) == NULL)
error("Failed to allocate sort memory.");
}
}
......
......@@ -42,17 +42,22 @@
#ifdef SWIFT_MEMUSE_REPORTS
/* The initial size and increment of the log entries buffer. */
#define MEMUSE_INITLOG 1000000
/* A megabyte for conversions. */
#define MEGABYTE 1048576.0
/* Maximum length of label in log entry. */
#define MEMUSE_MAXLABLEN 32
/* Also recorded in logger. */
extern int engine_rank;
extern int engine_current_step;
/* Entry for logger of memory allocations and deallocations in a step. */
#define MEMUSE_MAXLAB 64
struct memuse_log_entry {
/* Rank in action. */
int rank;
/* Step of action. */
int step;
......@@ -62,23 +67,291 @@ struct memuse_log_entry {
/* Memory allocated in bytes. */
size_t size;
/* Address of memory. */
void *ptr;
/* Address of memory. Use union as easy way to convert into an array of
* bytes. */
union {
void *ptr;
uint8_t vptr[sizeof(uintptr_t)];
};
/* Relative time of this action. */
ticks dtic;
/* Label associated with the memory. */
char label[MEMUSE_MAXLAB + 1];
char label[MEMUSE_MAXLABLEN + 1];
/* Whether log is still active, i.e. not matched with a free or allocation. */
int active;
};
/* The log of allocations and frees. */
/* The log of allocations and frees. All volatile as accessed from threads
* that use the value to synchronise. */
static struct memuse_log_entry *volatile memuse_log = NULL;
static volatile size_t memuse_log_size = 0;
static volatile size_t memuse_log_count = 0;
static volatile size_t memuse_old_count = 0;
static volatile size_t memuse_log_done = 0;
#define MEMUSE_INITLOG 1000000
/* Current sum of memory in use. Only used in dumping. */
static size_t memuse_current = 0;
/* Label usage gathering struct. Only used in dumping. */
struct memuse_labelled_item {
size_t sum;
size_t count;
};
/* A radix node, this has a single byte key and a pointer to some related
* resource. It also holds a sorted list of children, if any. */
struct memuse_rnode {
/* Byte key of this node. */
uint8_t keypart;
/* Value of this node, if set. */
void *ptr;
/* Sorted pointers to children of this node. */
struct memuse_rnode **children;
unsigned int count;
};
/* Persistent radix trie root node. Holds active logs between dumps. */
static struct memuse_rnode *memuse_rnode_root;
static int memuse_rnode_root_init = 1;
#ifdef MEMUSE_RNODE_DUMP
/**
* @brief Dump a representation of the radix tree rooted at a node to stdout.
*
* @param depth the depth of the node in the tree, root is 0.
* @param node the node at which to start dumping.
* @param full if not zero then nodes that are not storing a value
* are also reported.
*/
static void memuse_rnode_dump(int depth, struct memuse_rnode *node, int full) {
/* Value of the full key, to this depth. Assumes full key is a pointer,
* so uncomment when using strings. */
static union {
//uint8_t key[MEMUSE_MAXLABLEN];
//char ptr[MEMUSE_MAXLABLEN];
uint8_t key[sizeof(uintptr_t)];
void *ptr;
} keyparts = {0};
/* Record keypart at this depth. Root has no keypart. */
if (depth != 0) keyparts.key[depth - 1] = node->keypart;
//if (node->ptr != NULL || full) {
// keyparts.key[depth] = '\0';
//
// /* Gather children's keys if full. */
// char fullkey[MEMUSE_MAXLABLEN];
// if (full) {
// for (size_t k = 0; k < node->count; k++) {
// fullkey[k] = node->children[k]->keypart;
// }
// fullkey[node->count] = '\0';
// printf("dump @ depth: %d keypart: %d key: %s value: %p fullkey: %s\n",
// depth, node->keypart, keyparts.ptr, node->ptr, fullkey);
// } else {
// printf("dump @ depth: %d keypart: %d key: %s value: %p\n", depth,
// node->keypart, keyparts.ptr, node->ptr);
// }
//}
if (node->ptr != NULL || full) {
printf("dump @ depth: %d keypart: %d key: %p value: %p\n", depth,
node->keypart, keyparts.ptr, node->ptr);
}
/* Recurse to all children. */
for (size_t k = 0; k < node->count; k++) {
memuse_rnode_dump(depth + 1, node->children[k], full);
}
}
#endif
/**
* @brief Return the position of a keypart for a list of children.
* If not found returns where it would be inserted.
*
* @param keypart the keypart to locate.
* @param children the list of sorted children.
* @param count the number of children
*
* @return the index of key or where it should be inserted.
*/
static unsigned int memuse_rnode_bsearch(uint8_t keypart,
struct memuse_rnode **children,
unsigned int count) {
/* Search for lower bound. */
unsigned int lower = 0;
unsigned int upper = count;
while (lower < upper) {
unsigned int middle = (upper + lower) / 2;
if (keypart > children[middle]->keypart)
lower = middle + 1;
else
upper = middle;
}
return lower;
}
/**
* @brief Insert a child, if needed, into a list of children. Assumes
* we have sufficient room.
*
* @param child the child to insert, if needed.
* @param children the list of sorted children.
* @param count the number of children
*/
static void memuse_rnode_binsert_child(struct memuse_rnode *child,
struct memuse_rnode **children,
unsigned int *count) {
unsigned int pos = 0;
if (*count > 0) {
/* Find the child or insertion point. */
pos = memuse_rnode_bsearch(child->keypart, children, *count);
/* If not found move all children to make a space, unless we're inserting
* after the end. */
if (pos < *count && children[pos]->keypart != child->keypart) {
memmove(&children[pos + 1], &children[pos],
(*count - pos) * sizeof(struct memuse_rnode *));
}
}
/* Insert new child */
children[pos] = child;
*count += 1;
}
/**
* @brief Add a child rnode to an rnode. Making sure we have room and keeping
* the sort order.
*
* @param node the parent node.
* @param child the node to add to the parent,
*/
static void memuse_rnode_add_child(struct memuse_rnode *node,
struct memuse_rnode *child) {
/* Extend the children list to include a new entry .*/
void *mem = realloc(node->children,
(node->count + 1) * sizeof(struct memuse_rnode *));
if (mem == NULL) error("Failed to reallocate rnodes\n");
node->children = mem;
/* Insert the new child. */
memuse_rnode_binsert_child(child, node->children, &node->count);
}
/**
* @brief Find a child of a node with the given key part.
*
* @param node the node to search.
* @param keypart the key part of the child.
* @return NULL if not found.
*/
static struct memuse_rnode *memuse_rnode_lookup(const struct memuse_rnode *node,
uint8_t keypart) {
/* Locate the key, or where it would be inserted. */
if (node->count > 0) {
unsigned int index = memuse_rnode_bsearch(keypart, node->children,
node->count);
if (index < node->count && keypart == node->children[index]->keypart) {
return node->children[index];
}
}
return NULL;
}
/**
* @brief insert a child into a node's children list and add a pointer, iff
* this is the destination node for the given key.
*
* @param node the parent node.
* @param depth the depth of the parent node.
* @param key the full key of the eventual leaf node.
* @param keylen the numbers of bytes in the full key.
* @param value pointer that will be stored as the value of the leaf node.
*/
static void memuse_rnode_insert_child(struct memuse_rnode *node, uint8_t depth,
uint8_t *key, uint8_t keylen,
void *value) {
/* Check if keypart this already exists at this level and add new child if
* not. */
uint8_t keypart = key[depth];
struct memuse_rnode *child = memuse_rnode_lookup(node, keypart);
if (child == NULL) {
child = calloc(1, sizeof(struct memuse_rnode));
child->keypart = keypart;
memuse_rnode_add_child(node, child);
}
/* Are we at the lowest level yet? */
depth++;
if (depth == keylen) {
/* Our destination node. */
#if SWIFT_DEBUG_CHECKS
if (child->ptr != NULL)
message("Overwriting rnode value: %p with %p", child->ptr, value);
#endif
child->ptr = value;
return;
}
/* Down we go to the next level. */
memuse_rnode_insert_child(child, depth, key, keylen, value);
return;
}
/**
* @brief Find a child node for the given full key.
*
* @param node the current parent node.
* @param depth the depth of the parent node, 0 for first call.
* @param key the full key of the expected child node.
* @param keylen the number of bytes in the key.
*/
static struct memuse_rnode *memuse_rnode_find_child(struct memuse_rnode *node,
uint8_t depth, uint8_t *key,
uint8_t keylen) {
uint8_t keypart = key[depth];
struct memuse_rnode *child = NULL;
if (node->count > 0) child = memuse_rnode_lookup(node, keypart);
if (child != NULL && (depth + 1) < keylen) {
return memuse_rnode_find_child(child, depth + 1, key, keylen);
}
return child;
}
/**
* @brief Free all resources associated with a node.
*
* @param node the rnode.
*/
static void memuse_rnode_cleanup(struct memuse_rnode *node) {
if (!node) return;
for (size_t k = 0; k < node->count; k++) {
memuse_rnode_cleanup(node->children[k]);
free(node->children[k]);
}
if (node->count > 0) free(node->children);
}
/**
* @brief reallocate the entries log if space is needed.
*/
static void memuse_log_reallocate(size_t ind) {
if (ind == 0) {
......@@ -94,7 +367,8 @@ static void memuse_log_reallocate(size_t ind) {
} else {
struct memuse_log_entry *new_log;
if ((new_log = (struct memuse_log_entry *)malloc(
sizeof(struct memuse_log_entry) * memuse_log_size * 2)) == NULL)
sizeof(struct memuse_log_entry) *
(memuse_log_size + MEMUSE_INITLOG))) == NULL)
error("Failed to re-allocate memuse log.");
/* Wait for all writes to the old buffer to complete. */
......@@ -107,8 +381,8 @@ static void memuse_log_reallocate(size_t ind) {
free(memuse_log);
memuse_log = new_log;
/* Last action. */
memuse_log_size *= 2;
/* Last action, releases waiting threads. */
atomic_add(&memuse_log_size, MEMUSE_INITLOG);
}
}
......@@ -123,6 +397,7 @@ static void memuse_log_reallocate(size_t ind) {
*/
void memuse_log_allocation(const char *label, void *ptr, int allocated,
size_t size) {
size_t ind = atomic_inc(&memuse_log_count);
/* If we are at the current size we need more space. */
......@@ -133,14 +408,14 @@ void memuse_log_allocation(const char *label, void *ptr, int allocated,
;
/* Record the log. */
memuse_log[ind].rank = engine_rank;
memuse_log[ind].step = engine_current_step;
memuse_log[ind].allocated = allocated;
memuse_log[ind].size = size;
memuse_log[ind].ptr = ptr;
strncpy(memuse_log[ind].label, label, MEMUSE_MAXLAB);
memuse_log[ind].label[MEMUSE_MAXLAB] = '\0';
strncpy(memuse_log[ind].label, label, MEMUSE_MAXLABLEN);
memuse_log[ind].label[MEMUSE_MAXLABLEN] = '\0';
memuse_log[ind].dtic = getticks() - clocks_start_ticks;
memuse_log[ind].active = 1;
atomic_inc(&memuse_log_done);
}
......@@ -152,30 +427,210 @@ void memuse_log_allocation(const char *label, void *ptr, int allocated,
void memuse_log_dump(const char *filename) {
/* Skip if nothing allocated this step. */
if (memuse_log_count == 0) return;
if (memuse_log_count == memuse_old_count) return;
//ticks tic = getticks();
/* Create the radix tree. If not already done. */
if (memuse_rnode_root_init) {
memuse_rnode_root =
(struct memuse_rnode *)calloc(1, sizeof(struct memuse_rnode));
memuse_rnode_root_init = 0;
}
/* Stop any new logs from being processed while we are dumping.
* Remember to not abort with error() in this section, that is recursive
* with the exit handler. */
size_t log_count = memuse_log_count;
size_t old_count = memuse_old_count;
/* Open the output file. */
FILE *fd;
if ((fd = fopen(filename, "w")) == NULL)
error("Failed to create memuse log file '%s'.", filename);
if ((fd = fopen(filename, "w")) == NULL) {
message("Failed to create memuse log file '%s', logs not dumped.",
filename);
return;
}
/* Write a header. */
fprintf(fd, "# Current use: %s\n", memuse_process(1));
fprintf(fd, "# cpufreq: %lld\n", clocks_get_cpufreq());
fprintf(fd, "# dtic adr rank step allocated label size\n");
fprintf(fd, "# dtic step label size sum\n");
size_t memuse_maxmem = memuse_current;
for (size_t k = old_count; k < log_count; k++) {
/* Check if this address has already been recorded. */
struct memuse_rnode *child = memuse_rnode_find_child(
memuse_rnode_root, 0, memuse_log[k].vptr, sizeof(uintptr_t));
if (child != NULL && child->ptr != NULL) {
/* Found the allocation, this should be the free. */
if (memuse_log[k].allocated) {
/* Allocated twice, this is an error, but we cannot abort as that will
* attempt another memory dump, so just complain. */
#if SWIFT_DEBUG_CHECKS
message("Allocated the same address twice (%s: %zd)",
memuse_log[k].label, memuse_log[k].size);
#endif
continue;
}
/* Free, update the size to remove the allocation. */
struct memuse_log_entry *oldlog = (struct memuse_log_entry *)child->ptr;
memuse_log[k].size = -oldlog->size;
/* And deactivate this key. */
child->ptr = NULL;
/* And mark this as matched. */
memuse_log[k].active = 0;
oldlog->active = 0;
} else if (child == NULL && memuse_log[k].allocated) {
/* Not found, so new allocation which we store the log against the
* address. */
memuse_rnode_insert_child(memuse_rnode_root, 0, memuse_log[k].vptr,
sizeof(uintptr_t), &memuse_log[k]);
for (size_t k = 0; k < memuse_log_count; k++) {
fprintf(fd, "%lld %p %d %d %d %s %zd\n", memuse_log[k].dtic,
memuse_log[k].ptr, memuse_log[k].rank, memuse_log[k].step,
memuse_log[k].allocated, memuse_log[k].label, memuse_log[k].size);
} else if (child == NULL && !memuse_log[k].allocated) {
/* Unmatched free, OK if NULL. */
#if SWIFT_DEBUG_CHECKS
if (memuse_log[k].ptr != NULL) {
message("Unmatched non-NULL free: %s", memuse_log[k].label);
}
#endif
continue;
} else if (memuse_log[k].allocated) {
/* Must be previously released allocation with same address, so we
* store. */
memuse_rnode_insert_child(memuse_rnode_root, 0, memuse_log[k].vptr,
sizeof(uintptr_t), &memuse_log[k]);
} else {
/* Should not happen ... */
message("weird memory log record for label '%s' skipped",
memuse_log[k].label);
continue;
}
/* Keep maximum and rolling sum. */
memuse_current += memuse_log[k].size;
if (memuse_current > memuse_maxmem) memuse_maxmem = memuse_current;
/* And output. */
fprintf(fd, "%lld %d %s %zd %zd\n", memuse_log[k].dtic,
memuse_log[k].step, memuse_log[k].label,
memuse_log[k].size, memuse_current);
}
/* Clear the log. */
memuse_log_count = 0;
#ifdef MEMUSE_RNODE_DUMP
/* Debug dump of tree. */
//memuse_rnode_dump(0, memuse_rnode_root, 0);
#endif
/* Now we find all the still active nodes and gather their sizes against the
* labels. */
struct memuse_rnode *activernodes =
(struct memuse_rnode *)calloc(1, sizeof(struct memuse_rnode));
size_t newcount = 0;
struct memuse_rnode *labellednodes =
(struct memuse_rnode *)calloc(1, sizeof(struct memuse_rnode));
size_t *lindices = (size_t *)calloc(log_count, sizeof(size_t));
size_t lcount = 0;
for (size_t k = 0; k < log_count; k++) {
/* Only allocations are stored also is it active? */
if (memuse_log[k].allocated && memuse_log[k].active) {
/* Look for this label in our tree. */
struct memuse_rnode *labelchild = memuse_rnode_find_child(
labellednodes, 0, (uint8_t *)memuse_log[k].label,
strlen(memuse_log[k].label));
struct memuse_labelled_item *item = NULL;
if (labelchild == NULL || labelchild->ptr == NULL) {
/* New, so create an instance to keep the count. */
item = (struct memuse_labelled_item *)calloc(
1, sizeof(struct memuse_labelled_item));
item->sum = 0;
item->count = 0;
memuse_rnode_insert_child(labellednodes, 0,
(uint8_t *)memuse_log[k].label,
strlen(memuse_log[k].label), item);
/* Keep for indexing next time. */