diff --git a/configure.ac b/configure.ac index 8d189c1210abf48304ca39b0fc6450323091eb7e..2fa871e22f08065527904eaa5838ac3afe725220 100644 --- a/configure.ac +++ b/configure.ac @@ -381,6 +381,7 @@ fi # Check whether we have any of the ARM v8.1 tick timers AX_ASM_ARM_PMCCNTR AX_ASM_ARM_CNTVCT + # See if we want memuse reporting. AC_ARG_ENABLE([memuse-reports], [AS_HELP_STRING([--enable-memuse-reports], @@ -393,6 +394,18 @@ if test "$enable_memuse_reports" = "yes"; then AC_DEFINE([SWIFT_MEMUSE_REPORTS],1,[Enable memory usage reports]) fi +# See if we want mpi reporting. +AC_ARG_ENABLE([mpiuse-reports], + [AS_HELP_STRING([--enable-mpiuse-reports], + [Output reports about MPI tasks requests@<:@yes/no@:>@] + )], + [enable_mpiuse_reports="$enableval"], + [enable_mpiuse_reports="no"] +) +if test "$enable_mpiuse_reports" = "yes"; then + AC_DEFINE([SWIFT_MPIUSE_REPORTS],1,[Enable MPI task reports]) +fi + # Define HAVE_POSIX_MEMALIGN if it works. AX_FUNC_POSIX_MEMALIGN diff --git a/doc/RTD/source/AnalysisTools/index.rst b/doc/RTD/source/AnalysisTools/index.rst index 8b4467f5f36a5e07f0b5446f4f590b2643990731..52783719b593ce08ea20a8a2c32ec140750da2ba 100644 --- a/doc/RTD/source/AnalysisTools/index.rst +++ b/doc/RTD/source/AnalysisTools/index.rst @@ -31,7 +31,7 @@ To solve this problem, you will need to either access them through an existing s or install ``npm`` and then run the following commands .. code-block:: bash - + npm install http-server -g http-server . @@ -78,3 +78,38 @@ the step, and the total memory still in use per label. Note this includes memory still active from previous steps and the total memory is also continued from the previous dump. +MPI task communication reports +------------------------------ + +When SWIFT is configured using the ``--enable-mpiuse-reports`` flag it will +log any all asynchronous MPI communications made to send particle updates +between nodes to support the tasks. + +The output files are called ``mpiuse_report-rank<m>-step<n>.dat``, i.e. one +per rank per step. These have a line for each request for communication, either +an MPI_Irecv or MPI_Isend and a line for the subsequent completion (successful +MPI_Test). + +Each line of the logs contains the following information: + +.. code-block:: none + + stic: ticks since the start of this step + etic: ticks since the start of the simulation + dtic: ticks that the request was active + step: current step + rank: current rank + otherrank: rank that the request was sent to or expected from + type itype: task type as string and enum + subtype isubtype: task subtype as string and enum + activation: 1 if record for the start of a request, 0 if request completion + tag: MPI tag of the request + size: size, in bytes, of the request + sum: sum, in bytes, of all requests that are currently not logged as complete + +The stic values should be synchronized between ranks as all ranks have a +barrier in place to make sure they start the step together, so should be +suitable for matching between ranks. The unique keys to associate records +between ranks (so that the MPI_Isend and MPI_Irecv pairs can be identified) +are "otherrank/rank/subtype/tag/size" and "rank/otherrank/subtype/tag/size" +for send and recv respectively. When matching ignore step0. diff --git a/examples/main.c b/examples/main.c index 9f9c0a471370a208251fe1c3628d3d980b476af4..cb453a74605f7ac05f9aaee1c1c63dc14f8bf522 100644 --- a/examples/main.c +++ b/examples/main.c @@ -1231,6 +1231,15 @@ int main(int argc, char *argv[]) { } #endif + /* Dump MPI requests if collected. */ +#if defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) + { + char dumpfile[40]; + snprintf(dumpfile, 40, "mpiuse_report-rank%d-step%d.dat", engine_rank, 0); + mpiuse_log_dump(dumpfile, clocks_start_ticks); + } +#endif + /* Main simulation loop */ /* ==================== */ int force_stop = 0, resubmit = 0; @@ -1299,6 +1308,16 @@ int main(int argc, char *argv[]) { } #endif + /* Dump MPI requests if collected. */ +#if defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) + { + char dumpfile[40]; + snprintf(dumpfile, 40, "mpiuse_report-rank%d-step%d.dat", engine_rank, + j + 1); + mpiuse_log_dump(dumpfile, e.tic_step); + } +#endif // WITH_MPI + #ifdef SWIFT_DEBUG_THREADPOOL /* Dump the task data using the given frequency. */ if (dump_threadpool && (dump_threadpool == 1 || j % dump_threadpool == 1)) { diff --git a/src/Makefile.am b/src/Makefile.am index 665aa4b24c94162fb8f772edd346f3c95a1d7ddb..e4d01722534faf817e4e68252699fc141857b9d8 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -53,8 +53,9 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ star_formation_struct.h star_formation.h \ star_formation_logger.h star_formation_logger_struct.h \ pressure_floor.h pressure_floor_struct.h pressure_floor_iact.h \ - velociraptor_struct.h velociraptor_io.h random.h memuse.h black_holes.h black_holes_io.h \ - black_holes_properties.h black_holes_struct.h feedback.h feedback_struct.h feedback_properties.h + velociraptor_struct.h velociraptor_io.h random.h memuse.h mpiuse.h memuse_rnodes.h \ + black_holes.h black_holes_io.h black_holes_properties.h black_holes_struct.h \ + feedback.h feedback_struct.h feedback_properties.h # source files for EAGLE cooling EAGLE_COOLING_SOURCES = @@ -84,7 +85,7 @@ AM_SOURCES = space.c runner_main.c runner_doiact_hydro.c runner_doiact_grav.c \ part_type.c xmf.c gravity_properties.c gravity.c \ collectgroup.c hydro_space.c equation_of_state.c \ chemistry.c cosmology.c restart.c mesh_gravity.c velociraptor_interface.c \ - outputlist.c velociraptor_dummy.c logger_io.c memuse.c fof.c \ + outputlist.c velociraptor_dummy.c logger_io.c memuse.c mpiuse.c memuse_rnodes.c fof.c \ hashmap.c pressure_floor.c \ $(EAGLE_COOLING_SOURCES) $(EAGLE_FEEDBACK_SOURCES) diff --git a/src/engine.c b/src/engine.c index 14712b3baf31f6d46989bbb82e576276de7c8c33..1aed4b2a5e0ce44e92a8ee4de86428c2afd0c5c1 100644 --- a/src/engine.c +++ b/src/engine.c @@ -2156,6 +2156,11 @@ void engine_step(struct engine *e) { struct clocks_time time1, time2; clocks_gettime(&time1); +#if defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) + /* We may want to compare times across ranks, so make sure all steps start + * at the same time, just different ticks. */ + MPI_Barrier(MPI_COMM_WORLD); +#endif e->tic_step = getticks(); if (e->nodeID == 0) { diff --git a/src/memuse.c b/src/memuse.c index 77344cc6cec56544a6e19cc6a71ca2375aa1916c..00a2a5f879a994d96e0747eba55c56a3161a6b86 100644 --- a/src/memuse.c +++ b/src/memuse.c @@ -27,8 +27,10 @@ #include "../config.h" /* Standard includes. */ +#include <stdint.h> #include <stdio.h> #include <stdlib.h> +#include <string.h> #include <sys/types.h> #include <unistd.h> @@ -39,6 +41,8 @@ #include "atomic.h" #include "clocks.h" #include "engine.h" +#include "error.h" +#include "memuse_rnodes.h" #ifdef SWIFT_MEMUSE_REPORTS @@ -101,254 +105,10 @@ struct memuse_labelled_item { size_t count; }; -/* A radix node, this has a single byte key and a pointer to some related - * resource. It also holds a sorted list of children, if any. */ -struct memuse_rnode { - - /* Byte key of this node. */ - uint8_t keypart; - - /* Value of this node, if set. */ - void *ptr; - - /* Sorted pointers to children of this node. */ - struct memuse_rnode **children; - unsigned int count; -}; - /* Persistent radix trie root node. Holds active logs between dumps. */ static struct memuse_rnode *memuse_rnode_root; static int memuse_rnode_root_init = 1; -#ifdef MEMUSE_RNODE_DUMP -/** - * @brief Dump a representation of the radix tree rooted at a node to stdout. - * - * @param depth the depth of the node in the tree, root is 0. - * @param node the node at which to start dumping. - * @param full if not zero then nodes that are not storing a value - * are also reported. - */ -static void memuse_rnode_dump(int depth, struct memuse_rnode *node, int full) { - - /* Value of the full key, to this depth. Assumes full key is a pointer, - * so uncomment when using strings. */ - static union { - // uint8_t key[MEMUSE_MAXLABLEN]; - // char ptr[MEMUSE_MAXLABLEN]; - uint8_t key[sizeof(uintptr_t)]; - void *ptr; - } keyparts = {0}; - - /* Record keypart at this depth. Root has no keypart. */ - if (depth != 0) keyparts.key[depth - 1] = node->keypart; - - // if (node->ptr != NULL || full) { - // keyparts.key[depth] = '\0'; - // - // /* Gather children's keys if full. */ - // char fullkey[MEMUSE_MAXLABLEN]; - // if (full) { - // for (size_t k = 0; k < node->count; k++) { - // fullkey[k] = node->children[k]->keypart; - // } - // fullkey[node->count] = '\0'; - // printf("dump @ depth: %d keypart: %d key: %s value: %p fullkey: %s\n", - // depth, node->keypart, keyparts.ptr, node->ptr, fullkey); - // } else { - // printf("dump @ depth: %d keypart: %d key: %s value: %p\n", depth, - // node->keypart, keyparts.ptr, node->ptr); - // } - //} - - if (node->ptr != NULL || full) { - printf("dump @ depth: %d keypart: %d key: %p value: %p\n", depth, - node->keypart, keyparts.ptr, node->ptr); - } - - /* Recurse to all children. */ - for (size_t k = 0; k < node->count; k++) { - memuse_rnode_dump(depth + 1, node->children[k], full); - } -} -#endif - -/** - * @brief Return the position of a keypart for a list of children. - * If not found returns where it would be inserted. - * - * @param keypart the keypart to locate. - * @param children the list of sorted children. - * @param count the number of children - * - * @return the index of key or where it should be inserted. - */ -static unsigned int memuse_rnode_bsearch(uint8_t keypart, - struct memuse_rnode **children, - unsigned int count) { - - /* Search for lower bound. */ - unsigned int lower = 0; - unsigned int upper = count; - while (lower < upper) { - unsigned int middle = (upper + lower) / 2; - if (keypart > children[middle]->keypart) - lower = middle + 1; - else - upper = middle; - } - return lower; -} - -/** - * @brief Insert a child, if needed, into a list of children. Assumes - * we have sufficient room. - * - * @param child the child to insert, if needed. - * @param children the list of sorted children. - * @param count the number of children - */ -static void memuse_rnode_binsert_child(struct memuse_rnode *child, - struct memuse_rnode **children, - unsigned int *count) { - unsigned int pos = 0; - if (*count > 0) { - - /* Find the child or insertion point. */ - pos = memuse_rnode_bsearch(child->keypart, children, *count); - - /* If not found move all children to make a space, unless we're inserting - * after the end. */ - if (pos < *count && children[pos]->keypart != child->keypart) { - memmove(&children[pos + 1], &children[pos], - (*count - pos) * sizeof(struct memuse_rnode *)); - } - } - - /* Insert new child */ - children[pos] = child; - *count += 1; -} - -/** - * @brief Add a child rnode to an rnode. Making sure we have room and keeping - * the sort order. - * - * @param node the parent node. - * @param child the node to add to the parent, - */ -static void memuse_rnode_add_child(struct memuse_rnode *node, - struct memuse_rnode *child) { - - /* Extend the children list to include a new entry .*/ - void *mem = realloc(node->children, - (node->count + 1) * sizeof(struct memuse_rnode *)); - if (mem == NULL) error("Failed to reallocate rnodes\n"); - node->children = mem; - - /* Insert the new child. */ - memuse_rnode_binsert_child(child, node->children, &node->count); -} - -/** - * @brief Find a child of a node with the given key part. - * - * @param node the node to search. - * @param keypart the key part of the child. - * @return NULL if not found. - */ -static struct memuse_rnode *memuse_rnode_lookup(const struct memuse_rnode *node, - uint8_t keypart) { - - /* Locate the key, or where it would be inserted. */ - if (node->count > 0) { - unsigned int index = - memuse_rnode_bsearch(keypart, node->children, node->count); - if (index < node->count && keypart == node->children[index]->keypart) { - return node->children[index]; - } - } - return NULL; -} - -/** - * @brief insert a child into a node's children list and add a pointer, iff - * this is the destination node for the given key. - * - * @param node the parent node. - * @param depth the depth of the parent node. - * @param key the full key of the eventual leaf node. - * @param keylen the numbers of bytes in the full key. - * @param value pointer that will be stored as the value of the leaf node. - */ -static void memuse_rnode_insert_child(struct memuse_rnode *node, uint8_t depth, - uint8_t *key, uint8_t keylen, - void *value) { - - /* Check if keypart this already exists at this level and add new child if - * not. */ - uint8_t keypart = key[depth]; - struct memuse_rnode *child = memuse_rnode_lookup(node, keypart); - if (child == NULL) { - child = calloc(1, sizeof(struct memuse_rnode)); - child->keypart = keypart; - memuse_rnode_add_child(node, child); - } - - /* Are we at the lowest level yet? */ - depth++; - if (depth == keylen) { - /* Our destination node. */ - -#if SWIFT_DEBUG_CHECKS - if (child->ptr != NULL) - message("Overwriting rnode value: %p with %p", child->ptr, value); -#endif - child->ptr = value; - return; - } - - /* Down we go to the next level. */ - memuse_rnode_insert_child(child, depth, key, keylen, value); - return; -} - -/** - * @brief Find a child node for the given full key. - * - * @param node the current parent node. - * @param depth the depth of the parent node, 0 for first call. - * @param key the full key of the expected child node. - * @param keylen the number of bytes in the key. - */ -static struct memuse_rnode *memuse_rnode_find_child(struct memuse_rnode *node, - uint8_t depth, uint8_t *key, - uint8_t keylen) { - uint8_t keypart = key[depth]; - struct memuse_rnode *child = NULL; - if (node->count > 0) child = memuse_rnode_lookup(node, keypart); - if (child != NULL && (depth + 1) < keylen) { - return memuse_rnode_find_child(child, depth + 1, key, keylen); - } - return child; -} - -/** - * @brief Free all resources associated with a node. - * - * @param node the rnode. - */ -static void memuse_rnode_cleanup(struct memuse_rnode *node) { - - if (!node) return; - - for (size_t k = 0; k < node->count; k++) { - memuse_rnode_cleanup(node->children[k]); - free(node->children[k]); - } - if (node->count > 0) free(node->children); -} - /** * @brief reallocate the entries log if space is needed. */ diff --git a/src/memuse_rnodes.c b/src/memuse_rnodes.c new file mode 100644 index 0000000000000000000000000000000000000000..39fe5011220bc232a803ba5f6da62725d3e627b8 --- /dev/null +++ b/src/memuse_rnodes.c @@ -0,0 +1,270 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2019 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/** + * @file memuse_rnodes.c + * @brief file of routines used for radix nodes in memory loggers. + */ + +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +/* Local defines. */ +#include "memuse_rnodes.h" + +/* Local includes. */ +#include "atomic.h" +#include "clocks.h" +#include "error.h" + +/** + * @brief Return the position of a keypart for a list of children. + * If not found returns where it would be inserted. + * + * @param keypart the keypart to locate. + * @param children the list of sorted children. + * @param count the number of children + * + * @return the index of key or where it should be inserted. + */ +static unsigned int memuse_rnode_bsearch(uint8_t keypart, + struct memuse_rnode **children, + unsigned int count) { + + /* Search for lower bound. */ + unsigned int lower = 0; + unsigned int upper = count; + while (lower < upper) { + unsigned int middle = (upper + lower) / 2; + if (keypart > children[middle]->keypart) + lower = middle + 1; + else + upper = middle; + } + return lower; +} + +/** + * @brief Insert a child, if needed, into a list of children. Assumes + * we have sufficient room. + * + * @param child the child to insert, if needed. + * @param children the list of sorted children. + * @param count the number of children + */ +static void memuse_rnode_binsert_child(struct memuse_rnode *child, + struct memuse_rnode **children, + unsigned int *count) { + unsigned int pos = 0; + if (*count > 0) { + + /* Find the child or insertion point. */ + pos = memuse_rnode_bsearch(child->keypart, children, *count); + + /* If not found move all children to make a space, unless we're inserting + * after the end. */ + if (pos < *count && children[pos]->keypart != child->keypart) { + memmove(&children[pos + 1], &children[pos], + (*count - pos) * sizeof(struct memuse_rnode *)); + } + } + + /* Insert new child */ + children[pos] = child; + *count += 1; +} + +/** + * @brief Add a child rnode to an rnode. Making sure we have room and keeping + * the sort order. + * + * @param node the parent node. + * @param child the node to add to the parent, + */ +static void memuse_rnode_add_child(struct memuse_rnode *node, + struct memuse_rnode *child) { + + /* Extend the children list to include a new entry .*/ + void *mem = realloc(node->children, + (node->count + 1) * sizeof(struct memuse_rnode *)); + if (mem == NULL) error("Failed to reallocate rnodes\n"); + node->children = mem; + + /* Insert the new child. */ + memuse_rnode_binsert_child(child, node->children, &node->count); +} + +/** + * @brief Find a child of a node with the given key part. + * + * @param node the node to search. + * @param keypart the key part of the child. + * @return NULL if not found. + */ +static struct memuse_rnode *memuse_rnode_lookup(const struct memuse_rnode *node, + uint8_t keypart) { + + /* Locate the key, or where it would be inserted. */ + if (node->count > 0) { + unsigned int index = + memuse_rnode_bsearch(keypart, node->children, node->count); + if (index < node->count && keypart == node->children[index]->keypart) { + return node->children[index]; + } + } + return NULL; +} + +/** + * @brief insert a child into a node's children list and add a pointer, iff + * this is the destination node for the given key. + * + * @param node the parent node. + * @param depth the depth of the parent node. + * @param key the full key of the eventual leaf node. + * @param keylen the numbers of bytes in the full key. + * @param value pointer that will be stored as the value of the leaf node. + */ +void memuse_rnode_insert_child(struct memuse_rnode *node, uint8_t depth, + uint8_t *key, uint8_t keylen, void *value) { + + /* Check if keypart this already exists at this level and add new child if + * not. */ + uint8_t keypart = key[depth]; + struct memuse_rnode *child = memuse_rnode_lookup(node, keypart); + if (child == NULL) { + child = calloc(1, sizeof(struct memuse_rnode)); + child->keypart = keypart; + memuse_rnode_add_child(node, child); + } + + /* Are we at the lowest level yet? */ + depth++; + if (depth == keylen) { + /* Our destination node. */ + +#if SWIFT_DEBUG_CHECKS + if (child->ptr != NULL) + message("Overwriting rnode value: %p with %p", child->ptr, value); +#endif + child->ptr = value; + return; + } + + /* Down we go to the next level. */ + memuse_rnode_insert_child(child, depth, key, keylen, value); + return; +} + +/** + * @brief Find a child node for the given full key. + * + * @param node the current parent node. + * @param depth the depth of the parent node, 0 for first call. + * @param key the full key of the expected child node. + * @param keylen the number of bytes in the key. + */ +struct memuse_rnode *memuse_rnode_find_child(struct memuse_rnode *node, + uint8_t depth, uint8_t *key, + uint8_t keylen) { + uint8_t keypart = key[depth]; + struct memuse_rnode *child = NULL; + if (node->count > 0) child = memuse_rnode_lookup(node, keypart); + if (child != NULL && (depth + 1) < keylen) { + return memuse_rnode_find_child(child, depth + 1, key, keylen); + } + return child; +} + +/** + * @brief Free all resources associated with a node. + * + * @param node the rnode. + */ +void memuse_rnode_cleanup(struct memuse_rnode *node) { + + if (!node) return; + + for (size_t k = 0; k < node->count; k++) { + memuse_rnode_cleanup(node->children[k]); + free(node->children[k]); + } + if (node->count > 0) free(node->children); +} + +/** + * @brief Dump a representation of the radix tree rooted at a node to stdout. + * + * Debugging code. + * + * @param depth the depth of the node in the tree, root is 0. + * @param node the node at which to start dumping. + * @param full if not zero then nodes that are not storing a value + * are also reported. + */ +void memuse_rnode_dump(int depth, struct memuse_rnode *node, int full) { + + /* Value of the full key, to this depth. Assumes full key is a pointer, + * so uncomment when using strings. */ + static union { + // uint8_t key[MEMUSE_MAXLABLEN]; + // char ptr[MEMUSE_MAXLABLEN]; + uint8_t key[sizeof(uintptr_t)]; + void *ptr; + } keyparts = {0}; + + /* Record keypart at this depth. Root has no keypart. */ + if (depth != 0) keyparts.key[depth - 1] = node->keypart; + + // if (node->ptr != NULL || full) { + // keyparts.key[depth] = '\0'; + // + // /* Gather children's keys if full. */ + // char fullkey[MEMUSE_MAXLABLEN]; + // if (full) { + // for (size_t k = 0; k < node->count; k++) { + // fullkey[k] = node->children[k]->keypart; + // } + // fullkey[node->count] = '\0'; + // printf("dump @ depth: %d keypart: %d key: %s value: %p fullkey: %s\n", + // depth, node->keypart, keyparts.ptr, node->ptr, fullkey); + // } else { + // printf("dump @ depth: %d keypart: %d key: %s value: %p\n", depth, + // node->keypart, keyparts.ptr, node->ptr); + // } + //} + + if (node->ptr != NULL || full) { + printf("dump @ depth: %d keypart: %d key: %p value: %p\n", depth, + node->keypart, keyparts.ptr, node->ptr); + } + + /* Recurse to all children. */ + for (size_t k = 0; k < node->count; k++) { + memuse_rnode_dump(depth + 1, node->children[k], full); + } +} diff --git a/src/memuse_rnodes.h b/src/memuse_rnodes.h new file mode 100644 index 0000000000000000000000000000000000000000..41f24a98ad60396aec06d3170d478834428007ce --- /dev/null +++ b/src/memuse_rnodes.h @@ -0,0 +1,51 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2018 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_MEMUSE_RNODE_H +#define SWIFT_MEMUSE_RNODE_H + +/* Config parameters. */ +#include "../config.h" + +/* Includes. */ +#include <stdlib.h> + +/* A radix node, this has a single byte key and a pointer to some related + * resource. It also holds a sorted list of children, if any. */ +struct memuse_rnode { + + /* Byte key of this node. */ + uint8_t keypart; + + /* Value of this node, if set. */ + void *ptr; + + /* Sorted pointers to children of this node. */ + struct memuse_rnode **children; + unsigned int count; +}; + +void memuse_rnode_dump(int depth, struct memuse_rnode *node, int full); +void memuse_rnode_insert_child(struct memuse_rnode *node, uint8_t depth, + uint8_t *key, uint8_t keylen, void *value); +struct memuse_rnode *memuse_rnode_find_child(struct memuse_rnode *node, + uint8_t depth, uint8_t *key, + uint8_t keylen); +void memuse_rnode_cleanup(struct memuse_rnode *node); + +#endif /* SWIFT_MEMUSE_RNODE_H */ diff --git a/src/mpiuse.c b/src/mpiuse.c new file mode 100644 index 0000000000000000000000000000000000000000..d290e55291ee91122e8a9c4753a321befe1c5e52 --- /dev/null +++ b/src/mpiuse.c @@ -0,0 +1,354 @@ +/* This file is part of SWIFT. + * Copyright (c) 2019 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/** + * @file mpiuse.c + * @brief file of routines to report about MPI tasks used in SWIFT. + */ +/* Config parameters. */ +#include "../config.h" + +#if defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) + +/* Standard includes. */ +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +/* Local defines. */ +#include "mpiuse.h" + +/* Local includes. */ +#include "atomic.h" +#include "clocks.h" +#include "engine.h" +#include "error.h" +#include "memuse_rnodes.h" + +/* The initial size and increment of the log entries buffer. */ +#define MPIUSE_INITLOG 1000000 + +/* A megabyte for conversions. */ +#define MEGABYTE 1048576.0 + +/* Also recorded in logger. */ +extern int engine_rank; +extern int engine_current_step; + +/* Entry for logger of MPI send and recv requests in a step. */ +struct mpiuse_log_entry { + + /* Type and subtype of MPI task. */ + int type; + int subtype; + + /* Step of action. */ + int step; + + /* Whether an activation, send or recv, or if handoff completed. Not the + * same as delivered, need to match across ranks to see that. */ + int activation; + + /* Memory of the request. */ + size_t size; + + /* Pointer to the request associated with the call. Needs to be + * unique and match to the successful */ + union { + void *ptr; + uint8_t vptr[sizeof(uintptr_t)]; /* For rnode keys. */ + }; + + /* Ticks at time of this action. */ + ticks tic; + + /* Time taken for handoff of this action. */ + ticks acttic; + + /* Whether request is still active, i.e. successful test not seen. */ + int active; + + /* Rank of otherside of communication. */ + int otherrank; + + /* The tag. */ + int tag; +}; + +/* The log of activations and handoffs. All volatile as accessed from threads + * that use the value to synchronise. */ +static struct mpiuse_log_entry *volatile mpiuse_log = NULL; +static volatile size_t mpiuse_log_size = 0; +static volatile size_t mpiuse_log_count = 0; +static volatile size_t mpiuse_log_done = 0; + +/** + * @brief reallocate the entries log if space is needed. + */ +static void mpiuse_log_reallocate(size_t ind) { + + if (ind == 0) { + + /* Need to perform initialization. Be generous. */ + if ((mpiuse_log = (struct mpiuse_log_entry *)malloc( + sizeof(struct mpiuse_log_entry) * MPIUSE_INITLOG)) == NULL) + error("Failed to allocate MPI use log."); + + /* Last action. */ + mpiuse_log_size = MPIUSE_INITLOG; + + } else { + struct mpiuse_log_entry *new_log; + if ((new_log = (struct mpiuse_log_entry *)malloc( + sizeof(struct mpiuse_log_entry) * + (mpiuse_log_size + MPIUSE_INITLOG))) == NULL) + error("Failed to re-allocate MPI use log."); + + /* Wait for all writes to the old buffer to complete. */ + while (mpiuse_log_done < mpiuse_log_size) + ; + + /* Copy to new buffer. */ + memcpy(new_log, mpiuse_log, + sizeof(struct mpiuse_log_entry) * mpiuse_log_size); + free(mpiuse_log); + mpiuse_log = new_log; + + /* Last action, releases waiting threads. */ + atomic_add(&mpiuse_log_size, MPIUSE_INITLOG); + } +} + +/** + * @brief Log an MPI request or handoff. + * + * @param type the task type (send or recv). + * @param subtype the task subtype. + * @param ptr pointer to the MPI request. + * @param activation if not is a successful MPI_Test, not MPI_Isend or + * MPI_Irecv. + * @param size the size in bytes of memory to be transfered or received. + * 0 for a deactivation. + * @param otherrank other rank associated with the transfer. + * @param tag the MPI tag. + */ +void mpiuse_log_allocation(int type, int subtype, void *ptr, int activation, + size_t size, int otherrank, int tag) { + + size_t ind = atomic_inc(&mpiuse_log_count); + + /* If we are at the current size we need more space. */ + if (ind == mpiuse_log_size) mpiuse_log_reallocate(ind); + + /* Other threads wait for space. */ + while (ind > mpiuse_log_size) + ; + + /* Record the log. */ + mpiuse_log[ind].step = engine_current_step; + mpiuse_log[ind].type = type; + mpiuse_log[ind].subtype = subtype; + mpiuse_log[ind].activation = activation; + mpiuse_log[ind].size = size; + mpiuse_log[ind].ptr = ptr; + mpiuse_log[ind].otherrank = otherrank; + mpiuse_log[ind].tag = tag; + mpiuse_log[ind].tic = getticks(); + mpiuse_log[ind].acttic = 0; + mpiuse_log[ind].active = 1; + atomic_inc(&mpiuse_log_done); +} + +/** + * @brief dump the log to a file and reset, if anything to dump. + * + * @param filename name of file for log dump. + * @param stepticks the clock ticks at the start of step, if dumping a step, + * otherwise some locally relative time that might help + * synchronize across ranks. + */ +void mpiuse_log_dump(const char *filename, ticks stepticks) { + + /* Skip if nothing logged this step. */ + if (mpiuse_log_count == 0) return; + + // ticks tic = getticks(); + + /* Create the radix tree root node. */ + struct memuse_rnode *memuse_rnode_root = + (struct memuse_rnode *)calloc(1, sizeof(struct memuse_rnode)); + + /* Stop any new logs from being processed while we are dumping. */ + size_t log_count = mpiuse_log_count; + + /* Open the output file. */ + FILE *fd; + if ((fd = fopen(filename, "w")) == NULL) { + message("Failed to create MPI use log file '%s', logs not dumped.", + filename); + return; + } + + /* Write a header. */ + fprintf(fd, + "# stic etic dtic step rank otherrank type itype subtype isubtype " + "activation tag size sum\n"); + + size_t mpiuse_current = 0; + size_t mpiuse_max = 0; + double mpiuse_sum = 0; + size_t mpiuse_actcount = 0; + for (size_t k = 0; k < log_count; k++) { + + /* Check if this address has already been recorded. */ + struct memuse_rnode *child = memuse_rnode_find_child( + memuse_rnode_root, 0, mpiuse_log[k].vptr, sizeof(uintptr_t)); + + if (child != NULL && child->ptr != NULL) { + + /* Should be the handoff. Check that. */ + if (mpiuse_log[k].activation) { + + /* Used twice, this is an error, but just complain as not fatal. */ +#if SWIFT_DEBUG_CHECKS + message( + "Used the same MPI request address twice " + "(%s/%s: %d->%d: %zd/%d)", + taskID_names[mpiuse_log[k].type], + subtaskID_names[mpiuse_log[k].subtype], engine_rank, + mpiuse_log[k].otherrank, mpiuse_log[k].size, mpiuse_log[k].tag); +#endif + continue; + } + + /* Free, update the missing fields, size of request is removed. */ + struct mpiuse_log_entry *oldlog = (struct mpiuse_log_entry *)child->ptr; + mpiuse_log[k].size = -oldlog->size; + mpiuse_log[k].otherrank = oldlog->otherrank; + mpiuse_log[k].tag = oldlog->tag; + + /* Time taken to handoff. */ + mpiuse_log[k].acttic = mpiuse_log[k].tic - oldlog->tic; + + /* And deactivate this key. */ + child->ptr = NULL; + + /* And mark this as handed off. */ + mpiuse_log[k].active = 0; + oldlog->active = 0; + + } else if (child == NULL && mpiuse_log[k].activation) { + + /* Not found, so new send/recv which we store the log against the + * address. */ + memuse_rnode_insert_child(memuse_rnode_root, 0, mpiuse_log[k].vptr, + sizeof(uintptr_t), &mpiuse_log[k]); + + } else if (child == NULL && !mpiuse_log[k].activation) { + + /* Unmatched handoff, not OK, but not fatal. */ +#if SWIFT_DEBUG_CHECKS + if (mpiuse_log[k].ptr != NULL) { + message("Unmatched MPI_Test found: (%s/%s: %d->%d: %zd/%d)", + taskID_names[mpiuse_log[k].type], + subtaskID_names[mpiuse_log[k].subtype], engine_rank, + mpiuse_log[k].otherrank, mpiuse_log[k].size, mpiuse_log[k].tag); + } +#endif + continue; + } else if (mpiuse_log[k].activation) { + + /* Must be previously released request with the same address, so we + * store. */ + memuse_rnode_insert_child(memuse_rnode_root, 0, mpiuse_log[k].vptr, + sizeof(uintptr_t), &mpiuse_log[k]); + + } else { + message("Weird MPI log record found: (%s/%s: %d->%d: %zd/%d/%d/%p)", + taskID_names[mpiuse_log[k].type], + subtaskID_names[mpiuse_log[k].subtype], engine_rank, + mpiuse_log[k].otherrank, mpiuse_log[k].size, mpiuse_log[k].tag, + mpiuse_log[k].activation, mpiuse_log[k].ptr); + continue; + } + + /* Sum of memory in flight. */ + mpiuse_current += mpiuse_log[k].size; + + /* Gather for stats report. */ + if (mpiuse_log[k].activation) { + if (mpiuse_log[k].size > mpiuse_max) mpiuse_max = mpiuse_log[k].size; + mpiuse_sum += (double)mpiuse_log[k].size; + mpiuse_actcount++; + } + + /* And output. */ + fprintf(fd, "%lld %lld %lld %d %d %d %s %d %s %d %d %d %zd %zd\n", + mpiuse_log[k].tic - stepticks, + mpiuse_log[k].tic - clocks_start_ticks, + mpiuse_log[k].acttic, mpiuse_log[k].step, engine_rank, + mpiuse_log[k].otherrank, taskID_names[mpiuse_log[k].type], + mpiuse_log[k].type, subtaskID_names[mpiuse_log[k].subtype], + mpiuse_log[k].subtype, mpiuse_log[k].activation, + mpiuse_log[k].tag, mpiuse_log[k].size, mpiuse_current); + } + +#ifdef MEMUSE_RNODE_DUMP + /* Debug dump of tree. */ + // memuse_rnode_dump(0, memuse_rnode_root, 0); +#endif + + /* Write our statistics. */ + fprintf(fd, "##\n"); + fprintf(fd, "## Number of requests: %zd\n", mpiuse_actcount); + fprintf(fd, "## Maximum request size: %.4f (MB)\n", mpiuse_max / MEGABYTE); + fprintf(fd, "## Sum of all requests: %.4f (MB)\n", mpiuse_sum / MEGABYTE); + fprintf(fd, "## Mean of all requests: %.4f (MB)\n", + mpiuse_sum / (double)mpiuse_actcount / MEGABYTE); + fprintf(fd, "##\n"); + + /* Now check any still active logs, these are errors all should match. */ + if (mpiuse_current != 0) { + message("Some MPI requests have not been completed"); + for (size_t k = 0; k < log_count; k++) { + if (mpiuse_log[k].active) + message("%s/%s: %d->%d: %zd/%d)", taskID_names[mpiuse_log[k].type], + subtaskID_names[mpiuse_log[k].subtype], engine_rank, + mpiuse_log[k].otherrank, mpiuse_log[k].size, mpiuse_log[k].tag); + } + } + + /* Finished with the rnodes. */ + memuse_rnode_cleanup(memuse_rnode_root); + + /* Clear the log. We expect this to clear step to step, unlike memory. */ + mpiuse_log_count = 0; + mpiuse_log_done = 0; + + /* Close the file. */ + fflush(fd); + fclose(fd); + + // message("took %.3f %s.", clocks_from_ticks(getticks() - tic), + // clocks_getunit()); +} + +#endif /* defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) */ diff --git a/src/mpiuse.h b/src/mpiuse.h new file mode 100644 index 0000000000000000000000000000000000000000..20c6d05e1cc2d6b3995fbfc1dc2f69809ed093c9 --- /dev/null +++ b/src/mpiuse.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2019 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_MPIUSE_H +#define SWIFT_MPIUSE_H + +/* Config parameters. */ +#include "../config.h" + +/* Local includes. */ +#include "cycle.h" + +/* Includes. */ +#include <stdlib.h> + +/* API. */ +#if defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) +void mpiuse_log_dump(const char *filename, ticks stepticks); +void mpiuse_log_allocation(int type, int subtype, void *ptr, int activation, + size_t size, int otherrank, int tag); +#else + +/* No-op when not reporting. */ +#define mpiuse_log_allocation(type, subtype, ptr, activation, size, otherrank, \ + tag) \ + ; +#endif /* defined(SWIFT_MPIUSE_REPORTS) && defined(WITH_MPI) */ + +#endif /* SWIFT_MPIUSE_H */ diff --git a/src/scheduler.c b/src/scheduler.c index 1fad63fd7141db2aad486aaaa7e4dc877a8aa3b8..e08b0225d15b434649c0ad8f56bbdf3494b76285 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -47,6 +47,7 @@ #include "intrinsics.h" #include "kernel_hydro.h" #include "memuse.h" +#include "mpiuse.h" #include "queue.h" #include "sort_part.h" #include "space.h" @@ -1694,281 +1695,234 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { break; case task_type_recv: #ifdef WITH_MPI + { + size_t size = 0; /* Size in bytes. */ + size_t count = 0; /* Number of elements to receive */ + MPI_Datatype type = MPI_BYTE; /* Type of the elements */ + void *buff = NULL; /* Buffer to accept elements */ + if (t->subtype == task_subtype_tend_part) { - t->buff = (struct pcell_step_hydro *)malloc( - sizeof(struct pcell_step_hydro) * t->ci->mpi.pcell_size); - err = MPI_Irecv( - t->buff, t->ci->mpi.pcell_size * sizeof(struct pcell_step_hydro), - MPI_BYTE, t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = size = + t->ci->mpi.pcell_size * sizeof(struct pcell_step_hydro); + buff = t->buff = malloc(count); + } else if (t->subtype == task_subtype_tend_gpart) { - t->buff = (struct pcell_step_grav *)malloc( - sizeof(struct pcell_step_grav) * t->ci->mpi.pcell_size); - err = MPI_Irecv( - t->buff, t->ci->mpi.pcell_size * sizeof(struct pcell_step_grav), - MPI_BYTE, t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = size = t->ci->mpi.pcell_size * sizeof(struct pcell_step_grav); + buff = t->buff = malloc(count); + } else if (t->subtype == task_subtype_tend_spart) { - t->buff = (struct pcell_step_stars *)malloc( - sizeof(struct pcell_step_stars) * t->ci->mpi.pcell_size); - err = MPI_Irecv( - t->buff, t->ci->mpi.pcell_size * sizeof(struct pcell_step_stars), - MPI_BYTE, t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = size = + t->ci->mpi.pcell_size * sizeof(struct pcell_step_stars); + buff = t->buff = malloc(count); + } else if (t->subtype == task_subtype_tend_bpart) { - t->buff = (struct pcell_step_black_holes *)malloc( - sizeof(struct pcell_step_black_holes) * t->ci->mpi.pcell_size); - err = MPI_Irecv( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_black_holes), - MPI_BYTE, t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = size = + t->ci->mpi.pcell_size * sizeof(struct pcell_step_black_holes); + buff = t->buff = malloc(count); + } else if (t->subtype == task_subtype_part_swallow) { - t->buff = (struct black_holes_part_data *)malloc( - sizeof(struct black_holes_part_data) * t->ci->hydro.count); - err = MPI_Irecv( - t->buff, - t->ci->hydro.count * sizeof(struct black_holes_part_data), - MPI_BYTE, t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = size = + t->ci->hydro.count * sizeof(struct black_holes_part_data); + buff = t->buff = malloc(count); + } else if (t->subtype == task_subtype_bpart_merger) { - t->buff = (struct black_holes_bpart_data *)malloc( - sizeof(struct black_holes_bpart_data) * t->ci->black_holes.count); - err = MPI_Irecv( - t->buff, - t->ci->black_holes.count * sizeof(struct black_holes_bpart_data), - MPI_BYTE, t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + count = size = + sizeof(struct black_holes_bpart_data) * t->ci->black_holes.count; + buff = t->buff = malloc(count); + } else if (t->subtype == task_subtype_xv || t->subtype == task_subtype_rho || t->subtype == task_subtype_gradient) { - err = MPI_Irecv(t->ci->hydro.parts, t->ci->hydro.count, part_mpi_type, - t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = t->ci->hydro.count; + size = count * sizeof(struct part); + type = part_mpi_type; + buff = t->ci->hydro.parts; + } else if (t->subtype == task_subtype_gpart) { - err = MPI_Irecv(t->ci->grav.parts, t->ci->grav.count, gpart_mpi_type, - t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = t->ci->grav.count; + size = count * sizeof(struct gpart); + type = gpart_mpi_type; + buff = t->ci->grav.parts; + } else if (t->subtype == task_subtype_spart) { - err = MPI_Irecv(t->ci->stars.parts, t->ci->stars.count, - spart_mpi_type, t->ci->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = t->ci->stars.count; + size = count * sizeof(struct spart); + type = spart_mpi_type; + buff = t->ci->stars.parts; + } else if (t->subtype == task_subtype_bpart_rho || t->subtype == task_subtype_bpart_swallow || t->subtype == task_subtype_bpart_feedback) { - err = MPI_Irecv(t->ci->black_holes.parts, t->ci->black_holes.count, - bpart_mpi_type, t->ci->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = t->ci->black_holes.count; + size = count * sizeof(struct bpart); + type = bpart_mpi_type; + buff = t->ci->black_holes.parts; + } else if (t->subtype == task_subtype_multipole) { - t->buff = (struct gravity_tensors *)malloc( - sizeof(struct gravity_tensors) * t->ci->mpi.pcell_size); - err = MPI_Irecv(t->buff, t->ci->mpi.pcell_size, multipole_mpi_type, - t->ci->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = t->ci->mpi.pcell_size; + size = count * sizeof(struct gravity_tensors); + type = multipole_mpi_type; + buff = t->buff = malloc(size); + } else if (t->subtype == task_subtype_sf_counts) { - t->buff = (struct pcell_sf *)malloc(sizeof(struct pcell_sf) * - t->ci->mpi.pcell_size); - err = MPI_Irecv(t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_sf), - MPI_BYTE, t->ci->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = size = t->ci->mpi.pcell_size * sizeof(struct pcell_sf); + buff = t->buff = malloc(count); + } else { error("Unknown communication sub-type"); } + + err = MPI_Irecv(buff, count, type, t->ci->nodeID, t->flags, + subtaskMPI_comms[t->subtype], &t->req); + if (err != MPI_SUCCESS) { mpi_error(err, "Failed to emit irecv for particle data."); } + + /* And log, if logging enabled. */ + mpiuse_log_allocation(t->type, t->subtype, &t->req, 1, size, + t->ci->nodeID, t->flags); + qid = 1 % s->nr_queues; + } #else error("SWIFT was not compiled with MPI support."); #endif - break; + break; case task_type_send: #ifdef WITH_MPI + { + size_t size = 0; /* Size in bytes. */ + size_t count = 0; /* Number of elements to send */ + MPI_Datatype type = MPI_BYTE; /* Type of the elements */ + void *buff = NULL; /* Buffer to send */ + if (t->subtype == task_subtype_tend_part) { - t->buff = (struct pcell_step_hydro *)malloc( - sizeof(struct pcell_step_hydro) * t->ci->mpi.pcell_size); - cell_pack_end_step_hydro(t->ci, (struct pcell_step_hydro *)t->buff); - - if ((t->ci->mpi.pcell_size * sizeof(struct pcell_step_hydro)) > - s->mpi_message_limit) { - err = MPI_Isend( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_hydro), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } else { - err = MPI_Issend( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_hydro), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } + + size = count = + t->ci->mpi.pcell_size * sizeof(struct pcell_step_hydro); + buff = t->buff = malloc(size); + cell_pack_end_step_hydro(t->ci, (struct pcell_step_hydro *)buff); + } else if (t->subtype == task_subtype_tend_gpart) { - t->buff = (struct pcell_step_grav *)malloc( - sizeof(struct pcell_step_grav) * t->ci->mpi.pcell_size); - cell_pack_end_step_grav(t->ci, (struct pcell_step_grav *)t->buff); - - if ((t->ci->mpi.pcell_size * sizeof(struct pcell_step_grav)) > - s->mpi_message_limit) { - err = MPI_Isend( - t->buff, t->ci->mpi.pcell_size * sizeof(struct pcell_step_grav), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } else { - err = MPI_Issend( - t->buff, t->ci->mpi.pcell_size * sizeof(struct pcell_step_grav), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } + + size = count = t->ci->mpi.pcell_size * sizeof(struct pcell_step_grav); + buff = t->buff = malloc(size); + cell_pack_end_step_grav(t->ci, (struct pcell_step_grav *)buff); + } else if (t->subtype == task_subtype_tend_spart) { - t->buff = (struct pcell_step_stars *)malloc( - sizeof(struct pcell_step_stars) * t->ci->mpi.pcell_size); - cell_pack_end_step_stars(t->ci, (struct pcell_step_stars *)t->buff); - - if ((t->ci->mpi.pcell_size * sizeof(struct pcell_step_stars)) > - s->mpi_message_limit) { - err = MPI_Isend( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_stars), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } else { - err = MPI_Issend( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_stars), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } + + size = count = + t->ci->mpi.pcell_size * sizeof(struct pcell_step_stars); + buff = t->buff = malloc(size); + cell_pack_end_step_stars(t->ci, (struct pcell_step_stars *)buff); + } else if (t->subtype == task_subtype_tend_bpart) { - t->buff = (struct pcell_step_black_holes *)malloc( - sizeof(struct pcell_step_black_holes) * t->ci->mpi.pcell_size); - cell_pack_end_step_black_holes( - t->ci, (struct pcell_step_black_holes *)t->buff); - - if ((t->ci->mpi.pcell_size * sizeof(struct pcell_step_black_holes)) > - s->mpi_message_limit) { - err = MPI_Isend( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_black_holes), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } else { - err = MPI_Issend( - t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_step_black_holes), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } + + size = count = + t->ci->mpi.pcell_size * sizeof(struct pcell_step_black_holes); + buff = t->buff = malloc(size); + cell_pack_end_step_black_holes(t->ci, + (struct pcell_step_black_holes *)buff); + } else if (t->subtype == task_subtype_part_swallow) { - t->buff = (struct black_holes_part_data *)malloc( - sizeof(struct black_holes_part_data) * t->ci->hydro.count); - cell_pack_part_swallow(t->ci, - (struct black_holes_part_data *)t->buff); - - if (t->ci->hydro.count * sizeof(struct black_holes_part_data) > - s->mpi_message_limit) { - err = MPI_Isend( - t->buff, - t->ci->hydro.count * sizeof(struct black_holes_part_data), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } else { - err = MPI_Issend( - t->buff, - t->ci->hydro.count * sizeof(struct black_holes_part_data), - MPI_BYTE, t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); - } + + size = count = + t->ci->hydro.count * sizeof(struct black_holes_part_data); + buff = t->buff = malloc(size); + cell_pack_part_swallow(t->ci, (struct black_holes_part_data *)buff); + } else if (t->subtype == task_subtype_bpart_merger) { - t->buff = (struct black_holes_bpart_data *)malloc( - sizeof(struct black_holes_bpart_data) * t->ci->black_holes.count); + + size = count = + sizeof(struct black_holes_bpart_data) * t->ci->black_holes.count; + buff = t->buff = malloc(size); cell_pack_bpart_swallow(t->ci, (struct black_holes_bpart_data *)t->buff); - if (t->ci->black_holes.count * sizeof(struct black_holes_bpart_data) > - s->mpi_message_limit) { - err = MPI_Isend(t->buff, - t->ci->black_holes.count * - sizeof(struct black_holes_bpart_data), - MPI_BYTE, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); - } else { - err = MPI_Issend(t->buff, - t->ci->black_holes.count * - sizeof(struct black_holes_bpart_data), - MPI_BYTE, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); - } - } else if (t->subtype == task_subtype_xv || t->subtype == task_subtype_rho || t->subtype == task_subtype_gradient) { - if ((t->ci->hydro.count * sizeof(struct part)) > s->mpi_message_limit) - err = MPI_Isend(t->ci->hydro.parts, t->ci->hydro.count, - part_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); - else - err = MPI_Issend(t->ci->hydro.parts, t->ci->hydro.count, - part_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = t->ci->hydro.count; + size = count * sizeof(struct part); + type = part_mpi_type; + buff = t->ci->hydro.parts; + } else if (t->subtype == task_subtype_gpart) { - if ((t->ci->grav.count * sizeof(struct gpart)) > s->mpi_message_limit) - err = MPI_Isend(t->ci->grav.parts, t->ci->grav.count, - gpart_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); - else - err = MPI_Issend(t->ci->grav.parts, t->ci->grav.count, - gpart_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = t->ci->grav.count; + size = count * sizeof(struct gpart); + type = gpart_mpi_type; + buff = t->ci->grav.parts; + } else if (t->subtype == task_subtype_spart) { - if ((t->ci->stars.count * sizeof(struct spart)) > - s->mpi_message_limit) - err = MPI_Isend(t->ci->stars.parts, t->ci->stars.count, - spart_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); - else - err = MPI_Issend(t->ci->stars.parts, t->ci->stars.count, - spart_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = t->ci->stars.count; + size = count * sizeof(struct spart); + type = spart_mpi_type; + buff = t->ci->stars.parts; + } else if (t->subtype == task_subtype_bpart_rho || t->subtype == task_subtype_bpart_swallow || t->subtype == task_subtype_bpart_feedback) { - if ((t->ci->black_holes.count * sizeof(struct bpart)) > - s->mpi_message_limit) - err = MPI_Isend(t->ci->black_holes.parts, t->ci->black_holes.count, - bpart_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); - else - err = MPI_Issend(t->ci->black_holes.parts, t->ci->black_holes.count, - bpart_mpi_type, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + + count = t->ci->black_holes.count; + size = count * sizeof(struct bpart); + type = bpart_mpi_type; + buff = t->ci->black_holes.parts; + } else if (t->subtype == task_subtype_multipole) { - t->buff = (struct gravity_tensors *)malloc( - sizeof(struct gravity_tensors) * t->ci->mpi.pcell_size); - cell_pack_multipoles(t->ci, (struct gravity_tensors *)t->buff); - err = MPI_Isend(t->buff, t->ci->mpi.pcell_size, multipole_mpi_type, - t->cj->nodeID, t->flags, subtaskMPI_comms[t->subtype], - &t->req); + + count = t->ci->mpi.pcell_size; + size = count * sizeof(struct gravity_tensors); + type = multipole_mpi_type; + buff = t->buff = malloc(size); + cell_pack_multipoles(t->ci, (struct gravity_tensors *)buff); + } else if (t->subtype == task_subtype_sf_counts) { - t->buff = (struct pcell_sf *)malloc(sizeof(struct pcell_sf) * - t->ci->mpi.pcell_size); + + size = count = t->ci->mpi.pcell_size * sizeof(struct pcell_sf); + buff = t->buff = malloc(size); cell_pack_sf_counts(t->ci, (struct pcell_sf *)t->buff); - err = MPI_Isend(t->buff, - t->ci->mpi.pcell_size * sizeof(struct pcell_sf), - MPI_BYTE, t->cj->nodeID, t->flags, - subtaskMPI_comms[t->subtype], &t->req); + } else { error("Unknown communication sub-type"); } + + if (size > s->mpi_message_limit) { + err = MPI_Isend(buff, count, type, t->cj->nodeID, t->flags, + subtaskMPI_comms[t->subtype], &t->req); + } else { + err = MPI_Issend(buff, count, type, t->cj->nodeID, t->flags, + subtaskMPI_comms[t->subtype], &t->req); + } + if (err != MPI_SUCCESS) { mpi_error(err, "Failed to emit isend for particle data."); } + + /* And log, if logging enabled. */ + mpiuse_log_allocation(t->type, t->subtype, &t->req, 1, size, + t->cj->nodeID, t->flags); + qid = 0; + } #else error("SWIFT was not compiled with MPI support."); #endif - break; + break; default: qid = -1; } diff --git a/src/swift.h b/src/swift.h index fe9196a8fcf6d1845c9446c480c7961504a4756f..258c714f23f53ae6fa0439f55f0bbd832a055ecc 100644 --- a/src/swift.h +++ b/src/swift.h @@ -56,6 +56,7 @@ #include "memuse.h" #include "mesh_gravity.h" #include "minmax.h" +#include "mpiuse.h" #include "multipole.h" #include "outputlist.h" #include "parallel_io.h" diff --git a/src/task.c b/src/task.c index 4d6cfa2482491b1a08f6b28f7188fb94448afb2e..94652aaa363053c58aa38054f9a1b7c9d0c19b3a 100644 --- a/src/task.c +++ b/src/task.c @@ -46,6 +46,7 @@ #include "error.h" #include "inline.h" #include "lock.h" +#include "mpiuse.h" /* Task type names. */ const char *taskID_names[task_type_count] = {"none", @@ -552,6 +553,12 @@ int task_lock(struct task *t) { "%s).", taskID_names[t->type], subtaskID_names[t->subtype], t->flags, buff); } + + /* And log deactivation, if logging enabled. */ + if (res) { + mpiuse_log_allocation(t->type, t->subtype, &t->req, 0, 0, 0, 0); + } + return res; #else error("SWIFT was not compiled with MPI support."); diff --git a/tools/match_mpireports.py b/tools/match_mpireports.py new file mode 100755 index 0000000000000000000000000000000000000000..3541506c41cbc8ca7f7ce67b30f42bb013adf35c --- /dev/null +++ b/tools/match_mpireports.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +""" +Usage: + match_mpireports.py [options] mpi-reports... + +Match the rows that sends start and recvs complete from a set of mpi-reports +of a single step, and output the matched rows to standard output. If captured +the output can be analysed to see how long the send to recvs took to complete. + +This file is part of SWIFT. + +Copyright (C) 2019 Peter W. Draper (p.w.draper@durham.ac.uk) +All Rights Reserved. + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" + +import sys +import argparse + +# Handle the command line. +parser = argparse.ArgumentParser(description="Match MPI reports") + +parser.add_argument("input", + nargs="+", + metavar="mpi-reports", + help="MPI reports") +parser.add_argument( + "-v", + "--verbose", + dest="verbose", + help="Verbose output", + default=False, + action="store_true", +) +args = parser.parse_args() +infiles = args.input + +# Indices for words in a line. +sticcol=0 +eticcol=1 +dticcol=2 +stepcol=3 +rankcol=4 +otherrankcol=5 +typecol=6 +itypecol=7 +subtypecol=8 +isubtypecol=9 +activationcol=10 +tagcol=11 +sizecol=12 +sum=13 + +# Keyed lines. +sends = {} +recvs = {} + +# Gather keys from input files. We created dicts with matchable keys +# for when sends start and recvs end. Other pairings are possible... +# Note size of completion recv is negative. +for f in infiles: + if args.verbose: + print "Processing: " + f + with open(f, "r") as fp: + for line in fp: + if line[0] == '#': + continue + words = line.split() + if words[activationcol] == "1" and words[typecol] == "send": + key = words[otherrankcol] + "/" + \ + words[rankcol] + "/" + \ + words[subtypecol] + "/" + \ + words[tagcol] + "/" + \ + words[sizecol] + if not key in sends: + sends[key] = [line[:-1]] + else: + sends[key].append(line[:-1]) + + elif words[activationcol] == "0" and words[typecol] == "recv": + key = words[rankcol] + "/" + \ + words[otherrankcol] + "/" + \ + words[subtypecol] + "/" + \ + words[tagcol] + "/" + \ + words[sizecol][1:] + + if not key in recvs: + recvs[key] = [line[:-1]] + else: + recvs[key].append(line[:-1]) + +# Now output. Note we could have unmatched recv keys, we don't check for that. +for key in sends: + if key in recvs: + if len(sends[key]) == 1 and len(recvs[key]) == 1: + print sends[key][0], recvs[key][0] + else: + print "# ERROR: found ", len(sends[key]), "/", len(recvs[key]), " matches for key: ", key, " should be 1/1" + else: + print "# ERROR: missing recv key: ", key + + +sys.exit(0)