diff --git a/.gitignore b/.gitignore index 812aa06dd54ee8bb81e87796deb8ee05160fec72..893f786506ed168565fb995812136bcb7f1305f1 100644 --- a/.gitignore +++ b/.gitignore @@ -26,11 +26,13 @@ examples/*/*.xmf examples/*/*.hdf5 examples/*/*.png examples/*/*.txt +examples/*/*.dot examples/*/used_parameters.yml examples/*/*/*.xmf examples/*/*/*.hdf5 examples/*/*/*.png examples/*/*/*.txt +examples/*/*/*.dot examples/*/*/used_parameters.yml examples/*/gravity_checks_*.dat diff --git a/examples/analyse_dump_cells.py b/examples/analyse_dump_cells.py new file mode 100755 index 0000000000000000000000000000000000000000..3140e799566c75fe494a75895db0f4a8dcff4e57 --- /dev/null +++ b/examples/analyse_dump_cells.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +""" +Usage: + analysedumpcells.py nx ny nx cell<1>.dat cell<2>.dat ... + +Analyses a number of output files created by calls to the dumpCells() debug +function (presumably called in engine_step()) to output a list of active +top-level cells, identifying those that are on the edges of the volumes being +processed on by various nodes. The point is that these should be minimised to +reduce the MPI communications. + +The nx, ny and nz arguments are the number of cells in the complete space, +we need these so that we can wrap around the edge of space correctly. + +This file is part of SWIFT. +Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk) + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +import pylab as pl +import numpy as np +import sys +import pandas + +xcol = 0 +ycol = 1 +zcol = 2 +xwcol = 3 +ywcol = 4 +zwcol = 5 +localcol = 18 +supercol = 15 +activecol = 16 + +# Command-line arguments. +if len(sys.argv) < 5: + print "usage: ", sys.argv[0], " nx ny nz cell1.dat cell2.dat ..." + sys.exit(1) +nx = int(sys.argv[1]) +ny = int(sys.argv[2]) +nz = int(sys.argv[3]) + +print "# x y z onedge" +allactives = [] +onedge = 0 +tcount = 0 +for i in range(4, len(sys.argv)): + + # Read the file. + data = pl.loadtxt(sys.argv[i]) + #print data + + # Select cells that are on the current rank and are super cells. + rdata = data[data[:,localcol] == 1] + tdata = rdata[rdata[:,supercol] == 1] + + # Separation of the cells is in data. + xwidth = tdata[0,xwcol] + ywidth = tdata[0,ywcol] + zwidth = tdata[0,zwcol] + + # Fill space nx, ny,n nz with all toplevel cells and flag their active + # state. + space = np.zeros((nx,ny,nz)) + actives = [] + for line in tdata: + ix = int(np.rint(line[xcol] / xwidth)) + iy = int(np.rint(line[ycol] / ywidth)) + iz = int(np.rint(line[zcol] / zwidth)) + active = int(line[activecol]) + space[ix,iy,iz] = 1 + active + tcount = tcount + 1 + if active == 1: + actives.append([ix, iy, iz, line]) + + # Report all active cells and flag any without 26 neighbours. These are + # on the edge of the partition volume and will have foreign neighbour + # cells. + for active in actives: + count = 0 + for ii in [-1, 0, 1]: + i = active[0] + ii + if i < 0: + i = i + nx + elif i >= nx: + i = i - nx + + for jj in [-1, 0, 1]: + j = active[1] + jj + if j < 0: + j = j + ny + elif j >= ny: + j = j - ny + + for kk in [-1, 0, 1]: + k = active[2] + kk + if k < 0: + k = k + nz + elif k >= nz: + k = k - nz + if space[i, j, k] > 0: + count = count + 1 + if count < 27: + onedge = onedge + 1 + print active[3][0], active[3][1], active[3][2], 1 + else: + print active[3][0], active[3][1], active[3][2], 0 + + allactives.extend(actives) + +print "# top cells: ", tcount, " active: ", len(allactives), " on edge: ", onedge + +sys.exit(0) + diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index a5791306752c92bbe63985d029594178faae7213..948731d7879ec3e34f6e53f58dc72f6d7a6145ec 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -15,6 +15,7 @@ Scheduler: cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...). + mpi_message_limit: 4096 # (Optional) Maximum MPI task message size to send non-buffered, KB. # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.) TimeIntegration: @@ -79,8 +80,13 @@ DomainDecomposition: initial_grid_x: 10 # (Optional) Grid size if the "grid" strategy is chosen. initial_grid_y: 10 # "" initial_grid_z: 10 # "" - repartition_type: task_weights # (Optional) The re-decomposition strategy: "none", "task_weights", "particle_weights", - # "edge_task_weights" or "hybrid_weights". + + repartition_type: costs/costs # (Optional) The re-decomposition strategy, one of: + # "none/none", "costs/costs", "counts/none", "none/costs", "counts/costs", + # "costs/time" or "none/time". + # These are vertex/edge weights with "costs" as task timing, "counts" as + # sum of particles and "time" as the expected time of the next updates + trigger: 0.05 # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a # new decomposition, or number of steps (>1) between decompositions minfrac: 0.9 # (Optional) Fractional of all particles that should be updated in previous step when @@ -120,7 +126,7 @@ SineWavePotential: amplitude: 10. # Amplitude of the sine wave (internal units) timestep_limit: 1. # Time-step dimensionless pre-factor. growth_time: 0. # (Optional) Time for the potential to grow to its final size. - + # Parameters related to cooling function ---------------------------------------------- # Constant du/dt cooling function diff --git a/examples/plot_task_dependencies.sh b/examples/plot_task_dependencies.sh new file mode 100755 index 0000000000000000000000000000000000000000..77784d8a9cdd3720621c9ad35c4cfbdaf0167ff1 --- /dev/null +++ b/examples/plot_task_dependencies.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Creates a graphic from the task graph file dependency_graph.dot. +# Requires the graphviz command "dot". + +if [ ! -e dependency_graph.dot ]; then + echo "Missing task-graph output 'dependency_graph.dot'! Cannot generate figure." +else + dot -Tpng dependency_graph.dot -o task_graph.png + echo "Output written to task_graph.png" +fi + +exit diff --git a/examples/process_cells b/examples/process_cells new file mode 100755 index 0000000000000000000000000000000000000000..eead4572387f2732cf0b86265f14d20039f73ae1 --- /dev/null +++ b/examples/process_cells @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Usage: +# process_cells nx ny nz nprocess +# +# Description: +# Process all the cell dumps in the current directory. + +# Outputs file per rank with the active cells identified and marked as to +# whether they are near an edge or not. Note requires the numbers of cells +# per dimension of the space. +# +# Also outputs a graphic showing the fraction of active cells on edges +# for each step. + +# Handle command-line +if test "$4" = ""; then + echo "Usage: $0 nx ny nz nprocess" + exit 1 +fi +NX=$1 +NY=$2 +NZ=$3 +NPROCS=$4 + +# Locate script. +SCRIPTHOME=$(dirname "$0") + +# Find all files. Use version sort to get into correct order. +files=$(ls -v cells_*.dat) +if test $? != 0; then + echo "Failed to find any cell dump files" + exit 1 +fi + +# Construct list of names need the number of ranks. +ranks=$(ls -v cells_*.dat | sed 's,cells_\(.*\)_.*.dat,\1,' | sort | uniq | wc -l) +echo "Number of ranks = $ranks" + +# Now construct a list of files ordered by rank, not step. +files=$(ls cells_*.dat | sort -t "_" -k 3,3 -n | xargs -n 4) + +# Need number of steps. +nfiles=$(echo $files| wc -w) +echo "Number of files = $nfiles" +steps=$(( $nfiles / $ranks + 1 )) +echo "Number of steps = $steps" + +# And process them, +echo "Processing cell dumps files..." +#echo $files | xargs -P $NPROCS -n 4 /bin/bash -c "${SCRIPTHOME}/process_cells_helper $NX $NY $NZ \$0 \$1 \$2 \$3" + +# Create summary. +grep "top cells" step*-active-cells.dat | sort -h > active_cells.log + +# And plot of active cells to edge cells. +stilts plot2plane ifmt=ascii in=active_cells.log xmin=-0.1 xmax=1.1 ymin=0 ymax=$steps grid=1 \ + legend=false xpix=600 ypix=500 xlabel="Edge cells/Active cells" ylabel="Step" \ + layer1=mark x1="col9/1.0/col6" y1="index" size1=3 shading1=aux auxmap=rainbow \ + aux=col6 auxfunc=log auxlabel="Active cells" layer2=histogram x2="col9/1.0/col6" \ + color2=grey binsize2=0.01 phase2=0.5 barform2=semi_steps thick2=1 \ + out=active_cells.png + +exit diff --git a/examples/process_cells_helper b/examples/process_cells_helper new file mode 100755 index 0000000000000000000000000000000000000000..a96bdba0de56e04c28ed6d8675c705241ce241d9 --- /dev/null +++ b/examples/process_cells_helper @@ -0,0 +1,12 @@ +#!/bin/bash + +# Helper for process_cells. + +# Locate script. +SCRIPTHOME=$(dirname "$0") + +step=$(echo $4|sed 's,cells_\(.*\)_\(.*\).dat,\2,') +echo "${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat" +${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat + +exit diff --git a/src/debug.c b/src/debug.c index ac886fc8d75a03ac67b213e0639cd384f0980529..4c521397b12c555923f8cea8a4f0cdb4c4551b97 100644 --- a/src/debug.c +++ b/src/debug.c @@ -276,10 +276,13 @@ int checkCellhdxmax(const struct cell *c, int *depth) { * only. */ static void dumpCells_map(struct cell *c, void *data) { - uintptr_t *ldata = (uintptr_t *)data; + size_t *ldata = (size_t *)data; FILE *file = (FILE *)ldata[0]; struct engine *e = (struct engine *)ldata[1]; float ntasks = c->nr_tasks; + int active = (int)ldata[2]; + int mpiactive = (int)ldata[3]; + int pactive = (int)ldata[4]; #if SWIFT_DEBUG_CHECKS /* The c->nr_tasks field does not include all the tasks. So let's check this @@ -300,46 +303,94 @@ static void dumpCells_map(struct cell *c, void *data) { } #endif - /* Only locally active cells are dumped. */ - if (c->count > 0 || c->gcount > 0 || c->scount > 0) - fprintf(file, - " %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d " - "%6.1f %20lld %6d %6d %6d %6d\n", - c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1], - c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks, - c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c), - cell_is_active(c, e), c->nodeID); + /* Only cells with particles are dumped. */ + if (c->count > 0 || c->gcount > 0 || c->scount > 0) { + +/* In MPI mode we may only output cells with foreign partners. + * These define the edges of the partitions. */ +#if WITH_MPI + if (mpiactive) + mpiactive = (c->send_xv != NULL); + else + mpiactive = 1; +#else + mpiactive = 1; +#endif + + /* Active cells, otherwise all. */ + if (active) + active = cell_is_active(c, e); + else + active = 1; + + /* So output local super cells that are active and have MPI tasks as + * requested. */ + if (c->nodeID == e->nodeID && (c->super == c) && active && mpiactive) { + + /* If requested we work out how many particles are active in this cell. */ + int pactcount = 0; + if (pactive) { + const struct part *parts = c->parts; + for (int k = 0; k < c->count; k++) + if (part_is_active(&parts[k], e)) pactcount++; + struct gpart *gparts = c->gparts; + for (int k = 0; k < c->gcount; k++) + if (gpart_is_active(&gparts[k], e)) pactcount++; + struct spart *sparts = c->sparts; + for (int k = 0; k < c->scount; k++) + if (spart_is_active(&sparts[k], e)) pactcount++; + } + + fprintf(file, + " %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d %6d %6d " + "%6.1f %20lld %6d %6d %6d %6d %6d\n", + c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1], + c->width[2], e->step, c->count, c->gcount, c->scount, pactcount, + c->depth, ntasks, c->ti_end_min, get_time_bin(c->ti_end_min), + (c->super == c), cell_is_active(c, e), c->nodeID, + c->nodeID == e->nodeID); + } + } } /** * @brief Dump the location, depth, task counts and timebins and active state, - * for all cells to a simple text file. + * for all cells to a simple text file. A more costly count of the active + * particles in a cell can also be output. * - * @param prefix base output filename + * @param prefix base output filename, result is written to + * %prefix%_%rank%_%step%.dat + * @param active just output active cells. + * @param mpiactive just output MPI active cells, i.e. those with foreign cells. + * @param pactive also output a count of active particles. * @param s the space holding the cells to dump. + * @param rank node ID of MPI rank, or 0 if not relevant. + * @param step the current engine step, or some unique integer. */ -void dumpCells(const char *prefix, struct space *s) { +void dumpCells(const char *prefix, int active, int mpiactive, int pactive, + struct space *s, int rank, int step) { FILE *file = NULL; /* Name of output file. */ - static int nseq = 0; char fname[200]; - int uniq = atomic_inc(&nseq); - sprintf(fname, "%s_%03d.dat", prefix, uniq); - + sprintf(fname, "%s_%03d_%03d.dat", prefix, rank, step); file = fopen(fname, "w"); /* Header. */ fprintf(file, - "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s " - "%20s %6s %6s %6s\n", - "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth", - "tasks", "ti_end_min", "timebin", "issuper", "active", "rank"); + "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s " + "%20s %6s %6s %6s %6s %6s\n", + "x", "y", "z", "xw", "yw", "zw", "step", "count", "gcount", "scount", + "actcount", "depth", "tasks", "ti_end_min", "timebin", "issuper", + "active", "rank", "local"); - uintptr_t data[2]; + size_t data[5]; data[0] = (size_t)file; data[1] = (size_t)s->e; + data[2] = (size_t)active; + data[3] = (size_t)mpiactive; + data[4] = (size_t)pactive; space_map_cells_pre(s, 1, dumpCells_map, &data); fclose(file); } diff --git a/src/debug.h b/src/debug.h index 20ae8fc2c20fa458b7be82f75b2fbb3be9acd32f..2cff37fa6f6f03185dc769bb770fe3a98a424ce1 100644 --- a/src/debug.h +++ b/src/debug.h @@ -36,7 +36,8 @@ void printgParticle_single(struct gpart *gp); int checkSpacehmax(struct space *s); int checkCellhdxmax(const struct cell *c, int *depth); -void dumpCells(const char *prefix, struct space *s); +void dumpCells(const char *prefix, int active, int mpiactive, int pactive, + struct space *s, int rank, int step); #ifdef HAVE_METIS #include "metis.h" diff --git a/src/engine.c b/src/engine.c index 128cb15dab09ffd55a098a3f3ce01cf845913498..84b4237df26771b5f17399405d43a5ffa18863dd 100644 --- a/src/engine.c +++ b/src/engine.c @@ -4008,6 +4008,8 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, gravity_exact_force_compute(e->s, e); #endif + if (e->nodeID == 0) scheduler_write_dependencies(&e->sched, e->verbose); + /* Run the 0th time-step */ engine_launch(e); @@ -4162,6 +4164,9 @@ void engine_step(struct engine *e) { /* Print the number of active tasks ? */ if (e->verbose) engine_print_task_counts(e); +/* Dump local cells and active particle counts. */ +/* dumpCells("cells", 0, 0, 1, e->s, e->nodeID, e->step); */ + #ifdef SWIFT_DEBUG_CHECKS /* Check that we have the correct total mass in the top-level multipoles */ long long num_gpart_mpole = 0; @@ -5113,6 +5118,12 @@ void engine_init(struct engine *e, struct space *s, scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues, (policy & scheduler_flag_steal), e->nodeID, &e->threadpool); + /* Maximum size of MPI task messages, in KB, that should not be buffered, + * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. + */ + e->sched.mpi_message_limit = + parser_get_opt_param_int(params, "Scheduler:mpi_message_limit", 4) * 1024; + /* Allocate and init the threads. */ if ((e->runners = (struct runner *)malloc(sizeof(struct runner) * e->nr_threads)) == NULL) diff --git a/src/equation_of_state.h b/src/equation_of_state.h index eadc73c919e3593b2b8433d4215ba562ceb7a659..76b8bb922133c9c4f29453a14068fe3f9044d66f 100644 --- a/src/equation_of_state.h +++ b/src/equation_of_state.h @@ -185,7 +185,8 @@ struct eos_parameters { /** * @brief Returns the internal energy given density and entropy * - * Since we are using an isothermal EoS, the entropy value is ignored + * Since we are using an isothermal EoS, the entropy and density values are + * ignored. * Computes \f$u = u_{cst}\f$. * * @param density The density \f$\rho\f$. @@ -196,10 +197,11 @@ gas_internal_energy_from_entropy(float density, float entropy) { return eos.isothermal_internal_energy; } + /** * @brief Returns the pressure given density and entropy * - * Since we are using an isothermal EoS, the entropy value is ignored + * Since we are using an isothermal EoS, the entropy value is ignored. * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$. * * @param density The density \f$\rho\f$. @@ -214,7 +216,8 @@ __attribute__((always_inline)) INLINE static float gas_pressure_from_entropy( /** * @brief Returns the entropy given density and pressure. * - * Computes \f$A = \frac{P}{\rho^\gamma}\f$. + * Since we are using an isothermal EoS, the pressure value is ignored. + * Computes \f$A = (\gamma - 1)u_{cst}\rho^{-(\gamma-1)}\f$. * * @param density The density \f$\rho\f$. * @param pressure The pressure \f$P\f$ (ignored). @@ -230,8 +233,9 @@ __attribute__((always_inline)) INLINE static float gas_entropy_from_pressure( /** * @brief Returns the sound speed given density and entropy * - * Since we are using an isothermal EoS, the entropy value is ignored - * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$. + * Since we are using an isothermal EoS, the entropy and density values are + * ignored. + * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$. * * @param density The density \f$\rho\f$. * @param entropy The entropy \f$S\f$. @@ -246,7 +250,7 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_entropy( /** * @brief Returns the entropy given density and internal energy * - * Since we are using an isothermal EoS, the energy value is ignored + * Since we are using an isothermal EoS, the energy value is ignored. * Computes \f$S = \frac{(\gamma - 1)u_{cst}}{\rho^{\gamma-1}}\f$. * * @param density The density \f$\rho\f$ @@ -262,7 +266,7 @@ gas_entropy_from_internal_energy(float density, float u) { /** * @brief Returns the pressure given density and internal energy * - * Since we are using an isothermal EoS, the energy value is ignored + * Since we are using an isothermal EoS, the energy value is ignored. * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$. * * @param density The density \f$\rho\f$ @@ -291,8 +295,9 @@ gas_internal_energy_from_pressure(float density, float pressure) { /** * @brief Returns the sound speed given density and internal energy * - * Since we are using an isothermal EoS, the energy value is ignored - * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$. + * Since we are using an isothermal EoS, the energy and density values are + * ignored. + * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$. * * @param density The density \f$\rho\f$ * @param u The internal energy \f$u\f$ @@ -307,8 +312,9 @@ gas_soundspeed_from_internal_energy(float density, float u) { /** * @brief Returns the sound speed given density and pressure * - * Since we are using an isothermal EoS, the pressure value is ignored - * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$. + * Since we are using an isothermal EoS, the pressure and density values are + * ignored. + * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$. * * @param density The density \f$\rho\f$ * @param P The pressure \f$P\f$ diff --git a/src/parallel_io.c b/src/parallel_io.c index a542a5ff6f7f18d4e616abf355120844652075f2..b31d3b6351b00af4a042e4b28bf492258b17b073 100644 --- a/src/parallel_io.c +++ b/src/parallel_io.c @@ -146,7 +146,7 @@ void readArray_chunk(hid_t h_data, hid_t h_plist_id, * @brief Reads a data array from a given HDF5 group. * * @param grp The group from which to read. - * @param prop The #io_props of the field to read. + * @param props The #io_props of the field to read. * @param N The number of particles on that rank. * @param N_total The total number of particles. * @param mpi_rank The MPI rank of this node. diff --git a/src/partition.c b/src/partition.c index d9aade781e824defedb05f5a201fec8401da7ee2..567fcb38c030a0dec1f052329111b0cc6d82d8f7 100644 --- a/src/partition.c +++ b/src/partition.c @@ -63,10 +63,15 @@ const char *initial_partition_name[] = { /* Simple descriptions of repartition types for reports. */ const char *repartition_name[] = { - "no", "METIS edge and vertex time weighted cells", - "METIS particle count vertex weighted cells", - "METIS time edge weighted cells", - "METIS particle count vertex and time edge cells"}; + "no", + "METIS edge and vertex task cost weights", + "METIS particle count vertex weights", + "METIS task cost edge weights", + "METIS particle count vertex and task cost edge weights", + "METIS vertex task costs and edge delta timebin weights", + "METIS particle count vertex and edge delta timebin weights", + "METIS edge delta timebin weights", +}; /* Local functions, if needed. */ static int check_complete(struct space *s, int verbose, int nregions); @@ -236,14 +241,14 @@ static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { * @param counts the number of particles per cell. Should be * allocated as size s->nr_parts. */ -static void accumulate_counts(struct space *s, int *counts) { +static void accumulate_counts(struct space *s, double *counts) { struct part *parts = s->parts; int *cdim = s->cdim; double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]}; double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; - bzero(counts, sizeof(int) * s->nr_cells); + bzero(counts, sizeof(double) * s->nr_cells); for (size_t k = 0; k < s->nr_parts; k++) { for (int j = 0; j < 3; j++) { @@ -274,6 +279,9 @@ static void accumulate_counts(struct space *s, int *counts) { static void split_metis(struct space *s, int nregions, int *celllist) { for (int i = 0; i < s->nr_cells; i++) s->cells_top[i].nodeID = celllist[i]; + + /* To check or visualise the partition dump all the cells. */ + /* dumpCellRanks("metis_partition", s->cells_top, s->nr_cells); */ } #endif @@ -300,15 +308,16 @@ static int indexvalcmp(const void *p1, const void *p2) { * @param s the space of cells to partition. * @param nregions the number of regions required in the partition. * @param vertexw weights for the cells, sizeof number of cells if used, - * NULL for unit weights. + * NULL for unit weights. Need to be in the range of idx_t. * @param edgew weights for the graph edges between all cells, sizeof number * of cells * 26 if used, NULL for unit weights. Need to be packed - * in CSR format, so same as adjncy array. + * in CSR format, so same as adjncy array. Need to be in the range of + * idx_t. * @param celllist on exit this contains the ids of the selected regions, * sizeof number of cells. */ -static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, - int *celllist) { +static void pick_metis(struct space *s, int nregions, double *vertexw, + double *edgew, int *celllist) { /* Total number of cells. */ int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2]; @@ -380,8 +389,8 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, idx_t objval; /* Dump graph in METIS format */ - /* dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy, - * weights_v, NULL, weights_e); + /*dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy, + * weights_v, NULL, weights_e); */ if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, NULL, weights_e, &idx_nregions, NULL, NULL, options, @@ -465,31 +474,28 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, #if defined(WITH_MPI) && defined(HAVE_METIS) /** - * @brief Repartition the cells amongst the nodes using task timings - * as edge weights and vertex weights also from task timings + * @brief Repartition the cells amongst the nodes using task costs + * as edge weights and vertex weights also from task costs * or particle cells counts. * * @param partweights whether particle counts will be used as vertex weights. * @param bothweights whether vertex and edge weights will be used, otherwise * only edge weights will be used. + * @param timebins use timebins as edge weights. * @param nodeID our nodeID. * @param nr_nodes the number of nodes. * @param s the space of cells holding our local particles. * @param tasks the completed tasks from the last engine step for our node. * @param nr_tasks the number of tasks. */ -static void repart_edge_metis(int partweights, int bothweights, int nodeID, - int nr_nodes, struct space *s, struct task *tasks, - int nr_tasks) { +static void repart_edge_metis(int partweights, int bothweights, int timebins, + int nodeID, int nr_nodes, struct space *s, + struct task *tasks, int nr_tasks) { /* Create weight arrays using task ticks for vertices and edges (edges * assume the same graph structure as used in the part_ calls). */ int nr_cells = s->nr_cells; struct cell *cells = s->cells_top; - float wscale = 1.f, wscale_buff = 0.0; - int wtot = 0; - int wmax = 1e9 / nr_nodes; - int wmin; /* Allocate and fill the adjncy indexing array defining the graph of * cells. */ @@ -499,16 +505,16 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, graph_init_metis(s, inds, NULL); /* Allocate and init weights. */ - int *weights_v = NULL; - int *weights_e = NULL; + double *weights_v = NULL; + double *weights_e = NULL; if (bothweights) { - if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL) + if ((weights_v = (double *)malloc(sizeof(double) * nr_cells)) == NULL) error("Failed to allocate vertex weights arrays."); - bzero(weights_v, sizeof(int) * nr_cells); + bzero(weights_v, sizeof(double) * nr_cells); } - if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL) + if ((weights_e = (double *)malloc(sizeof(double) * 26 * nr_cells)) == NULL) error("Failed to allocate edge weights arrays."); - bzero(weights_e, sizeof(int) * 26 * nr_cells); + bzero(weights_e, sizeof(double) * 26 * nr_cells); /* Generate task weights for vertices. */ int taskvweights = (bothweights && !partweights); @@ -521,19 +527,8 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, /* Skip un-interesting tasks. */ if (t->cost == 0) continue; - /* Get the task weight. */ - int w = t->cost * wscale; - - /* Do we need to re-scale? */ - wtot += w; - while (wtot > wmax) { - wscale /= 2; - wtot /= 2; - w /= 2; - for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; - if (taskvweights) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; - } + /* Get the task weight based on costs. */ + double w = (double)t->cost; /* Get the top-level cells involved. */ struct cell *ci, *cj; @@ -552,9 +547,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, if (t->type == task_type_ghost || t->type == task_type_kick1 || t->type == task_type_kick2 || t->type == task_type_timestep || t->type == task_type_drift_part || t->type == task_type_drift_gpart) { + /* Particle updates add only to vertex weight. */ if (taskvweights) weights_v[cid] += w; - } /* Self interaction? */ @@ -575,64 +570,72 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, } - /* Distinct cells with local ci? */ - else if (ci->nodeID == nodeID) { + /* Distinct cells. */ + else { /* Index of the jth cell. */ int cjd = cj - cells; - /* Add half of weight to each cell. */ - if (taskvweights) { - if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; + /* Local cells add weight to vertices. */ + if (taskvweights && ci->nodeID == nodeID) { + weights_v[cid] += 0.5 * w; if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; } - /* Add weights to edge. */ - int kk; - for (kk = 26 * cid; inds[kk] != cjd; kk++) - ; - weights_e[kk] += w; - for (kk = 26 * cjd; inds[kk] != cid; kk++) - ; - weights_e[kk] += w; + if (timebins) { + /* Add weights to edge for all cells based on the expected + * interaction time (calculated as the time to the last expected + * time) as we want to avoid having active cells on the edges, so we + * cut for that. Note that weight is added to the local and remote + * cells, as we want to keep both away from any cuts, this can + * overflow int, so take care. */ + int dti = num_time_bins - get_time_bin(ci->ti_end_min); + int dtj = num_time_bins - get_time_bin(cj->ti_end_min); + double dt = (double)(1 << dti) + (double)(1 << dtj); + + /* ci */ + int kk; + for (kk = 26 * cid; inds[kk] != cjd; kk++) + ; + weights_e[kk] += dt; + + /* cj */ + for (kk = 26 * cjd; inds[kk] != cid; kk++) + ; + weights_e[kk] += dt; + } else { + + /* Add weights from task costs to the edge. */ + + /* ci */ + int kk; + for (kk = 26 * cid; inds[kk] != cjd; kk++) + ; + weights_e[kk] += w; + + /* cj */ + for (kk = 26 * cjd; inds[kk] != cid; kk++) + ; + weights_e[kk] += w; + } } } } /* Re-calculate the vertices if using particle counts. */ - if (partweights && bothweights) { - accumulate_counts(s, weights_v); - - /* Rescale to balance times. */ - float vwscale = (float)wtot / (float)nr_tasks; - for (int k = 0; k < nr_cells; k++) { - weights_v[k] *= vwscale; - } - } - - /* Get the minimum scaling and re-scale if necessary. */ - int res; - if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, - MPI_COMM_WORLD)) != MPI_SUCCESS) - mpi_error(res, "Failed to allreduce the weight scales."); - - if (wscale_buff != wscale) { - float scale = wscale_buff / wscale; - for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; - if (bothweights) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale; - } + if (partweights && bothweights) accumulate_counts(s, weights_v); /* Merge the weights arrays across all nodes. */ + int res; if (bothweights) { if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, - nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != + nr_cells, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD)) != MPI_SUCCESS) mpi_error(res, "Failed to allreduce vertex weights."); } if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, - 26 * nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != - MPI_SUCCESS) + 26 * nr_cells, MPI_DOUBLE, MPI_SUM, 0, + MPI_COMM_WORLD)) != MPI_SUCCESS) mpi_error(res, "Failed to allreduce edge weights."); /* Allocate cell list for the partition. */ @@ -641,39 +644,53 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, /* As of here, only one node needs to compute the partition. */ if (nodeID == 0) { - /* Final rescale of all weights to avoid a large range. Large ranges - * have been seen to cause an incomplete graph. */ - wmin = wmax; - wmax = 0; - for (int k = 0; k < 26 * nr_cells; k++) { - wmax = weights_e[k] > wmax ? weights_e[k] : wmax; - wmin = weights_e[k] < wmin ? weights_e[k] : wmin; - } + + /* We need to rescale the weights into the range of an integer for METIS + * (really range of idx_t). Also we would like the range of vertex and + * edges weights to be similar so they balance. */ + double wminv = 0.0; + double wmaxv = 0.0; if (bothweights) { + wminv = weights_v[0]; + wmaxv = weights_v[0]; for (int k = 0; k < nr_cells; k++) { - wmax = weights_v[k] > wmax ? weights_v[k] : wmax; - wmin = weights_v[k] < wmin ? weights_v[k] : wmin; + wmaxv = weights_v[k] > wmaxv ? weights_v[k] : wmaxv; + wminv = weights_v[k] < wminv ? weights_v[k] : wminv; } } - if ((wmax - wmin) > metis_maxweight) { - wscale = metis_maxweight / (wmax - wmin); - for (int k = 0; k < 26 * nr_cells; k++) { - weights_e[k] = (weights_e[k] - wmin) * wscale + 1; + double wmine = weights_e[0]; + double wmaxe = weights_e[0]; + for (int k = 0; 26 * k < nr_cells; k++) { + wmaxe = weights_e[k] > wmaxe ? weights_e[k] : wmaxe; + wmine = weights_e[k] < wmine ? weights_e[k] : wmine; + } + + double wscalev = 1.0; + double wscalee = 1.0; + if (bothweights) { + + /* Make maximum value same in both weights systems. */ + if (wmaxv > wmaxe) { + wscalee = wmaxv / wmaxe; + wmaxe = wmaxv; + } else { + wscalev = wmaxe / wmaxv; + wmaxv = wmaxe; } - if (bothweights) { - for (int k = 0; k < nr_cells; k++) { - weights_v[k] = (weights_v[k] - wmin) * wscale + 1; - } + + /* Scale to the METIS range. */ + wscalev *= metis_maxweight / (wmaxv - wminv); + for (int k = 0; k < nr_cells; k++) { + weights_v[k] = (weights_v[k] - wminv) * wscalev + 1.0; } } - /* Make sure there are no zero weights. */ - for (int k = 0; k < 26 * nr_cells; k++) - if (weights_e[k] == 0) weights_e[k] = 1; - if (bothweights) - for (int k = 0; k < nr_cells; k++) - if (weights_v[k] == 0) weights_v[k] = 1; + /* Scale to the METIS range. */ + wscalee *= metis_maxweight / (wmaxe - wmine); + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] = (weights_e[k] - wmine) * wscalee + 1.0; + } /* And partition, use both weights or not as requested. */ if (bothweights) @@ -736,8 +753,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { /* Use particle counts as vertex weights. */ /* Space for particles per cell counts, which will be used as weights. */ - int *weights = NULL; - if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + double *weights = NULL; + if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL) error("Failed to allocate weights buffer."); /* Check each particle and accumulate the counts per cell. */ @@ -745,8 +762,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { /* Get all the counts from all the nodes. */ int res; - if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM, - MPI_COMM_WORLD)) != MPI_SUCCESS) + if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS) mpi_error(res, "Failed to allreduce particle cell weights."); /* Main node does the partition calculation. */ @@ -787,35 +804,32 @@ void partition_repartition(struct repartition *reparttype, int nodeID, #if defined(WITH_MPI) && defined(HAVE_METIS) - if (reparttype->type == REPART_METIS_BOTH || - reparttype->type == REPART_METIS_EDGE || - reparttype->type == REPART_METIS_VERTEX_EDGE) { + if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_COSTS) { + repart_edge_metis(0, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks); - int partweights; - int bothweights; - if (reparttype->type == REPART_METIS_VERTEX_EDGE) - partweights = 1; - else - partweights = 0; + } else if (reparttype->type == REPART_METIS_EDGE_COSTS) { + repart_edge_metis(0, 0, 0, nodeID, nr_nodes, s, tasks, nr_tasks); - if (reparttype->type == REPART_METIS_BOTH) - bothweights = 1; - else - bothweights = 0; + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_COSTS) { + repart_edge_metis(1, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks); + + } else if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS) { + repart_edge_metis(0, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks); - repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks, - nr_tasks); + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS) { + repart_edge_metis(1, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks); - } else if (reparttype->type == REPART_METIS_VERTEX) { + } else if (reparttype->type == REPART_METIS_EDGE_TIMEBINS) { + repart_edge_metis(0, 0, 1, nodeID, nr_nodes, s, tasks, nr_tasks); + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS) { repart_vertex_metis(s, nodeID, nr_nodes); } else if (reparttype->type == REPART_NONE) { - /* Doing nothing. */ } else { - error("Unknown repartition type"); + error("Impossible repartition type"); } #else error("SWIFT was not compiled with METIS support."); @@ -884,17 +898,17 @@ void partition_initial_partition(struct partition *initial_partition, /* Space for particles per cell counts, which will be used as weights or * not. */ - int *weights = NULL; + double *weights = NULL; if (initial_partition->type == INITPART_METIS_WEIGHT) { - if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL) error("Failed to allocate weights buffer."); - bzero(weights, sizeof(int) * s->nr_cells); + bzero(weights, sizeof(double) * s->nr_cells); /* Check each particle and accumilate the counts per cell. */ accumulate_counts(s, weights); /* Get all the counts from all the nodes. */ - if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM, + if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS) error("Failed to allreduce particle cell weights."); } @@ -969,10 +983,10 @@ void partition_init(struct partition *partition, /* Defaults make use of METIS if available */ #ifdef HAVE_METIS - const char *default_repart = "task_weights"; + const char *default_repart = "costs/costs"; const char *default_part = "simple_metis"; #else - const char *default_repart = "none"; + const char *default_repart = "none/none"; const char *default_part = "grid"; #endif @@ -1029,32 +1043,40 @@ void partition_init(struct partition *partition, parser_get_opt_param_string(params, "DomainDecomposition:repartition_type", part_type, default_repart); - switch (part_type[0]) { - case 'n': - repartition->type = REPART_NONE; - break; + if (strcmp("none/none", part_type) == 0) { + repartition->type = REPART_NONE; + #ifdef HAVE_METIS - case 't': - repartition->type = REPART_METIS_BOTH; - break; - case 'e': - repartition->type = REPART_METIS_EDGE; - break; - case 'p': - repartition->type = REPART_METIS_VERTEX; - break; - case 'h': - repartition->type = REPART_METIS_VERTEX_EDGE; - break; - default: - message("Invalid choice of re-partition type '%s'.", part_type); - error( - "Permitted values are: 'none', 'task_weights', 'particle_weights'," - "'edge_task_weights' or 'hybrid_weights'"); + } else if (strcmp("costs/costs", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_COSTS; + + } else if (strcmp("counts/none", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS; + + } else if (strcmp("none/costs", part_type) == 0) { + repartition->type = REPART_METIS_EDGE_COSTS; + + } else if (strcmp("counts/costs", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_COSTS; + + } else if (strcmp("costs/time", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS; + + } else if (strcmp("counts/time", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS; + + } else if (strcmp("none/time", part_type) == 0) { + repartition->type = REPART_METIS_EDGE_TIMEBINS; + } else { + message("Invalid choice of re-partition type '%s'.", part_type); + error( + "Permitted values are: 'none/none', 'costs/costs'," + "'counts/none', 'none/costs', 'counts/costs', " + "'costs/time', 'counts/time' or 'none/time'"); #else - default: - message("Invalid choice of re-partition type '%s'.", part_type); - error("Permitted values are: 'none' when compiled without METIS."); + } else { + message("Invalid choice of re-partition type '%s'.", part_type); + error("Permitted values are: 'none/none' when compiled without METIS."); #endif } diff --git a/src/partition.h b/src/partition.h index 03523b165a930b085224e458ac0dd8c8232a578d..c3eade190c9514efb4c44011e3990745e20846fd 100644 --- a/src/partition.h +++ b/src/partition.h @@ -43,10 +43,13 @@ struct partition { /* Repartition type to use. */ enum repartition_type { REPART_NONE = 0, - REPART_METIS_BOTH, - REPART_METIS_VERTEX, - REPART_METIS_EDGE, - REPART_METIS_VERTEX_EDGE + REPART_METIS_VERTEX_COSTS_EDGE_COSTS, + REPART_METIS_VERTEX_COUNTS, + REPART_METIS_EDGE_COSTS, + REPART_METIS_VERTEX_COUNTS_EDGE_COSTS, + REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS, + REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS, + REPART_METIS_EDGE_TIMEBINS }; /* Repartition preferences. */ diff --git a/src/scheduler.c b/src/scheduler.c index dd6cfed6480b4f42b025c62c8574e72cefeb196b..263466c8841ebb3d04053bca55fb9200236750f5 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -50,6 +50,7 @@ #include "space.h" #include "task.h" #include "timers.h" +#include "version.h" /** * @brief Re-set the list of active tasks. @@ -111,6 +112,125 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta, atomic_inc(&s->completed_unlock_writes); } +/** + * @brief Write a dot file with the task dependencies. + * + * Run plot_task_dependencies.sh for an example of how to use it + * to generate the figure. + * + * @param s The #scheduler we are working in. + * @param verbose Are we verbose about this? + */ +void scheduler_write_dependencies(struct scheduler *s, int verbose) { + + const ticks tic = getticks(); + + /* Conservative number of dependencies per task type */ + const int max_nber_dep = 128; + + /* Number of possible relations between tasks */ + const int nber_relation = + 2 * task_type_count * task_subtype_count * max_nber_dep; + + /* To get the table of max_nber_dep for a task: + * ind = (ta * task_subtype_count + sa) * max_nber_dep * 2 + * where ta is the value of task_type and sa is the value of + * task_subtype */ + int *table = malloc(nber_relation * sizeof(int)); + if (table == NULL) + error("Error allocating memory for task-dependency graph."); + + /* Reset everything */ + for (int i = 0; i < nber_relation; i++) table[i] = -1; + + /* Create file */ + char filename[200] = "dependency_graph.dot"; + FILE *f = fopen(filename, "w"); + if (f == NULL) error("Error opening dependency graph file."); + + /* Write header */ + fprintf(f, "digraph task_dep {\n"); + fprintf(f, "label=\"Task dependencies for SWIFT %s\"", git_revision()); + fprintf(f, "\t compound=true;\n"); + fprintf(f, "\t ratio=0.66;\n"); + fprintf(f, "\t node[nodesep=0.15];\n"); + + /* loop over all tasks */ + for (int i = 0; i < s->nr_tasks; i++) { + const struct task *ta = &s->tasks[i]; + + /* and their dependencies */ + for (int j = 0; j < ta->nr_unlock_tasks; j++) { + const struct task *tb = ta->unlock_tasks[j]; + + /* check if dependency already written */ + int written = 0; + + /* Current index */ + int ind = ta->type * task_subtype_count + ta->subtype; + ind *= 2 * max_nber_dep; + + int k = 0; + int *cur = &table[ind]; + while (k < max_nber_dep) { + + /* not written yet */ + if (cur[0] == -1) { + cur[0] = tb->type; + cur[1] = tb->subtype; + break; + } + + /* already written */ + if (cur[0] == tb->type && cur[1] == tb->subtype) { + written = 1; + break; + } + + k += 1; + cur = &cur[3]; + } + + /* max_nber_dep is too small */ + if (k == max_nber_dep) + error("Not enough memory, please increase max_nber_dep"); + + /* Not written yet => write it */ + if (!written) { + + /* text to write */ + char ta_name[200]; + char tb_name[200]; + + /* construct line */ + if (ta->subtype == task_subtype_none) + sprintf(ta_name, "%s", taskID_names[ta->type]); + else + sprintf(ta_name, "\"%s %s\"", taskID_names[ta->type], + subtaskID_names[ta->subtype]); + + if (tb->subtype == task_subtype_none) + sprintf(tb_name, "%s", taskID_names[tb->type]); + else + sprintf(tb_name, "\"%s %s\"", taskID_names[tb->type], + subtaskID_names[tb->subtype]); + + /* Write to the ffile */ + fprintf(f, "\t %s->%s;\n", ta_name, tb_name); + } + } + } + + /* Be clean */ + fprintf(f, "}"); + fclose(f); + free(table); + + if (verbose) + message("Printing task graph took %.3f %s.", + clocks_from_ticks(getticks() - tic), clocks_getunit()); +} + /** * @brief Split a hydrodynamic task if too large. * @@ -1319,23 +1439,41 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { if (t->subtype == task_subtype_tend) { t->buff = malloc(sizeof(struct pcell_step) * t->ci->pcell_size); cell_pack_end_step(t->ci, t->buff); - err = MPI_Isend( - t->buff, t->ci->pcell_size * sizeof(struct pcell_step), MPI_BYTE, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->pcell_size * sizeof(struct pcell_step)) > + s->mpi_message_limit) + err = MPI_Isend( + t->buff, t->ci->pcell_size * sizeof(struct pcell_step), + MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend( + t->buff, t->ci->pcell_size * sizeof(struct pcell_step), + MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else if (t->subtype == task_subtype_xv || t->subtype == task_subtype_rho || t->subtype == task_subtype_gradient) { - err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->count * sizeof(struct part)) > s->mpi_message_limit) + err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->parts, t->ci->count, part_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); // message( "sending %i parts with tag=%i from %i to %i." , // t->ci->count , t->flags , s->nodeID , t->cj->nodeID ); // fflush(stdout); } else if (t->subtype == task_subtype_gpart) { - err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->gcount * sizeof(struct gpart)) > s->mpi_message_limit) + err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->gparts, t->ci->gcount, gpart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else if (t->subtype == task_subtype_spart) { - err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->scount * sizeof(struct spart)) > s->mpi_message_limit) + err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->sparts, t->ci->scount, spart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else if (t->subtype == task_subtype_multipole) { t->buff = malloc(sizeof(struct gravity_tensors) * t->ci->pcell_size); cell_pack_multipoles(t->ci, t->buff); diff --git a/src/scheduler.h b/src/scheduler.h index c5ccbf43f3048dfb47d2d3eb7a7db6634b646700..1a75544de12b8402e553e3ae2b84e2d8a65c56e8 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -103,6 +103,10 @@ struct scheduler { /* The node we are working on. */ int nodeID; + /* Maximum size of task messages, in bytes, to sent using non-buffered + * MPI. */ + size_t mpi_message_limit; + /* 'Pointer' to the seed for the random number generator */ pthread_key_t local_seed_pointer; }; @@ -168,5 +172,6 @@ void scheduler_dump_queue(struct scheduler *s); void scheduler_print_tasks(const struct scheduler *s, const char *fileName); void scheduler_clean(struct scheduler *s); void scheduler_free_tasks(struct scheduler *s); +void scheduler_write_dependencies(struct scheduler *s, int verbose); #endif /* SWIFT_SCHEDULER_H */ diff --git a/src/space.c b/src/space.c index eccf7e910bf558425463f4174c02e6264263244a..6051320e26c9c463cb85598ac8c106abd769d32e 100644 --- a/src/space.c +++ b/src/space.c @@ -2708,7 +2708,7 @@ void space_init_gparts(struct space *s, int verbose) { if (s->nr_gparts > 0) threadpool_map(&s->e->threadpool, space_init_gparts_mapper, s->gparts, - s->nr_gparts, sizeof(struct part), 0, NULL); + s->nr_gparts, sizeof(struct gpart), 0, NULL); if (verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit());