diff --git a/INSTALL.swift b/INSTALL.swift index bf8dbc92f5ccb06f6988c5672e5fdac54d2d2598..4fa82c60838cf417961682318095f090f1bb709f 100644 --- a/INSTALL.swift +++ b/INSTALL.swift @@ -138,14 +138,15 @@ before you can build it. ===================== - - METIS: - a build of the METIS library can be optionally used to - optimize the load between MPI nodes (requires an MPI - library). This should be found in the standard installation - directories, or pointed at using the "--with-metis" - configuration option. In this case the top-level installation - directory of the METIS build should be given. Note to use - METIS you should supply at least "--with-metis". + - METIS/ParMETIS: + a build of the METIS or ParMETIS library should be used to + optimize the load between MPI nodes. This should be found in the + standard installation directories, or pointed at using the + "--with-metis" or "--with-parmetis" configuration options. + In this case the top-level installation directory of the build + should be given. Note to use METIS or ParMETIS you should supply at + least "--with-metis". ParMETIS is preferred over METIS when there + is a choice. - libNUMA: a build of the NUMA library can be used to pin the threads to diff --git a/README b/README index b51abc121f7cc7c1b4baa851c02045b5f4614bbb..7060589401d80e205733fb5770f258708263d966 100644 --- a/README +++ b/README @@ -65,8 +65,8 @@ Parameters: -T, --timers=<int> Print timers every time-step. -v, --verbose=<int> Run in verbose mode, in MPI mode 2 outputs from all ranks. - -y, --task-dumps=<int> Time-step frequency at which task graphs - are dumped. + -y, --task-dumps=<int> Time-step frequency at which task analysis + files and/or tasks are dumped. -Y, --threadpool-dumps=<int> Time-step frequency at which threadpool tasks are dumped. diff --git a/README.md b/README.md index 29415f27ee62f154b01dcd6a65414d7288a0a63f..c160a21adb921da79ae660196d5fa33e20af74fc 100644 --- a/README.md +++ b/README.md @@ -113,8 +113,8 @@ Parameters: -T, --timers=<int> Print timers every time-step. -v, --verbose=<int> Run in verbose mode, in MPI mode 2 outputs from all ranks. - -y, --task-dumps=<int> Time-step frequency at which task graphs - are dumped. + -y, --task-dumps=<int> Time-step frequency at which task analysis + files and/or tasks are dumped. -Y, --threadpool-dumps=<int> Time-step frequency at which threadpool tasks are dumped. diff --git a/configure.ac b/configure.ac index 95e390add27d7f45a4c6fd9a45654d73f66ebdc1..4d4c19e07f71fe997a131c0bad57524ad577c39a 100644 --- a/configure.ac +++ b/configure.ac @@ -215,7 +215,7 @@ fi # Check if task debugging is on. AC_ARG_ENABLE([task-debugging], [AS_HELP_STRING([--enable-task-debugging], - [Store task timing information and generate task dump files @<:@yes/no@:>@] + [Store extra information for generating task dump files @<:@yes/no@:>@] )], [enable_task_debugging="$enableval"], [enable_task_debugging="no"] diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 1aa6deb2c8b199c0eab7f047fbc081e313f4de07..b310ce3a6edbc79eef4524dbadbbde20515621ae 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1976,7 +1976,7 @@ SEARCH_INCLUDES = YES # preprocessor. # This tag requires that the tag SEARCH_INCLUDES is set to YES. -INCLUDE_PATH = +INCLUDE_PATH = @top_srcdir@/src # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the diff --git a/doc/RTD/source/CommandLineOptions/index.rst b/doc/RTD/source/CommandLineOptions/index.rst index e2603532b4ed4e64c86887f2a4f7c35f80cb08bf..88493ddb10ff2a4978e5e4b31a55efc87ba45d3b 100644 --- a/doc/RTD/source/CommandLineOptions/index.rst +++ b/doc/RTD/source/CommandLineOptions/index.rst @@ -60,7 +60,7 @@ can be found by typing ``./swift -h``:: -T, --timers=<int> Print timers every time-step. -v, --verbose=<int> Run in verbose mode, in MPI mode 2 outputs from all ranks. - -y, --task-dumps=<int> Time-step frequency at which task graphs - are dumped. + -y, --task-dumps=<int> Time-step frequency at which task analysis + files and/or tasks are dumped. -Y, --threadpool-dumps=<int> Time-step frequency at which threadpool tasks are dumped. diff --git a/doc/RTD/source/GettingStarted/compiling_code.rst b/doc/RTD/source/GettingStarted/compiling_code.rst index 696d5a232b53205f9dbd6e03647d9da86e2b1ceb..0cfde4d18db62c2e0b41e652c73a6b6ad268440e 100644 --- a/doc/RTD/source/GettingStarted/compiling_code.rst +++ b/doc/RTD/source/GettingStarted/compiling_code.rst @@ -41,9 +41,9 @@ FFTW ~~~~ Version 3.3.x or higher is required for periodic gravity. -METIS -~~~~~ -METIS is used for domain decomposition and load balancing. +ParMETIS or METIS +~~~~~~~~~~~~~~~~~ +One is required for domain decomposition and load balancing. libNUMA ~~~~~~~ diff --git a/doc/RTD/source/GettingStarted/running_on_large_systems.rst b/doc/RTD/source/GettingStarted/running_on_large_systems.rst index 8bd3a76985e9181bae1a715564e58d2052dd15fc..42beedf790dc3d87895cd9bd6db13f25942b0a16 100644 --- a/doc/RTD/source/GettingStarted/running_on_large_systems.rst +++ b/doc/RTD/source/GettingStarted/running_on_large_systems.rst @@ -26,9 +26,8 @@ system (i.e. over MPI on several nodes). Here are some recommendations: + Run with threads pinned. You can do this by passing the ``-a`` flag to the SWIFT binary. This ensures that processes stay on the same core that spawned them, ensuring that cache is accessed more efficiently. -+ Ensure that you compile with METIS. More information is available in an - upcoming paper, but using METIS allows for work to be distributed in a - more efficient way between your nodes. ++ Ensure that you compile with ParMETIS or METIS. These are required if + want to load balance between MPI ranks. Your batch script should look something like the following (to run on 16 nodes each with 2x16 core processors for a total of 512 cores): diff --git a/doc/RTD/source/ParameterFiles/index.rst b/doc/RTD/source/ParameterFiles/index.rst index 488e8d37d7fa530f6dcd536f6bb39debeaab9f25..4cd0ab7bff1396c90c4dfe978b3e109db64bcab5 100644 --- a/doc/RTD/source/ParameterFiles/index.rst +++ b/doc/RTD/source/ParameterFiles/index.rst @@ -15,4 +15,3 @@ parameter files. parameter_description output_selection - diff --git a/doc/RTD/source/ParameterFiles/parameter_description.rst b/doc/RTD/source/ParameterFiles/parameter_description.rst index 6304b60c5eb6df77d79e2ff50b9ba895d31a7889..7f979c8da8e25b1c491a2dc04e2bbf6c348abce2 100644 --- a/doc/RTD/source/ParameterFiles/parameter_description.rst +++ b/doc/RTD/source/ParameterFiles/parameter_description.rst @@ -458,7 +458,7 @@ using the parameter: The default level of ``0`` implies no compression and values have to be in the range :math:`[0-9]`. This integer is passed to the i/o library and used for the -lossless GZIP compression algorithm. Higher values imply higher compression but +loss-less GZIP compression algorithm. Higher values imply higher compression but also more time spent deflating and inflating the data. Note that up until HDF5 1.10.x this option is not available when using the MPI-parallel version of the i/o routines. @@ -500,7 +500,7 @@ Showing all the parameters for a basic hydro test-case, one would have: int_time_label_on: 0 compression: 3 UnitLength_in_cgs: 1. # Use cm in outputs - UnitMass_in_cgs: 1. # Use grams in outpus + UnitMass_in_cgs: 1. # Use grams in outputs UnitVelocity_in_cgs: 1. # Use cm/s in outputs UnitCurrent_in_cgs: 1. # Use Ampere in outputs UnitTemp_in_cgs: 1. # Use Kelvin in outputs @@ -611,8 +611,212 @@ Scheduler .. _Parameters_domain_decomposition: -Domain Decomposition --------------------- +Domain Decomposition: +--------------------- + +This section determines how the top-level cells are distributed between the +ranks of an MPI run. An ideal decomposition should result in each rank having +a similar amount of work to do, so that all the ranks complete at the same +time. Achieving a good balance requires that SWIFT is compiled with either the +ParMETIS or METIS libraries. ParMETIS is an MPI version of METIS, so is +preferred for performance reasons. + +When we use ParMETIS/METIS the top-level cells of the volume are considered as +a graph, with a cell at each vertex and edges that connect the vertices to all +the neighbouring cells (so we have 26 edges connected to each vertex). +Decomposing such a graph into domains is known as partitioning, so in SWIFT we +refer to domain decomposition as partitioning. + +This graph of cells can have weights associated with the vertices and the +edges. These weights are then used to guide the partitioning, seeking to +balance the total weight of the vertices and minimize the weights of the edges +that are cut by the domain boundaries (known as the edgecut). We can consider +the edge weights as a proxy for the exchange of data between cells, so +minimizing this reduces communication. + +The Initial Partition: +^^^^^^^^^^^^^^^^^^^^^^ + +When SWIFT first starts it reads the initial conditions and then does an +initial distribution of the top-level cells. At this time the only information +available is the cell structure and, by geometry, the particles each cell +should contain. The type of partitioning attempted is controlled by the:: + + DomainDecomposition: + initial_type: + +parameter. Which can have the values *memory*, *region*, *grid* or +*vectorized*: + + + * *memory* + + This is the default if METIS or ParMETIS is available. It performs a + partition based on the memory use of all the particles in each cell, + attempting to equalize the memory used by all the ranks. + How successful this attempt is depends on the granularity of cells and particles + and the number of ranks, clearly if most of the particles are in one cell, + or a small region of the volume, balance is impossible or + difficult. Having more top-level cells makes it easier to calculate a + good distribution (but this comes at the cost of greater overheads). + + * *region* + + The one other METIS/ParMETIS option is "region". This attempts to assign equal + numbers of cells to each rank, with the surface area of the regions minimised + (so we get blobs, rather than rectangular volumes of cells). + +If ParMETIS and METIS are not available two other options are possible, but +will give a poorer partition: + + * *grid* + + Split the cells into a number of axis aligned regions. The number of + splits per axis is controlled by the:: + + initial_grid + + parameter. It takes an array of three values. The product of these values + must equal the number of MPI ranks. If not set a suitable default will be used. + + * *vectorized* + + Allocate the cells on the basis of proximity to a set of seed + positions. The seed positions are picked every nranks along a vectorized + cell list (1D representation). This is guaranteed to give an initial + partition for all cases when the number of cells is greater equal to the + number of MPI ranks, so can be used if the others fail. Don't use this. + +If ParMETIS and METIS are not available then only an initial partition will be +performed. So the balance will be compromised by the quality of the initial +partition. + +Repartitioning: +^^^^^^^^^^^^^^^ + +When ParMETIS or METIS is available we can consider adjusting the balance +during the run, so we can improve from the initial partition and also track +changes in the run that require a different balance. The initial partition is +usually not optimal as although it may have balanced the distribution of +particles it has not taken account of the fact that different particles types +require differing amounts of processing and we have not considered that we +also need to do work requiring communication between cells. This latter point +is important as we are running an MPI job, as inter-cell communication may be +very expensive. + +There are a number of possible repartition strategies which are defined using +the:: + + DomainDecomposition: + repartition_type: + +parameter. The possible values for this are *none*, *fullcosts*, *edgecosts*, +*memory*, *timecosts*. + + * *none* + + Rather obviously, don't repartition. You are happy to run with the + initial partition. + + * *fullcosts* + + Use computation weights derived from the running tasks for the vertex and + edge weights. This is the default. + + * *edgecosts* + + Only use computation weights derived from the running tasks for the edge + weights. + + * *memory* + + Repeat the initial partition with the current particle positions + re-balancing the memory use. + + * *timecosts* + + Only use computation weights derived from the running tasks for the vertex + weights and the expected time the particles will interact in the cells as + the edge weights. Using time as the edge weight has the effect of keeping + very active cells on single MPI ranks, so can reduce MPI communication. + +The computation weights are actually the measured times, in CPU ticks, that +tasks associated with a cell take. So these automatically reflect the relative +cost of the different task types (SPH, self-gravity etc.), and other factors +like how well they run on the current hardware and are optimized by the +compiler used, but this means that we have a constraint on how often we can +consider repartitioning, namely when all (or nearly all) the tasks of the +system have been invoked in a step. To control this we have the:: + + minfrac: 0.9 + +parameter. Which defines the minimum fraction of all the particles in the +simulation that must have been actively updated in the last step, before +repartitioning is considered. + +That then leaves the question of when a run is considered to be out of balance +and should benefit from a repartition. That is controlled by the:: + + trigger: 0.05 + +parameter. This value is the CPU time difference between MPI ranks, as a +fraction, if less than this value a repartition will not be +done. Repartitioning can be expensive not just in CPU time, but also because +large numbers of particles can be exchanged between MPI ranks, so is best +avoided. + +If you are using ParMETIS there additional ways that you can tune the +repartition process. + +METIS only offers the ability to create a partition from a graph, which means +that each solution is independent of those that have already been made, that +can make the exchange of particles very large (although SWIFT attempts to +minimize this), however, using ParMETIS we can use the existing partition to +inform the new partition, this has two algorithms that are controlled using:: + + adaptive: 1 + +which means use adaptive repartition, otherwise simple refinement. The +adaptive algorithm is further controlled by the:: + + itr: 100 + +parameter, which defines the ratio of inter node communication time to data +redistribution time, in the range 0.00001 to 10000000.0. Lower values give +less data movement during redistributions. The best choice for these can only +be determined by experimentation (the gains are usually small, so not really +recommended). + +Finally we have the parameter:: + + usemetis: 0 + +Forces the use of the METIS API, probably only useful for developers. + +**Fixed cost repartitioning:** + +So far we have assumed that repartitioning will only happen after a step that +meets the `minfrac:` and `trigger:` criteria, but we may want to repartition +at some arbitrary steps, and indeed do better than the initial partition +earlier in the run. This can be done using *fixed cost* repartitioning. + +Fixed costs are output during each repartition step into the file +`partition_fixed_costs.h`, this should be created by a test run of your your +full simulation (with possibly with a smaller volume, but all the physics +enabled). This file can then be used to replace the same file found in the +`src/` directory and SWIFT should then be recompiled. Once you have that, you +can use the parameter:: + + use_fixed_costs: 1 + +to control whether they are used or not. If enabled these will be used to +repartition after the second step, which will generally give as good a +repartition immediately as you get at the first unforced repartition. + +Also once these have been enabled you can change the `trigger:` value to +numbers greater than 2, and repartitioning will be forced every `trigger` +steps. This latter option is probably only useful for developers, but tuning +the second step to use fixed costs can give some improvements. .. _Parameters_structure_finding: diff --git a/examples/main.c b/examples/main.c index f57950fc9a66cdaa22c9b6279a65ed791d29439c..7e16e7096243736e49be7d773452ee8c0132ebef 100644 --- a/examples/main.c +++ b/examples/main.c @@ -290,10 +290,12 @@ int main(int argc, char *argv[]) { #ifndef SWIFT_DEBUG_TASKS if (dump_tasks) { - printf( - "Error: task dumping is only possible if SWIFT was configured" - " with the --enable-task-debugging option.\n"); - return 1; + if (myrank == 0) { + message( + "WARNING: complete task dumps are only created when " + "configured with --enable-task-debugging."); + message(" Basic task statistics will be output."); + } } #endif @@ -503,8 +505,10 @@ int main(int argc, char *argv[]) { message("Using METIS serial partitioning:"); else message("Using ParMETIS partitioning:"); -#else +#elif defined(HAVE_METIS) message("Using METIS serial partitioning:"); +#else + message("Non-METIS partitioning:"); #endif message(" initial partitioning: %s", initial_partition_name[initial_partition.type]); @@ -1040,99 +1044,17 @@ int main(int argc, char *argv[]) { if (force_stop || (e.restart_onexit && e.step - 1 == nsteps)) engine_dump_restarts(&e, 0, 1); -#ifdef SWIFT_DEBUG_TASKS /* Dump the task data using the given frequency. */ if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) { -#ifdef WITH_MPI - - /* Make sure output file is empty, only on one rank. */ - char dumpfile[35]; - snprintf(dumpfile, sizeof(dumpfile), "thread_info_MPI-step%d.dat", j + 1); - FILE *file_thread; - if (myrank == 0) { - file_thread = fopen(dumpfile, "w"); - fclose(file_thread); - } - MPI_Barrier(MPI_COMM_WORLD); - - for (int i = 0; i < nr_nodes; i++) { - - /* Rank 0 decides the index of writing node, this happens one-by-one. */ - int kk = i; - MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD); - - if (i == myrank) { - - /* Open file and position at end. */ - file_thread = fopen(dumpfile, "a"); - - fprintf(file_thread, - " %03d 0 0 0 0 %lld %lld %lld %lld %lld 0 0 %lld\n", myrank, - e.tic_step, e.toc_step, e.updates, e.g_updates, e.s_updates, - cpufreq); - int count = 0; - for (int l = 0; l < e.sched.nr_tasks; l++) { - if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) { - fprintf(file_thread, - " %03i %i %i %i %i %lli %lli %i %i %i %i %lli %i\n", - myrank, e.sched.tasks[l].rid, e.sched.tasks[l].type, - e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL), - e.sched.tasks[l].tic, e.sched.tasks[l].toc, - (e.sched.tasks[l].ci != NULL) - ? e.sched.tasks[l].ci->hydro.count - : 0, - (e.sched.tasks[l].cj != NULL) - ? e.sched.tasks[l].cj->hydro.count - : 0, - (e.sched.tasks[l].ci != NULL) - ? e.sched.tasks[l].ci->grav.count - : 0, - (e.sched.tasks[l].cj != NULL) - ? e.sched.tasks[l].cj->grav.count - : 0, - e.sched.tasks[l].flags, e.sched.tasks[l].sid); - } - fflush(stdout); - count++; - } - fclose(file_thread); - } - - /* And we wait for all to synchronize. */ - MPI_Barrier(MPI_COMM_WORLD); - } +#ifdef SWIFT_DEBUG_TASKS + task_dump_all(&e, j + 1); +#endif -#else - char dumpfile[32]; - snprintf(dumpfile, sizeof(dumpfile), "thread_info-step%d.dat", j + 1); - FILE *file_thread; - file_thread = fopen(dumpfile, "w"); - /* Add some information to help with the plots */ - fprintf(file_thread, " %d %d %d %d %lld %lld %lld %lld %lld %d %lld\n", - -2, -1, -1, 1, e.tic_step, e.toc_step, e.updates, e.g_updates, - e.s_updates, 0, cpufreq); - for (int l = 0; l < e.sched.nr_tasks; l++) { - if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) { - fprintf( - file_thread, " %i %i %i %i %lli %lli %i %i %i %i %i\n", - e.sched.tasks[l].rid, e.sched.tasks[l].type, - e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL), - e.sched.tasks[l].tic, e.sched.tasks[l].toc, - (e.sched.tasks[l].ci == NULL) ? 0 - : e.sched.tasks[l].ci->hydro.count, - (e.sched.tasks[l].cj == NULL) ? 0 - : e.sched.tasks[l].cj->hydro.count, - (e.sched.tasks[l].ci == NULL) ? 0 - : e.sched.tasks[l].ci->grav.count, - (e.sched.tasks[l].cj == NULL) ? 0 - : e.sched.tasks[l].cj->grav.count, - e.sched.tasks[l].sid); - } - } - fclose(file_thread); -#endif // WITH_MPI + /* Generate the task statistics. */ + char dumpfile[40]; + snprintf(dumpfile, 40, "thread_stats-step%d.dat", j + 1); + task_dump_stats(dumpfile, &e, /* header = */ 0, /* allranks = */ 1); } -#endif // SWIFT_DEBUG_TASKS #ifdef SWIFT_DEBUG_THREADPOOL /* Dump the task data using the given frequency. */ diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 759fbe4aa60e42c434bb6b75ebf0e7b0fd531ff0..eb94bc3ed4a4bbf6ab1ce5fdf8fa581ffe77d8e3 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -145,23 +145,24 @@ Restarts: # Parameters governing domain decomposition DomainDecomposition: - initial_type: memory # (Optional) The initial decomposition strategy: "grid", - # "region", "memory", or "vectorized". - initial_grid: [10,10,10] # (Optional) Grid sizes if the "grid" strategy is chosen. + initial_type: memory # (Optional) The initial decomposition strategy: "grid", + # "region", "memory", or "vectorized". + initial_grid: [10,10,10] # (Optional) Grid sizes if the "grid" strategy is chosen. - repartition_type: costs/costs # (Optional) The re-decomposition strategy, one of: - # "none/none", "costs/costs", "none/costs", "costs/none" or "costs/time". - # These are vertex/edge weights with "costs" as task timing - # and "time" as the expected time of the next updates - - trigger: 0.05 # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a - # new decomposition, or number of steps (>1) between decompositions - minfrac: 0.9 # (Optional) Fractional of all particles that should be updated in previous step when - # using CPU time trigger - usemetis: 0 # Use serial METIS when ParMETIS is also available. - adaptive: 1 # Use adaptive repartition when ParMETIS is available, otherwise simple refinement. - itr: 100 # When adaptive defines the ratio of inter node communication time to data redistribution time, in the range 0.00001 to 10000000.0. - # Lower values give less data movement during redistributions, at the cost of global balance which may require more communication. + repartition_type: fullcosts # (Optional) The re-decomposition strategy, one of: + # "none", "fullcosts", "edgecosts", "memory" or + # "timecosts". + trigger: 0.05 # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a + # new decomposition, or number of steps (>1) between decompositions + minfrac: 0.9 # (Optional) Fractional of all particles that should be updated in previous step when + # using CPU time trigger + usemetis: 0 # Use serial METIS when ParMETIS is also available. + adaptive: 1 # Use adaptive repartition when ParMETIS is available, otherwise simple refinement. + itr: 100 # When adaptive defines the ratio of inter node communication time to data redistribution time, in the range 0.00001 to 10000000.0. + # Lower values give less data movement during redistributions, at the cost of global balance which may require more communication. + use_fixed_costs: 1 # If 1 then use any compiled in fixed costs for + # task weights in first repartition, if 0 only use task timings, if > 1 only use + # fixed costs, unless none are available. # Structure finding options (requires velociraptor) StructureFinding: diff --git a/src/Makefile.am b/src/Makefile.am index 6afe1c8509b77bc6eb5a2a22111673e1a17aaf15..f1c20865af9ade70257abd6e683895f363c55813 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -41,8 +41,8 @@ endif # List required headers include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \ - common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \ - physical_constants.h physical_constants_cgs.h potential.h version.h \ + common_io.h single_io.h multipole.h map.h tools.h partition.h partition_fixed_costs.h \ + clocks.h parser.h physical_constants.h physical_constants_cgs.h potential.h version.h \ hydro_properties.h riemann.h threadpool.h cooling_io.h cooling.h cooling_struct.h \ statistics.h memswap.h cache.h runner_doiact_vec.h profiler.h \ dump.h logger.h active.h timeline.h xmf.h gravity_properties.h gravity_derivatives.h \ diff --git a/src/engine.c b/src/engine.c index fd76580722e4481b716d6fd6ec010cdfcaa07a5a..1e0e89043091a8267d1de4ab345c282778768364 100644 --- a/src/engine.c +++ b/src/engine.c @@ -993,6 +993,12 @@ void engine_repartition(struct engine *e) { * bug that doesn't handle this case well. */ if (e->nr_nodes == 1) return; + /* Generate the fixed costs include file. */ + if (e->step > 3 && e->reparttype->trigger <= 1.f) { + task_dump_stats("partition_fixed_costs.h", e, /* header = */ 1, + /* allranks = */ 1); + } + /* Do the repartitioning. */ partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s, e->sched.tasks, e->sched.nr_tasks); @@ -1048,32 +1054,41 @@ void engine_repartition_trigger(struct engine *e) { const ticks tic = getticks(); - /* Do nothing if there have not been enough steps since the last - * repartition, don't want to repeat this too often or immediately after - * a repartition step. Also nothing to do when requested. */ + /* Do nothing if there have not been enough steps since the last repartition + * as we don't want to repeat this too often or immediately after a + * repartition step. Also nothing to do when requested. */ if (e->step - e->last_repartition >= 2 && e->reparttype->type != REPART_NONE) { - /* Old style if trigger is >1 or this is the second step (want an early - * repartition following the initial repartition). */ - if (e->reparttype->trigger > 1 || e->step == 2) { + /* If we have fixed costs available and this is step 2 or we are forcing + * repartitioning then we do a fixed costs one now. */ + if (e->reparttype->trigger > 1 || + (e->step == 2 && e->reparttype->use_fixed_costs)) { + if (e->reparttype->trigger > 1) { if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1; } else { e->forcerepart = 1; } + e->reparttype->use_ticks = 0; } else { - /* Use cputimes from ranks to estimate the imbalance. */ - /* First check if we are going to skip this stage anyway, if so do that - * now. If is only worth checking the CPU loads when we have processed a - * significant number of all particles. */ + /* It is only worth checking the CPU loads when we have processed a + * significant number of all particles as we require all tasks to have + * timings. */ if ((e->updates > 1 && e->updates >= e->total_nr_parts * e->reparttype->minfrac) || (e->g_updates > 1 && e->g_updates >= e->total_nr_gparts * e->reparttype->minfrac)) { + /* Should we are use the task timings or fixed costs. */ + if (e->reparttype->use_fixed_costs > 1) { + e->reparttype->use_ticks = 0; + } else { + e->reparttype->use_ticks = 1; + } + /* Get CPU time used since the last call to this function. */ double elapsed_cputime = clocks_get_cputime_used() - e->cputime_last_step; @@ -1096,17 +1111,22 @@ void engine_repartition_trigger(struct engine *e) { double mean = sum / (double)e->nr_nodes; /* Are we out of balance? */ - if (((maxtime - mintime) / mean) > e->reparttype->trigger) { + double abs_trigger = fabs(e->reparttype->trigger); + if (((maxtime - mintime) / mean) > abs_trigger) { if (e->verbose) - message("trigger fraction %.3f exceeds %.3f will repartition", - (maxtime - mintime) / mintime, e->reparttype->trigger); + message("trigger fraction %.3f > %.3f will repartition", + (maxtime - mintime) / mean, abs_trigger); e->forcerepart = 1; + } else { + if (e->verbose) + message("trigger fraction %.3f =< %.3f will not repartition", + (maxtime - mintime) / mean, abs_trigger); } } - - /* All nodes do this together. */ - MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD); } + + /* All nodes do this together. */ + MPI_Bcast(&e->forcerepart, 1, MPI_INT, 0, MPI_COMM_WORLD); } /* Remember we did this. */ @@ -2984,9 +3004,7 @@ void engine_step(struct engine *e) { struct clocks_time time1, time2; clocks_gettime(&time1); -#ifdef SWIFT_DEBUG_TASKS e->tic_step = getticks(); -#endif if (e->nodeID == 0) { @@ -3153,10 +3171,8 @@ void engine_step(struct engine *e) { clocks_gettime(&time2); e->wallclock_time = (float)clocks_diff(&time1, &time2); -#ifdef SWIFT_DEBUG_TASKS /* Time in ticks at the end of this step. */ e->toc_step = getticks(); -#endif } /** diff --git a/src/engine.h b/src/engine.h index 4a51d6a9fe864c016f1de23ae16d0e0567a66187..13754097e9e3460eaa7d20a4dfdb6f7dd18bb9bc 100644 --- a/src/engine.h +++ b/src/engine.h @@ -290,10 +290,8 @@ struct engine { struct proxy *proxies; int nr_proxies, *proxy_ind; -#ifdef SWIFT_DEBUG_TASKS /* Tic/toc at the start/end of a step. */ ticks tic_step, toc_step; -#endif #ifdef WITH_MPI /* CPU time of the last step. */ diff --git a/src/partition.c b/src/partition.c index 60ee7716efb25188b3a09f44f93a65a3ccbd5893..0584fab8dff94c32128610a7e0f06a533daac1d9 100644 --- a/src/partition.c +++ b/src/partition.c @@ -71,12 +71,23 @@ const char *initial_partition_name[] = { /* Simple descriptions of repartition types for reports. */ const char *repartition_name[] = { "none", "edge and vertex task cost weights", "task cost edge weights", - "task cost vertex weights", + "memory balanced, using particle vertex weights", "vertex task costs and edge delta timebin weights"}; /* Local functions, if needed. */ static int check_complete(struct space *s, int verbose, int nregions); +/* + * Repartition fixed costs per type/subtype. These are determined from the + * statistics output produced when running with task debugging enabled. + */ +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +static double repartition_costs[task_type_count][task_subtype_count]; +#endif +#if defined(WITH_MPI) +static int repart_init_fixed_costs(void); +#endif + /* Vectorisation support */ /* ===================== */ @@ -1162,6 +1173,7 @@ struct weights_mapper_data { int timebins; int vweights; int nr_cells; + int use_ticks; struct cell *cells; }; @@ -1179,8 +1191,8 @@ static void check_weights(struct task *tasks, int nr_tasks, * @param num_elements the number of data elements to process. * @param extra_data additional data for the mapper context. */ -void partition_gather_weights(void *map_data, int num_elements, - void *extra_data) { +static void partition_gather_weights(void *map_data, int num_elements, + void *extra_data) { struct task *tasks = (struct task *)map_data; struct weights_mapper_data *mydata = (struct weights_mapper_data *)extra_data; @@ -1193,6 +1205,7 @@ void partition_gather_weights(void *map_data, int num_elements, int nr_cells = mydata->nr_cells; int timebins = mydata->timebins; int vweights = mydata->vweights; + int use_ticks = mydata->use_ticks; struct cell *cells = mydata->cells; @@ -1201,10 +1214,18 @@ void partition_gather_weights(void *map_data, int num_elements, struct task *t = &tasks[i]; /* Skip un-interesting tasks. */ - if (t->cost == 0.f) continue; - - /* Get the task weight based on costs. */ - double w = (double)t->cost; + if (t->type == task_type_send || t->type == task_type_recv || + t->type == task_type_logger || t->implicit || t->ci == NULL) + continue; + + /* Get weight for this task. Either based on fixed costs or task timings. */ + double w = 0.0; + if (use_ticks) { + w = (double)t->toc - (double)t->tic; + } else { + w = repartition_costs[t->type][t->subtype]; + } + if (w <= 0.0) continue; /* Get the top-level cells involved. */ struct cell *ci, *cj; @@ -1367,6 +1388,7 @@ static void repart_edge_metis(int vweights, int eweights, int timebins, weights_data.vweights = vweights; weights_data.weights_e = weights_e; weights_data.weights_v = weights_v; + weights_data.use_ticks = repartition->use_ticks; ticks tic = getticks(); @@ -1411,10 +1433,7 @@ static void repart_edge_metis(int vweights, int eweights, int timebins, } /* We need to rescale the sum of the weights so that the sums of the two - * types of weights are less than IDX_MAX, that is the range of idx_t. Also - * we would like to balance edges and vertices when the edge weights are - * timebins, as these have no reason to have equivalent scales, so we use an - * equipartition. */ + * types of weights are less than IDX_MAX, that is the range of idx_t. */ double vsum = 0.0; if (vweights) for (int k = 0; k < nr_cells; k++) vsum += weights_v[k]; @@ -1422,41 +1441,60 @@ static void repart_edge_metis(int vweights, int eweights, int timebins, if (eweights) for (int k = 0; k < 26 * nr_cells; k++) esum += weights_e[k]; + /* Do the scaling, if needed, keeping both weights in proportion. */ double vscale = 1.0; double escale = 1.0; - if (timebins && eweights) { - /* Make sums the same. */ + if (vweights && eweights) { if (vsum > esum) { - escale = vsum / esum; - esum = vsum; + if (vsum > (double)IDX_MAX) { + vscale = (double)(IDX_MAX - 1000) / vsum; + escale = vscale; + } } else { - vscale = esum / vsum; - vsum = esum; + if (esum > (double)IDX_MAX) { + escale = (double)(IDX_MAX - 1000) / esum; + vscale = escale; + } } - } - - /* Now make sure sum of weights are in the range of idx_t. */ - if (vweights) { + } else if (vweights) { if (vsum > (double)IDX_MAX) { vscale = (double)(IDX_MAX - 1000) / vsum; + } + } else if (eweights) { + if (esum > (double)IDX_MAX) { + escale = (double)(IDX_MAX - 1000) / esum; + } + } - if (!timebins && eweights) { - /* Keep edge weights in proportion. */ - esum = 0.0; - for (int k = 0; k < 26 * nr_cells; k++) { - weights_e[k] *= vscale; - esum += weights_e[k]; - } - } + if (vweights && vscale != 1.0) { + vsum = 0.0; + for (int k = 0; k < nr_cells; k++) { + weights_v[k] *= vscale; + vsum += weights_v[k]; } - if (vscale != 1.0) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= vscale; + vscale = 1.0; + } + if (eweights && escale != 1.0) { + esum = 0.0; + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] *= escale; + esum += weights_e[k]; + } + escale = 1.0; } - if (eweights) { - if (esum > (double)IDX_MAX) escale = (double)(IDX_MAX - 1000) / esum; - if (escale != 1.0) + /* Balance edges and vertices when the edge weights are timebins, as these + * have no reason to have equivalent scales, we use an equipartition. */ + if (timebins && eweights) { + + /* Make sums the same. */ + if (vsum > esum) { + escale = vsum / esum; for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= escale; + } else { + vscale = esum / vsum; + for (int k = 0; k < nr_cells; k++) weights_v[k] *= vscale; + } } /* And repartition/ partition, using both weights or not as requested. */ @@ -1511,8 +1549,106 @@ static void repart_edge_metis(int vweights, int eweights, int timebins, if (vweights) free(weights_v); if (eweights) free(weights_e); } + +/** + * @brief Repartition the cells amongst the nodes using weights based on + * the memory use of particles in the cells. + * + * @param repartition the partition struct of the local engine. + * @param nodeID our nodeID. + * @param nr_nodes the number of nodes. + * @param s the space of cells holding our local particles. + */ +static void repart_memory_metis(struct repartition *repartition, int nodeID, + int nr_nodes, struct space *s) { + + /* Space for counts of particle memory use per cell. */ + double *weights = NULL; + if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL) + error("Failed to allocate cell weights buffer."); + bzero(weights, sizeof(double) * s->nr_cells); + + /* Check each particle and accumulate the sizes per cell. */ + accumulate_sizes(s, weights); + + /* Get all the counts from all the nodes. */ + if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM, + MPI_COMM_WORLD) != MPI_SUCCESS) + error("Failed to allreduce particle cell weights."); + + /* Allocate cell list for the partition. If not already done. */ +#ifdef HAVE_PARMETIS + int refine = 1; +#endif + if (repartition->ncelllist != s->nr_cells) { +#ifdef HAVE_PARMETIS + refine = 0; +#endif + free(repartition->celllist); + repartition->ncelllist = 0; + if ((repartition->celllist = (int *)malloc(sizeof(int) * s->nr_cells)) == + NULL) + error("Failed to allocate celllist"); + repartition->ncelllist = s->nr_cells; + } + + /* We need to rescale the sum of the weights so that the sum is + * less than IDX_MAX, that is the range of idx_t. */ + double sum = 0.0; + for (int k = 0; k < s->nr_cells; k++) sum += weights[k]; + if (sum > (double)IDX_MAX) { + double scale = (double)(IDX_MAX - 1000) / sum; + for (int k = 0; k < s->nr_cells; k++) weights[k] *= scale; + } + + /* And repartition. */ +#ifdef HAVE_PARMETIS + if (repartition->usemetis) { + pick_metis(nodeID, s, nr_nodes, weights, NULL, repartition->celllist); + } else { + pick_parmetis(nodeID, s, nr_nodes, weights, NULL, refine, + repartition->adaptive, repartition->itr, + repartition->celllist); + } +#else + pick_metis(nodeID, s, nr_nodes, weights, NULL, repartition->celllist); #endif + /* Check that all cells have good values. All nodes have same copy, so just + * check on one. */ + if (nodeID == 0) { + for (int k = 0; k < s->nr_cells; k++) + if (repartition->celllist[k] < 0 || repartition->celllist[k] >= nr_nodes) + error("Got bad nodeID %d for cell %i.", repartition->celllist[k], k); + } + + /* Check that the partition is complete and all nodes have some cells. */ + int present[nr_nodes]; + int failed = 0; + for (int i = 0; i < nr_nodes; i++) present[i] = 0; + for (int i = 0; i < s->nr_cells; i++) present[repartition->celllist[i]]++; + for (int i = 0; i < nr_nodes; i++) { + if (!present[i]) { + failed = 1; + if (nodeID == 0) message("Node %d is not present after repartition", i); + } + } + + /* If partition failed continue with the current one, but make this clear. */ + if (failed) { + if (nodeID == 0) + message( + "WARNING: repartition has failed, continuing with the current" + " partition, load balance will not be optimal"); + for (int k = 0; k < s->nr_cells; k++) + repartition->celllist[k] = s->cells_top[k].nodeID; + } + + /* And apply to our cells */ + split_metis(s, nr_nodes, repartition->celllist); +} +#endif /* WITH_MPI && (HAVE_METIS || HAVE_PARMETIS) */ + /** * @brief Repartition the space using the given repartition type. * @@ -1542,14 +1678,13 @@ void partition_repartition(struct repartition *reparttype, int nodeID, repart_edge_metis(0, 1, 0, reparttype, nodeID, nr_nodes, s, tasks, nr_tasks); - } else if (reparttype->type == REPART_METIS_VERTEX_COSTS) { - repart_edge_metis(1, 0, 0, reparttype, nodeID, nr_nodes, s, tasks, - nr_tasks); - } else if (reparttype->type == REPART_METIS_VERTEX_COSTS_TIMEBINS) { repart_edge_metis(1, 1, 1, reparttype, nodeID, nr_nodes, s, tasks, nr_tasks); + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS) { + repart_memory_metis(reparttype, nodeID, nr_nodes, s); + } else if (reparttype->type == REPART_NONE) { /* Doing nothing. */ @@ -1706,7 +1841,7 @@ void partition_initial_partition(struct partition *initial_partition, /** * @brief Initialises the partition and re-partition scheme from the parameter - * file + * file. * * @param partition The #partition scheme to initialise. * @param repartition The #repartition scheme to initialise. @@ -1721,10 +1856,10 @@ void partition_init(struct partition *partition, /* Defaults make use of METIS if available */ #if defined(HAVE_METIS) || defined(HAVE_PARMETIS) - const char *default_repart = "costs/costs"; + const char *default_repart = "fullcosts"; const char *default_part = "memory"; #else - const char *default_repart = "none/none"; + const char *default_repart = "none"; const char *default_part = "grid"; #endif @@ -1777,32 +1912,32 @@ void partition_init(struct partition *partition, parser_get_opt_param_string(params, "DomainDecomposition:repartition_type", part_type, default_repart); - if (strcmp("none/none", part_type) == 0) { + if (strcmp("none", part_type) == 0) { repartition->type = REPART_NONE; #if defined(HAVE_METIS) || defined(HAVE_PARMETIS) - } else if (strcmp("costs/costs", part_type) == 0) { + } else if (strcmp("fullcosts", part_type) == 0) { repartition->type = REPART_METIS_VERTEX_EDGE_COSTS; - } else if (strcmp("none/costs", part_type) == 0) { + } else if (strcmp("edgecosts", part_type) == 0) { repartition->type = REPART_METIS_EDGE_COSTS; - } else if (strcmp("costs/none", part_type) == 0) { - repartition->type = REPART_METIS_VERTEX_COSTS; + } else if (strcmp("memory", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS; - } else if (strcmp("costs/time", part_type) == 0) { + } else if (strcmp("timecosts", part_type) == 0) { repartition->type = REPART_METIS_VERTEX_COSTS_TIMEBINS; } else { message("Invalid choice of re-partition type '%s'.", part_type); error( - "Permitted values are: 'none/none', 'costs/costs', 'none/costs' " - "'costs/none' or 'costs/time'"); + "Permitted values are: 'none', 'fullcosts', 'edgecosts' " + "'memory' or 'timecosts'"); #else } else { message("Invalid choice of re-partition type '%s'.", part_type); error( - "Permitted values are: 'none/none' when compiled without " + "Permitted values are: 'none' when compiled without " "METIS or ParMETIS."); #endif } @@ -1819,13 +1954,13 @@ void partition_init(struct partition *partition, " than 1"); /* Fraction of particles that should be updated before a repartition - * based on CPU time is considered. */ + * based on CPU time is considered, needs to be high. */ repartition->minfrac = - parser_get_opt_param_float(params, "DomainDecomposition:minfrac", 0.9f); - if (repartition->minfrac <= 0 || repartition->minfrac > 1) + parser_get_opt_param_float(params, "DomainDecomposition:minfrac", 0.95f); + if (repartition->minfrac <= 0.5 || repartition->minfrac > 1) error( - "Invalid DomainDecomposition:minfrac, must be greater than 0 and less " - "than equal to 1"); + "Invalid DomainDecomposition:minfrac, must be greater than 0.5 " + "and less than equal to 1"); /* Use METIS or ParMETIS when ParMETIS is also available. */ repartition->usemetis = @@ -1844,11 +1979,62 @@ void partition_init(struct partition *partition, repartition->ncelllist = 0; repartition->celllist = NULL; + /* Do we have fixed costs available? These can be used to force + * repartitioning at any time. Not required if not repartitioning.*/ + repartition->use_fixed_costs = parser_get_opt_param_int( + params, "DomainDecomposition:use_fixed_costs", 0); + if (repartition->type == REPART_NONE) repartition->use_fixed_costs = 0; + + /* Check if this is true or required and initialise them. */ + if (repartition->use_fixed_costs || repartition->trigger > 1) { + if (!repart_init_fixed_costs()) { + if (repartition->trigger <= 1) { + if (engine_rank == 0) + message( + "WARNING: fixed cost repartitioning was requested but is" + " not available."); + repartition->use_fixed_costs = 0; + } else { + error( + "Forced fixed cost repartitioning was requested but is" + " not available."); + } + } + } + #else error("SWIFT was not compiled with MPI support"); #endif } +#ifdef WITH_MPI +/** + * @brief Set the fixed costs for repartition using METIS. + * + * These are determined using a run with the -y flag on which produces + * a statistical analysis that is condensed into a .h file for inclusion. + * + * If the default include file is used then no fixed costs are set and this + * function will return 0. + */ +static int repart_init_fixed_costs(void) { + +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) + /* Set the default fixed cost. */ + for (int j = 0; j < task_type_count; j++) { + for (int k = 0; k < task_subtype_count; k++) { + repartition_costs[j][k] = 1.0; + } + } + +#include <partition_fixed_costs.h> + return HAVE_FIXED_COSTS; +#endif + + return 0; +} +#endif /* WITH_MPI */ + /* General support */ /* =============== */ diff --git a/src/partition.h b/src/partition.h index 1202a1d19ff18f83ed26464bade088990ed51db6..de0d95a5e343f1aa85a03c2cda49019f2fd08037 100644 --- a/src/partition.h +++ b/src/partition.h @@ -46,7 +46,7 @@ enum repartition_type { REPART_NONE = 0, REPART_METIS_VERTEX_EDGE_COSTS, REPART_METIS_EDGE_COSTS, - REPART_METIS_VERTEX_COSTS, + REPART_METIS_VERTEX_COUNTS, REPART_METIS_VERTEX_COSTS_TIMEBINS }; @@ -59,6 +59,9 @@ struct repartition { int usemetis; int adaptive; + int use_fixed_costs; + int use_ticks; + /* The partition as a cell-list. */ int ncelllist; int *celllist; diff --git a/src/partition_fixed_costs.h b/src/partition_fixed_costs.h new file mode 100644 index 0000000000000000000000000000000000000000..e713684b28ce81e60b9fa98a6078d1c8c370f935 --- /dev/null +++ b/src/partition_fixed_costs.h @@ -0,0 +1,25 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2018 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_PARTITION_FIXED_COSTS_H +#define SWIFT_PARTITION_FIXED_COSTS_H + +/* Default is no fixed costs. */ +#define HAVE_FIXED_COSTS 0 + +#endif /* SWIFT_PARTITION_FIXED_COSTS_H */ diff --git a/src/scheduler.c b/src/scheduler.c index ad6af73aec209a19106794636c3b6599baca21e1..1083be1996f9ba3986a601ecbe47b1e6a224239a 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -1735,9 +1735,9 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, t->nr_unlock_tasks = 0; #ifdef SWIFT_DEBUG_TASKS t->rid = -1; +#endif t->tic = 0; t->toc = 0; -#endif /* Add an index for it. */ // lock_lock( &s->lock ); @@ -1956,17 +1956,12 @@ void scheduler_reweight(struct scheduler *s, int verbose) { /* Run through the tasks backwards and set their weights. */ for (int k = nr_tasks - 1; k >= 0; k--) { struct task *t = &tasks[tid[k]]; + float cost = 0.f; t->weight = 0.f; -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - t->cost = 0.f; -#endif + for (int j = 0; j < t->nr_unlock_tasks; j++) if (t->unlock_tasks[j]->weight > t->weight) t->weight = t->unlock_tasks[j]->weight; - float cost = 0.f; -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - int partcost = 1; -#endif const float count_i = (t->ci != NULL) ? t->ci->hydro.count : 0.f; const float count_j = (t->cj != NULL) ? t->cj->hydro.count : 0.f; @@ -1987,9 +1982,9 @@ void scheduler_reweight(struct scheduler *s, int verbose) { break; case task_type_self: - if (t->subtype == task_subtype_grav) + if (t->subtype == task_subtype_grav) { cost = 1.f * (wscale * gcount_i) * gcount_i; - else if (t->subtype == task_subtype_external_grav) + } else if (t->subtype == task_subtype_external_grav) cost = 1.f * wscale * gcount_i; else if (t->subtype == task_subtype_stars_density) cost = 1.f * wscale * scount_i * count_i; @@ -2089,18 +2084,12 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = wscale * count_i + wscale * gcount_i; break; case task_type_send: -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - partcost = 0; -#endif if (count_i < 1e5) cost = 10.f * (wscale * count_i) * count_i; else cost = 2e9; break; case task_type_recv: -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - partcost = 0; -#endif if (count_i < 1e5) cost = 5.f * (wscale * count_i) * count_i; else @@ -2110,10 +2099,6 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = 0; break; } - -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - if (partcost) t->cost = cost; -#endif t->weight += cost; } @@ -2186,14 +2171,14 @@ void scheduler_enqueue_mapper(void *map_data, int num_elements, */ void scheduler_start(struct scheduler *s) { -/* Reset all task debugging timers */ -#ifdef SWIFT_DEBUG_TASKS + /* Reset all task timers. */ for (int i = 0; i < s->nr_tasks; ++i) { s->tasks[i].tic = 0; s->tasks[i].toc = 0; +#ifdef SWIFT_DEBUG_TASKS s->tasks[i].rid = -1; - } #endif + } /* Re-wait the tasks. */ if (s->active_count > 1000) { @@ -2445,9 +2430,7 @@ struct task *scheduler_done(struct scheduler *s, struct task *t) { /* Task definitely done, signal any sleeping runners. */ if (!t->implicit) { -#ifdef SWIFT_DEBUG_TASKS t->toc = getticks(); -#endif pthread_mutex_lock(&s->sleep_mutex); atomic_dec(&s->waiting); pthread_cond_broadcast(&s->sleep_cond); @@ -2488,9 +2471,7 @@ struct task *scheduler_unlock(struct scheduler *s, struct task *t) { /* Task definitely done. */ if (!t->implicit) { -#ifdef SWIFT_DEBUG_TASKS t->toc = getticks(); -#endif pthread_mutex_lock(&s->sleep_mutex); atomic_dec(&s->waiting); pthread_cond_broadcast(&s->sleep_cond); @@ -2574,13 +2555,13 @@ struct task *scheduler_gettask(struct scheduler *s, int qid, } } -#ifdef SWIFT_DEBUG_TASKS /* Start the timer on this task, if we got one. */ if (res != NULL) { res->tic = getticks(); +#ifdef SWIFT_DEBUG_TASKS res->rid = qid; - } #endif + } /* No milk today. */ return res; diff --git a/src/task.c b/src/task.c index 4d5695f64c81e710c39fcc460a642a0887856814..d5e0a7d8dc65679cc7e281faa113faf5868a1ff5 100644 --- a/src/task.c +++ b/src/task.c @@ -42,6 +42,7 @@ /* Local headers. */ #include "atomic.h" +#include "engine.h" #include "error.h" #include "inline.h" #include "lock.h" @@ -708,3 +709,239 @@ void task_create_mpi_comms(void) { } } #endif + +/** + * @brief dump all the tasks of all the known engines into a file for + * postprocessing. + * + * Dumps the information to a file "thread_info-stepn.dat" where n is the + * given step value, or "thread_info_MPI-stepn.dat", if we are running + * under MPI. Note if running under MPIU all the ranks are dumped into this + * one file, which has an additional field to identify the rank. + * + * @param e the #engine + * @param step the current step. + */ +void task_dump_all(struct engine *e, int step) { + +#ifdef SWIFT_DEBUG_TASKS + + /* Need this to convert ticks to seconds. */ + unsigned long long cpufreq = clocks_get_cpufreq(); + +#ifdef WITH_MPI + /* Make sure output file is empty, only on one rank. */ + char dumpfile[35]; + snprintf(dumpfile, sizeof(dumpfile), "thread_info_MPI-step%d.dat", step); + FILE *file_thread; + if (engine_rank == 0) { + file_thread = fopen(dumpfile, "w"); + fclose(file_thread); + } + MPI_Barrier(MPI_COMM_WORLD); + + for (int i = 0; i < e->nr_nodes; i++) { + + /* Rank 0 decides the index of the writing node, this happens + * one-by-one. */ + int kk = i; + MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD); + + if (i == engine_rank) { + + /* Open file and position at end. */ + file_thread = fopen(dumpfile, "a"); + + /* Add some information to help with the plots and conversion of ticks to + * seconds. */ + fprintf(file_thread, " %03d 0 0 0 0 %lld %lld %lld %lld %lld 0 0 %lld\n", + engine_rank, e->tic_step, e->toc_step, e->updates, e->g_updates, + e->s_updates, cpufreq); + int count = 0; + for (int l = 0; l < e->sched.nr_tasks; l++) { + if (!e->sched.tasks[l].implicit && e->sched.tasks[l].toc != 0) { + fprintf( + file_thread, " %03i %i %i %i %i %lli %lli %i %i %i %i %lli %i\n", + engine_rank, e->sched.tasks[l].rid, e->sched.tasks[l].type, + e->sched.tasks[l].subtype, (e->sched.tasks[l].cj == NULL), + e->sched.tasks[l].tic, e->sched.tasks[l].toc, + (e->sched.tasks[l].ci != NULL) ? e->sched.tasks[l].ci->hydro.count + : 0, + (e->sched.tasks[l].cj != NULL) ? e->sched.tasks[l].cj->hydro.count + : 0, + (e->sched.tasks[l].ci != NULL) ? e->sched.tasks[l].ci->grav.count + : 0, + (e->sched.tasks[l].cj != NULL) ? e->sched.tasks[l].cj->grav.count + : 0, + e->sched.tasks[l].flags, e->sched.tasks[l].sid); + } + count++; + } + fclose(file_thread); + } + + /* And we wait for all to synchronize. */ + MPI_Barrier(MPI_COMM_WORLD); + } + +#else + /* Non-MPI, so just a single engine's worth of tasks to dump. */ + char dumpfile[32]; + snprintf(dumpfile, sizeof(dumpfile), "thread_info-step%d.dat", step); + FILE *file_thread; + file_thread = fopen(dumpfile, "w"); + + /* Add some information to help with the plots and conversion of ticks to + * seconds. */ + fprintf(file_thread, " %d %d %d %d %lld %lld %lld %lld %lld %d %lld\n", -2, + -1, -1, 1, e->tic_step, e->toc_step, e->updates, e->g_updates, + e->s_updates, 0, cpufreq); + for (int l = 0; l < e->sched.nr_tasks; l++) { + if (!e->sched.tasks[l].implicit && e->sched.tasks[l].toc != 0) { + fprintf( + file_thread, " %i %i %i %i %lli %lli %i %i %i %i %i\n", + e->sched.tasks[l].rid, e->sched.tasks[l].type, + e->sched.tasks[l].subtype, (e->sched.tasks[l].cj == NULL), + e->sched.tasks[l].tic, e->sched.tasks[l].toc, + (e->sched.tasks[l].ci == NULL) ? 0 + : e->sched.tasks[l].ci->hydro.count, + (e->sched.tasks[l].cj == NULL) ? 0 + : e->sched.tasks[l].cj->hydro.count, + (e->sched.tasks[l].ci == NULL) ? 0 : e->sched.tasks[l].ci->grav.count, + (e->sched.tasks[l].cj == NULL) ? 0 : e->sched.tasks[l].cj->grav.count, + e->sched.tasks[l].sid); + } + } + fclose(file_thread); +#endif // WITH_MPI +#endif // SWIFT_DEBUG_TASKS +} + +/** + * @brief Generate simple statistics about the times used by the tasks of + * all the engines and write these into two format, a human readable + * version for debugging and one intented for inclusion as the fixed + * costs for repartitioning. + * + * Note that when running under MPI all the tasks can be summed into this single + * file. In the fuller, human readable file, the statistics included are the + * number of task of each type/subtype followed by the minimum, maximum, mean + * and total time, in millisec and then the fixed costs value. + * + * If header is set, only the fixed costs value is written into the output + * file in a format that is suitable for inclusion in SWIFT (as + * partition_fixed_costs.h). + * + * @param dumpfile name of the file for the output. + * @param e the #engine + * @param header whether to write a header include file. + * @param allranks do the statistics over all ranks, if not just the current + * one, only used if header is false. + */ +void task_dump_stats(const char *dumpfile, struct engine *e, int header, + int allranks) { + + /* Need arrays for sum, min and max across all types and subtypes. */ + double sum[task_type_count][task_subtype_count]; + double min[task_type_count][task_subtype_count]; + double max[task_type_count][task_subtype_count]; + int count[task_type_count][task_subtype_count]; + + for (int j = 0; j < task_type_count; j++) { + for (int k = 0; k < task_subtype_count; k++) { + sum[j][k] = 0.0; + count[j][k] = 0; + min[j][k] = DBL_MAX; + max[j][k] = 0.0; + } + } + + double total[1] = {0.0}; + for (int l = 0; l < e->sched.nr_tasks; l++) { + int type = e->sched.tasks[l].type; + + /* Skip implicit tasks, tasks that didn't run and MPI send/recv as these + * are not interesting (or meaningfully measured). */ + if (!e->sched.tasks[l].implicit && e->sched.tasks[l].toc != 0 && + type != task_type_send && type != task_type_recv) { + int subtype = e->sched.tasks[l].subtype; + + double dt = e->sched.tasks[l].toc - e->sched.tasks[l].tic; + sum[type][subtype] += dt; + count[type][subtype] += 1; + if (dt < min[type][subtype]) { + min[type][subtype] = dt; + } + if (dt > max[type][subtype]) { + max[type][subtype] = dt; + } + total[0] += dt; + } + } + +#ifdef WITH_MPI + if (allranks || header) { + /* Get these from all ranks for output from rank 0. Could wrap these into a + * single operation. */ + size_t size = task_type_count * task_subtype_count; + int res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : sum), sum, size, + MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task sums"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : count), count, size, + MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task counts"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : min), min, size, + MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task minima"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : max), max, size, + MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task maxima"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : total), total, 1, + MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task total time"); + } + + if (!allranks || (engine_rank == 0 && (allranks || header))) { +#endif + + FILE *dfile = fopen(dumpfile, "w"); + if (header) { + fprintf(dfile, "/* use as src/partition_fixed_costs.h */\n"); + fprintf(dfile, "#define HAVE_FIXED_COSTS 1\n"); + } else { + fprintf(dfile, "# task ntasks min max sum mean percent fixed_cost\n"); + } + + for (int j = 0; j < task_type_count; j++) { + const char *taskID = taskID_names[j]; + for (int k = 0; k < task_subtype_count; k++) { + if (sum[j][k] > 0.0) { + double mean = sum[j][k] / (double)count[j][k]; + double perc = 100.0 * sum[j][k] / total[0]; + + /* Fixed cost is in .1ns as we want to compare between runs in + * some absolute units. */ + int fixed_cost = (int)(clocks_from_ticks(mean) * 10000.f); + if (header) { + fprintf(dfile, "repartition_costs[%d][%d] = %10d; /* %s/%s */\n", j, + k, fixed_cost, taskID, subtaskID_names[k]); + } else { + fprintf(dfile, + "%15s/%-10s %10d %14.4f %14.4f %14.4f %14.4f %14.4f %10d\n", + taskID, subtaskID_names[k], count[j][k], + clocks_from_ticks(min[j][k]), clocks_from_ticks(max[j][k]), + clocks_from_ticks(sum[j][k]), clocks_from_ticks(mean), perc, + fixed_cost); + } + } + } + } + fclose(dfile); +#ifdef WITH_MPI + } +#endif +} diff --git a/src/task.h b/src/task.h index 100ac225bd5956e8d59d6a197c1257cb3e796ebb..35b46bd383221d767e313fa38d838777e61f0a99 100644 --- a/src/task.h +++ b/src/task.h @@ -158,11 +158,6 @@ struct task { /*! Weight of the task */ float weight; -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) - /*! Individual cost estimate for this task. */ - float cost; -#endif - /*! Number of tasks unlocked by this one */ short int nr_unlock_tasks; @@ -187,10 +182,10 @@ struct task { /*! Information about the direction of the pair task */ short int sid; +#endif /*! Start and end time of this task */ ticks tic, toc; -#endif #ifdef SWIFT_DEBUG_CHECKS /* When was this task last run? */ @@ -205,6 +200,9 @@ float task_overlap(const struct task *ta, const struct task *tb); int task_lock(struct task *t); void task_do_rewait(struct task *t); void task_print(const struct task *t); +void task_dump_all(struct engine *e, int step); +void task_dump_stats(const char *dumpfile, struct engine *e, int header, + int allranks); void task_get_full_name(int type, int subtype, char *name); void task_get_group_name(int type, int subtype, char *cluster);