diff --git a/.gitignore b/.gitignore index 812aa06dd54ee8bb81e87796deb8ee05160fec72..893f786506ed168565fb995812136bcb7f1305f1 100644 --- a/.gitignore +++ b/.gitignore @@ -26,11 +26,13 @@ examples/*/*.xmf examples/*/*.hdf5 examples/*/*.png examples/*/*.txt +examples/*/*.dot examples/*/used_parameters.yml examples/*/*/*.xmf examples/*/*/*.hdf5 examples/*/*/*.png examples/*/*/*.txt +examples/*/*/*.dot examples/*/*/used_parameters.yml examples/*/gravity_checks_*.dat diff --git a/INSTALL.swift b/INSTALL.swift index 90096bc1b88d34ab86c04f623f1d3f04ca5a2997..fd960200117972d72ec5144250c51fa5126f09f4 100644 --- a/INSTALL.swift +++ b/INSTALL.swift @@ -75,6 +75,15 @@ also be switched off for benchmarking purposes. To do so, you can use: ./configure --disable-vec +Please note that to build SWIFT on MacOS, you will need to configure +using + + ./configure --disable-compiler-warnings + +due to the incorrect behaviour of the LLVM compiler on this platform +that raises warnings when the pthread flags are passed to the linker. + + Dependencies ============ diff --git a/configure.ac b/configure.ac index c0bb9e7a2b6fae10e998a76a5cf833575a3283ec..d95290309e285d9bb1fa212954c9a770071e886f 100644 --- a/configure.ac +++ b/configure.ac @@ -379,6 +379,19 @@ AX_PTHREAD([LIBS="$PTHREAD_LIBS $LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS" or use CPPFLAGS and LDFLAGS if the library is installed in a non-standard location.])) +# Check whether POSIX thread barriers are implemented (e.g. OSX does not have them) +have_pthread_barrier="no" +AC_CHECK_LIB(pthread, pthread_barrier_init, + have_pthread_barrier="yes", + AC_MSG_WARN(POSIX implementation does not have barriers. SWIFT will use home-made ones.)) +if test "x$have_pthread_barrier" == "xyes"; then + AC_DEFINE([HAVE_PTHREAD_BARRIERS], [1], [The posix library implements barriers]) +fi + +# Check whether POSIX file allocation functions exist (e.g. OSX does not have them) +AC_CHECK_LIB(pthread, posix_fallocate, + AC_DEFINE([HAVE_POSIX_FALLOCATE], [1], [The posix library implements file allocation functions.]), + AC_MSG_WARN(POSIX implementation does not have file allocation functions.)) # Check for METIS. Note AX_LIB_METIS exists, but cannot be configured # to be default off (i.e. given no option it tries to locate METIS), so we @@ -548,6 +561,10 @@ if test "$with_hdf5" = "yes"; then fi AM_CONDITIONAL([HAVEPARALLELHDF5],[test "$have_parallel_hdf5" = "yes"]) +# Check for floating-point execeptions +AC_CHECK_FUNC(feenableexcept, AC_DEFINE([HAVE_FE_ENABLE_EXCEPT],[1], + [Defined if the floating-point exception can be enabled using non-standard GNU functions.])) + # Check for setaffinity. AC_CHECK_FUNC(pthread_setaffinity_np, AC_DEFINE([HAVE_SETAFFINITY],[1], [Defined if pthread_setaffinity_np exists.]) ) @@ -914,19 +931,20 @@ AC_MSG_RESULT([ $PACKAGE_NAME v.$PACKAGE_VERSION - Compiler : $CC - - vendor : $ax_cv_c_compiler_vendor - - version : $ax_cv_c_compiler_version - - flags : $CFLAGS - MPI enabled : $enable_mpi - HDF5 enabled : $with_hdf5 - - parallel : $have_parallel_hdf5 - Metis enabled : $have_metis - FFTW3 enabled : $have_fftw3 - libNUMA enabled : $have_numa - Using tcmalloc : $have_tcmalloc - Using jemalloc : $have_jemalloc - CPU profiler : $have_profiler + Compiler : $CC + - vendor : $ax_cv_c_compiler_vendor + - version : $ax_cv_c_compiler_version + - flags : $CFLAGS + MPI enabled : $enable_mpi + HDF5 enabled : $with_hdf5 + - parallel : $have_parallel_hdf5 + Metis enabled : $have_metis + FFTW3 enabled : $have_fftw3 + libNUMA enabled : $have_numa + Using tcmalloc : $have_tcmalloc + Using jemalloc : $have_jemalloc + CPU profiler : $have_profiler + Pthread barriers : $have_pthread_barrier Hydro scheme : $with_hydro Dimensionality : $with_dimension diff --git a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml index 6f17cfbb1e0125faf8e47fe4e9e55bfdf4df7b71..2859599c03c44ff6a15522164a8e2a407bdde41e 100644 --- a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml +++ b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml @@ -31,6 +31,9 @@ SPH: max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. h_max: 60. # Maximal smoothing length allowed (in internal units). +EoS: + isothermal_internal_energy: 20.267845 + # Parameters related to the initial conditions InitialConditions: file_name: Disc-Patch.hdf5 # The file to read diff --git a/examples/DiscPatch/HydroStatic/makeIC.py b/examples/DiscPatch/HydroStatic/makeIC.py index 11b482059b494fc9a6b9447fdfe2e7ec543d52ff..83b07e50d59869c51fd3854a7f15cfd7d476d04d 100644 --- a/examples/DiscPatch/HydroStatic/makeIC.py +++ b/examples/DiscPatch/HydroStatic/makeIC.py @@ -202,7 +202,5 @@ print "--- Runtime parameters (YAML file): ---" print "DiscPatchPotential:surface_density: ", surface_density print "DiscPatchPotential:scale_height: ", scale_height print "DiscPatchPotential:x_disc: ", 0.5 * boxSize_x +print "EoS:isothermal_internal_energy: %ef"%u_therm print "" - -print "--- Constant parameters: ---" -print "const_isothermal_internal_energy: %ef"%u_therm diff --git a/examples/analyse_dump_cells.py b/examples/analyse_dump_cells.py new file mode 100755 index 0000000000000000000000000000000000000000..3140e799566c75fe494a75895db0f4a8dcff4e57 --- /dev/null +++ b/examples/analyse_dump_cells.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +""" +Usage: + analysedumpcells.py nx ny nx cell<1>.dat cell<2>.dat ... + +Analyses a number of output files created by calls to the dumpCells() debug +function (presumably called in engine_step()) to output a list of active +top-level cells, identifying those that are on the edges of the volumes being +processed on by various nodes. The point is that these should be minimised to +reduce the MPI communications. + +The nx, ny and nz arguments are the number of cells in the complete space, +we need these so that we can wrap around the edge of space correctly. + +This file is part of SWIFT. +Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk) + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" +import pylab as pl +import numpy as np +import sys +import pandas + +xcol = 0 +ycol = 1 +zcol = 2 +xwcol = 3 +ywcol = 4 +zwcol = 5 +localcol = 18 +supercol = 15 +activecol = 16 + +# Command-line arguments. +if len(sys.argv) < 5: + print "usage: ", sys.argv[0], " nx ny nz cell1.dat cell2.dat ..." + sys.exit(1) +nx = int(sys.argv[1]) +ny = int(sys.argv[2]) +nz = int(sys.argv[3]) + +print "# x y z onedge" +allactives = [] +onedge = 0 +tcount = 0 +for i in range(4, len(sys.argv)): + + # Read the file. + data = pl.loadtxt(sys.argv[i]) + #print data + + # Select cells that are on the current rank and are super cells. + rdata = data[data[:,localcol] == 1] + tdata = rdata[rdata[:,supercol] == 1] + + # Separation of the cells is in data. + xwidth = tdata[0,xwcol] + ywidth = tdata[0,ywcol] + zwidth = tdata[0,zwcol] + + # Fill space nx, ny,n nz with all toplevel cells and flag their active + # state. + space = np.zeros((nx,ny,nz)) + actives = [] + for line in tdata: + ix = int(np.rint(line[xcol] / xwidth)) + iy = int(np.rint(line[ycol] / ywidth)) + iz = int(np.rint(line[zcol] / zwidth)) + active = int(line[activecol]) + space[ix,iy,iz] = 1 + active + tcount = tcount + 1 + if active == 1: + actives.append([ix, iy, iz, line]) + + # Report all active cells and flag any without 26 neighbours. These are + # on the edge of the partition volume and will have foreign neighbour + # cells. + for active in actives: + count = 0 + for ii in [-1, 0, 1]: + i = active[0] + ii + if i < 0: + i = i + nx + elif i >= nx: + i = i - nx + + for jj in [-1, 0, 1]: + j = active[1] + jj + if j < 0: + j = j + ny + elif j >= ny: + j = j - ny + + for kk in [-1, 0, 1]: + k = active[2] + kk + if k < 0: + k = k + nz + elif k >= nz: + k = k - nz + if space[i, j, k] > 0: + count = count + 1 + if count < 27: + onedge = onedge + 1 + print active[3][0], active[3][1], active[3][2], 1 + else: + print active[3][0], active[3][1], active[3][2], 0 + + allactives.extend(actives) + +print "# top cells: ", tcount, " active: ", len(allactives), " on edge: ", onedge + +sys.exit(0) + diff --git a/examples/main.c b/examples/main.c index 0d87ea350793e0d089b41d3bd05a68cd97753ef9..31e6a5db089a919107e0f768f2e23b60a3520dd0 100644 --- a/examples/main.c +++ b/examples/main.c @@ -203,7 +203,11 @@ int main(int argc, char *argv[]) { with_drift_all = 1; break; case 'e': +#ifdef HAVE_FE_ENABLE_EXCEPT with_fp_exceptions = 1; +#else + error("Need support for floating point exception on this platform"); +#endif break; case 'f': if (sscanf(optarg, "%llu", &cpufreq) != 1) { @@ -379,11 +383,19 @@ int main(int argc, char *argv[]) { /* Do we choke on FP-exceptions ? */ if (with_fp_exceptions) { +#ifdef HAVE_FE_ENABLE_EXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); +#endif if (myrank == 0) message("WARNING: Floating point exceptions will be reported."); } +/* Do we have slow barriers? */ +#ifndef HAVE_PTHREAD_BARRIERS + if (myrank == 0) + message("WARNING: Non-optimal thread barriers are being used."); +#endif + /* How large are the parts? */ if (myrank == 0) { message("sizeof(part) is %4zi bytes.", sizeof(struct part)); @@ -463,6 +475,7 @@ int main(int argc, char *argv[]) { /* Initialise the hydro properties */ struct hydro_props hydro_properties; if (with_hydro) hydro_props_init(&hydro_properties, params); + if (with_hydro) eos_init(&eos, params); /* Initialise the gravity properties */ struct gravity_props gravity_properties; @@ -689,9 +702,9 @@ int main(int argc, char *argv[]) { /* Legend */ if (myrank == 0) - printf("# %6s %14s %14s %10s %10s %10s %16s [%s]\n", "Step", "Time", + printf("# %6s %14s %14s %12s %12s %12s %16s [%s] %6s\n", "Step", "Time", "Time-step", "Updates", "g-Updates", "s-Updates", "Wall-clock time", - clocks_getunit()); + clocks_getunit(), "Props"); /* File for the timers */ if (with_verbose_timers) timers_open_file(myrank); diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 6327540c27753ea61a79d7fb4c16d60c5f00635d..948731d7879ec3e34f6e53f58dc72f6d7a6145ec 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -15,6 +15,7 @@ Scheduler: cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...). + mpi_message_limit: 4096 # (Optional) Maximum MPI task message size to send non-buffered, KB. # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.) TimeIntegration: @@ -50,6 +51,9 @@ SPH: max_volume_change: 1.4 # (Optional) Maximal allowed change of kernel volume over one time-step. max_ghost_iterations: 30 # (Optional) Maximal number of iterations allowed to converge towards the smoothing length. +EoS: + isothermal_internal_energy: 20.26784 # Thermal energy per unit mass for the case of isothermal equation of state (in internal units). + # Parameters for the self-gravity scheme Gravity: eta: 0.025 # Constant dimensionless multiplier for time integration. @@ -76,8 +80,13 @@ DomainDecomposition: initial_grid_x: 10 # (Optional) Grid size if the "grid" strategy is chosen. initial_grid_y: 10 # "" initial_grid_z: 10 # "" - repartition_type: task_weights # (Optional) The re-decomposition strategy: "none", "task_weights", "particle_weights", - # "edge_task_weights" or "hybrid_weights". + + repartition_type: costs/costs # (Optional) The re-decomposition strategy, one of: + # "none/none", "costs/costs", "counts/none", "none/costs", "counts/costs", + # "costs/time" or "none/time". + # These are vertex/edge weights with "costs" as task timing, "counts" as + # sum of particles and "time" as the expected time of the next updates + trigger: 0.05 # (Optional) Fractional (<1) CPU time difference between MPI ranks required to trigger a # new decomposition, or number of steps (>1) between decompositions minfrac: 0.9 # (Optional) Fractional of all particles that should be updated in previous step when @@ -117,7 +126,7 @@ SineWavePotential: amplitude: 10. # Amplitude of the sine wave (internal units) timestep_limit: 1. # Time-step dimensionless pre-factor. growth_time: 0. # (Optional) Time for the potential to grow to its final size. - + # Parameters related to cooling function ---------------------------------------------- # Constant du/dt cooling function diff --git a/examples/plot_task_dependencies.sh b/examples/plot_task_dependencies.sh new file mode 100755 index 0000000000000000000000000000000000000000..77784d8a9cdd3720621c9ad35c4cfbdaf0167ff1 --- /dev/null +++ b/examples/plot_task_dependencies.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Creates a graphic from the task graph file dependency_graph.dot. +# Requires the graphviz command "dot". + +if [ ! -e dependency_graph.dot ]; then + echo "Missing task-graph output 'dependency_graph.dot'! Cannot generate figure." +else + dot -Tpng dependency_graph.dot -o task_graph.png + echo "Output written to task_graph.png" +fi + +exit diff --git a/examples/process_cells b/examples/process_cells new file mode 100755 index 0000000000000000000000000000000000000000..eead4572387f2732cf0b86265f14d20039f73ae1 --- /dev/null +++ b/examples/process_cells @@ -0,0 +1,64 @@ +#!/bin/bash +# +# Usage: +# process_cells nx ny nz nprocess +# +# Description: +# Process all the cell dumps in the current directory. + +# Outputs file per rank with the active cells identified and marked as to +# whether they are near an edge or not. Note requires the numbers of cells +# per dimension of the space. +# +# Also outputs a graphic showing the fraction of active cells on edges +# for each step. + +# Handle command-line +if test "$4" = ""; then + echo "Usage: $0 nx ny nz nprocess" + exit 1 +fi +NX=$1 +NY=$2 +NZ=$3 +NPROCS=$4 + +# Locate script. +SCRIPTHOME=$(dirname "$0") + +# Find all files. Use version sort to get into correct order. +files=$(ls -v cells_*.dat) +if test $? != 0; then + echo "Failed to find any cell dump files" + exit 1 +fi + +# Construct list of names need the number of ranks. +ranks=$(ls -v cells_*.dat | sed 's,cells_\(.*\)_.*.dat,\1,' | sort | uniq | wc -l) +echo "Number of ranks = $ranks" + +# Now construct a list of files ordered by rank, not step. +files=$(ls cells_*.dat | sort -t "_" -k 3,3 -n | xargs -n 4) + +# Need number of steps. +nfiles=$(echo $files| wc -w) +echo "Number of files = $nfiles" +steps=$(( $nfiles / $ranks + 1 )) +echo "Number of steps = $steps" + +# And process them, +echo "Processing cell dumps files..." +#echo $files | xargs -P $NPROCS -n 4 /bin/bash -c "${SCRIPTHOME}/process_cells_helper $NX $NY $NZ \$0 \$1 \$2 \$3" + +# Create summary. +grep "top cells" step*-active-cells.dat | sort -h > active_cells.log + +# And plot of active cells to edge cells. +stilts plot2plane ifmt=ascii in=active_cells.log xmin=-0.1 xmax=1.1 ymin=0 ymax=$steps grid=1 \ + legend=false xpix=600 ypix=500 xlabel="Edge cells/Active cells" ylabel="Step" \ + layer1=mark x1="col9/1.0/col6" y1="index" size1=3 shading1=aux auxmap=rainbow \ + aux=col6 auxfunc=log auxlabel="Active cells" layer2=histogram x2="col9/1.0/col6" \ + color2=grey binsize2=0.01 phase2=0.5 barform2=semi_steps thick2=1 \ + out=active_cells.png + +exit diff --git a/examples/process_cells_helper b/examples/process_cells_helper new file mode 100755 index 0000000000000000000000000000000000000000..a96bdba0de56e04c28ed6d8675c705241ce241d9 --- /dev/null +++ b/examples/process_cells_helper @@ -0,0 +1,12 @@ +#!/bin/bash + +# Helper for process_cells. + +# Locate script. +SCRIPTHOME=$(dirname "$0") + +step=$(echo $4|sed 's,cells_\(.*\)_\(.*\).dat,\2,') +echo "${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat" +${SCRIPTHOME}/analyse_dump_cells.py $* > step${step}-active-cells.dat + +exit diff --git a/m4/ax_cc_maxopt.m4 b/m4/ax_cc_maxopt.m4 index 93d5d6dcd78ff77c934f77ad0e1e02ef37873a37..0fcef66c7542fbb840e68d259550343a1a5601a1 100644 --- a/m4/ax_cc_maxopt.m4 +++ b/m4/ax_cc_maxopt.m4 @@ -146,6 +146,22 @@ if test "$ac_test_CFLAGS" != "set"; then fi ;; + clang) + # default optimization flags for clang on all systems + CFLAGS="-O3 -fomit-frame-pointer" + + # Always good optimisation to have + AX_CHECK_COMPILE_FLAG(-fstrict-aliasing, CFLAGS="$CFLAGS -fstrict-aliasing") + + # note that we enable "unsafe" fp optimization with other compilers, too + AX_CHECK_COMPILE_FLAG(-ffast-math, CFLAGS="$CFLAGS -ffast-math") + + # not all codes will benefit from this. + AX_CHECK_COMPILE_FLAG(-funroll-loops, CFLAGS="$CFLAGS -funroll-loops") + + AX_GCC_ARCHFLAG($acx_maxopt_portable) + ;; + gnu) # default optimization flags for gcc on all systems CFLAGS="-O3 -fomit-frame-pointer" @@ -155,7 +171,7 @@ if test "$ac_test_CFLAGS" != "set"; then # -fstrict-aliasing for gcc-2.95+ AX_CHECK_COMPILE_FLAG(-fstrict-aliasing, - CFLAGS="$CFLAGS -fstrict-aliasing") + CFLAGS="$CFLAGS -fstrict-aliasing") # note that we enable "unsafe" fp optimization with other compilers, too AX_CHECK_COMPILE_FLAG(-ffast-math, CFLAGS="$CFLAGS -ffast-math") diff --git a/m4/ax_gcc_archflag.m4 b/m4/ax_gcc_archflag.m4 index 298db809e82130b77758f5e66c29c276ceb8266a..6d4feb1e0aab816d0830dcdfdd6d84d61575c8b2 100644 --- a/m4/ax_gcc_archflag.m4 +++ b/m4/ax_gcc_archflag.m4 @@ -108,7 +108,8 @@ case $host_cpu in *3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; - *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; + *4?6[[de]]?:*:*:*) ax_gcc_arch="skylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; + *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;; *3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;; *000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;; diff --git a/m4/ax_pthread.m4 b/m4/ax_pthread.m4 index d383ad5c6d6a5061370800bb1dc89b7a334c0638..5fbf9fe0d68616042f87a8365190211cb8ccfbf1 100644 --- a/m4/ax_pthread.m4 +++ b/m4/ax_pthread.m4 @@ -1,5 +1,5 @@ # =========================================================================== -# http://www.gnu.org/software/autoconf-archive/ax_pthread.html +# https://www.gnu.org/software/autoconf-archive/ax_pthread.html # =========================================================================== # # SYNOPSIS @@ -19,10 +19,10 @@ # is necessary on AIX to use the special cc_r compiler alias.) # # NOTE: You are assumed to not only compile your program with these flags, -# but also link it with them as well. e.g. you should link with +# but also to link with them as well. For example, you might link with # $PTHREAD_CC $CFLAGS $PTHREAD_CFLAGS $LDFLAGS ... $PTHREAD_LIBS $LIBS # -# If you are only building threads programs, you may wish to use these +# If you are only building threaded programs, you may wish to use these # variables in your default LIBS, CFLAGS, and CC: # # LIBS="$PTHREAD_LIBS $LIBS" @@ -30,8 +30,8 @@ # CC="$PTHREAD_CC" # # In addition, if the PTHREAD_CREATE_JOINABLE thread-attribute constant -# has a nonstandard name, defines PTHREAD_CREATE_JOINABLE to that name -# (e.g. PTHREAD_CREATE_UNDETACHED on AIX). +# has a nonstandard name, this macro defines PTHREAD_CREATE_JOINABLE to +# that name (e.g. PTHREAD_CREATE_UNDETACHED on AIX). # # Also HAVE_PTHREAD_PRIO_INHERIT is defined if pthread is found and the # PTHREAD_PRIO_INHERIT symbol is defined when compiling with @@ -67,7 +67,7 @@ # Public License for more details. # # You should have received a copy of the GNU General Public License along -# with this program. If not, see <http://www.gnu.org/licenses/>. +# with this program. If not, see <https://www.gnu.org/licenses/>. # # As a special exception, the respective Autoconf Macro's copyright owner # gives unlimited permission to copy, distribute and modify the configure @@ -82,35 +82,40 @@ # modified version of the Autoconf Macro, you may extend this special # exception to the GPL to apply to your modified version as well. -#serial 21 +#serial 24 AU_ALIAS([ACX_PTHREAD], [AX_PTHREAD]) AC_DEFUN([AX_PTHREAD], [ AC_REQUIRE([AC_CANONICAL_HOST]) +AC_REQUIRE([AC_PROG_CC]) +AC_REQUIRE([AC_PROG_SED]) AC_LANG_PUSH([C]) ax_pthread_ok=no # We used to check for pthread.h first, but this fails if pthread.h -# requires special compiler flags (e.g. on True64 or Sequent). +# requires special compiler flags (e.g. on Tru64 or Sequent). # It gets checked for in the link test anyway. # First of all, check if the user has set any of the PTHREAD_LIBS, # etcetera environment variables, and if threads linking works using # them: -if test x"$PTHREAD_LIBS$PTHREAD_CFLAGS" != x; then - save_CFLAGS="$CFLAGS" +if test "x$PTHREAD_CFLAGS$PTHREAD_LIBS" != "x"; then + ax_pthread_save_CC="$CC" + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + AS_IF([test "x$PTHREAD_CC" != "x"], [CC="$PTHREAD_CC"]) CFLAGS="$CFLAGS $PTHREAD_CFLAGS" - save_LIBS="$LIBS" LIBS="$PTHREAD_LIBS $LIBS" - AC_MSG_CHECKING([for pthread_join in LIBS=$PTHREAD_LIBS with CFLAGS=$PTHREAD_CFLAGS]) - AC_TRY_LINK_FUNC([pthread_join], [ax_pthread_ok=yes]) + AC_MSG_CHECKING([for pthread_join using $CC $PTHREAD_CFLAGS $PTHREAD_LIBS]) + AC_LINK_IFELSE([AC_LANG_CALL([], [pthread_join])], [ax_pthread_ok=yes]) AC_MSG_RESULT([$ax_pthread_ok]) - if test x"$ax_pthread_ok" = xno; then + if test "x$ax_pthread_ok" = "xno"; then PTHREAD_LIBS="" PTHREAD_CFLAGS="" fi - LIBS="$save_LIBS" - CFLAGS="$save_CFLAGS" + CC="$ax_pthread_save_CC" + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" fi # We must check for the threads library under a number of different @@ -123,7 +128,7 @@ fi # which indicates that we try without any flags at all, and "pthread-config" # which is a program returning the flags for the Pth emulation library. -ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" +ax_pthread_flags="pthreads none -Kthread -pthread -pthreads -mthreads pthread --thread-safe -mt pthread-config" # The ordering *is* (sometimes) important. Some notes on the # individual items follow: @@ -132,82 +137,225 @@ ax_pthread_flags="pthreads none -Kthread -kthread lthread -pthread -pthreads -mt # none: in case threads are in libc; should be tried before -Kthread and # other compiler flags to prevent continual compiler warnings # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h) -# -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) -# lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) -# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads) -# -pthreads: Solaris/gcc -# -mthreads: Mingw32/gcc, Lynx/gcc +# -pthread: Linux/gcc (kernel threads), BSD/gcc (userland threads), Tru64 +# (Note: HP C rejects this with "bad form for `-t' option") +# -pthreads: Solaris/gcc (Note: HP C also rejects) # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it -# doesn't hurt to check since this sometimes defines pthreads too; -# also defines -D_REENTRANT) -# ... -mt is also the pthreads flag for HP/aCC +# doesn't hurt to check since this sometimes defines pthreads and +# -D_REENTRANT too), HP C (must be checked before -lpthread, which +# is present but should not be used directly; and before -mthreads, +# because the compiler interprets this as "-mt" + "-hreads") +# -mthreads: Mingw32/gcc, Lynx/gcc # pthread: Linux, etcetera # --thread-safe: KAI C++ # pthread-config: use pthread-config program (for GNU Pth library) -case ${host_os} in +case $host_os in + + freebsd*) + + # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able) + # lthread: LinuxThreads port on FreeBSD (also preferred to -pthread) + + ax_pthread_flags="-kthread lthread $ax_pthread_flags" + ;; + + hpux*) + + # From the cc(1) man page: "[-mt] Sets various -D flags to enable + # multi-threading and also sets -lpthread." + + ax_pthread_flags="-mt -pthread pthread $ax_pthread_flags" + ;; + + openedition*) + + # IBM z/OS requires a feature-test macro to be defined in order to + # enable POSIX threads at all, so give the user a hint if this is + # not set. (We don't define these ourselves, as they can affect + # other portions of the system API in unpredictable ways.) + + AC_EGREP_CPP([AX_PTHREAD_ZOS_MISSING], + [ +# if !defined(_OPEN_THREADS) && !defined(_UNIX03_THREADS) + AX_PTHREAD_ZOS_MISSING +# endif + ], + [AC_MSG_WARN([IBM z/OS requires -D_OPEN_THREADS or -D_UNIX03_THREADS to enable pthreads support.])]) + ;; + solaris*) # On Solaris (at least, for some versions), libc contains stubbed # (non-functional) versions of the pthreads routines, so link-based - # tests will erroneously succeed. (We need to link with -pthreads/-mt/ - # -lpthread.) (The stubs are missing pthread_cleanup_push, or rather - # a function called by this macro, so we could check for that, but - # who knows whether they'll stub that too in a future libc.) So, - # we'll just look for -pthreads and -lpthread first: + # tests will erroneously succeed. (N.B.: The stubs are missing + # pthread_cleanup_push, or rather a function called by this macro, + # so we could check for that, but who knows whether they'll stub + # that too in a future libc.) So we'll check first for the + # standard Solaris way of linking pthreads (-mt -lpthread). + + ax_pthread_flags="-mt,pthread pthread $ax_pthread_flags" + ;; +esac + +# GCC generally uses -pthread, or -pthreads on some platforms (e.g. SPARC) - ax_pthread_flags="-pthreads pthread -mt -pthread $ax_pthread_flags" +AS_IF([test "x$GCC" = "xyes"], + [ax_pthread_flags="-pthread -pthreads $ax_pthread_flags"]) + +# The presence of a feature test macro requesting re-entrant function +# definitions is, on some systems, a strong hint that pthreads support is +# correctly enabled + +case $host_os in + darwin* | hpux* | linux* | osf* | solaris*) + ax_pthread_check_macro="_REENTRANT" ;; - darwin*) - ax_pthread_flags="-pthread $ax_pthread_flags" + aix*) + ax_pthread_check_macro="_THREAD_SAFE" + ;; + + *) + ax_pthread_check_macro="--" ;; esac +AS_IF([test "x$ax_pthread_check_macro" = "x--"], + [ax_pthread_check_cond=0], + [ax_pthread_check_cond="!defined($ax_pthread_check_macro)"]) + +# Are we compiling with Clang? + +AC_CACHE_CHECK([whether $CC is Clang], + [ax_cv_PTHREAD_CLANG], + [ax_cv_PTHREAD_CLANG=no + # Note that Autoconf sets GCC=yes for Clang as well as GCC + if test "x$GCC" = "xyes"; then + AC_EGREP_CPP([AX_PTHREAD_CC_IS_CLANG], + [/* Note: Clang 2.7 lacks __clang_[a-z]+__ */ +# if defined(__clang__) && defined(__llvm__) + AX_PTHREAD_CC_IS_CLANG +# endif + ], + [ax_cv_PTHREAD_CLANG=yes]) + fi + ]) +ax_pthread_clang="$ax_cv_PTHREAD_CLANG" + +ax_pthread_clang_warning=no + +# Clang needs special handling, because older versions handle the -pthread +# option in a rather... idiosyncratic way + +if test "x$ax_pthread_clang" = "xyes"; then + + # Clang takes -pthread; it has never supported any other flag + + # (Note 1: This will need to be revisited if a system that Clang + # supports has POSIX threads in a separate library. This tends not + # to be the way of modern systems, but it's conceivable.) + + # (Note 2: On some systems, notably Darwin, -pthread is not needed + # to get POSIX threads support; the API is always present and + # active. We could reasonably leave PTHREAD_CFLAGS empty. But + # -pthread does define _REENTRANT, and while the Darwin headers + # ignore this macro, third-party headers might not.) + + PTHREAD_CFLAGS="-pthread" + PTHREAD_LIBS= + + ax_pthread_ok=yes + + # However, older versions of Clang make a point of warning the user + # that, in an invocation where only linking and no compilation is + # taking place, the -pthread option has no effect ("argument unused + # during compilation"). They expect -pthread to be passed in only + # when source code is being compiled. + # + # Problem is, this is at odds with the way Automake and most other + # C build frameworks function, which is that the same flags used in + # compilation (CFLAGS) are also used in linking. Many systems + # supported by AX_PTHREAD require exactly this for POSIX threads + # support, and in fact it is often not straightforward to specify a + # flag that is used only in the compilation phase and not in + # linking. Such a scenario is extremely rare in practice. + # + # Even though use of the -pthread flag in linking would only print + # a warning, this can be a nuisance for well-run software projects + # that build with -Werror. So if the active version of Clang has + # this misfeature, we search for an option to squash it. + + AC_CACHE_CHECK([whether Clang needs flag to prevent "argument unused" warning when linking with -pthread], + [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG], + [ax_cv_PTHREAD_CLANG_NO_WARN_FLAG=unknown + # Create an alternate version of $ac_link that compiles and + # links in two steps (.c -> .o, .o -> exe) instead of one + # (.c -> exe), because the warning occurs only in the second + # step + ax_pthread_save_ac_link="$ac_link" + ax_pthread_sed='s/conftest\.\$ac_ext/conftest.$ac_objext/g' + ax_pthread_link_step=`$as_echo "$ac_link" | sed "$ax_pthread_sed"` + ax_pthread_2step_ac_link="($ac_compile) && (echo ==== >&5) && ($ax_pthread_link_step)" + ax_pthread_save_CFLAGS="$CFLAGS" + for ax_pthread_try in '' -Qunused-arguments -Wno-unused-command-line-argument unknown; do + AS_IF([test "x$ax_pthread_try" = "xunknown"], [break]) + CFLAGS="-Werror -Wunknown-warning-option $ax_pthread_try -pthread $ax_pthread_save_CFLAGS" + ac_link="$ax_pthread_save_ac_link" + AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], + [ac_link="$ax_pthread_2step_ac_link" + AC_LINK_IFELSE([AC_LANG_SOURCE([[int main(void){return 0;}]])], + [break]) + ]) + done + ac_link="$ax_pthread_save_ac_link" + CFLAGS="$ax_pthread_save_CFLAGS" + AS_IF([test "x$ax_pthread_try" = "x"], [ax_pthread_try=no]) + ax_cv_PTHREAD_CLANG_NO_WARN_FLAG="$ax_pthread_try" + ]) -# Clang doesn't consider unrecognized options an error unless we specify -# -Werror. We throw in some extra Clang-specific options to ensure that -# this doesn't happen for GCC, which also accepts -Werror. + case "$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG" in + no | unknown) ;; + *) PTHREAD_CFLAGS="$ax_cv_PTHREAD_CLANG_NO_WARN_FLAG $PTHREAD_CFLAGS" ;; + esac -AC_MSG_CHECKING([if compiler needs -Werror to reject unknown flags]) -save_CFLAGS="$CFLAGS" -ax_pthread_extra_flags="-Werror" -CFLAGS="$CFLAGS $ax_pthread_extra_flags -Wunknown-warning-option -Wsizeof-array-argument" -AC_COMPILE_IFELSE([AC_LANG_PROGRAM([int foo(void);],[foo()])], - [AC_MSG_RESULT([yes])], - [ax_pthread_extra_flags= - AC_MSG_RESULT([no])]) -CFLAGS="$save_CFLAGS" +fi # $ax_pthread_clang = yes -if test x"$ax_pthread_ok" = xno; then -for flag in $ax_pthread_flags; do +if test "x$ax_pthread_ok" = "xno"; then +for ax_pthread_try_flag in $ax_pthread_flags; do - case $flag in + case $ax_pthread_try_flag in none) AC_MSG_CHECKING([whether pthreads work without any flags]) ;; + -mt,pthread) + AC_MSG_CHECKING([whether pthreads work with -mt -lpthread]) + PTHREAD_CFLAGS="-mt" + PTHREAD_LIBS="-lpthread" + ;; + -*) - AC_MSG_CHECKING([whether pthreads work with $flag]) - PTHREAD_CFLAGS="$flag" + AC_MSG_CHECKING([whether pthreads work with $ax_pthread_try_flag]) + PTHREAD_CFLAGS="$ax_pthread_try_flag" ;; pthread-config) AC_CHECK_PROG([ax_pthread_config], [pthread-config], [yes], [no]) - if test x"$ax_pthread_config" = xno; then continue; fi + AS_IF([test "x$ax_pthread_config" = "xno"], [continue]) PTHREAD_CFLAGS="`pthread-config --cflags`" PTHREAD_LIBS="`pthread-config --ldflags` `pthread-config --libs`" ;; *) - AC_MSG_CHECKING([for the pthreads library -l$flag]) - PTHREAD_LIBS="-l$flag" + AC_MSG_CHECKING([for the pthreads library -l$ax_pthread_try_flag]) + PTHREAD_LIBS="-l$ax_pthread_try_flag" ;; esac - save_LIBS="$LIBS" - save_CFLAGS="$CFLAGS" + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" + CFLAGS="$CFLAGS $PTHREAD_CFLAGS" LIBS="$PTHREAD_LIBS $LIBS" - CFLAGS="$CFLAGS $PTHREAD_CFLAGS $ax_pthread_extra_flags" # Check for various functions. We must include pthread.h, # since some functions may be macros. (On the Sequent, we @@ -218,7 +366,11 @@ for flag in $ax_pthread_flags; do # pthread_cleanup_push because it is one of the few pthread # functions on Solaris that doesn't have a non-functional libc stub. # We try pthread_create on general principles. + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h> +# if $ax_pthread_check_cond +# error "$ax_pthread_check_macro must be defined" +# endif static void routine(void *a) { a = 0; } static void *start_routine(void *a) { return a; }], [pthread_t th; pthread_attr_t attr; @@ -227,16 +379,14 @@ for flag in $ax_pthread_flags; do pthread_attr_init(&attr); pthread_cleanup_push(routine, 0); pthread_cleanup_pop(0) /* ; */])], - [ax_pthread_ok=yes], - []) + [ax_pthread_ok=yes], + []) - LIBS="$save_LIBS" - CFLAGS="$save_CFLAGS" + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" AC_MSG_RESULT([$ax_pthread_ok]) - if test "x$ax_pthread_ok" = xyes; then - break; - fi + AS_IF([test "x$ax_pthread_ok" = "xyes"], [break]) PTHREAD_LIBS="" PTHREAD_CFLAGS="" @@ -244,71 +394,74 @@ done fi # Various other checks: -if test "x$ax_pthread_ok" = xyes; then - save_LIBS="$LIBS" - LIBS="$PTHREAD_LIBS $LIBS" - save_CFLAGS="$CFLAGS" +if test "x$ax_pthread_ok" = "xyes"; then + ax_pthread_save_CFLAGS="$CFLAGS" + ax_pthread_save_LIBS="$LIBS" CFLAGS="$CFLAGS $PTHREAD_CFLAGS" + LIBS="$PTHREAD_LIBS $LIBS" # Detect AIX lossage: JOINABLE attribute is called UNDETACHED. - AC_MSG_CHECKING([for joinable pthread attribute]) - attr_name=unknown - for attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do - AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>], - [int attr = $attr; return attr /* ; */])], - [attr_name=$attr; break], - []) - done - AC_MSG_RESULT([$attr_name]) - if test "$attr_name" != PTHREAD_CREATE_JOINABLE; then - AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], [$attr_name], - [Define to necessary symbol if this constant - uses a non-standard name on your system.]) - fi - - AC_MSG_CHECKING([if more special flags are required for pthreads]) - flag=no - case ${host_os} in - aix* | freebsd* | darwin*) flag="-D_THREAD_SAFE";; - osf* | hpux*) flag="-D_REENTRANT";; - solaris*) - if test "$GCC" = "yes"; then - flag="-D_REENTRANT" - else - # TODO: What about Clang on Solaris? - flag="-mt -D_REENTRANT" - fi - ;; - esac - AC_MSG_RESULT([$flag]) - if test "x$flag" != xno; then - PTHREAD_CFLAGS="$flag $PTHREAD_CFLAGS" - fi + AC_CACHE_CHECK([for joinable pthread attribute], + [ax_cv_PTHREAD_JOINABLE_ATTR], + [ax_cv_PTHREAD_JOINABLE_ATTR=unknown + for ax_pthread_attr in PTHREAD_CREATE_JOINABLE PTHREAD_CREATE_UNDETACHED; do + AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <pthread.h>], + [int attr = $ax_pthread_attr; return attr /* ; */])], + [ax_cv_PTHREAD_JOINABLE_ATTR=$ax_pthread_attr; break], + []) + done + ]) + AS_IF([test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xunknown" && \ + test "x$ax_cv_PTHREAD_JOINABLE_ATTR" != "xPTHREAD_CREATE_JOINABLE" && \ + test "x$ax_pthread_joinable_attr_defined" != "xyes"], + [AC_DEFINE_UNQUOTED([PTHREAD_CREATE_JOINABLE], + [$ax_cv_PTHREAD_JOINABLE_ATTR], + [Define to necessary symbol if this constant + uses a non-standard name on your system.]) + ax_pthread_joinable_attr_defined=yes + ]) + + AC_CACHE_CHECK([whether more special flags are required for pthreads], + [ax_cv_PTHREAD_SPECIAL_FLAGS], + [ax_cv_PTHREAD_SPECIAL_FLAGS=no + case $host_os in + solaris*) + ax_cv_PTHREAD_SPECIAL_FLAGS="-D_POSIX_PTHREAD_SEMANTICS" + ;; + esac + ]) + AS_IF([test "x$ax_cv_PTHREAD_SPECIAL_FLAGS" != "xno" && \ + test "x$ax_pthread_special_flags_added" != "xyes"], + [PTHREAD_CFLAGS="$ax_cv_PTHREAD_SPECIAL_FLAGS $PTHREAD_CFLAGS" + ax_pthread_special_flags_added=yes]) AC_CACHE_CHECK([for PTHREAD_PRIO_INHERIT], - [ax_cv_PTHREAD_PRIO_INHERIT], [ - AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]], - [[int i = PTHREAD_PRIO_INHERIT;]])], - [ax_cv_PTHREAD_PRIO_INHERIT=yes], - [ax_cv_PTHREAD_PRIO_INHERIT=no]) + [ax_cv_PTHREAD_PRIO_INHERIT], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]], + [[int i = PTHREAD_PRIO_INHERIT;]])], + [ax_cv_PTHREAD_PRIO_INHERIT=yes], + [ax_cv_PTHREAD_PRIO_INHERIT=no]) ]) - AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes"], - [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.])]) + AS_IF([test "x$ax_cv_PTHREAD_PRIO_INHERIT" = "xyes" && \ + test "x$ax_pthread_prio_inherit_defined" != "xyes"], + [AC_DEFINE([HAVE_PTHREAD_PRIO_INHERIT], [1], [Have PTHREAD_PRIO_INHERIT.]) + ax_pthread_prio_inherit_defined=yes + ]) - LIBS="$save_LIBS" - CFLAGS="$save_CFLAGS" + CFLAGS="$ax_pthread_save_CFLAGS" + LIBS="$ax_pthread_save_LIBS" # More AIX lossage: compile with *_r variant - if test "x$GCC" != xyes; then + if test "x$GCC" != "xyes"; then case $host_os in aix*) AS_CASE(["x/$CC"], - [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6], - [#handle absolute path differently from PATH based program lookup - AS_CASE(["x$CC"], - [x/*], - [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])], - [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])]) + [x*/c89|x*/c89_128|x*/c99|x*/c99_128|x*/cc|x*/cc128|x*/xlc|x*/xlc_v6|x*/xlc128|x*/xlc128_v6], + [#handle absolute path differently from PATH based program lookup + AS_CASE(["x$CC"], + [x/*], + [AS_IF([AS_EXECUTABLE_P([${CC}_r])],[PTHREAD_CC="${CC}_r"])], + [AC_CHECK_PROGS([PTHREAD_CC],[${CC}_r],[$CC])])]) ;; esac fi @@ -321,7 +474,7 @@ AC_SUBST([PTHREAD_CFLAGS]) AC_SUBST([PTHREAD_CC]) # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND: -if test x"$ax_pthread_ok" = xyes; then +if test "x$ax_pthread_ok" = "xyes"; then ifelse([$1],,[AC_DEFINE([HAVE_PTHREAD],[1],[Define if you have POSIX threads libraries and header files.])],[$1]) : else diff --git a/src/Makefile.am b/src/Makefile.am index ec01184928faf3d58b2d0890965a745d05718354..e229d7f11e76aae4b6898a2f86145ec2bd025592 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -57,10 +57,10 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \ runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \ statistics.c runner_doiact_vec.c profiler.c dump.c logger.c \ part_type.c xmf.c gravity_properties.c gravity.c \ - collectgroup.c hydro_space.c + collectgroup.c hydro_space.c equation_of_state.c # Include files for distribution, not installation. -nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \ +nobase_noinst_HEADERS = align.h approx_math.h atomic.h barrier.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \ kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \ runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \ dimension.h equation_of_state.h part_type.h periodic.h \ diff --git a/src/barrier.h b/src/barrier.h new file mode 100644 index 0000000000000000000000000000000000000000..dbe856b402580f859042023cd423bd955887f790 --- /dev/null +++ b/src/barrier.h @@ -0,0 +1,170 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_BARRIER_H +#define SWIFT_BARRIER_H + +/** + * @file barrier.h + * @brief Define the thread barriers if the POSIX implementation on this system + * does not. + * + * The pthread barriers are only an option of the POSIX norm and they are not + * necessarily implemented. One example is OSX where all the rest of POSIX + * exists but not the barriers. + * We implement them here in a simple way to allow for SWIFT to run on such + * systems but this may lead to poorer performance. + * + * Note that we only define the three functions we need. This is a textbook + * implementation of a barrier that uses the common POSIX features (mutex, + * conditions and broadcasts). + * + * If the pthread barriers exist (Linux systems), we default to them. + */ + +/* Config parameters. */ +#include "../config.h" + +/* Standard headers */ +#include <pthread.h> + +/* Does this POSIX implementation provide barriers? */ +#ifdef HAVE_PTHREAD_BARRIERS + +#define swift_barrier_t pthread_barrier_t +#define swift_barrier_wait pthread_barrier_wait +#define swift_barrier_init pthread_barrier_init +#define swift_barrier_destroy pthread_barrier_destroy + +#else + +/* Local headers */ +#include "error.h" +#include "inline.h" + +/** + * @brief An ersatz of POSIX barriers to be used on systems that don't provide + * the good ones. + */ +typedef struct { + + /*! Barrier mutex */ + pthread_mutex_t mutex; + + /*! Condition to open the barrier */ + pthread_cond_t condition; + + /*! Total number of threads */ + int limit; + + /*! Number of threads that reached the barrier */ + int count; + +} swift_barrier_t; + +/** + * @brief Initialise a barrier object. + * + * @param barrier The #swift_barrier_t to initialise + * @param unused Unused parameter (NULL) as we don't support barrier attributes. + * @param count The number of threads that will wait at the barrier. + */ +static INLINE int swift_barrier_init(swift_barrier_t *barrier, void *unused, + unsigned int count) { + /* Initialise the mutex */ + if (pthread_mutex_init(&barrier->mutex, 0) != 0) + error("Error initializing the barrier mutex"); + + /* Initialise the condition */ + if (pthread_cond_init(&barrier->condition, 0) != 0) + error("Error initializing the barrier condition"); + + barrier->limit = count; + barrier->count = 0; + + /* All is good */ + return 0; +} + +/** + * @brief Make a set of threads wait at the barrier + * + * Note that once all threads have reached the barrier, we also + * reset the barrier to state where it is ready to be re-used + * without calling swift_barrier_init. + * + * @param barrier The (initialised) #swift_barrier_t to wait at. + */ +static INLINE int swift_barrier_wait(swift_barrier_t *barrier) { + + /* Start by locking the barrier */ + pthread_mutex_lock(&barrier->mutex); + + /* One more thread has gone home*/ + barrier->count++; + + /* Are threads still running? */ + if (barrier->count < barrier->limit) { + + /* We need to make the thread wait until everyone is back */ + pthread_cond_wait(&barrier->condition, &(barrier->mutex)); + + /* Release the mutex */ + pthread_mutex_unlock(&barrier->mutex); + + /* Say that this was not the last thread */ + return 0; + + } else { /* Everybody is home */ + + /* Open the barrier (i.e. release the threads blocked in the while loop) */ + pthread_cond_broadcast(&barrier->condition); + + /* Re-initialize the barrier */ + barrier->count = 0; + + /* Release the mutex */ + pthread_mutex_unlock(&barrier->mutex); + + /* Say that we are all done */ + return 1; + } +} + +/** + * @brief Destroy a barrier object + * + * Note that if destroy is called before a barrier is open, we return + * an error message and do not attempt to wait for the barrier to open + * before destroying it. + * + * @param barrier The #swift_barrier_t object to destroy. + */ +static INLINE int swift_barrier_destroy(swift_barrier_t *barrier) { + + /* Destroy the pthread things */ + pthread_cond_destroy(&barrier->condition); + pthread_mutex_destroy(&barrier->mutex); + + /* All is good */ + return 0; +} + +#endif /* HAVE_PTHREAD_BARRIERS */ + +#endif /* SWIFT_BARRIER_H */ diff --git a/src/cache.h b/src/cache.h index 3eb1e194dd4232319ac1d4a4323ca8099f044063..c939da28589c1421e0e4241dca124a8320d5c87b 100644 --- a/src/cache.h +++ b/src/cache.h @@ -349,10 +349,12 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); h[i] = parts_i[idx].h; - m[i] = parts_i[idx].mass; vx[i] = parts_i[idx].v[0]; vy[i] = parts_i[idx].v[1]; vz[i] = parts_i[idx].v[2]; +#ifdef GADGET2_SPH + m[i] = parts_i[idx].mass; +#endif } #ifdef SWIFT_DEBUG_CHECKS @@ -431,10 +433,12 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); hj[i] = parts_j[idx].h; - mj[i] = parts_j[idx].mass; vxj[i] = parts_j[idx].v[0]; vyj[i] = parts_j[idx].v[1]; vzj[i] = parts_j[idx].v[2]; +#ifdef GADGET2_SPH + mj[i] = parts_j[idx].mass; +#endif } #ifdef SWIFT_DEBUG_CHECKS @@ -572,15 +576,17 @@ cache_read_two_partial_cells_sorted_force( y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); h[i] = parts_i[idx].h; - m[i] = parts_i[idx].mass; vx[i] = parts_i[idx].v[0]; vy[i] = parts_i[idx].v[1]; vz[i] = parts_i[idx].v[2]; +#ifdef GADGET2_SPH + m[i] = parts_i[idx].mass; rho[i] = parts_i[idx].rho; grad_h[i] = parts_i[idx].force.f; pOrho2[i] = parts_i[idx].force.P_over_rho2; balsara[i] = parts_i[idx].force.balsara; soundspeed[i] = parts_i[idx].force.soundspeed; +#endif } /* Pad cache with fake particles that exist outside the cell so will not @@ -635,15 +641,17 @@ cache_read_two_partial_cells_sorted_force( yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); hj[i] = parts_j[idx].h; - mj[i] = parts_j[idx].mass; vxj[i] = parts_j[idx].v[0]; vyj[i] = parts_j[idx].v[1]; vzj[i] = parts_j[idx].v[2]; +#ifdef GADGET2_SPH + mj[i] = parts_j[idx].mass; rhoj[i] = parts_j[idx].rho; grad_hj[i] = parts_j[idx].force.f; pOrho2j[i] = parts_j[idx].force.P_over_rho2; balsaraj[i] = parts_j[idx].force.balsara; soundspeedj[i] = parts_j[idx].force.soundspeed; +#endif } /* Pad cache with fake particles that exist outside the cell so will not diff --git a/src/cell.c b/src/cell.c index 8d6c6b35c68b4af27ed7c836b1618e9d7857d9dd..9c5d1bd3aae8a0982db79c8fd370354c5c840dac 100644 --- a/src/cell.c +++ b/src/cell.c @@ -50,6 +50,7 @@ #include "active.h" #include "atomic.h" #include "drift.h" +#include "engine.h" #include "error.h" #include "gravity.h" #include "hydro.h" @@ -1862,6 +1863,7 @@ void cell_activate_subcell_external_grav_tasks(struct cell *ci, int cell_unskip_tasks(struct cell *c, struct scheduler *s) { struct engine *e = s->space->e; + const int nodeID = e->nodeID; int rebuild = 0; /* Un-skip the density tasks involved with this cell. */ @@ -1873,13 +1875,13 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { const int cj_active = (cj != NULL) ? cell_is_active(cj, e) : 0; /* Only activate tasks that involve a local active cell. */ - if ((ci_active && ci->nodeID == engine_rank) || - (cj_active && cj->nodeID == engine_rank)) { + if ((ci_active && ci->nodeID == nodeID) || + (cj_active && cj->nodeID == nodeID)) { scheduler_activate(s, t); /* Activate hydro drift */ if (t->type == task_type_self) { - if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s); + if (ci->nodeID == nodeID) cell_activate_drift_part(ci, s); } /* Set the correct sorting flags and activate hydro drifts */ @@ -1891,8 +1893,8 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { cj->dx_max_sort_old = cj->dx_max_sort; /* Activate the drift tasks. */ - if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s); - if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s); + if (ci->nodeID == nodeID) cell_activate_drift_part(ci, s); + if (cj->nodeID == nodeID) cell_activate_drift_part(cj, s); /* Check the sorts and activate them if needed. */ cell_activate_sorts(ci, t->flags, s); @@ -1913,7 +1915,7 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { #ifdef WITH_MPI /* Activate the send/recv tasks. */ - if (ci->nodeID != engine_rank) { + if (ci->nodeID != nodeID) { /* If the local cell is active, receive data from the foreign cell. */ if (cj_active) { @@ -1951,7 +1953,7 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { /* If the local cell is active, send its ti_end values. */ if (cj_active) scheduler_activate_send(s, cj->send_ti, ci->nodeID); - } else if (cj->nodeID != engine_rank) { + } else if (cj->nodeID != nodeID) { /* If the local cell is active, receive data from the foreign cell. */ if (ci_active) { @@ -2001,8 +2003,8 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { struct cell *cj = t->cj; /* Only activate tasks that involve a local active cell. */ - if ((cell_is_active(ci, e) && ci->nodeID == engine_rank) || - (cj != NULL && cell_is_active(cj, e) && cj->nodeID == engine_rank)) { + if ((cell_is_active(ci, e) && ci->nodeID == nodeID) || + (cj != NULL && cell_is_active(cj, e) && cj->nodeID == nodeID)) { scheduler_activate(s, t); /* Set the drifting flags */ @@ -2018,7 +2020,7 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { } /* Unskip all the other task types. */ - if (c->nodeID == engine_rank && cell_is_active(c, e)) { + if (c->nodeID == nodeID && cell_is_active(c, e)) { for (struct link *l = c->gradient; l != NULL; l = l->next) scheduler_activate(s, l->t); diff --git a/src/common_io.c b/src/common_io.c index 1b0770b1132c8c2a62bbe0bcf3426f8f6eef4a17..47e04b868b9d610983b977bd6455c1c78d70bb41 100644 --- a/src/common_io.c +++ b/src/common_io.c @@ -40,10 +40,13 @@ #include "common_io.h" /* Local includes. */ +#include "engine.h" #include "error.h" #include "hydro.h" +#include "io_properties.h" #include "kernel_hydro.h" #include "part.h" +#include "threadpool.h" #include "units.h" #include "version.h" @@ -266,7 +269,6 @@ void io_write_attribute_f(hid_t grp, const char* name, float data) { * @param name The name of the attribute * @param data The value to write */ - void io_write_attribute_i(hid_t grp, const char* name, int data) { io_write_attribute(grp, name, INT, &data, 1); } @@ -295,16 +297,19 @@ void io_write_attribute_s(hid_t grp, const char* name, const char* str) { * @brief Reads the Unit System from an IC file. * @param h_file The (opened) HDF5 file from which to read. * @param us The unit_system to fill. + * @param mpi_rank The MPI rank we are on. * * If the 'Units' group does not exist in the ICs, cgs units will be assumed */ -void io_read_unit_system(hid_t h_file, struct unit_system* us) { +void io_read_unit_system(hid_t h_file, struct unit_system* us, int mpi_rank) { /* First check if it exists as this is *not* required. */ const htri_t exists = H5Lexists(h_file, "/Units", H5P_DEFAULT); if (exists == 0) { - message("'Units' group not found in ICs. Assuming CGS unit system."); + + if (mpi_rank == 0) + message("'Units' group not found in ICs. Assuming CGS unit system."); /* Default to CGS */ us->UnitMass_in_cgs = 1.; @@ -319,7 +324,7 @@ void io_read_unit_system(hid_t h_file, struct unit_system* us) { exists); } - message("Reading IC units from ICs."); + if (mpi_rank == 0) message("Reading IC units from ICs."); hid_t h_grp = H5Gopen(h_file, "/Units", H5P_DEFAULT); /* Ok, Read the damn thing */ @@ -386,6 +391,7 @@ void io_write_code_description(hid_t h_file) { configuration_options()); io_write_attribute_s(h_grpcode, "CFLAGS", compilation_cflags()); io_write_attribute_s(h_grpcode, "HDF5 library version", hdf5_version()); + io_write_attribute_s(h_grpcode, "Thread barriers", thread_barrier_version()); #ifdef HAVE_FFTW io_write_attribute_s(h_grpcode, "FFTW library version", fftw3_version()); #endif @@ -400,6 +406,208 @@ void io_write_code_description(hid_t h_file) { H5Gclose(h_grpcode); } +/** + * @brief Mapper function to copy #part or #gpart fields into a buffer. + */ +void io_copy_mapper(void* restrict temp, int N, void* restrict extra_data) { + + const struct io_props props = *((const struct io_props*)extra_data); + const size_t typeSize = io_sizeof_type(props.type); + const size_t copySize = typeSize * props.dimension; + + /* How far are we with this chunk? */ + char* restrict temp_c = temp; + const ptrdiff_t delta = (temp_c - props.start_temp_c) / copySize; + + for (int k = 0; k < N; k++) { + memcpy(&temp_c[k * copySize], props.field + (delta + k) * props.partSize, + copySize); + } +} + +/** + * @brief Mapper function to copy #part into a buffer of floats using a + * conversion function. + */ +void io_convert_part_f_mapper(void* restrict temp, int N, + void* restrict extra_data) { + + const struct io_props props = *((const struct io_props*)extra_data); + const struct part* restrict parts = props.parts; + const struct engine* e = props.e; + const size_t dim = props.dimension; + + /* How far are we with this chunk? */ + float* restrict temp_f = temp; + const ptrdiff_t delta = (temp_f - props.start_temp_f) / dim; + + for (int i = 0; i < N; i++) + props.convert_part_f(e, parts + delta + i, &temp_f[i * dim]); +} + +/** + * @brief Mapper function to copy #part into a buffer of doubles using a + * conversion function. + */ +void io_convert_part_d_mapper(void* restrict temp, int N, + void* restrict extra_data) { + + const struct io_props props = *((const struct io_props*)extra_data); + const struct part* restrict parts = props.parts; + const struct engine* e = props.e; + const size_t dim = props.dimension; + + /* How far are we with this chunk? */ + double* restrict temp_d = temp; + const ptrdiff_t delta = (temp_d - props.start_temp_d) / dim; + + for (int i = 0; i < N; i++) + props.convert_part_d(e, parts + delta + i, &temp_d[i * dim]); +} + +/** + * @brief Mapper function to copy #gpart into a buffer of floats using a + * conversion function. + */ +void io_convert_gpart_f_mapper(void* restrict temp, int N, + void* restrict extra_data) { + + const struct io_props props = *((const struct io_props*)extra_data); + const struct gpart* restrict gparts = props.gparts; + const struct engine* e = props.e; + const size_t dim = props.dimension; + + /* How far are we with this chunk? */ + float* restrict temp_f = temp; + const ptrdiff_t delta = (temp_f - props.start_temp_f) / dim; + + for (int i = 0; i < N; i++) + props.convert_gpart_f(e, gparts + delta + i, &temp_f[i * dim]); +} + +/** + * @brief Mapper function to copy #gpart into a buffer of doubles using a + * conversion function. + */ +void io_convert_gpart_d_mapper(void* restrict temp, int N, + void* restrict extra_data) { + + const struct io_props props = *((const struct io_props*)extra_data); + const struct gpart* restrict gparts = props.gparts; + const struct engine* e = props.e; + const size_t dim = props.dimension; + + /* How far are we with this chunk? */ + double* restrict temp_d = temp; + const ptrdiff_t delta = (temp_d - props.start_temp_d) / dim; + + for (int i = 0; i < N; i++) + props.convert_gpart_d(e, gparts + delta + i, &temp_d[i * dim]); +} + +/** + * @brief Copy the particle data into a temporary buffer ready for i/o. + * + * @param temp The buffer to be filled. Must be allocated and aligned properly. + * @param e The #engine. + * @param props The #io_props corresponding to the particle field we are + * copying. + * @param N The number of particles to copy + * @param internal_units The system of units used internally. + * @param snapshot_units The system of units used for the snapshots. + */ +void io_copy_temp_buffer(void* temp, const struct engine* e, + struct io_props props, size_t N, + const struct unit_system* internal_units, + const struct unit_system* snapshot_units) { + + const size_t typeSize = io_sizeof_type(props.type); + const size_t copySize = typeSize * props.dimension; + const size_t num_elements = N * props.dimension; + + /* Copy particle data to temporary buffer */ + if (props.conversion == 0) { /* No conversion */ + + /* Prepare some parameters */ + char* temp_c = temp; + props.start_temp_c = temp_c; + + /* Copy the whole thing into a buffer */ + threadpool_map((struct threadpool*)&e->threadpool, io_copy_mapper, temp_c, + N, copySize, 0, (void*)&props); + + } else { /* Converting particle to data */ + + if (props.convert_part_f != NULL) { + + /* Prepare some parameters */ + float* temp_f = temp; + props.start_temp_f = temp; + props.e = e; + + /* Copy the whole thing into a buffer */ + threadpool_map((struct threadpool*)&e->threadpool, + io_convert_part_f_mapper, temp_f, N, copySize, 0, + (void*)&props); + + } else if (props.convert_part_d != NULL) { + + /* Prepare some parameters */ + double* temp_d = temp; + props.start_temp_d = temp; + props.e = e; + + /* Copy the whole thing into a buffer */ + threadpool_map((struct threadpool*)&e->threadpool, + io_convert_part_d_mapper, temp_d, N, copySize, 0, + (void*)&props); + + } else if (props.convert_gpart_f != NULL) { + + /* Prepare some parameters */ + float* temp_f = temp; + props.start_temp_f = temp; + props.e = e; + + /* Copy the whole thing into a buffer */ + threadpool_map((struct threadpool*)&e->threadpool, + io_convert_gpart_f_mapper, temp_f, N, copySize, 0, + (void*)&props); + + } else if (props.convert_gpart_d != NULL) { + + /* Prepare some parameters */ + double* temp_d = temp; + props.start_temp_d = temp; + props.e = e; + + /* Copy the whole thing into a buffer */ + threadpool_map((struct threadpool*)&e->threadpool, + io_convert_gpart_d_mapper, temp_d, N, copySize, 0, + (void*)&props); + + } else { + error("Missing conversion function"); + } + } + + /* Unit conversion if necessary */ + const double factor = + units_conversion_factor(internal_units, snapshot_units, props.units); + if (factor != 1.) { + + /* message("Converting ! factor=%e", factor); */ + + if (io_is_double_precision(props.type)) { + swift_declare_aligned_ptr(double, temp_d, temp, IO_BUFFER_ALIGNMENT); + for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor; + } else { + swift_declare_aligned_ptr(float, temp_f, temp, IO_BUFFER_ALIGNMENT); + for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor; + } + } +} + #endif /* HAVE_HDF5 */ /* ------------------------------------------------------------------------------------------------ diff --git a/src/common_io.h b/src/common_io.h index 577d1664db20196c8d0c48ef5b5e4e960b0e195e..2e6d4247a5e2d0f558f6bfa601fd6c57d80e05d9 100644 --- a/src/common_io.h +++ b/src/common_io.h @@ -30,6 +30,11 @@ #define FIELD_BUFFER_SIZE 200 #define PARTICLE_GROUP_BUFFER_SIZE 50 #define FILENAME_BUFFER_SIZE 150 +#define IO_BUFFER_ALIGNMENT 1024 + +/* Avoid cyclic inclusion problems */ +struct io_props; +struct engine; #if defined(HAVE_HDF5) @@ -68,10 +73,15 @@ void io_write_attribute_s(hid_t grp, const char* name, const char* str); void io_write_code_description(hid_t h_file); -void io_read_unit_system(hid_t h_file, struct unit_system* us); +void io_read_unit_system(hid_t h_file, struct unit_system* us, int mpi_rank); void io_write_unit_system(hid_t h_grp, const struct unit_system* us, const char* groupName); +void io_copy_temp_buffer(void* temp, const struct engine* e, + const struct io_props props, size_t N, + const struct unit_system* internal_units, + const struct unit_system* snapshot_units); + #endif /* defined HDF5 */ void io_collect_dm_gparts(const struct gpart* const gparts, size_t Ntot, diff --git a/src/const.h b/src/const.h index c8060a2be51468a791e192a65a74f1a4d9bc8e30..dd7e1e267246c0f73e760191a071c6e24b96cfe8 100644 --- a/src/const.h +++ b/src/const.h @@ -36,9 +36,6 @@ /* Time integration constants. */ #define const_max_u_change 0.1f -/* Thermal energy per unit mass used as a constant for the isothermal EoS */ -#define const_isothermal_internal_energy 20.2678457288f - /* Type of gradients to use (GIZMO_SPH only) */ /* If no option is chosen, no gradients are used (first order scheme) */ //#define GRADIENTS_SPH diff --git a/src/debug.c b/src/debug.c index 17048a16d0f6415eb66e6427a11611a5bef6c649..402234a27ef50bbc38c06f61dea96e48ff02c59a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -274,10 +274,13 @@ int checkCellhdxmax(const struct cell *c, int *depth) { * only. */ static void dumpCells_map(struct cell *c, void *data) { - uintptr_t *ldata = (uintptr_t *)data; + size_t *ldata = (size_t *)data; FILE *file = (FILE *)ldata[0]; struct engine *e = (struct engine *)ldata[1]; float ntasks = c->nr_tasks; + int active = (int)ldata[2]; + int mpiactive = (int)ldata[3]; + int pactive = (int)ldata[4]; #if SWIFT_DEBUG_CHECKS /* The c->nr_tasks field does not include all the tasks. So let's check this @@ -298,46 +301,94 @@ static void dumpCells_map(struct cell *c, void *data) { } #endif - /* Only locally active cells are dumped. */ - if (c->count > 0 || c->gcount > 0 || c->scount > 0) - fprintf(file, - " %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d " - "%6.1f %20lld %6d %6d %6d %6d\n", - c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1], - c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks, - c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c), - cell_is_active(c, e), c->nodeID); + /* Only cells with particles are dumped. */ + if (c->count > 0 || c->gcount > 0 || c->scount > 0) { + +/* In MPI mode we may only output cells with foreign partners. + * These define the edges of the partitions. */ +#if WITH_MPI + if (mpiactive) + mpiactive = (c->send_xv != NULL); + else + mpiactive = 1; +#else + mpiactive = 1; +#endif + + /* Active cells, otherwise all. */ + if (active) + active = cell_is_active(c, e); + else + active = 1; + + /* So output local super cells that are active and have MPI tasks as + * requested. */ + if (c->nodeID == e->nodeID && (c->super == c) && active && mpiactive) { + + /* If requested we work out how many particles are active in this cell. */ + int pactcount = 0; + if (pactive) { + const struct part *parts = c->parts; + for (int k = 0; k < c->count; k++) + if (part_is_active(&parts[k], e)) pactcount++; + struct gpart *gparts = c->gparts; + for (int k = 0; k < c->gcount; k++) + if (gpart_is_active(&gparts[k], e)) pactcount++; + struct spart *sparts = c->sparts; + for (int k = 0; k < c->scount; k++) + if (spart_is_active(&sparts[k], e)) pactcount++; + } + + fprintf(file, + " %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d %6d %6d " + "%6.1f %20lld %6d %6d %6d %6d %6d\n", + c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1], + c->width[2], e->step, c->count, c->gcount, c->scount, pactcount, + c->depth, ntasks, c->ti_end_min, get_time_bin(c->ti_end_min), + (c->super == c), cell_is_active(c, e), c->nodeID, + c->nodeID == e->nodeID); + } + } } /** * @brief Dump the location, depth, task counts and timebins and active state, - * for all cells to a simple text file. + * for all cells to a simple text file. A more costly count of the active + * particles in a cell can also be output. * - * @param prefix base output filename + * @param prefix base output filename, result is written to + * %prefix%_%rank%_%step%.dat + * @param active just output active cells. + * @param mpiactive just output MPI active cells, i.e. those with foreign cells. + * @param pactive also output a count of active particles. * @param s the space holding the cells to dump. + * @param rank node ID of MPI rank, or 0 if not relevant. + * @param step the current engine step, or some unique integer. */ -void dumpCells(const char *prefix, struct space *s) { +void dumpCells(const char *prefix, int active, int mpiactive, int pactive, + struct space *s, int rank, int step) { FILE *file = NULL; /* Name of output file. */ - static int nseq = 0; char fname[200]; - int uniq = atomic_inc(&nseq); - sprintf(fname, "%s_%03d.dat", prefix, uniq); - + sprintf(fname, "%s_%03d_%03d.dat", prefix, rank, step); file = fopen(fname, "w"); /* Header. */ fprintf(file, - "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s " - "%20s %6s %6s %6s\n", - "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth", - "tasks", "ti_end_min", "timebin", "issuper", "active", "rank"); + "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s " + "%20s %6s %6s %6s %6s %6s\n", + "x", "y", "z", "xw", "yw", "zw", "step", "count", "gcount", "scount", + "actcount", "depth", "tasks", "ti_end_min", "timebin", "issuper", + "active", "rank", "local"); - uintptr_t data[2]; + size_t data[5]; data[0] = (size_t)file; data[1] = (size_t)s->e; + data[2] = (size_t)active; + data[3] = (size_t)mpiactive; + data[4] = (size_t)pactive; space_map_cells_pre(s, 1, dumpCells_map, &data); fclose(file); } diff --git a/src/debug.h b/src/debug.h index 20ae8fc2c20fa458b7be82f75b2fbb3be9acd32f..2cff37fa6f6f03185dc769bb770fe3a98a424ce1 100644 --- a/src/debug.h +++ b/src/debug.h @@ -36,7 +36,8 @@ void printgParticle_single(struct gpart *gp); int checkSpacehmax(struct space *s); int checkCellhdxmax(const struct cell *c, int *depth); -void dumpCells(const char *prefix, struct space *s); +void dumpCells(const char *prefix, int active, int mpiactive, int pactive, + struct space *s, int rank, int step); #ifdef HAVE_METIS #include "metis.h" diff --git a/src/dump.c b/src/dump.c index 2c0cf221ebd897bab0d047c196ce8a2aeddc6eae..de5c5afb18bd2c3cb80fe983b0f88ee60350fd5e 100644 --- a/src/dump.c +++ b/src/dump.c @@ -20,6 +20,15 @@ /* Config parameters. */ #include "../config.h" +#ifdef HAVE_POSIX_FALLOCATE + +/* This object's header. */ +#include "dump.h" + +/* Local headers. */ +#include "atomic.h" +#include "error.h" + /* Some standard headers. */ #include <errno.h> #include <fcntl.h> @@ -29,13 +38,6 @@ #include <sys/types.h> #include <unistd.h> -/* This object's header. */ -#include "dump.h" - -/* Local headers. */ -#include "atomic.h" -#include "error.h" - /** * @brief Obtain a chunk of memory from a dump. * @@ -44,7 +46,6 @@ * @param offset The offset of the returned memory address within the dump file. * @return A pointer to the memory-mapped chunk of data. */ - void *dump_get(struct dump *d, size_t count, size_t *offset) { size_t local_offset = atomic_add(&d->count, count); *offset = local_offset + d->file_offset; @@ -54,7 +55,6 @@ void *dump_get(struct dump *d, size_t count, size_t *offset) { /** * @brief Ensure that at least size bytes are available in the #dump. */ - void dump_ensure(struct dump *d, size_t size) { /* If we have enough space already, just bail. */ @@ -88,7 +88,6 @@ void dump_ensure(struct dump *d, size_t size) { /** * @brief Flush the #dump to disk. */ - void dump_sync(struct dump *d) { if (msync(d->data, d->count, MS_SYNC) != 0) error("Failed to sync memory-mapped data."); @@ -97,7 +96,6 @@ void dump_sync(struct dump *d) { /** * @brief Finalize the #dump. */ - void dump_close(struct dump *d) { /* Unmap the data in memory. */ if (munmap(d->data, d->count) != 0) { @@ -121,7 +119,6 @@ void dump_close(struct dump *d) { * note that it will be overwritten. * @param size The initial buffer size for this #dump. */ - void dump_init(struct dump *d, const char *filename, size_t size) { /* Create the output file. */ @@ -151,3 +148,5 @@ void dump_init(struct dump *d, const char *filename, size_t size) { d->file_offset = 0; d->page_mask = page_mask; } + +#endif diff --git a/src/dump.h b/src/dump.h index a7e934218c271d2f82b99d39f278e5af3047be6e..6857aa3a008a27e0e8ed23854d84f848ee0ca2be 100644 --- a/src/dump.h +++ b/src/dump.h @@ -19,8 +19,13 @@ #ifndef SWIFT_DUMP_H #define SWIFT_DUMP_H -/* Includes. */ -#include "lock.h" +/* Config parameters. */ +#include "../config.h" + +#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */ + +/* Standard headers */ +#include <stdlib.h> /* Some constants. */ #define dump_grow_ensure_factor 10 @@ -54,4 +59,6 @@ void dump_sync(struct dump *d); void dump_close(struct dump *d); void *dump_get(struct dump *d, size_t count, size_t *offset); +#endif /* HAVE_POSIX_FALLOCATE */ + #endif /* SWIFT_DUMP_H */ diff --git a/src/engine.c b/src/engine.c index 3f02b85f2b881522c8ff887a58b9813a7dd4d27b..88a86a74e9812f9e8eecbee286d7031b98707270 100644 --- a/src/engine.c +++ b/src/engine.c @@ -880,6 +880,9 @@ void engine_redistribute(struct engine *e) { nodeID, nr_parts, nr_sparts, nr_gparts, my_cells); } + /* Flag that a redistribute has taken place */ + e->step_props |= engine_step_prop_redistribute; + if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); @@ -945,6 +948,9 @@ void engine_repartition(struct engine *e) { /* Tell the engine it should re-build whenever possible */ e->forcerebuild = 1; + /* Flag that a repartition has taken place */ + e->step_props |= engine_step_prop_repartition; + if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); @@ -2273,9 +2279,8 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Self-interaction? */ else if (t->type == task_type_self && t->subtype == task_subtype_density) { - /* Make all density tasks depend on the drift and the sorts. */ + /* Make the self-density tasks depend on the drift only. */ scheduler_addunlock(sched, t->ci->super->drift_part, t); - scheduler_addunlock(sched, t->ci->super->sorts, t); #ifdef EXTRA_HYDRO_LOOP /* Start by constructing the task for the second and third hydro loop. */ @@ -3098,6 +3103,9 @@ void engine_rebuild(struct engine *e, int clean_h_values) { /* Print the status of the system */ // if (e->verbose) engine_print_task_counts(e); + /* Flag that a rebuild has taken place */ + e->step_props |= engine_step_prop_rebuild; + if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); @@ -3153,10 +3161,10 @@ void engine_prepare(struct engine *e) { void engine_barrier(struct engine *e) { /* Wait at the wait barrier. */ - pthread_barrier_wait(&e->wait_barrier); + swift_barrier_wait(&e->wait_barrier); /* Wait at the run barrier. */ - pthread_barrier_wait(&e->run_barrier); + swift_barrier_wait(&e->run_barrier); } /** @@ -3472,7 +3480,7 @@ void engine_launch(struct engine *e) { atomic_inc(&e->sched.waiting); /* Cry havoc and let loose the dogs of war. */ - pthread_barrier_wait(&e->run_barrier); + swift_barrier_wait(&e->run_barrier); /* Load the tasks. */ scheduler_start(&e->sched); @@ -3484,7 +3492,7 @@ void engine_launch(struct engine *e) { pthread_mutex_unlock(&e->sched.sleep_mutex); /* Sit back and wait for the runners to come home. */ - pthread_barrier_wait(&e->wait_barrier); + swift_barrier_wait(&e->wait_barrier); if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -3527,10 +3535,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, /* Print the number of active tasks ? */ if (e->verbose) engine_print_task_counts(e); - /* Init the particle hydro data (by hand). */ - for (size_t k = 0; k < s->nr_parts; k++) - hydro_init_part(&s->parts[k], &e->s->hs); - for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]); + /* Init the particle data (by hand). */ + space_init_parts(s, e->verbose); + space_init_gparts(s, e->verbose); /* Now, launch the calculation */ TIMER_TIC; @@ -3542,9 +3549,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, if (e->nodeID == 0) message("Converting internal energy variable."); - /* Apply the conversion */ - for (size_t i = 0; i < s->nr_parts; ++i) - hydro_convert_quantities(&s->parts[i], &s->xparts[i]); + space_convert_quantities(e->s, e->verbose); /* Correct what we did (e.g. in PE-SPH, need to recompute rho_bar) */ if (hydro_need_extra_init_loop) { @@ -3577,10 +3582,9 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, /* No drift this time */ engine_skip_drift(e); - /* Init the particle hydro data (by hand). */ - for (size_t k = 0; k < s->nr_parts; k++) - hydro_init_part(&s->parts[k], &e->s->hs); - for (size_t k = 0; k < s->nr_gparts; k++) gravity_init_gpart(&s->gparts[k]); + /* Init the particle data (by hand). */ + space_init_parts(e->s, e->verbose); + space_init_gparts(e->s, e->verbose); /* Print the number of active tasks ? */ if (e->verbose) engine_print_task_counts(e); @@ -3591,6 +3595,8 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, gravity_exact_force_compute(e->s, e); #endif + if (e->nodeID == 0) scheduler_write_dependencies(&e->sched, e->verbose); + /* Run the 0th time-step */ engine_launch(e); @@ -3701,14 +3707,14 @@ void engine_step(struct engine *e) { if (e->nodeID == 0) { /* Print some information to the screen */ - printf(" %6d %14e %14e %10zu %10zu %10zu %21.3f\n", e->step, e->time, + printf(" %6d %14e %14e %12zu %12zu %12zu %21.3f %6d\n", e->step, e->time, e->timeStep, e->updates, e->g_updates, e->s_updates, - e->wallclock_time); + e->wallclock_time, e->step_props); fflush(stdout); - fprintf(e->file_timesteps, " %6d %14e %14e %10zu %10zu %10zu %21.3f\n", + fprintf(e->file_timesteps, " %6d %14e %14e %12zu %12zu %12zu %21.3f %6d\n", e->step, e->time, e->timeStep, e->updates, e->g_updates, - e->s_updates, e->wallclock_time); + e->s_updates, e->wallclock_time, e->step_props); fflush(e->file_timesteps); } @@ -3720,6 +3726,7 @@ void engine_step(struct engine *e) { e->time = e->ti_current * e->timeBase + e->timeBegin; e->timeOld = e->ti_old * e->timeBase + e->timeBegin; e->timeStep = (e->ti_current - e->ti_old) * e->timeBase; + e->step_props = engine_step_prop_none; /* Prepare the tasks to be launched, rebuild or repartition if needed. */ engine_prepare(e); @@ -3744,6 +3751,9 @@ void engine_step(struct engine *e) { /* Print the number of active tasks ? */ if (e->verbose) engine_print_task_counts(e); +/* Dump local cells and active particle counts. */ +/* dumpCells("cells", 0, 0, 1, e->s, e->nodeID, e->step); */ + #ifdef SWIFT_DEBUG_CHECKS /* Check that we have the correct total mass in the top-level multipoles */ size_t num_gpart_mpole = 0; @@ -3809,6 +3819,9 @@ void engine_step(struct engine *e) { /* ... and find the next output time */ engine_compute_next_snapshot_time(e); + + /* Flag that we dumped a snapshot */ + e->step_props |= engine_step_prop_snapshot; } /* Save some statistics */ @@ -3819,6 +3832,9 @@ void engine_step(struct engine *e) { /* and move on */ e->timeLastStatistics += e->deltaTimeStatistics; + + /* Flag that we dumped some statistics */ + e->step_props |= engine_step_prop_statistics; } /* Now apply all the collected time step updates and particle counts. */ @@ -4357,6 +4373,7 @@ void engine_init(struct engine *e, struct space *s, e->reparttype = reparttype; e->dump_snapshot = 0; e->save_stats = 0; + e->step_props = engine_step_prop_none; e->links = NULL; e->nr_links = 0; e->timeBegin = parser_get_param_double(params, "TimeIntegration:time_begin"); @@ -4517,7 +4534,7 @@ void engine_init(struct engine *e, struct space *s, if (with_aff) engine_unpin(); #endif - if (with_aff) { + if (with_aff && nodeID == 0) { #ifdef HAVE_SETAFFINITY #ifdef WITH_MPI printf("[%04i] %s engine_init: cpu map is [ ", nodeID, @@ -4580,9 +4597,16 @@ void engine_init(struct engine *e, struct space *s, e->hydro_properties->delta_neighbours, e->hydro_properties->eta_neighbours); - fprintf(e->file_timesteps, "# %6s %14s %14s %10s %10s %10s %16s [%s]\n", + fprintf(e->file_timesteps, + "# Step Properties: Rebuild=%d, Redistribute=%d, Repartition=%d, " + "Statistics=%d, Snapshot=%d\n", + engine_step_prop_rebuild, engine_step_prop_redistribute, + engine_step_prop_repartition, engine_step_prop_statistics, + engine_step_prop_snapshot); + + fprintf(e->file_timesteps, "# %6s %14s %14s %12s %12s %12s %16s [%s] %6s\n", "Step", "Time", "Time-step", "Updates", "g-Updates", "s-Updates", - "Wall-clock time", clocks_getunit()); + "Wall-clock time", clocks_getunit(), "Props"); fflush(e->file_timesteps); } @@ -4667,8 +4691,8 @@ void engine_init(struct engine *e, struct space *s, threadpool_init(&e->threadpool, e->nr_threads); /* First of all, init the barrier and lock it. */ - if (pthread_barrier_init(&e->wait_barrier, NULL, e->nr_threads + 1) != 0 || - pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0) + if (swift_barrier_init(&e->wait_barrier, NULL, e->nr_threads + 1) != 0 || + swift_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0) error("Failed to initialize barrier."); /* Expected average for tasks per cell. If set to zero we use a heuristic @@ -4681,6 +4705,12 @@ void engine_init(struct engine *e, struct space *s, scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues, (policy & scheduler_flag_steal), e->nodeID, &e->threadpool); + /* Maximum size of MPI task messages, in KB, that should not be buffered, + * that is sent using MPI_Issend, not MPI_Isend. 4Mb by default. + */ + e->sched.mpi_message_limit = + parser_get_opt_param_int(params, "Scheduler:mpi_message_limit", 4) * 1024; + /* Allocate and init the threads. */ if ((e->runners = (struct runner *)malloc(sizeof(struct runner) * e->nr_threads)) == NULL) @@ -4754,7 +4784,7 @@ void engine_init(struct engine *e, struct space *s, #endif /* Wait for the runner threads to be in place. */ - pthread_barrier_wait(&e->wait_barrier); + swift_barrier_wait(&e->wait_barrier); } /** diff --git a/src/engine.h b/src/engine.h index 2b5a4eee7cde974fc555460c6483d02b21bd6c04..f80589ad3fad508f48a6fc3eda676a929b824c6e 100644 --- a/src/engine.h +++ b/src/engine.h @@ -32,11 +32,8 @@ #include <mpi.h> #endif -/* Some standard headers. */ -#include <pthread.h> -#include <stdio.h> - /* Includes. */ +#include "barrier.h" #include "clocks.h" #include "collectgroup.h" #include "cooling_struct.h" @@ -51,7 +48,9 @@ #include "task.h" #include "units.h" -/* Some constants. */ +/** + * @brief The different policies the #engine can follow. + */ enum engine_policy { engine_policy_none = 0, engine_policy_rand = (1 << 0), @@ -74,7 +73,19 @@ enum engine_policy { #define engine_maxpolicy 15 extern const char *engine_policy_names[]; -#define engine_queue_scale 1.2 +/** + * @brief The different unusual events that can take place in a time-step. + */ +enum engine_step_properties { + engine_step_prop_none = 0, + engine_step_prop_rebuild = (1 << 0), + engine_step_prop_redistribute = (1 << 1), + engine_step_prop_repartition = (1 << 2), + engine_step_prop_statistics = (1 << 3), + engine_step_prop_snapshot = (1 << 4) +}; + +/* Some constants */ #define engine_maxproxies 64 #define engine_tasksreweight 1 #define engine_parts_size_grow 1.05 @@ -83,7 +94,9 @@ extern const char *engine_policy_names[]; #define engine_default_timesteps_file_name "timesteps" #define engine_max_parts_per_ghost 1000 -/* The rank of the engine as a global variable (for messages). */ +/** + * @brief The rank of the engine as a global variable (for messages). + */ extern int engine_rank; /* Data structure for the engine. */ @@ -143,9 +156,12 @@ struct engine { /* Maximal ti_beg for the next time-step */ integertime_t ti_beg_max; - /* Number of particles updated */ + /* Number of particles updated in the previous step */ size_t updates, g_updates, s_updates; + /* Properties of the previous step */ + int step_props; + /* Total numbers of particles in the system. */ size_t total_nr_parts, total_nr_gparts; @@ -175,8 +191,8 @@ struct engine { int count_step; /* Data for the threads' barrier. */ - pthread_barrier_t wait_barrier; - pthread_barrier_t run_barrier; + swift_barrier_t wait_barrier; + swift_barrier_t run_barrier; /* ID of the node this engine lives on. */ int nr_nodes, nodeID; diff --git a/src/equation_of_state.c b/src/equation_of_state.c new file mode 100644 index 0000000000000000000000000000000000000000..a792bc58cf13fa5f710dc1cd80713ff90e700b18 --- /dev/null +++ b/src/equation_of_state.c @@ -0,0 +1,67 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/* This object's header. */ +#include "equation_of_state.h" + +/* local headers */ +#include "common_io.h" + +/* Equation of state for the physics model + * (temporary ugly solution as a global variable) */ +struct eos_parameters eos; + +void eos_init(struct eos_parameters *e, const struct swift_params *params) { + +#if defined(EOS_IDEAL_GAS) +/* nothing to do here */ +#elif defined(EOS_ISOTHERMAL_GAS) + e->isothermal_internal_energy = + parser_get_param_float(params, "EoS:isothermal_internal_energy"); +#endif +} + +void eos_print(const struct eos_parameters *e) { + +#if defined(EOS_IDEAL_GAS) + message("Equation of state: Ideal gas."); +#elif defined(EOS_ISOTHERMAL_GAS) + message( + "Equation of state: Isothermal with internal energy " + "per unit mass set to %f.", + e->isothermal_internal_energy); +#endif + + message("Adiabatic index gamma: %f.", hydro_gamma); +} + +#if defined(HAVE_HDF5) +void eos_print_snapshot(hid_t h_grpsph, const struct eos_parameters *e) { + + io_write_attribute_f(h_grpsph, "Adiabatic index", hydro_gamma); + +#if defined(EOS_IDEAL_GAS) + io_write_attribute_s(h_grpsph, "Equation of state", "Ideal gas"); +#elif defined(EOS_ISOTHERMAL_GAS) + io_write_attribute_s(h_grpsph, "Equation of state", "Isothermal gas"); + io_write_attribute_f(h_grpsph, "Thermal energy per unit mass", + e->isothermal_internal_energy); +#endif +} +#endif diff --git a/src/equation_of_state.h b/src/equation_of_state.h index e51ed99519dc9c418e34789fcce95b5f28d69a99..76b8bb922133c9c4f29453a14068fe3f9044d66f 100644 --- a/src/equation_of_state.h +++ b/src/equation_of_state.h @@ -37,9 +37,13 @@ #include "debug.h" #include "inline.h" +extern struct eos_parameters eos; + /* ------------------------------------------------------------------------- */ #if defined(EOS_IDEAL_GAS) +struct eos_parameters {}; + /** * @brief Returns the internal energy given density and entropy * @@ -172,10 +176,17 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure( /* ------------------------------------------------------------------------- */ #elif defined(EOS_ISOTHERMAL_GAS) +struct eos_parameters { + + /*! Thermal energy per unit mass */ + float isothermal_internal_energy; +}; + /** * @brief Returns the internal energy given density and entropy * - * Since we are using an isothermal EoS, the entropy value is ignored + * Since we are using an isothermal EoS, the entropy and density values are + * ignored. * Computes \f$u = u_{cst}\f$. * * @param density The density \f$\rho\f$. @@ -184,12 +195,13 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure( __attribute__((always_inline)) INLINE static float gas_internal_energy_from_entropy(float density, float entropy) { - return const_isothermal_internal_energy; + return eos.isothermal_internal_energy; } + /** * @brief Returns the pressure given density and entropy * - * Since we are using an isothermal EoS, the entropy value is ignored + * Since we are using an isothermal EoS, the entropy value is ignored. * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$. * * @param density The density \f$\rho\f$. @@ -198,13 +210,14 @@ gas_internal_energy_from_entropy(float density, float entropy) { __attribute__((always_inline)) INLINE static float gas_pressure_from_entropy( float density, float entropy) { - return hydro_gamma_minus_one * const_isothermal_internal_energy * density; + return hydro_gamma_minus_one * eos.isothermal_internal_energy * density; } /** * @brief Returns the entropy given density and pressure. * - * Computes \f$A = \frac{P}{\rho^\gamma}\f$. + * Since we are using an isothermal EoS, the pressure value is ignored. + * Computes \f$A = (\gamma - 1)u_{cst}\rho^{-(\gamma-1)}\f$. * * @param density The density \f$\rho\f$. * @param pressure The pressure \f$P\f$ (ignored). @@ -213,15 +226,16 @@ __attribute__((always_inline)) INLINE static float gas_pressure_from_entropy( __attribute__((always_inline)) INLINE static float gas_entropy_from_pressure( float density, float pressure) { - return hydro_gamma_minus_one * const_isothermal_internal_energy * + return hydro_gamma_minus_one * eos.isothermal_internal_energy * pow_minus_gamma_minus_one(density); } /** * @brief Returns the sound speed given density and entropy * - * Since we are using an isothermal EoS, the entropy value is ignored - * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$. + * Since we are using an isothermal EoS, the entropy and density values are + * ignored. + * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$. * * @param density The density \f$\rho\f$. * @param entropy The entropy \f$S\f$. @@ -229,14 +243,14 @@ __attribute__((always_inline)) INLINE static float gas_entropy_from_pressure( __attribute__((always_inline)) INLINE static float gas_soundspeed_from_entropy( float density, float entropy) { - return sqrtf(const_isothermal_internal_energy * hydro_gamma * + return sqrtf(eos.isothermal_internal_energy * hydro_gamma * hydro_gamma_minus_one); } /** * @brief Returns the entropy given density and internal energy * - * Since we are using an isothermal EoS, the energy value is ignored + * Since we are using an isothermal EoS, the energy value is ignored. * Computes \f$S = \frac{(\gamma - 1)u_{cst}}{\rho^{\gamma-1}}\f$. * * @param density The density \f$\rho\f$ @@ -245,14 +259,14 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_entropy( __attribute__((always_inline)) INLINE static float gas_entropy_from_internal_energy(float density, float u) { - return hydro_gamma_minus_one * const_isothermal_internal_energy * + return hydro_gamma_minus_one * eos.isothermal_internal_energy * pow_minus_gamma_minus_one(density); } /** * @brief Returns the pressure given density and internal energy * - * Since we are using an isothermal EoS, the energy value is ignored + * Since we are using an isothermal EoS, the energy value is ignored. * Computes \f$P = (\gamma - 1)u_{cst}\rho\f$. * * @param density The density \f$\rho\f$ @@ -261,7 +275,7 @@ gas_entropy_from_internal_energy(float density, float u) { __attribute__((always_inline)) INLINE static float gas_pressure_from_internal_energy(float density, float u) { - return hydro_gamma_minus_one * const_isothermal_internal_energy * density; + return hydro_gamma_minus_one * eos.isothermal_internal_energy * density; } /** @@ -275,14 +289,15 @@ gas_pressure_from_internal_energy(float density, float u) { */ __attribute__((always_inline)) INLINE static float gas_internal_energy_from_pressure(float density, float pressure) { - return const_isothermal_internal_energy; + return eos.isothermal_internal_energy; } /** * @brief Returns the sound speed given density and internal energy * - * Since we are using an isothermal EoS, the energy value is ignored - * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$. + * Since we are using an isothermal EoS, the energy and density values are + * ignored. + * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$. * * @param density The density \f$\rho\f$ * @param u The internal energy \f$u\f$ @@ -290,15 +305,16 @@ gas_internal_energy_from_pressure(float density, float pressure) { __attribute__((always_inline)) INLINE static float gas_soundspeed_from_internal_energy(float density, float u) { - return sqrtf(const_isothermal_internal_energy * hydro_gamma * + return sqrtf(eos.isothermal_internal_energy * hydro_gamma * hydro_gamma_minus_one); } /** * @brief Returns the sound speed given density and pressure * - * Since we are using an isothermal EoS, the pressure value is ignored - * Computes \f$c = \sqrt{u_{cst} \gamma \rho^{\gamma-1}}\f$. + * Since we are using an isothermal EoS, the pressure and density values are + * ignored. + * Computes \f$c = \sqrt{u_{cst} \gamma (\gamma-1)}\f$. * * @param density The density \f$\rho\f$ * @param P The pressure \f$P\f$ @@ -306,7 +322,7 @@ gas_soundspeed_from_internal_energy(float density, float u) { __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure( float density, float P) { - return sqrtf(const_isothermal_internal_energy * hydro_gamma * + return sqrtf(eos.isothermal_internal_energy * hydro_gamma * hydro_gamma_minus_one); } @@ -317,4 +333,11 @@ __attribute__((always_inline)) INLINE static float gas_soundspeed_from_pressure( #endif +void eos_init(struct eos_parameters *e, const struct swift_params *params); +void eos_print(const struct eos_parameters *e); + +#if defined(HAVE_HDF5) +void eos_print_snapshot(hid_t h_grpsph, const struct eos_parameters *e); +#endif + #endif /* SWIFT_EQUATION_OF_STATE_H */ diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h index 26eed8a0a680e13130dd257d8b31e6cd00392f6d..671ccc24e95f322d71a734c0a39eface2a93bcec 100644 --- a/src/gravity/Default/gravity_io.h +++ b/src/gravity/Default/gravity_io.h @@ -21,6 +21,20 @@ #include "io_properties.h" +void convert_gpart_pos(const struct engine* e, const struct gpart* gp, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(gp->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(gp->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(gp->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = gp->x[0]; + ret[1] = gp->x[1]; + ret[2] = gp->x[2]; + } +} + /** * @brief Specifies which g-particle fields to read from a dataset * @@ -52,15 +66,15 @@ void darkmatter_read_particles(struct gpart* gparts, struct io_props* list, * @param list The list of i/o properties to write. * @param num_fields The number of i/o fields to write. */ -void darkmatter_write_particles(struct gpart* gparts, struct io_props* list, - int* num_fields) { +void darkmatter_write_particles(const struct gpart* gparts, + struct io_props* list, int* num_fields) { /* Say how much we want to read */ *num_fields = 5; /* List what we want to read */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - gparts, x); + list[0] = io_make_output_field_convert_gpart( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, gparts, convert_gpart_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, gparts, v_full); list[2] = diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h index 62e94b05ffea259aac99d4b3714e0eea7e7c955f..8c12f015bd69dc917a80c564422676a85ffcfb3b 100644 --- a/src/hydro/Default/hydro_io.h +++ b/src/hydro/Default/hydro_io.h @@ -53,6 +53,20 @@ void hydro_read_particles(struct part* parts, struct io_props* list, UNIT_CONV_DENSITY, parts, rho); } +void convert_part_pos(const struct engine* e, const struct part* p, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = p->x[0]; + ret[1] = p->x[1]; + ret[2] = p->x[2]; + } +} + /** * @brief Specifies which particle fields to write to a dataset * @@ -66,8 +80,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list, *num_fields = 8; /* List what we want to write */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - parts, x); + list[0] = io_make_output_field_convert_part( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v); list[2] = diff --git a/src/hydro/Default/hydro_part.h b/src/hydro/Default/hydro_part.h index 8b14c45614e4c09c48056c9398ed4bfe3ed90e38..5d2f76ab298cdd6199d95abe1e4435321302689e 100644 --- a/src/hydro/Default/hydro_part.h +++ b/src/hydro/Default/hydro_part.h @@ -125,6 +125,16 @@ struct part { /* Particle time-bin */ timebin_t time_bin; +#ifdef SWIFT_DEBUG_CHECKS + + /* Time of the last drift */ + integertime_t ti_drift; + + /* Time of the last kick */ + integertime_t ti_kick; + +#endif + } SWIFT_STRUCT_ALIGN; #endif /* SWIFT_DEFAULT_HYDRO_PART_H */ diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h index ac4ce0db57b3297fc961548ffea6b00f32f4dd8c..e6d125105d4a9aa6443e35644990655fd4c7335c 100644 --- a/src/hydro/Gadget2/hydro_io.h +++ b/src/hydro/Gadget2/hydro_io.h @@ -55,14 +55,28 @@ void hydro_read_particles(struct part* parts, struct io_props* list, UNIT_CONV_DENSITY, parts, rho); } -float convert_u(struct engine* e, struct part* p) { +void convert_u(const struct engine* e, const struct part* p, float* ret) { - return hydro_get_internal_energy(p); + ret[0] = hydro_get_internal_energy(p); } -float convert_P(struct engine* e, struct part* p) { +void convert_P(const struct engine* e, const struct part* p, float* ret) { - return hydro_get_pressure(p); + ret[0] = hydro_get_pressure(p); +} + +void convert_part_pos(const struct engine* e, const struct part* p, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = p->x[0]; + ret[1] = p->x[1]; + ret[2] = p->x[2]; + } } /** @@ -72,7 +86,7 @@ float convert_P(struct engine* e, struct part* p) { * @param list The list of i/o properties to write. * @param num_fields The number of i/o fields to write. */ -void hydro_write_particles(struct part* parts, struct io_props* list, +void hydro_write_particles(const struct part* parts, struct io_props* list, int* num_fields) { *num_fields = 10; @@ -82,8 +96,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list, #endif /* List what we want to write */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - parts, x); + list[0] = io_make_output_field_convert_part( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v); list[2] = @@ -100,9 +114,9 @@ void hydro_write_particles(struct part* parts, struct io_props* list, io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho); list[8] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1, UNIT_CONV_ENERGY_PER_UNIT_MASS, - parts, rho, convert_u); + parts, convert_u); list[9] = io_make_output_field_convert_part( - "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, rho, convert_P); + "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, convert_P); #ifdef DEBUG_INTERACTIONS_SPH list[10] = io_make_output_field("Num_ngb_density", INT, 1, UNIT_CONV_NO_UNITS, parts, num_ngb_density); diff --git a/src/hydro/Gizmo/hydro.h b/src/hydro/Gizmo/hydro.h index 2c2f54699bb380a491edf61a83ad8a031572c86c..6273f5009a051e17d2a2954ac994c3f189110464 100644 --- a/src/hydro/Gizmo/hydro.h +++ b/src/hydro/Gizmo/hydro.h @@ -24,6 +24,7 @@ #include "approx_math.h" #include "equation_of_state.h" #include "hydro_gradients.h" +#include "hydro_properties.h" #include "hydro_space.h" #include "hydro_unphysical.h" #include "hydro_velocities.h" @@ -130,8 +131,9 @@ __attribute__((always_inline)) INLINE static void hydro_first_init_part( /* remember that we store the total thermal energy, not the specific thermal energy (as in Gadget) */ #if defined(EOS_ISOTHERMAL_GAS) - /* this overwrites the internal energy from the initial condition file */ - p->conserved.energy = mass * const_isothermal_internal_energy; + /* this overwrites the internal energy from the initial condition file + * Note that we call the EoS function just to get the constant u here. */ + p->conserved.energy = mass * gas_internal_energy_from_entropy(0.f, 0.f); #else p->conserved.energy *= mass; #endif @@ -608,7 +610,9 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra( p->conserved.momentum[1] += p->conserved.flux.momentum[1]; p->conserved.momentum[2] += p->conserved.flux.momentum[2]; #if defined(EOS_ISOTHERMAL_GAS) - p->conserved.energy = p->conserved.mass * const_isothermal_internal_energy; + /* We use the EoS equation in a sneaky way here just to get the constant u */ + p->conserved.energy = + p->conserved.mass * gas_internal_energy_from_entropy(0.f, 0.f); #else p->conserved.energy += p->conserved.flux.energy; #endif @@ -707,15 +711,11 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra( __attribute__((always_inline)) INLINE static float hydro_get_internal_energy( const struct part* restrict p) { - if (p->primitives.rho > 0.) { -#ifdef EOS_ISOTHERMAL_GAS - return p->primitives.P / hydro_gamma_minus_one / p->primitives.rho; -#else - return const_isothermal_internal_energy; -#endif - } else { + if (p->primitives.rho > 0.) + return gas_internal_energy_from_pressure(p->primitives.rho, + p->primitives.P); + else return 0.; - } } /** @@ -727,7 +727,7 @@ __attribute__((always_inline)) INLINE static float hydro_get_entropy( const struct part* restrict p) { if (p->primitives.rho > 0.) { - return p->primitives.P / pow_gamma(p->primitives.rho); + return gas_entropy_from_pressure(p->primitives.rho, p->primitives.P); } else { return 0.; } @@ -741,16 +741,10 @@ __attribute__((always_inline)) INLINE static float hydro_get_entropy( __attribute__((always_inline)) INLINE static float hydro_get_soundspeed( const struct part* restrict p) { - if (p->primitives.rho > 0.) { -#ifdef EOS_ISOTHERMAL_GAS - return sqrtf(const_isothermal_internal_energy * hydro_gamma * - hydro_gamma_minus_one); -#else - return sqrtf(hydro_gamma * p->primitives.P / p->primitives.rho); -#endif - } else { + if (p->primitives.rho > 0.) + return gas_soundspeed_from_pressure(p->primitives.rho, p->primitives.P); + else return 0.; - } } /** diff --git a/src/hydro/Gizmo/hydro_io.h b/src/hydro/Gizmo/hydro_io.h index d20f7e2eb1cf50be7690e15a9569d8e9c4605af5..458ea6151e4b00b9ccf85ad6eff4ccf58277ff7d 100644 --- a/src/hydro/Gizmo/hydro_io.h +++ b/src/hydro/Gizmo/hydro_io.h @@ -18,6 +18,7 @@ ******************************************************************************/ #include "adiabatic_index.h" +#include "hydro.h" #include "hydro_flux_limiters.h" #include "hydro_gradients.h" #include "hydro_slope_limiters.h" @@ -68,18 +69,11 @@ void hydro_read_particles(struct part* parts, struct io_props* list, * * @param e #engine. * @param p Particle. - * @return Internal energy of the particle + * @param ret (return) Internal energy of the particle */ -float convert_u(struct engine* e, struct part* p) { - if (p->primitives.rho > 0.) { -#ifdef EOS_ISOTHERMAL_GAS - return const_isothermal_internal_energy; -#else - return p->primitives.P / hydro_gamma_minus_one / p->primitives.rho; -#endif - } else { - return 0.; - } +void convert_u(const struct engine* e, const struct part* p, float* ret) { + + ret[0] = hydro_get_internal_energy(p); } /** @@ -87,14 +81,10 @@ float convert_u(struct engine* e, struct part* p) { * * @param e #engine. * @param p Particle. - * @return Entropic function of the particle + * @param ret (return) Entropic function of the particle */ -float convert_A(struct engine* e, struct part* p) { - if (p->primitives.rho > 0.) { - return p->primitives.P / pow_gamma(p->primitives.rho); - } else { - return 0.; - } +void convert_A(const struct engine* e, const struct part* p, float* ret) { + ret[0] = hydro_get_entropy(p); } /** @@ -104,9 +94,9 @@ float convert_A(struct engine* e, struct part* p) { * @param p Particle. * @return Total energy of the particle */ -float convert_Etot(struct engine* e, struct part* p) { +void convert_Etot(const struct engine* e, const struct part* p, float* ret) { #ifdef GIZMO_TOTAL_ENERGY - return p->conserved.energy; + ret[0] = p->conserved.energy; #else float momentum2; @@ -114,10 +104,24 @@ float convert_Etot(struct engine* e, struct part* p) { p->conserved.momentum[1] * p->conserved.momentum[1] + p->conserved.momentum[2] * p->conserved.momentum[2]; - return p->conserved.energy + 0.5f * momentum2 / p->conserved.mass; + ret[0] = p->conserved.energy + 0.5f * momentum2 / p->conserved.mass; #endif } +void convert_part_pos(const struct engine* e, const struct part* p, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = p->x[0]; + ret[1] = p->x[1]; + ret[2] = p->x[2]; + } +} + /** * @brief Specifies which particle fields to write to a dataset * @@ -131,8 +135,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list, *num_fields = 10; /* List what we want to write */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - parts, x); + list[0] = io_make_output_field_convert_part( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, primitives.v); list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, parts, @@ -141,18 +145,17 @@ void hydro_write_particles(struct part* parts, struct io_props* list, parts, h); list[4] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1, UNIT_CONV_ENERGY_PER_UNIT_MASS, - parts, primitives.P, convert_u); + parts, convert_u); list[5] = io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, parts, id); list[6] = io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, primitives.rho); list[7] = io_make_output_field_convert_part( - "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, primitives.P, convert_A); + "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, convert_A); list[8] = io_make_output_field("Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, primitives.P); - list[9] = - io_make_output_field_convert_part("TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, - parts, conserved.energy, convert_Etot); + list[9] = io_make_output_field_convert_part( + "TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, parts, convert_Etot); } /** diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h index 4d8ca5b05547467c973e17983774b64736060471..905412040cbb8ac9666e555d88cad9bc56de13ab 100644 --- a/src/hydro/Minimal/hydro.h +++ b/src/hydro/Minimal/hydro.h @@ -224,7 +224,7 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( /* Finish the calculation by inserting the missing h-factors */ p->rho *= h_inv_dim; p->density.rho_dh *= h_inv_dim_plus_one; - p->density.wcount *= kernel_norm; + p->density.wcount *= h_inv_dim; p->density.wcount_dh *= h_inv_dim_plus_one; } diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h index 2ec0cb11d1e3ccaaa09d9822c75b396364912df8..771e4ac2d066ea098edb8b2f11d7afd550783347 100644 --- a/src/hydro/Minimal/hydro_io.h +++ b/src/hydro/Minimal/hydro_io.h @@ -69,14 +69,28 @@ void hydro_read_particles(struct part* parts, struct io_props* list, UNIT_CONV_DENSITY, parts, rho); } -float convert_S(struct engine* e, struct part* p) { +void convert_S(const struct engine* e, const struct part* p, float* ret) { - return hydro_get_entropy(p); + ret[0] = hydro_get_entropy(p); } -float convert_P(struct engine* e, struct part* p) { +void convert_P(const struct engine* e, const struct part* p, float* ret) { - return hydro_get_pressure(p); + ret[0] = hydro_get_pressure(p); +} + +void convert_part_pos(const struct engine* e, const struct part* p, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = p->x[0]; + ret[1] = p->x[1]; + ret[2] = p->x[2]; + } } /** @@ -92,8 +106,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list, *num_fields = 10; /* List what we want to write */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - parts, x); + list[0] = io_make_output_field_convert_part( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v); list[2] = @@ -108,11 +122,10 @@ void hydro_write_particles(struct part* parts, struct io_props* list, UNIT_CONV_ACCELERATION, parts, a_hydro); list[7] = io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho); - list[8] = io_make_output_field_convert_part("Entropy", FLOAT, 1, - UNIT_CONV_ENTROPY_PER_UNIT_MASS, - parts, rho, convert_S); + list[8] = io_make_output_field_convert_part( + "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY_PER_UNIT_MASS, parts, convert_S); list[9] = io_make_output_field_convert_part( - "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, rho, convert_P); + "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, convert_P); } /** diff --git a/src/hydro/PressureEntropy/hydro_io.h b/src/hydro/PressureEntropy/hydro_io.h index 10243750c01d6bc64664f9834bc4cc245c786f49..d3780261ba587cfac92ec44a65b13f0a4344b3c3 100644 --- a/src/hydro/PressureEntropy/hydro_io.h +++ b/src/hydro/PressureEntropy/hydro_io.h @@ -67,14 +67,28 @@ void hydro_read_particles(struct part* parts, struct io_props* list, UNIT_CONV_DENSITY, parts, rho); } -float convert_u(struct engine* e, struct part* p) { +void convert_u(const struct engine* e, const struct part* p, float* ret) { - return hydro_get_internal_energy(p); + ret[0] = hydro_get_internal_energy(p); } -float convert_P(struct engine* e, struct part* p) { +void convert_P(const struct engine* e, const struct part* p, float* ret) { - return hydro_get_pressure(p); + ret[0] = hydro_get_pressure(p); +} + +void convert_part_pos(const struct engine* e, const struct part* p, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = p->x[0]; + ret[1] = p->x[1]; + ret[2] = p->x[2]; + } } /** @@ -90,27 +104,27 @@ void hydro_write_particles(struct part* parts, struct io_props* list, *num_fields = 11; /* List what we want to write */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - parts, x); + list[0] = io_make_output_field_convert_part( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, v); list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, parts, mass); list[3] = io_make_output_field("SmoothingLength", FLOAT, 1, UNIT_CONV_LENGTH, parts, h); - list[4] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1, - UNIT_CONV_ENERGY_PER_UNIT_MASS, - parts, entropy, convert_u); + list[4] = io_make_output_field( + "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY_PER_UNIT_MASS, parts, entropy); list[5] = io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, parts, id); list[6] = io_make_output_field("Acceleration", FLOAT, 3, UNIT_CONV_ACCELERATION, parts, a_hydro); list[7] = io_make_output_field("Density", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho); - list[8] = io_make_output_field( - "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY_PER_UNIT_MASS, parts, entropy); + list[8] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1, + UNIT_CONV_ENERGY_PER_UNIT_MASS, + parts, convert_u); list[9] = io_make_output_field_convert_part( - "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, rho, convert_P); + "Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, convert_P); list[10] = io_make_output_field("WeightedDensity", FLOAT, 1, UNIT_CONV_DENSITY, parts, rho_bar); } diff --git a/src/hydro/Shadowswift/hydro.h b/src/hydro/Shadowswift/hydro.h index abbcdcd2f7879d8063a906e44ab2fe6a3e675828..eb9938a341f1e6273f8ea03264bdda6dc69832e6 100644 --- a/src/hydro/Shadowswift/hydro.h +++ b/src/hydro/Shadowswift/hydro.h @@ -22,6 +22,7 @@ #include "approx_math.h" #include "equation_of_state.h" #include "hydro_gradients.h" +#include "hydro_properties.h" #include "hydro_space.h" #include "voronoi_algorithm.h" diff --git a/src/hydro/Shadowswift/hydro_io.h b/src/hydro/Shadowswift/hydro_io.h index de45b5d68c78f96cee3030eadef4b4410e550c22..f6f6fcc6c070b0add8e2b97678e5ef1ada636bfd 100644 --- a/src/hydro/Shadowswift/hydro_io.h +++ b/src/hydro/Shadowswift/hydro_io.h @@ -19,6 +19,7 @@ #include "adiabatic_index.h" #include "equation_of_state.h" +#include "hydro.h" #include "hydro_gradients.h" #include "hydro_slope_limiters.h" #include "io_properties.h" @@ -63,13 +64,8 @@ void hydro_read_particles(struct part* parts, struct io_props* list, * @param p Particle. * @return Internal energy of the particle */ -float convert_u(struct engine* e, struct part* p) { - if (p->primitives.rho > 0.) { - return gas_internal_energy_from_pressure(p->primitives.rho, - p->primitives.P); - } else { - return 0.; - } +void convert_u(const struct engine* e, const struct part* p, float* ret) { + ret[0] = hydro_get_internal_energy(p); } /** @@ -79,12 +75,8 @@ float convert_u(struct engine* e, struct part* p) { * @param p Particle. * @return Entropic function of the particle */ -float convert_A(struct engine* e, struct part* p) { - if (p->primitives.rho > 0.) { - return gas_entropy_from_pressure(p->primitives.rho, p->primitives.P); - } else { - return 0.; - } +void convert_A(const struct engine* e, const struct part* p, float* ret) { + ret[0] = hydro_get_entropy(p); } /** @@ -94,7 +86,7 @@ float convert_A(struct engine* e, struct part* p) { * @param p Particle. * @return Total energy of the particle */ -float convert_Etot(struct engine* e, struct part* p) { +void convert_Etot(const struct engine* e, const struct part* p, float* ret) { #ifdef SHADOWFAX_TOTAL_ENERGY return p->conserved.energy; #else @@ -105,13 +97,27 @@ float convert_Etot(struct engine* e, struct part* p) { p->conserved.momentum[1] * p->conserved.momentum[1] + p->conserved.momentum[2] * p->conserved.momentum[2]; - return p->conserved.energy + 0.5f * momentum2 / p->conserved.mass; + ret[0] = p->conserved.energy + 0.5f * momentum2 / p->conserved.mass; } else { - return 0.; + ret[0] = 0.; } #endif } +void convert_part_pos(const struct engine* e, const struct part* p, + double* ret) { + + if (e->s->periodic) { + ret[0] = box_wrap(p->x[0], 0.0, e->s->dim[0]); + ret[1] = box_wrap(p->x[1], 0.0, e->s->dim[1]); + ret[2] = box_wrap(p->x[2], 0.0, e->s->dim[2]); + } else { + ret[0] = p->x[0]; + ret[1] = p->x[1]; + ret[2] = p->x[2]; + } +} + /** * @brief Specifies which particle fields to write to a dataset * @@ -125,8 +131,8 @@ void hydro_write_particles(struct part* parts, struct io_props* list, *num_fields = 13; /* List what we want to write */ - list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, - parts, x); + list[0] = io_make_output_field_convert_part( + "Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, parts, convert_part_pos); list[1] = io_make_output_field("Velocities", FLOAT, 3, UNIT_CONV_SPEED, parts, primitives.v); list[2] = io_make_output_field("Masses", FLOAT, 1, UNIT_CONV_MASS, parts, @@ -135,7 +141,7 @@ void hydro_write_particles(struct part* parts, struct io_props* list, parts, h); list[4] = io_make_output_field_convert_part("InternalEnergy", FLOAT, 1, UNIT_CONV_ENERGY_PER_UNIT_MASS, - parts, primitives.P, convert_u); + parts, convert_u); list[5] = io_make_output_field("ParticleIDs", ULONGLONG, 1, UNIT_CONV_NO_UNITS, parts, id); list[6] = io_make_output_field("Acceleration", FLOAT, 3, @@ -147,12 +153,11 @@ void hydro_write_particles(struct part* parts, struct io_props* list, list[9] = io_make_output_field("GradDensity", FLOAT, 3, UNIT_CONV_DENSITY, parts, primitives.gradients.rho); list[10] = io_make_output_field_convert_part( - "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, primitives.P, convert_A); + "Entropy", FLOAT, 1, UNIT_CONV_ENTROPY, parts, convert_A); list[11] = io_make_output_field("Pressure", FLOAT, 1, UNIT_CONV_PRESSURE, parts, primitives.P); - list[12] = - io_make_output_field_convert_part("TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, - parts, conserved.energy, convert_Etot); + list[12] = io_make_output_field_convert_part( + "TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, parts, convert_Etot); } /** diff --git a/src/hydro_properties.c b/src/hydro_properties.c index 1e7554f7d84220b8c962d60cc4538c685b5bad52..995610acb2d46cb62a4fa5fc7ef76b0d9474f504 100644 --- a/src/hydro_properties.c +++ b/src/hydro_properties.c @@ -77,17 +77,10 @@ void hydro_props_init(struct hydro_props *p, void hydro_props_print(const struct hydro_props *p) { -#if defined(EOS_IDEAL_GAS) - message("Equation of state: Ideal gas."); -#elif defined(EOS_ISOTHERMAL_GAS) - message( - "Equation of state: Isothermal with internal energy " - "per unit mass set to %f.", - const_isothermal_internal_energy); -#endif - - message("Adiabatic index gamma: %f.", hydro_gamma); + /* Print equation of state first */ + eos_print(&eos); + /* Now SPH */ message("Hydrodynamic scheme: %s in %dD.", SPH_IMPLEMENTATION, (int)hydro_dimension); @@ -115,7 +108,8 @@ void hydro_props_print(const struct hydro_props *p) { #if defined(HAVE_HDF5) void hydro_props_print_snapshot(hid_t h_grpsph, const struct hydro_props *p) { - io_write_attribute_f(h_grpsph, "Adiabatic index", hydro_gamma); + eos_print_snapshot(h_grpsph, &eos); + io_write_attribute_i(h_grpsph, "Dimension", (int)hydro_dimension); io_write_attribute_s(h_grpsph, "Scheme", SPH_IMPLEMENTATION); io_write_attribute_s(h_grpsph, "Kernel function", kernel_name); diff --git a/src/io_properties.h b/src/io_properties.h index 9fcf1a1ac67cae6afab6870369e51d06c752fc11..2142d8d555fb52258d3443f20d14c72cf7568045 100644 --- a/src/io_properties.h +++ b/src/io_properties.h @@ -16,19 +16,31 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ - #ifndef SWIFT_IO_PROPERTIES_H #define SWIFT_IO_PROPERTIES_H /* Config parameters. */ #include "../config.h" +/* Local includes. */ +#include "inline.h" + /** * @brief The two sorts of data present in the GADGET IC files: compulsory to * start a run or optional. */ enum DATA_IMPORTANCE { COMPULSORY = 1, OPTIONAL = 0, UNUSED = -1 }; +/* Helper typedefs */ +typedef void (*conversion_func_part_float)(const struct engine*, + const struct part*, float*); +typedef void (*conversion_func_part_double)(const struct engine*, + const struct part*, double*); +typedef void (*conversion_func_gpart_float)(const struct engine*, + const struct gpart*, float*); +typedef void (*conversion_func_gpart_double)(const struct engine*, + const struct gpart*, double*); + /** * @brief The properties of a given dataset for i/o */ @@ -52,18 +64,31 @@ struct io_props { /* Pointer to the field of the first particle in the array */ char* field; + /* Pointer to the start of the temporary buffer used in i/o */ + char* start_temp_c; + float* start_temp_f; + double* start_temp_d; + + /* Pointer to the engine */ + const struct engine* e; + /* The size of the particles */ size_t partSize; /* The particle arrays */ - struct part* parts; - struct gpart* gparts; + const struct part* parts; + const struct gpart* gparts; + + /* Are we converting? */ + int conversion; /* Conversion function for part */ - float (*convert_part)(struct engine*, struct part*); + conversion_func_part_float convert_part_f; + conversion_func_part_double convert_part_d; /* Conversion function for gpart */ - float (*convert_gpart)(struct engine*, struct gpart*); + conversion_func_gpart_float convert_gpart_f; + conversion_func_gpart_double convert_gpart_d; }; /** @@ -86,11 +111,10 @@ struct io_props { * * Do not call this function directly. Use the macro defined above. */ -struct io_props io_make_input_field_(char name[FIELD_BUFFER_SIZE], - enum IO_DATA_TYPE type, int dimension, - enum DATA_IMPORTANCE importance, - enum unit_conversion_factor units, - char* field, size_t partSize) { +INLINE static struct io_props io_make_input_field_( + char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension, + enum DATA_IMPORTANCE importance, enum unit_conversion_factor units, + char* field, size_t partSize) { struct io_props r; strcpy(r.name, name); r.type = type; @@ -101,8 +125,11 @@ struct io_props io_make_input_field_(char name[FIELD_BUFFER_SIZE], r.partSize = partSize; r.parts = NULL; r.gparts = NULL; - r.convert_part = NULL; - r.convert_gpart = NULL; + r.conversion = 0; + r.convert_part_f = NULL; + r.convert_part_d = NULL; + r.convert_gpart_f = NULL; + r.convert_gpart_d = NULL; return r; } @@ -126,10 +153,9 @@ struct io_props io_make_input_field_(char name[FIELD_BUFFER_SIZE], * * Do not call this function directly. Use the macro defined above. */ -struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE], - enum IO_DATA_TYPE type, int dimension, - enum unit_conversion_factor units, - char* field, size_t partSize) { +INLINE static struct io_props io_make_output_field_( + char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension, + enum unit_conversion_factor units, char* field, size_t partSize) { struct io_props r; strcpy(r.name, name); r.type = type; @@ -140,8 +166,11 @@ struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE], r.partSize = partSize; r.parts = NULL; r.gparts = NULL; - r.convert_part = NULL; - r.convert_gpart = NULL; + r.conversion = 0; + r.convert_part_f = NULL; + r.convert_part_d = NULL; + r.convert_gpart_f = NULL; + r.convert_gpart_d = NULL; return r; } @@ -149,11 +178,10 @@ struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE], /** * @brief Constructs an #io_props (with conversion) from its parameters */ -#define io_make_output_field_convert_part(name, type, dim, units, part, field, \ - convert) \ - io_make_output_field_convert_part_(name, type, dim, units, \ - (char*)(&(part[0]).field), \ - sizeof(part[0]), part, convert) +#define io_make_output_field_convert_part(name, type, dim, units, part, \ + convert) \ + io_make_output_field_convert_part_##type(name, type, dim, units, \ + sizeof(part[0]), part, convert) /** * @brief Construct an #io_props from its parameters @@ -162,17 +190,16 @@ struct io_props io_make_output_field_(char name[FIELD_BUFFER_SIZE], * @param type The type of the data * @param dimension Dataset dimension (1D, 3D, ...) * @param units The units of the dataset - * @param field Pointer to the field of the first particle * @param partSize The size in byte of the particle * @param parts The particle array * @param functionPtr The function used to convert a particle to a float * * Do not call this function directly. Use the macro defined above. */ -struct io_props io_make_output_field_convert_part_( +INLINE static struct io_props io_make_output_field_convert_part_FLOAT( char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension, - enum unit_conversion_factor units, char* field, size_t partSize, - struct part* parts, float (*functionPtr)(struct engine*, struct part*)) { + enum unit_conversion_factor units, size_t partSize, + const struct part* parts, conversion_func_part_float functionPtr) { struct io_props r; strcpy(r.name, name); @@ -180,12 +207,52 @@ struct io_props io_make_output_field_convert_part_( r.dimension = dimension; r.importance = UNUSED; r.units = units; - r.field = field; + r.field = NULL; + r.partSize = partSize; + r.parts = parts; + r.gparts = NULL; + r.conversion = 1; + r.convert_part_f = functionPtr; + r.convert_part_d = NULL; + r.convert_gpart_f = NULL; + r.convert_gpart_d = NULL; + + return r; +} + +/** + * @brief Construct an #io_props from its parameters + * + * @param name Name of the field to read + * @param type The type of the data + * @param dimension Dataset dimension (1D, 3D, ...) + * @param units The units of the dataset + * @param partSize The size in byte of the particle + * @param parts The particle array + * @param functionPtr The function used to convert a particle to a float + * + * Do not call this function directly. Use the macro defined above. + */ +INLINE static struct io_props io_make_output_field_convert_part_DOUBLE( + char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension, + enum unit_conversion_factor units, size_t partSize, + const struct part* parts, conversion_func_part_double functionPtr) { + + struct io_props r; + strcpy(r.name, name); + r.type = type; + r.dimension = dimension; + r.importance = UNUSED; + r.units = units; + r.field = NULL; r.partSize = partSize; r.parts = parts; r.gparts = NULL; - r.convert_part = functionPtr; - r.convert_gpart = NULL; + r.conversion = 1; + r.convert_part_f = NULL; + r.convert_part_d = functionPtr; + r.convert_gpart_f = NULL; + r.convert_gpart_d = NULL; return r; } @@ -193,11 +260,10 @@ struct io_props io_make_output_field_convert_part_( /** * @brief Constructs an #io_props (with conversion) from its parameters */ -#define io_make_output_field_convert_gpart(name, type, dim, units, part, \ - field, convert) \ - io_make_output_field_convert_gpart_(name, type, dim, units, \ - (char*)(&(part[0]).field), \ - sizeof(part[0]), gpart, convert) +#define io_make_output_field_convert_gpart(name, type, dim, units, gpart, \ + convert) \ + io_make_output_field_convert_gpart_##type(name, type, dim, units, \ + sizeof(gpart[0]), gpart, convert) /** * @brief Construct an #io_props from its parameters @@ -206,17 +272,16 @@ struct io_props io_make_output_field_convert_part_( * @param type The type of the data * @param dimension Dataset dimension (1D, 3D, ...) * @param units The units of the dataset - * @param field Pointer to the field of the first particle - * @param partSize The size in byte of the particle + * @param gpartSize The size in byte of the particle * @param gparts The particle array * @param functionPtr The function used to convert a g-particle to a float * * Do not call this function directly. Use the macro defined above. */ -struct io_props io_make_output_field_convert_gpart_( +INLINE static struct io_props io_make_output_field_convert_gpart_FLOAT( char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension, - enum unit_conversion_factor units, char* field, size_t partSize, - struct gpart* gparts, float (*functionPtr)(struct engine*, struct gpart*)) { + enum unit_conversion_factor units, size_t gpartSize, + const struct gpart* gparts, conversion_func_gpart_float functionPtr) { struct io_props r; strcpy(r.name, name); @@ -224,12 +289,52 @@ struct io_props io_make_output_field_convert_gpart_( r.dimension = dimension; r.importance = UNUSED; r.units = units; - r.field = field; - r.partSize = partSize; + r.field = NULL; + r.partSize = gpartSize; + r.parts = NULL; + r.gparts = gparts; + r.conversion = 1; + r.convert_part_f = NULL; + r.convert_part_d = NULL; + r.convert_gpart_f = functionPtr; + r.convert_gpart_d = NULL; + + return r; +} + +/** + * @brief Construct an #io_props from its parameters + * + * @param name Name of the field to read + * @param type The type of the data + * @param dimension Dataset dimension (1D, 3D, ...) + * @param units The units of the dataset + * @param gpartSize The size in byte of the particle + * @param gparts The particle array + * @param functionPtr The function used to convert a g-particle to a float + * + * Do not call this function directly. Use the macro defined above. + */ +INLINE static struct io_props io_make_output_field_convert_gpart_DOUBLE( + char name[FIELD_BUFFER_SIZE], enum IO_DATA_TYPE type, int dimension, + enum unit_conversion_factor units, size_t gpartSize, + const struct gpart* gparts, conversion_func_gpart_double functionPtr) { + + struct io_props r; + strcpy(r.name, name); + r.type = type; + r.dimension = dimension; + r.importance = UNUSED; + r.units = units; + r.field = NULL; + r.partSize = gpartSize; r.parts = NULL; r.gparts = gparts; - r.convert_part = NULL; - r.convert_gpart = functionPtr; + r.conversion = 1; + r.convert_part_f = NULL; + r.convert_part_d = NULL; + r.convert_gpart_f = NULL; + r.convert_gpart_d = functionPtr; return r; } diff --git a/src/logger.c b/src/logger.c index b2acf47aa70cef55f53d296033f6f5c6162fd5bd..0e0bc930c0841f985da2c353357a69b69aba5d91 100644 --- a/src/logger.c +++ b/src/logger.c @@ -20,6 +20,8 @@ /* Config parameters. */ #include "../config.h" +#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */ + /* Some standard headers. */ #include <stdint.h> #include <stdlib.h> @@ -41,7 +43,6 @@ * * @return The size of the logger message in bytes. */ - int logger_size(unsigned int mask) { /* Start with 8 bytes for the header. */ @@ -95,7 +96,6 @@ int logger_size(unsigned int mask) { * @param offset Pointer to the offset of the previous log of this particle. * @param dump The #dump in which to log the particle data. */ - void logger_log_part(struct part *p, unsigned int mask, size_t *offset, struct dump *dump) { @@ -176,7 +176,6 @@ void logger_log_part(struct part *p, unsigned int mask, size_t *offset, * @param offset Pointer to the offset of the previous log of this particle. * @param dump The #dump in which to log the particle data. */ - void logger_log_gpart(struct gpart *p, unsigned int mask, size_t *offset, struct dump *dump) { @@ -270,7 +269,6 @@ void logger_log_timestamp(unsigned long long int timestamp, size_t *offset, * * @return The mask containing the values read. */ - int logger_read_part(struct part *p, size_t *offset, const char *buff) { /* Jump to the offset. */ @@ -349,7 +347,6 @@ int logger_read_part(struct part *p, size_t *offset, const char *buff) { * * @return The mask containing the values read. */ - int logger_read_gpart(struct gpart *p, size_t *offset, const char *buff) { /* Jump to the offset. */ @@ -416,7 +413,6 @@ int logger_read_gpart(struct gpart *p, size_t *offset, const char *buff) { * * @return The mask containing the values read. */ - int logger_read_timestamp(unsigned long long int *t, size_t *offset, const char *buff) { @@ -444,3 +440,5 @@ int logger_read_timestamp(unsigned long long int *t, size_t *offset, /* Finally, return the mask of the values we just read. */ return mask; } + +#endif /* HAVE_POSIX_FALLOCATE */ diff --git a/src/logger.h b/src/logger.h index 32fae752c2ae13a143809d9df3030dbc06b0942d..596c0903750404d0934e0d3843a5461523700e9e 100644 --- a/src/logger.h +++ b/src/logger.h @@ -20,9 +20,11 @@ #define SWIFT_LOGGER_H /* Includes. */ -#include "dump.h" #include "part.h" +/* Forward declaration */ +struct dump; + /** * Logger entries contain messages representing the particle data at a given * point in time during the simulation. diff --git a/src/parallel_io.c b/src/parallel_io.c index 26686baadb7c040264c20385d0c89cb30e5dc398..e11c43c79badcc40bb76b7eee09348571e1841cc 100644 --- a/src/parallel_io.c +++ b/src/parallel_io.c @@ -51,56 +51,35 @@ #include "units.h" #include "xmf.h" +/* Are we timing the i/o? */ +//#define IO_SPEED_MEASUREMENT + /* The current limit of ROMIO (the underlying MPI-IO layer) is 2GB */ #define HDF5_PARALLEL_IO_MAX_BYTES 2000000000LL /** - * @brief Reads a data array from a given HDF5 group. + * @brief Reads a chunk of data from an open HDF5 dataset * - * @param grp The group from which to read. - * @param prop The #io_props of the field to read. - * @param N The number of particles on that rank. - * @param N_total The total number of particles. - * @param offset The offset in the array on disk for this rank. + * @param h_data The HDF5 dataset to write to. + * @param h_plist_id the parallel HDF5 properties. + * @param props The #io_props of the field to read. + * @param N The number of particles to write. + * @param offset Offset in the array where this mpi task starts writing. * @param internal_units The #unit_system used internally. - * @param ic_units The #unit_system used in the ICs. + * @param ic_units The #unit_system used in the snapshots. */ -void readArray(hid_t grp, const struct io_props prop, size_t N, - long long N_total, long long offset, - const struct unit_system* internal_units, - const struct unit_system* ic_units) { - - const size_t typeSize = io_sizeof_type(prop.type); - const size_t copySize = typeSize * prop.dimension; - const size_t num_elements = N * prop.dimension; - - /* Check whether the dataspace exists or not */ - const htri_t exist = H5Lexists(grp, prop.name, 0); - if (exist < 0) { - error("Error while checking the existence of data set '%s'.", prop.name); - } else if (exist == 0) { - if (prop.importance == COMPULSORY) { - error("Compulsory data set '%s' not present in the file.", prop.name); - } else { - for (size_t i = 0; i < N; ++i) - memset(prop.field + i * prop.partSize, 0, copySize); - return; - } - } - - /* message("Reading %s '%s' array...", */ - /* prop.importance == COMPULSORY ? "compulsory" : "optional ", */ - /* prop.name); */ +void readArray_chunk(hid_t h_data, hid_t h_plist_id, + const struct io_props props, size_t N, long long offset, + const struct unit_system* internal_units, + const struct unit_system* ic_units) { - /* Open data space in file */ - const hid_t h_data = H5Dopen2(grp, prop.name, H5P_DEFAULT); - if (h_data < 0) error("Error while opening data space '%s'.", prop.name); + const size_t typeSize = io_sizeof_type(props.type); + const size_t copySize = typeSize * props.dimension; + const size_t num_elements = N * props.dimension; - /* Check data type */ - const hid_t h_type = H5Dget_type(h_data); - if (h_type < 0) error("Unable to retrieve data type from the file"); - /* if (!H5Tequal(h_type, hdf5_type(type))) */ - /* error("Non-matching types between the code and the file"); */ + /* Can't handle writes of more than 2GB */ + if (N * props.dimension * typeSize > HDF5_PARALLEL_IO_MAX_BYTES) + error("Dataset too large to be written in one pass!"); /* Allocate temporary buffer */ void* temp = malloc(num_elements * typeSize); @@ -109,10 +88,10 @@ void readArray(hid_t grp, const struct io_props prop, size_t N, /* Prepare information for hyper-slab */ hsize_t shape[2], offsets[2]; int rank; - if (prop.dimension > 1) { + if (props.dimension > 1) { rank = 2; shape[0] = N; - shape[1] = prop.dimension; + shape[1] = props.dimension; offsets[0] = offset; offsets[1] = 0; } else { @@ -130,27 +109,23 @@ void readArray(hid_t grp, const struct io_props prop, size_t N, const hid_t h_filespace = H5Dget_space(h_data); H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL); - /* Set collective reading properties */ - const hid_t h_plist_id = H5Pcreate(H5P_DATASET_XFER); - H5Pset_dxpl_mpio(h_plist_id, H5FD_MPIO_COLLECTIVE); - /* Read HDF5 dataspace in temporary buffer */ /* Dirty version that happens to work for vectors but should be improved */ /* Using HDF5 dataspaces would be better */ - const hid_t h_err = H5Dread(h_data, io_hdf5_type(prop.type), h_memspace, + const hid_t h_err = H5Dread(h_data, io_hdf5_type(props.type), h_memspace, h_filespace, h_plist_id, temp); if (h_err < 0) { - error("Error while reading data array '%s'.", prop.name); + error("Error while reading data array '%s'.", props.name); } /* Unit conversion if necessary */ const double factor = - units_conversion_factor(ic_units, internal_units, prop.units); - if (factor != 1. && exist != 0) { + units_conversion_factor(ic_units, internal_units, props.units); + if (factor != 1.) { /* message("Converting ! factor=%e", factor); */ - if (io_is_double_precision(prop.type)) { + if (io_is_double_precision(props.type)) { double* temp_d = temp; for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor; } else { @@ -162,13 +137,98 @@ void readArray(hid_t grp, const struct io_props prop, size_t N, /* Copy temporary buffer to particle data */ char* temp_c = temp; for (size_t i = 0; i < N; ++i) - memcpy(prop.field + i * prop.partSize, &temp_c[i * copySize], copySize); + memcpy(props.field + i * props.partSize, &temp_c[i * copySize], copySize); /* Free and close everything */ free(temp); - H5Pclose(h_plist_id); H5Sclose(h_filespace); H5Sclose(h_memspace); +} + +/** + * @brief Reads a data array from a given HDF5 group. + * + * @param grp The group from which to read. + * @param props The #io_props of the field to read. + * @param N The number of particles on that rank. + * @param N_total The total number of particles. + * @param mpi_rank The MPI rank of this node. + * @param offset The offset in the array on disk for this rank. + * @param internal_units The #unit_system used internally. + * @param ic_units The #unit_system used in the ICs. + */ +void readArray(hid_t grp, struct io_props props, size_t N, long long N_total, + int mpi_rank, long long offset, + const struct unit_system* internal_units, + const struct unit_system* ic_units) { + + const size_t typeSize = io_sizeof_type(props.type); + const size_t copySize = typeSize * props.dimension; + + /* Check whether the dataspace exists or not */ + const htri_t exist = H5Lexists(grp, props.name, 0); + if (exist < 0) { + error("Error while checking the existence of data set '%s'.", props.name); + } else if (exist == 0) { + if (props.importance == COMPULSORY) { + error("Compulsory data set '%s' not present in the file.", props.name); + } else { + for (size_t i = 0; i < N; ++i) + memset(props.field + i * props.partSize, 0, copySize); + return; + } + } + + /* Open data space in file */ + const hid_t h_data = H5Dopen2(grp, props.name, H5P_DEFAULT); + if (h_data < 0) error("Error while opening data space '%s'.", props.name); + + /* Check data type */ + const hid_t h_type = H5Dget_type(h_data); + if (h_type < 0) error("Unable to retrieve data type from the file"); + /* if (!H5Tequal(h_type, hdf5_type(type))) */ + /* error("Non-matching types between the code and the file"); */ + + /* Create property list for collective dataset read. */ + const hid_t h_plist_id = H5Pcreate(H5P_DATASET_XFER); + H5Pset_dxpl_mpio(h_plist_id, H5FD_MPIO_COLLECTIVE); + + /* Given the limitations of ROM-IO we will need to read the data in chunk of + HDF5_PARALLEL_IO_MAX_BYTES bytes per node until all the nodes are done. */ + char redo = 1; + while (redo) { + + /* Maximal number of elements */ + const size_t max_chunk_size = + HDF5_PARALLEL_IO_MAX_BYTES / (props.dimension * typeSize); + + /* Write the first chunk */ + const size_t this_chunk = (N > max_chunk_size) ? max_chunk_size : N; + readArray_chunk(h_data, h_plist_id, props, this_chunk, offset, + internal_units, ic_units); + + /* Compute how many items are left */ + if (N > max_chunk_size) { + N -= max_chunk_size; + props.field += max_chunk_size * props.partSize; + offset += max_chunk_size; + redo = 1; + } else { + N = 0; + offset += 0; + redo = 0; + } + + /* Do we need to run again ? */ + MPI_Allreduce(MPI_IN_PLACE, &redo, 1, MPI_SIGNED_CHAR, MPI_MAX, + MPI_COMM_WORLD); + + if (redo && mpi_rank == 0) + message("Need to redo one iteration for array '%s'", props.name); + } + + /* Close everything */ + H5Pclose(h_plist_id); H5Tclose(h_type); H5Dclose(h_data); } @@ -195,7 +255,6 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id, const struct unit_system* snapshot_units) { const size_t typeSize = io_sizeof_type(props.type); - const size_t copySize = typeSize * props.dimension; const size_t num_elements = N * props.dimension; /* Can't handle writes of more than 2GB */ @@ -206,44 +265,24 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id, /* Allocate temporary buffer */ void* temp = malloc(num_elements * typeSize); - if (temp == NULL) error("Unable to allocate memory for temporary buffer"); - - /* Copy particle data to temporary buffer */ - if (props.convert_part == NULL && - props.convert_gpart == NULL) { /* No conversion */ + if (posix_memalign((void**)&temp, IO_BUFFER_ALIGNMENT, + num_elements * typeSize) != 0) + error("Unable to allocate temporary i/o buffer"); - char* temp_c = temp; - for (size_t i = 0; i < N; ++i) - memcpy(&temp_c[i * copySize], props.field + i * props.partSize, copySize); +#ifdef IO_SPEED_MEASUREMENT + MPI_Barrier(MPI_COMM_WORLD); + ticks tic = getticks(); +#endif - } else if (props.convert_part != NULL) { /* conversion (for parts)*/ + /* Copy the particle data to the temporary buffer */ + io_copy_temp_buffer(temp, e, props, N, internal_units, snapshot_units); - float* temp_f = temp; - for (size_t i = 0; i < N; ++i) - temp_f[i] = props.convert_part(e, &props.parts[i]); - - } else if (props.convert_gpart != NULL) { /* conversion (for gparts)*/ - - float* temp_f = temp; - for (size_t i = 0; i < N; ++i) - temp_f[i] = props.convert_gpart(e, &props.gparts[i]); - } - - /* Unit conversion if necessary */ - const double factor = - units_conversion_factor(internal_units, snapshot_units, props.units); - if (factor != 1.) { - - /* message("Converting ! factor=%e", factor); */ - - if (io_is_double_precision(props.type)) { - double* temp_d = temp; - for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor; - } else { - float* temp_f = temp; - for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor; - } - } +#ifdef IO_SPEED_MEASUREMENT + MPI_Barrier(MPI_COMM_WORLD); + if(engine_rank == 0) + message("Copying for '%s' took %.3f %s." , props.name, + clocks_from_ticks(getticks() - tic), clocks_getunit()); +#endif /* Create data space */ const hid_t h_memspace = H5Screate(H5S_SIMPLE); @@ -286,10 +325,14 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id, } /* message("Writing %lld '%s', %zd elements = %zd bytes (int=%d) at offset - * %zd", */ - /* N, props.name, N * props.dimension, N * props.dimension * typeSize, */ + * %zd", N, props.name, N * props.dimension, N * props.dimension * typeSize, */ /* (int)(N * props.dimension * typeSize), offset); */ +#ifdef IO_SPEED_MEASUREMENT + MPI_Barrier(MPI_COMM_WORLD); + tic = getticks(); +#endif + /* Write temporary buffer to HDF5 dataspace */ h_err = H5Dwrite(h_data, io_hdf5_type(props.type), h_memspace, h_filespace, h_plist_id, temp); @@ -297,6 +340,18 @@ void writeArray_chunk(struct engine* e, hid_t h_data, hid_t h_plist_id, error("Error while writing data array '%s'.", props.name); } +#ifdef IO_SPEED_MEASUREMENT + MPI_Barrier(MPI_COMM_WORLD); + ticks toc = getticks(); + float ms = clocks_from_ticks(toc - tic); + int megaBytes = N * props.dimension * typeSize / (1024 * 1024); + int total = 0; + MPI_Reduce(&megaBytes, &total, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + if (engine_rank == 0) + message("H5Dwrite for '%s' (%d MB) took %.3f %s (speed = %f MB/s).", + props.name, total, ms, clocks_getunit(), total / (ms / 1000.)); +#endif + /* Free and close everything */ free(temp); H5Sclose(h_memspace); @@ -328,6 +383,10 @@ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, const size_t typeSize = io_sizeof_type(props.type); +#ifdef IO_SPEED_MEASUREMENT + const ticks tic = getticks(); +#endif + /* Work out properties of the array in the file */ int rank; hsize_t shape_total[2]; @@ -440,6 +499,13 @@ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, H5Pclose(h_prop); H5Dclose(h_data); H5Pclose(h_plist_id); + +#ifdef IO_SPEED_MEASUREMENT + MPI_Barrier(MPI_COMM_WORLD); + if(engine_rank == 0) + message("'%s' took %.3f %s." , props.name, + clocks_from_ticks(getticks() - tic), clocks_getunit()); +#endif } /** @@ -560,7 +626,7 @@ void read_ic_parallel(char* fileName, const struct unit_system* internal_units, /* Read the unit system used in the ICs */ struct unit_system* ic_units = malloc(sizeof(struct unit_system)); if (ic_units == NULL) error("Unable to allocate memory for IC unit system"); - io_read_unit_system(h_file, ic_units); + io_read_unit_system(h_file, ic_units, mpi_rank); /* Tell the user if a conversion will be needed */ if (mpi_rank == 0) { @@ -678,14 +744,16 @@ void read_ic_parallel(char* fileName, const struct unit_system* internal_units, break; default: - message("Particle Type %d not yet supported. Particles ignored", ptype); + if (mpi_rank == 0) + message("Particle Type %d not yet supported. Particles ignored", + ptype); } /* Read everything */ if (!dry_run) for (int i = 0; i < num_fields; ++i) - readArray(h_grp, list[i], Nparticles, N_total[ptype], offset[ptype], - internal_units, ic_units); + readArray(h_grp, list[i], Nparticles, N_total[ptype], mpi_rank, + offset[ptype], internal_units, ic_units); /* Close particle group */ H5Gclose(h_grp); @@ -768,16 +836,42 @@ void write_output_parallel(struct engine* e, const char* baseName, /* Prepare the XMF file for the new entry */ if (mpi_rank == 0) xmfFile = xmf_prepare_file(baseName); - /* Open HDF5 file */ + /* Prepare some file-access properties */ hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); - H5Pset_fapl_mpio(plist_id, comm, info); + + /* Set some MPI-IO parameters */ + // MPI_Info_set(info, "IBM_largeblock_io", "true"); + MPI_Info_set(info, "romio_cb_write", "enable"); + MPI_Info_set(info, "romio_ds_write", "disable"); + + /* Activate parallel i/o */ + hid_t h_err = H5Pset_fapl_mpio(plist_id, comm, info); + if (h_err < 0) error("Error setting parallel i/o"); + + /* Align on 4k pages. */ + h_err = H5Pset_alignment(plist_id, 1024, 4096); + if (h_err < 0) error("Error setting Hdf5 alignment"); + + /* Disable meta-data cache eviction */ + H5AC_cache_config_t mdc_config; + mdc_config.version = H5AC__CURR_CACHE_CONFIG_VERSION; + h_err = H5Pget_mdc_config(plist_id, &mdc_config); + if (h_err < 0) error("Error getting the MDC config"); + + mdc_config.evictions_enabled = 0; /* false */ + mdc_config.incr_mode = H5C_incr__off; + mdc_config.decr_mode = H5C_decr__off; + mdc_config.flash_incr_mode = H5C_flash_incr__off; + h_err = H5Pset_mdc_config(plist_id, &mdc_config); + if (h_err < 0) error("Error setting the MDC config"); + + /* Open HDF5 file with the chosen parameters */ h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); if (h_file < 0) { error("Error while opening file '%s'.", fileName); } - /* Compute offset in the file and total number of - * particles */ + /* Compute offset in the file and total number of particles */ size_t N[swift_type_count] = {Ngas, Ndm, 0, 0, Nstars, 0}; long long N_total[swift_type_count] = {0}; long long offset[swift_type_count] = {0}; @@ -790,8 +884,7 @@ void write_output_parallel(struct engine* e, const char* baseName, MPI_Bcast(&N_total, 6, MPI_LONG_LONG_INT, mpi_size - 1, comm); /* Now everybody konws its offset and the total number of - * particles of each - * type */ + * particles of each type */ /* Write the part of the XMF file corresponding to this * specific output */ diff --git a/src/partition.c b/src/partition.c index d9aade781e824defedb05f5a201fec8401da7ee2..51231b5c01e7fd0d3d8d689d5f41fa1017a85537 100644 --- a/src/partition.c +++ b/src/partition.c @@ -63,10 +63,15 @@ const char *initial_partition_name[] = { /* Simple descriptions of repartition types for reports. */ const char *repartition_name[] = { - "no", "METIS edge and vertex time weighted cells", - "METIS particle count vertex weighted cells", - "METIS time edge weighted cells", - "METIS particle count vertex and time edge cells"}; + "no", + "METIS edge and vertex task cost weights", + "METIS particle count vertex weights", + "METIS task cost edge weights", + "METIS particle count vertex and task cost edge weights", + "METIS vertex task costs and edge delta timebin weights", + "METIS particle count vertex and edge delta timebin weights", + "METIS edge delta timebin weights", +}; /* Local functions, if needed. */ static int check_complete(struct space *s, int verbose, int nregions); @@ -236,14 +241,14 @@ static void graph_init_metis(struct space *s, idx_t *adjncy, idx_t *xadj) { * @param counts the number of particles per cell. Should be * allocated as size s->nr_parts. */ -static void accumulate_counts(struct space *s, int *counts) { +static void accumulate_counts(struct space *s, double *counts) { struct part *parts = s->parts; int *cdim = s->cdim; double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]}; double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; - bzero(counts, sizeof(int) * s->nr_cells); + bzero(counts, sizeof(double) * s->nr_cells); for (size_t k = 0; k < s->nr_parts; k++) { for (int j = 0; j < 3; j++) { @@ -274,6 +279,9 @@ static void accumulate_counts(struct space *s, int *counts) { static void split_metis(struct space *s, int nregions, int *celllist) { for (int i = 0; i < s->nr_cells; i++) s->cells_top[i].nodeID = celllist[i]; + + /* To check or visualise the partition dump all the cells. */ + /* dumpCellRanks("metis_partition", s->cells_top, s->nr_cells); */ } #endif @@ -300,15 +308,16 @@ static int indexvalcmp(const void *p1, const void *p2) { * @param s the space of cells to partition. * @param nregions the number of regions required in the partition. * @param vertexw weights for the cells, sizeof number of cells if used, - * NULL for unit weights. + * NULL for unit weights. Need to be in the range of idx_t. * @param edgew weights for the graph edges between all cells, sizeof number * of cells * 26 if used, NULL for unit weights. Need to be packed - * in CSR format, so same as adjncy array. + * in CSR format, so same as adjncy array. Need to be in the range of + * idx_t. * @param celllist on exit this contains the ids of the selected regions, * sizeof number of cells. */ -static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, - int *celllist) { +static void pick_metis(struct space *s, int nregions, double *vertexw, + double *edgew, int *celllist) { /* Total number of cells. */ int ncells = s->cdim[0] * s->cdim[1] * s->cdim[2]; @@ -345,23 +354,56 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, /* Init the vertex weights array. */ if (vertexw != NULL) { for (int k = 0; k < ncells; k++) { - if (vertexw[k] > 0) { + if (vertexw[k] > 1) { weights_v[k] = vertexw[k]; } else { weights_v[k] = 1; } } + +#ifdef SWIFT_DEBUG_CHECKS + /* Check weights are all in range. */ + int failed = 0; + for (int k = 0; k < ncells; k++) { + if ((idx_t)vertexw[k] < 0) { + message("Input vertex weight out of range: %ld", (long)vertexw[k]); + failed++; + } + if (weights_v[k] < 1) { + message("Used vertex weight out of range: %" PRIDX, weights_v[k]); + failed++; + } + } + if (failed > 0) error("%d vertex weights are out of range", failed); +#endif } /* Init the edges weights array. */ if (edgew != NULL) { for (int k = 0; k < 26 * ncells; k++) { - if (edgew[k] > 0) { + if (edgew[k] > 1) { weights_e[k] = edgew[k]; } else { weights_e[k] = 1; } } + +#ifdef SWIFT_DEBUG_CHECKS + /* Check weights are all in range. */ + int failed = 0; + for (int k = 0; k < 26 * ncells; k++) { + + if ((idx_t)edgew[k] < 0) { + message("Input edge weight out of range: %ld", (long)edgew[k]); + failed++; + } + if (weights_e[k] < 1) { + message("Used edge weight out of range: %" PRIDX, weights_e[k]); + failed++; + } + } + if (failed > 0) error("%d edge weights are out of range", failed); +#endif } /* Set the METIS options. */ @@ -380,8 +422,8 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, idx_t objval; /* Dump graph in METIS format */ - /* dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy, - * weights_v, NULL, weights_e); + /*dumpMETISGraph("metis_graph", idx_ncells, one, xadj, adjncy, + * weights_v, NULL, weights_e); */ if (METIS_PartGraphKway(&idx_ncells, &one, xadj, adjncy, weights_v, NULL, weights_e, &idx_nregions, NULL, NULL, options, @@ -465,31 +507,28 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, #if defined(WITH_MPI) && defined(HAVE_METIS) /** - * @brief Repartition the cells amongst the nodes using task timings - * as edge weights and vertex weights also from task timings + * @brief Repartition the cells amongst the nodes using task costs + * as edge weights and vertex weights also from task costs * or particle cells counts. * * @param partweights whether particle counts will be used as vertex weights. * @param bothweights whether vertex and edge weights will be used, otherwise * only edge weights will be used. + * @param timebins use timebins as edge weights. * @param nodeID our nodeID. * @param nr_nodes the number of nodes. * @param s the space of cells holding our local particles. * @param tasks the completed tasks from the last engine step for our node. * @param nr_tasks the number of tasks. */ -static void repart_edge_metis(int partweights, int bothweights, int nodeID, - int nr_nodes, struct space *s, struct task *tasks, - int nr_tasks) { +static void repart_edge_metis(int partweights, int bothweights, int timebins, + int nodeID, int nr_nodes, struct space *s, + struct task *tasks, int nr_tasks) { /* Create weight arrays using task ticks for vertices and edges (edges * assume the same graph structure as used in the part_ calls). */ int nr_cells = s->nr_cells; struct cell *cells = s->cells_top; - float wscale = 1.f, wscale_buff = 0.0; - int wtot = 0; - int wmax = 1e9 / nr_nodes; - int wmin; /* Allocate and fill the adjncy indexing array defining the graph of * cells. */ @@ -499,16 +538,16 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, graph_init_metis(s, inds, NULL); /* Allocate and init weights. */ - int *weights_v = NULL; - int *weights_e = NULL; + double *weights_v = NULL; + double *weights_e = NULL; if (bothweights) { - if ((weights_v = (int *)malloc(sizeof(int) * nr_cells)) == NULL) + if ((weights_v = (double *)malloc(sizeof(double) * nr_cells)) == NULL) error("Failed to allocate vertex weights arrays."); - bzero(weights_v, sizeof(int) * nr_cells); + bzero(weights_v, sizeof(double) * nr_cells); } - if ((weights_e = (int *)malloc(sizeof(int) * 26 * nr_cells)) == NULL) + if ((weights_e = (double *)malloc(sizeof(double) * 26 * nr_cells)) == NULL) error("Failed to allocate edge weights arrays."); - bzero(weights_e, sizeof(int) * 26 * nr_cells); + bzero(weights_e, sizeof(double) * 26 * nr_cells); /* Generate task weights for vertices. */ int taskvweights = (bothweights && !partweights); @@ -521,19 +560,8 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, /* Skip un-interesting tasks. */ if (t->cost == 0) continue; - /* Get the task weight. */ - int w = t->cost * wscale; - - /* Do we need to re-scale? */ - wtot += w; - while (wtot > wmax) { - wscale /= 2; - wtot /= 2; - w /= 2; - for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; - if (taskvweights) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; - } + /* Get the task weight based on costs. */ + double w = (double)t->cost; /* Get the top-level cells involved. */ struct cell *ci, *cj; @@ -552,9 +580,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, if (t->type == task_type_ghost || t->type == task_type_kick1 || t->type == task_type_kick2 || t->type == task_type_timestep || t->type == task_type_drift_part || t->type == task_type_drift_gpart) { + /* Particle updates add only to vertex weight. */ if (taskvweights) weights_v[cid] += w; - } /* Self interaction? */ @@ -575,64 +603,72 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, } - /* Distinct cells with local ci? */ - else if (ci->nodeID == nodeID) { + /* Distinct cells. */ + else { /* Index of the jth cell. */ int cjd = cj - cells; - /* Add half of weight to each cell. */ - if (taskvweights) { - if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; + /* Local cells add weight to vertices. */ + if (taskvweights && ci->nodeID == nodeID) { + weights_v[cid] += 0.5 * w; if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; } - /* Add weights to edge. */ - int kk; - for (kk = 26 * cid; inds[kk] != cjd; kk++) - ; - weights_e[kk] += w; - for (kk = 26 * cjd; inds[kk] != cid; kk++) - ; - weights_e[kk] += w; + if (timebins) { + /* Add weights to edge for all cells based on the expected + * interaction time (calculated as the time to the last expected + * time) as we want to avoid having active cells on the edges, so we + * cut for that. Note that weight is added to the local and remote + * cells, as we want to keep both away from any cuts, this can + * overflow int, so take care. */ + int dti = num_time_bins - get_time_bin(ci->ti_end_min); + int dtj = num_time_bins - get_time_bin(cj->ti_end_min); + double dt = (double)(1 << dti) + (double)(1 << dtj); + + /* ci */ + int kk; + for (kk = 26 * cid; inds[kk] != cjd; kk++) + ; + weights_e[kk] += dt; + + /* cj */ + for (kk = 26 * cjd; inds[kk] != cid; kk++) + ; + weights_e[kk] += dt; + } else { + + /* Add weights from task costs to the edge. */ + + /* ci */ + int kk; + for (kk = 26 * cid; inds[kk] != cjd; kk++) + ; + weights_e[kk] += w; + + /* cj */ + for (kk = 26 * cjd; inds[kk] != cid; kk++) + ; + weights_e[kk] += w; + } } } } /* Re-calculate the vertices if using particle counts. */ - if (partweights && bothweights) { - accumulate_counts(s, weights_v); - - /* Rescale to balance times. */ - float vwscale = (float)wtot / (float)nr_tasks; - for (int k = 0; k < nr_cells; k++) { - weights_v[k] *= vwscale; - } - } - - /* Get the minimum scaling and re-scale if necessary. */ - int res; - if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, - MPI_COMM_WORLD)) != MPI_SUCCESS) - mpi_error(res, "Failed to allreduce the weight scales."); - - if (wscale_buff != wscale) { - float scale = wscale_buff / wscale; - for (int k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; - if (bothweights) - for (int k = 0; k < nr_cells; k++) weights_v[k] *= scale; - } + if (partweights && bothweights) accumulate_counts(s, weights_v); /* Merge the weights arrays across all nodes. */ + int res; if (bothweights) { if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, - nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != + nr_cells, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD)) != MPI_SUCCESS) mpi_error(res, "Failed to allreduce vertex weights."); } if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, - 26 * nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != - MPI_SUCCESS) + 26 * nr_cells, MPI_DOUBLE, MPI_SUM, 0, + MPI_COMM_WORLD)) != MPI_SUCCESS) mpi_error(res, "Failed to allreduce edge weights."); /* Allocate cell list for the partition. */ @@ -641,39 +677,72 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, /* As of here, only one node needs to compute the partition. */ if (nodeID == 0) { - /* Final rescale of all weights to avoid a large range. Large ranges - * have been seen to cause an incomplete graph. */ - wmin = wmax; - wmax = 0; - for (int k = 0; k < 26 * nr_cells; k++) { - wmax = weights_e[k] > wmax ? weights_e[k] : wmax; - wmin = weights_e[k] < wmin ? weights_e[k] : wmin; - } + + /* We need to rescale the weights into the range of an integer for METIS + * (really range of idx_t). Also we would like the range of vertex and + * edges weights to be similar so they balance. */ + double wminv = 0.0; + double wmaxv = 0.0; if (bothweights) { + wminv = weights_v[0]; + wmaxv = weights_v[0]; for (int k = 0; k < nr_cells; k++) { - wmax = weights_v[k] > wmax ? weights_v[k] : wmax; - wmin = weights_v[k] < wmin ? weights_v[k] : wmin; + wmaxv = weights_v[k] > wmaxv ? weights_v[k] : wmaxv; + wminv = weights_v[k] < wminv ? weights_v[k] : wminv; } } - if ((wmax - wmin) > metis_maxweight) { - wscale = metis_maxweight / (wmax - wmin); - for (int k = 0; k < 26 * nr_cells; k++) { - weights_e[k] = (weights_e[k] - wmin) * wscale + 1; - } - if (bothweights) { + double wmine = weights_e[0]; + double wmaxe = weights_e[0]; + for (int k = 0; k < 26 * nr_cells; k++) { + wmaxe = weights_e[k] > wmaxe ? weights_e[k] : wmaxe; + wmine = weights_e[k] < wmine ? weights_e[k] : wmine; + } + + if (bothweights) { + + /* Make range the same in both weights systems. */ + if ((wmaxv - wminv) > (wmaxe - wmine)) { + double wscale = 1.0; + if ((wmaxe - wmine) > 0.0) { + wscale = (wmaxv - wminv) / (wmaxe - wmine); + } + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] = (weights_e[k] - wmine) * wscale + wminv; + } + wmine = wminv; + wmaxe = wmaxv; + + } else { + double wscale = 1.0; + if ((wmaxv - wminv) > 0.0) { + wscale = (wmaxe - wmine) / (wmaxv - wminv); + } for (int k = 0; k < nr_cells; k++) { - weights_v[k] = (weights_v[k] - wmin) * wscale + 1; + weights_v[k] = (weights_v[k] - wminv) * wscale + wmine; } + wminv = wmine; + wmaxv = wmaxe; + } + + /* Scale to the METIS range. */ + double wscale = 1.0; + if ((wmaxv - wminv) > 0.0) { + wscale = (metis_maxweight - 1.0) / (wmaxv - wminv); + } + for (int k = 0; k < nr_cells; k++) { + weights_v[k] = (weights_v[k] - wminv) * wscale + 1.0; } } - /* Make sure there are no zero weights. */ - for (int k = 0; k < 26 * nr_cells; k++) - if (weights_e[k] == 0) weights_e[k] = 1; - if (bothweights) - for (int k = 0; k < nr_cells; k++) - if (weights_v[k] == 0) weights_v[k] = 1; + /* Scale to the METIS range. */ + double wscale = 1.0; + if ((wmaxe - wmine) > 0.0) { + wscale = (metis_maxweight - 1.0) / (wmaxe - wmine); + } + for (int k = 0; k < 26 * nr_cells; k++) { + weights_e[k] = (weights_e[k] - wmine) * wscale + 1.0; + } /* And partition, use both weights or not as requested. */ if (bothweights) @@ -736,8 +805,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { /* Use particle counts as vertex weights. */ /* Space for particles per cell counts, which will be used as weights. */ - int *weights = NULL; - if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + double *weights = NULL; + if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL) error("Failed to allocate weights buffer."); /* Check each particle and accumulate the counts per cell. */ @@ -745,8 +814,8 @@ static void repart_vertex_metis(struct space *s, int nodeID, int nr_nodes) { /* Get all the counts from all the nodes. */ int res; - if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM, - MPI_COMM_WORLD)) != MPI_SUCCESS) + if ((res = MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, + MPI_SUM, MPI_COMM_WORLD)) != MPI_SUCCESS) mpi_error(res, "Failed to allreduce particle cell weights."); /* Main node does the partition calculation. */ @@ -787,35 +856,32 @@ void partition_repartition(struct repartition *reparttype, int nodeID, #if defined(WITH_MPI) && defined(HAVE_METIS) - if (reparttype->type == REPART_METIS_BOTH || - reparttype->type == REPART_METIS_EDGE || - reparttype->type == REPART_METIS_VERTEX_EDGE) { + if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_COSTS) { + repart_edge_metis(0, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks); - int partweights; - int bothweights; - if (reparttype->type == REPART_METIS_VERTEX_EDGE) - partweights = 1; - else - partweights = 0; + } else if (reparttype->type == REPART_METIS_EDGE_COSTS) { + repart_edge_metis(0, 0, 0, nodeID, nr_nodes, s, tasks, nr_tasks); - if (reparttype->type == REPART_METIS_BOTH) - bothweights = 1; - else - bothweights = 0; + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_COSTS) { + repart_edge_metis(1, 1, 0, nodeID, nr_nodes, s, tasks, nr_tasks); - repart_edge_metis(partweights, bothweights, nodeID, nr_nodes, s, tasks, - nr_tasks); + } else if (reparttype->type == REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS) { + repart_edge_metis(0, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks); - } else if (reparttype->type == REPART_METIS_VERTEX) { + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS) { + repart_edge_metis(1, 1, 1, nodeID, nr_nodes, s, tasks, nr_tasks); + } else if (reparttype->type == REPART_METIS_EDGE_TIMEBINS) { + repart_edge_metis(0, 0, 1, nodeID, nr_nodes, s, tasks, nr_tasks); + + } else if (reparttype->type == REPART_METIS_VERTEX_COUNTS) { repart_vertex_metis(s, nodeID, nr_nodes); } else if (reparttype->type == REPART_NONE) { - /* Doing nothing. */ } else { - error("Unknown repartition type"); + error("Impossible repartition type"); } #else error("SWIFT was not compiled with METIS support."); @@ -884,17 +950,17 @@ void partition_initial_partition(struct partition *initial_partition, /* Space for particles per cell counts, which will be used as weights or * not. */ - int *weights = NULL; + double *weights = NULL; if (initial_partition->type == INITPART_METIS_WEIGHT) { - if ((weights = (int *)malloc(sizeof(int) * s->nr_cells)) == NULL) + if ((weights = (double *)malloc(sizeof(double) * s->nr_cells)) == NULL) error("Failed to allocate weights buffer."); - bzero(weights, sizeof(int) * s->nr_cells); + bzero(weights, sizeof(double) * s->nr_cells); /* Check each particle and accumilate the counts per cell. */ accumulate_counts(s, weights); /* Get all the counts from all the nodes. */ - if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM, + if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS) error("Failed to allreduce particle cell weights."); } @@ -969,10 +1035,10 @@ void partition_init(struct partition *partition, /* Defaults make use of METIS if available */ #ifdef HAVE_METIS - const char *default_repart = "task_weights"; + const char *default_repart = "costs/costs"; const char *default_part = "simple_metis"; #else - const char *default_repart = "none"; + const char *default_repart = "none/none"; const char *default_part = "grid"; #endif @@ -1029,32 +1095,40 @@ void partition_init(struct partition *partition, parser_get_opt_param_string(params, "DomainDecomposition:repartition_type", part_type, default_repart); - switch (part_type[0]) { - case 'n': - repartition->type = REPART_NONE; - break; + if (strcmp("none/none", part_type) == 0) { + repartition->type = REPART_NONE; + #ifdef HAVE_METIS - case 't': - repartition->type = REPART_METIS_BOTH; - break; - case 'e': - repartition->type = REPART_METIS_EDGE; - break; - case 'p': - repartition->type = REPART_METIS_VERTEX; - break; - case 'h': - repartition->type = REPART_METIS_VERTEX_EDGE; - break; - default: - message("Invalid choice of re-partition type '%s'.", part_type); - error( - "Permitted values are: 'none', 'task_weights', 'particle_weights'," - "'edge_task_weights' or 'hybrid_weights'"); + } else if (strcmp("costs/costs", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_COSTS; + + } else if (strcmp("counts/none", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS; + + } else if (strcmp("none/costs", part_type) == 0) { + repartition->type = REPART_METIS_EDGE_COSTS; + + } else if (strcmp("counts/costs", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_COSTS; + + } else if (strcmp("costs/time", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS; + + } else if (strcmp("counts/time", part_type) == 0) { + repartition->type = REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS; + + } else if (strcmp("none/time", part_type) == 0) { + repartition->type = REPART_METIS_EDGE_TIMEBINS; + } else { + message("Invalid choice of re-partition type '%s'.", part_type); + error( + "Permitted values are: 'none/none', 'costs/costs'," + "'counts/none', 'none/costs', 'counts/costs', " + "'costs/time', 'counts/time' or 'none/time'"); #else - default: - message("Invalid choice of re-partition type '%s'.", part_type); - error("Permitted values are: 'none' when compiled without METIS."); + } else { + message("Invalid choice of re-partition type '%s'.", part_type); + error("Permitted values are: 'none/none' when compiled without METIS."); #endif } diff --git a/src/partition.h b/src/partition.h index 03523b165a930b085224e458ac0dd8c8232a578d..c3eade190c9514efb4c44011e3990745e20846fd 100644 --- a/src/partition.h +++ b/src/partition.h @@ -43,10 +43,13 @@ struct partition { /* Repartition type to use. */ enum repartition_type { REPART_NONE = 0, - REPART_METIS_BOTH, - REPART_METIS_VERTEX, - REPART_METIS_EDGE, - REPART_METIS_VERTEX_EDGE + REPART_METIS_VERTEX_COSTS_EDGE_COSTS, + REPART_METIS_VERTEX_COUNTS, + REPART_METIS_EDGE_COSTS, + REPART_METIS_VERTEX_COUNTS_EDGE_COSTS, + REPART_METIS_VERTEX_COSTS_EDGE_TIMEBINS, + REPART_METIS_VERTEX_COUNTS_EDGE_TIMEBINS, + REPART_METIS_EDGE_TIMEBINS }; /* Repartition preferences. */ diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 8e5a9e609c02b1697c6e8a052127d0a60e88961c..39647a2bb1f4c76914a946e49138519983639134 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -26,7 +26,8 @@ /* Local headers. */ #include "active.h" -#ifdef WITH_VECTORIZATION +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); /** @@ -384,8 +385,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( * @param rshift cutoff shift * @param hi_max_raw Maximal smoothing length in cell ci * @param hj_max_raw Maximal smoothing length in cell cj - * @param hi_max Maximal smoothing length in cell ci scaled by kernel_gamma - * @param hj_max Maximal smoothing length in cell cj scaled by kernel_gamma + * @param h_max Maximal smoothing length in both cells scaled by kernel_gamma * @param di_max Maximal position on the axis that can interact in cell ci * @param dj_min Minimal position on the axis that can interact in cell ci * @param max_index_i array to hold the maximum distances of pi particles into @@ -402,9 +402,8 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_j, const float dx_max, const float rshift, const double hi_max_raw, - const double hj_max_raw, const double hi_max, - const double hj_max, const double di_max, - const double dj_min, int *max_index_i, + const double hj_max_raw, const double h_max, + const double di_max, const double dj_min, int *max_index_i, int *max_index_j, int *init_pi, int *init_pj, const timebin_t max_active_bin) { @@ -419,7 +418,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, first_pi = ci->count; int active_id = first_pi - 1; while (first_pi > 0 && - sort_i[first_pi - 1].d + dx_max + max(hi_max, hj_max) > dj_min) { + sort_i[first_pi - 1].d + dx_max + h_max - rshift > dj_min) { first_pi--; /* Store the index of the particle if it is active. */ if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin)) @@ -469,7 +468,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, last_pj = -1; active_id = last_pj; while (last_pj < cj->count && - sort_j[last_pj + 1].d - max(hj_max, hi_max) - dx_max < di_max) { + sort_j[last_pj + 1].d - h_max - dx_max < di_max) { last_pj++; /* Store the index of the particle if it is active. */ if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin)) @@ -515,7 +514,7 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, *init_pj = last_pj; } -#endif /* WITH_VECTORIZATION */ +#endif /* WITH_VECTORIZATION && GADGET2_SPH */ /** * @brief Compute the cell self-interaction (non-symmetric) using vector @@ -527,7 +526,8 @@ populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, __attribute__((always_inline)) INLINE void runner_doself1_density_vec( struct runner *r, struct cell *restrict c) { -#ifdef WITH_VECTORIZATION +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + /* Get some local variables */ const struct engine *e = r->e; const timebin_t max_active_bin = e->max_active_bin; @@ -541,20 +541,25 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell."); +#ifdef SWIFT_DEBUG_CHECKS + for (int i = 0; i < count; i++) { + /* Check that particles have been drifted to the current time */ + if (parts[i].ti_drift != e->ti_current) + error("Particle pi not drifted to current time"); + } +#endif + /* Get the particle cache from the runner and re-allocate * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->ci_cache; - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } + if (cell_cache->count < count) cache_init(cell_cache, count); /* Read the particles from the cell and store them locally in the cache. */ cache_read_particles(c, cell_cache); /* Create secondary cache to store particle interactions. */ struct c2_cache int_cache; - int icount = 0, icount_align = 0; /* Loop over the particles in the cell. */ for (int pid = 0; pid < count; pid++) { @@ -607,6 +612,10 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( } } + /* The number of interactions for pi and the padded version of it to + * make it a multiple of VEC_SIZE. */ + int icount = 0, icount_align = 0; + /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -727,6 +736,11 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( } /* loop over all particles. */ TIMER_TOC(timer_doself_density); + +#else + + error("Incorrectly calling vectorized Gadget-2 functions!"); + #endif /* WITH_VECTORIZATION */ } @@ -744,25 +758,23 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( struct runner *r, struct cell *restrict c, struct part *restrict parts, int *restrict ind, int pi_count) { -#ifdef WITH_VECTORIZATION +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + const int count = c->count; - TIMER_TIC + TIMER_TIC; /* Get the particle cache from the runner and re-allocate * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->ci_cache; - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } + if (cell_cache->count < count) cache_init(cell_cache, count); /* Read the particles from the cell and store them locally in the cache. */ cache_read_particles(c, cell_cache); /* Create secondary cache to store particle interactions. */ struct c2_cache int_cache; - int icount = 0, icount_align = 0; /* Loop over the subset of particles in the parts that need updating. */ for (int pid = 0; pid < pi_count; pid++) { @@ -819,6 +831,10 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( } } + /* The number of interactions for pi and the padded version of it to + * make it a multiple of VEC_SIZE. */ + int icount = 0, icount_align = 0; + /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -941,6 +957,11 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( } /* loop over all particles. */ TIMER_TOC(timer_doself_subset); + +#else + + error("Incorrectly calling vectorized Gadget-2 functions!"); + #endif /* WITH_VECTORIZATION */ } @@ -954,13 +975,10 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( __attribute__((always_inline)) INLINE void runner_doself2_force_vec( struct runner *r, struct cell *restrict c) { -#ifdef WITH_VECTORIZATION - const struct engine *e = r->e; - struct part *restrict pi; - int count_align; +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + const struct engine *e = r->e; const timebin_t max_active_bin = e->max_active_bin; - struct part *restrict parts = c->parts; const int count = c->count; @@ -970,31 +988,28 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell."); +#ifdef SWIFT_DEBUG_CHECKS + for (int i = 0; i < count; i++) { + /* Check that particles have been drifted to the current time */ + if (parts[i].ti_drift != e->ti_current) + error("Particle pi not drifted to current time"); + } +#endif + /* Get the particle cache from the runner and re-allocate * the cache if it is not big enough for the cell. */ struct cache *restrict cell_cache = &r->ci_cache; - if (cell_cache->count < count) { - cache_init(cell_cache, count); - } + if (cell_cache->count < count) cache_init(cell_cache, count); /* Read the particles from the cell and store them locally in the cache. */ cache_read_force_particles(c, cell_cache); -#ifdef SWIFT_DEBUG_CHECKS - for (int i = 0; i < count; i++) { - pi = &c->parts[i]; - /* Check that particles have been drifted to the current time */ - if (pi->ti_drift != e->ti_current) - error("Particle pi not drifted to current time"); - } -#endif - /* Loop over the particles in the cell. */ for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Is the ith particle active? */ if (!part_is_active_no_debug(pi, max_active_bin)) continue; @@ -1031,7 +1046,7 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( vector v_entropy_dtSum = vector_setzero(); /* Pad cache if there is a serial remainder. */ - count_align = count; + int count_align = count; int rem = count % VEC_SIZE; if (rem != 0) { int pad = VEC_SIZE - rem; @@ -1129,6 +1144,11 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( } /* loop over all particles. */ TIMER_TOC(timer_doself_force); + +#else + + error("Incorrectly calling vectorized Gadget-2 functions!"); + #endif /* WITH_VECTORIZATION */ } @@ -1146,7 +1166,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *cj, const int sid, const double *shift) { -#ifdef WITH_VECTORIZATION +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + const struct engine *restrict e = r->e; const timebin_t max_active_bin = e->max_active_bin; @@ -1173,6 +1194,16 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const int active_ci = cell_is_active(ci, e); const int active_cj = cell_is_active(cj, e); +#ifdef SWIFT_DEBUG_CHECKS + /* Check that particles have been drifted to the current time */ + for (int pid = 0; pid < count_i; pid++) + if (parts_i[pid].ti_drift != e->ti_current) + error("Particle pi not drifted to current time"); + for (int pjd = 0; pjd < count_j; pjd++) + if (parts_j[pjd].ti_drift != e->ti_current) + error("Particle pj not drifted to current time"); +#endif + /* Count number of particles that are in range and active*/ int numActive = 0; @@ -1206,16 +1237,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cache *restrict ci_cache = &r->ci_cache; struct cache *restrict cj_cache = &r->cj_cache; - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } + if (ci_cache->count < count_i) cache_init(ci_cache, count_i); + if (cj_cache->count < count_j) cache_init(cj_cache, count_j); int first_pi, last_pj; - int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE))); - int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE))); + int *restrict max_index_i SWIFT_CACHE_ALIGN; + int *restrict max_index_j SWIFT_CACHE_ALIGN; max_index_i = r->ci_cache.max_index; max_index_j = r->cj_cache.max_index; @@ -1228,8 +1255,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, &first_pi, &last_pj, max_active_bin); /* Limits of the outer loops. */ - int first_pi_loop = first_pi; - int last_pj_loop = last_pj; + const int first_pi_loop = first_pi; + const int last_pj_loop_end = last_pj + 1; /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ @@ -1262,7 +1289,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - const int exit_iteration = max_index_i[pid]; + const int exit_iteration_end = max_index_i[pid] + 1; /* Fill particle pi vectors. */ const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]); @@ -1292,7 +1319,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Loop over the parts in cj. Making sure to perform an iteration of the * loop even if exit_iteration_align is zero and there is only one * particle to interact with.*/ - for (int pjd = 0; pjd <= exit_iteration; pjd += VEC_SIZE) { + for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */ const int cj_cache_idx = pjd; @@ -1363,7 +1390,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (active_cj) { /* Loop over the parts in cj until nothing is within range in ci. */ - for (int pjd = 0; pjd <= last_pj_loop; pjd++) { + for (int pjd = 0; pjd < last_pj_loop_end; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -1486,6 +1513,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, TIMER_TOC(timer_dopair_density); +#else + + error("Incorrectly calling vectorized Gadget-2 functions!"); + #endif /* WITH_VECTORIZATION */ } @@ -1503,7 +1534,8 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, struct cell *cj, const int sid, const double *shift) { -#ifdef WITH_VECTORIZATION +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + const struct engine *restrict e = r->e; const timebin_t max_active_bin = e->max_active_bin; @@ -1520,7 +1552,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Get some other useful values. */ const int count_i = ci->count; const int count_j = cj->count; - const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hi_max = ci->h_max * kernel_gamma; const double hj_max = cj->h_max * kernel_gamma; const double hi_max_raw = ci->h_max; const double hj_max_raw = cj->h_max; @@ -1532,6 +1564,16 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, const int active_ci = cell_is_active(ci, e); const int active_cj = cell_is_active(cj, e); +#ifdef SWIFT_DEBUG_CHECKS + /* Check that particles have been drifted to the current time */ + for (int pid = 0; pid < count_i; pid++) + if (parts_i[pid].ti_drift != e->ti_current) + error("Particle pi not drifted to current time"); + for (int pjd = 0; pjd < count_j; pjd++) + if (parts_j[pjd].ti_drift != e->ti_current) + error("Particle pj not drifted to current time"); +#endif + /* Check if any particles are active and in range */ int numActive = 0; @@ -1541,7 +1583,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, if (active_ci) { for (int pid = count_i - 1; - pid >= 0 && sort_i[pid].d + h_max + dx_max > dj_min; pid--) { + pid >= 0 && sort_i[pid].d + h_max + dx_max - rshift > dj_min; pid--) { struct part *restrict pi = &parts_i[sort_i[pid].i]; if (part_is_active(pi, e)) { numActive++; @@ -1569,16 +1611,12 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, struct cache *restrict ci_cache = &r->ci_cache; struct cache *restrict cj_cache = &r->cj_cache; - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } + if (ci_cache->count < count_i) cache_init(ci_cache, count_i); + if (cj_cache->count < count_j) cache_init(cj_cache, count_j); int first_pi, last_pj; - int *max_index_i SWIFT_CACHE_ALIGN; - int *max_index_j SWIFT_CACHE_ALIGN; + int *restrict max_index_i SWIFT_CACHE_ALIGN; + int *restrict max_index_j SWIFT_CACHE_ALIGN; max_index_i = r->ci_cache.max_index; max_index_j = r->cj_cache.max_index; @@ -1587,7 +1625,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ populate_max_index_no_cache_force(ci, cj, sort_i, sort_j, dx_max, rshift, - hi_max_raw, hj_max_raw, hi_max, hj_max, + hi_max_raw, hj_max_raw, h_max, di_max, dj_min, max_index_i, max_index_j, &first_pi, &last_pj, max_active_bin); @@ -1599,7 +1637,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Limits of the outer loops. */ const int first_pi_loop = first_pi; - const int last_pj_loop = last_pj; + const int last_pj_loop_end = last_pj + 1; /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ @@ -1632,7 +1670,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - const int exit_iteration = max_index_i[pid]; + const int exit_iteration_end = max_index_i[pid] + 1; /* Fill particle pi vectors. */ const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]); @@ -1665,7 +1703,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Loop over the parts in cj. Making sure to perform an iteration of the * loop even if exit_iteration_align is zero and there is only one * particle to interact with.*/ - for (int pjd = 0; pjd <= exit_iteration; pjd += VEC_SIZE) { + for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */ const int cj_cache_idx = pjd; @@ -1746,7 +1784,7 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, if (active_cj) { /* Loop over the parts in cj until nothing is within range in ci. */ - for (int pjd = 0; pjd <= last_pj_loop; pjd++) { + for (int pjd = 0; pjd < last_pj_loop_end; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -1878,5 +1916,9 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, TIMER_TOC(timer_dopair_density); } +#else + + error("Incorrectly calling vectorized Gadget-2 functions!"); + #endif /* WITH_VECTORIZATION */ } diff --git a/src/scheduler.c b/src/scheduler.c index ea2c6bde010dd387b355e961903411c0b18a41bf..5af9e9c9e19248838bb98c3ee2305d7dc54324f0 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -50,6 +50,7 @@ #include "space.h" #include "task.h" #include "timers.h" +#include "version.h" /** * @brief Re-set the list of active tasks. @@ -111,6 +112,125 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta, atomic_inc(&s->completed_unlock_writes); } +/** + * @brief Write a dot file with the task dependencies. + * + * Run plot_task_dependencies.sh for an example of how to use it + * to generate the figure. + * + * @param s The #scheduler we are working in. + * @param verbose Are we verbose about this? + */ +void scheduler_write_dependencies(struct scheduler *s, int verbose) { + + const ticks tic = getticks(); + + /* Conservative number of dependencies per task type */ + const int max_nber_dep = 128; + + /* Number of possible relations between tasks */ + const int nber_relation = + 2 * task_type_count * task_subtype_count * max_nber_dep; + + /* To get the table of max_nber_dep for a task: + * ind = (ta * task_subtype_count + sa) * max_nber_dep * 2 + * where ta is the value of task_type and sa is the value of + * task_subtype */ + int *table = malloc(nber_relation * sizeof(int)); + if (table == NULL) + error("Error allocating memory for task-dependency graph."); + + /* Reset everything */ + for (int i = 0; i < nber_relation; i++) table[i] = -1; + + /* Create file */ + char filename[200] = "dependency_graph.dot"; + FILE *f = fopen(filename, "w"); + if (f == NULL) error("Error opening dependency graph file."); + + /* Write header */ + fprintf(f, "digraph task_dep {\n"); + fprintf(f, "label=\"Task dependencies for SWIFT %s\"", git_revision()); + fprintf(f, "\t compound=true;\n"); + fprintf(f, "\t ratio=0.66;\n"); + fprintf(f, "\t node[nodesep=0.15];\n"); + + /* loop over all tasks */ + for (int i = 0; i < s->nr_tasks; i++) { + const struct task *ta = &s->tasks[i]; + + /* and their dependencies */ + for (int j = 0; j < ta->nr_unlock_tasks; j++) { + const struct task *tb = ta->unlock_tasks[j]; + + /* check if dependency already written */ + int written = 0; + + /* Current index */ + int ind = ta->type * task_subtype_count + ta->subtype; + ind *= 2 * max_nber_dep; + + int k = 0; + int *cur = &table[ind]; + while (k < max_nber_dep) { + + /* not written yet */ + if (cur[0] == -1) { + cur[0] = tb->type; + cur[1] = tb->subtype; + break; + } + + /* already written */ + if (cur[0] == tb->type && cur[1] == tb->subtype) { + written = 1; + break; + } + + k += 1; + cur = &cur[3]; + } + + /* max_nber_dep is too small */ + if (k == max_nber_dep) + error("Not enough memory, please increase max_nber_dep"); + + /* Not written yet => write it */ + if (!written) { + + /* text to write */ + char ta_name[200]; + char tb_name[200]; + + /* construct line */ + if (ta->subtype == task_subtype_none) + sprintf(ta_name, "%s", taskID_names[ta->type]); + else + sprintf(ta_name, "\"%s %s\"", taskID_names[ta->type], + subtaskID_names[ta->subtype]); + + if (tb->subtype == task_subtype_none) + sprintf(tb_name, "%s", taskID_names[tb->type]); + else + sprintf(tb_name, "\"%s %s\"", taskID_names[tb->type], + subtaskID_names[tb->subtype]); + + /* Write to the ffile */ + fprintf(f, "\t %s->%s;\n", ta_name, tb_name); + } + } + } + + /* Be clean */ + fprintf(f, "}"); + fclose(f); + free(table); + + if (verbose) + message("Printing task graph took %.3f %s.", + clocks_from_ticks(getticks() - tic), clocks_getunit()); +} + /** * @brief Split a hydrodynamic task if too large. * @@ -1317,26 +1437,49 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { if (t->subtype == task_subtype_tend) { t->buff = malloc(sizeof(struct pcell_step) * t->ci->pcell_size); cell_pack_end_step(t->ci, t->buff); - err = MPI_Isend( - t->buff, t->ci->pcell_size * sizeof(struct pcell_step), MPI_BYTE, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->pcell_size * sizeof(struct pcell_step)) > + s->mpi_message_limit) + err = MPI_Isend( + t->buff, t->ci->pcell_size * sizeof(struct pcell_step), + MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend( + t->buff, t->ci->pcell_size * sizeof(struct pcell_step), + MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else if (t->subtype == task_subtype_xv || t->subtype == task_subtype_rho || t->subtype == task_subtype_gradient) { - err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->count * sizeof(struct part)) > s->mpi_message_limit) + err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->parts, t->ci->count, part_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); // message( "sending %i parts with tag=%i from %i to %i." , // t->ci->count , t->flags , s->nodeID , t->cj->nodeID ); // fflush(stdout); } else if (t->subtype == task_subtype_gpart) { - err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->gcount * sizeof(struct gpart)) > s->mpi_message_limit) + err = MPI_Isend(t->ci->gparts, t->ci->gcount, gpart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->gparts, t->ci->gcount, gpart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else if (t->subtype == task_subtype_spart) { - err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->scount * sizeof(struct spart)) > s->mpi_message_limit) + err = MPI_Isend(t->ci->sparts, t->ci->scount, spart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->sparts, t->ci->scount, spart_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else if (t->subtype == task_subtype_multipole) { - err = MPI_Isend(t->ci->multipole, 1, multipole_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if ((t->ci->scount * sizeof(struct gravity_tensors)) > + s->mpi_message_limit) + err = MPI_Isend(t->ci->multipole, 1, multipole_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + else + err = MPI_Issend(t->ci->multipole, 1, multipole_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); } else { error("Unknown communication sub-type"); } diff --git a/src/scheduler.h b/src/scheduler.h index c5ccbf43f3048dfb47d2d3eb7a7db6634b646700..1a75544de12b8402e553e3ae2b84e2d8a65c56e8 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -103,6 +103,10 @@ struct scheduler { /* The node we are working on. */ int nodeID; + /* Maximum size of task messages, in bytes, to sent using non-buffered + * MPI. */ + size_t mpi_message_limit; + /* 'Pointer' to the seed for the random number generator */ pthread_key_t local_seed_pointer; }; @@ -168,5 +172,6 @@ void scheduler_dump_queue(struct scheduler *s); void scheduler_print_tasks(const struct scheduler *s, const char *fileName); void scheduler_clean(struct scheduler *s); void scheduler_free_tasks(struct scheduler *s); +void scheduler_write_dependencies(struct scheduler *s, int verbose); #endif /* SWIFT_SCHEDULER_H */ diff --git a/src/serial_io.c b/src/serial_io.c index eb1e0e23fb34fd8d6a21230d9e38cfe82c47df1d..f8e3927c4a09a8a94afff5287a42f2b10afac9b4 100644 --- a/src/serial_io.c +++ b/src/serial_io.c @@ -176,9 +176,9 @@ void readArray(hid_t grp, const struct io_props props, size_t N, * Routines writing an output file *-----------------------------------------------------------------------------*/ -void prepareArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, - char* partTypeGroupName, const struct io_props props, - unsigned long long N_total, +void prepareArray(const struct engine* e, hid_t grp, char* fileName, + FILE* xmfFile, char* partTypeGroupName, + const struct io_props props, unsigned long long N_total, const struct unit_system* internal_units, const struct unit_system* snapshot_units) { @@ -282,14 +282,14 @@ void prepareArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, * @todo A better version using HDF5 hyper-slabs to write the file directly from * the part array will be written once the structures have been stabilized. */ -void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, - char* partTypeGroupName, const struct io_props props, size_t N, - long long N_total, int mpi_rank, long long offset, +void writeArray(const struct engine* e, hid_t grp, char* fileName, + FILE* xmfFile, char* partTypeGroupName, + const struct io_props props, size_t N, long long N_total, + int mpi_rank, long long offset, const struct unit_system* internal_units, const struct unit_system* snapshot_units) { const size_t typeSize = io_sizeof_type(props.type); - const size_t copySize = typeSize * props.dimension; const size_t num_elements = N * props.dimension; /* message("Writing '%s' array...", props.name); */ @@ -300,45 +300,13 @@ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, internal_units, snapshot_units); /* Allocate temporary buffer */ - void* temp = malloc(num_elements * io_sizeof_type(props.type)); - if (temp == NULL) error("Unable to allocate memory for temporary buffer"); - - /* Copy particle data to temporary buffer */ - if (props.convert_part == NULL && - props.convert_gpart == NULL) { /* No conversion */ - - char* temp_c = temp; - for (size_t i = 0; i < N; ++i) - memcpy(&temp_c[i * copySize], props.field + i * props.partSize, copySize); - - } else if (props.convert_part != NULL) { /* conversion (for parts)*/ - - float* temp_f = temp; - for (size_t i = 0; i < N; ++i) - temp_f[i] = props.convert_part(e, &props.parts[i]); - - } else if (props.convert_gpart != NULL) { /* conversion (for gparts)*/ - - float* temp_f = temp; - for (size_t i = 0; i < N; ++i) - temp_f[i] = props.convert_gpart(e, &props.gparts[i]); - } - - /* Unit conversion if necessary */ - const double factor = - units_conversion_factor(internal_units, snapshot_units, props.units); - if (factor != 1.) { - - /* message("Converting ! factor=%e", factor); */ + void* temp = NULL; + if (posix_memalign((void**)&temp, IO_BUFFER_ALIGNMENT, + num_elements * typeSize) != 0) + error("Unable to allocate temporary i/o buffer"); - if (io_is_double_precision(props.type)) { - double* temp_d = temp; - for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor; - } else { - float* temp_f = temp; - for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor; - } - } + /* Copy the particle data to the temporary buffer */ + io_copy_temp_buffer(temp, e, props, N, internal_units, snapshot_units); /* Construct information for the hyper-slab */ int rank; @@ -512,7 +480,7 @@ void read_ic_serial(char* fileName, const struct unit_system* internal_units, /* Read the unit system used in the ICs */ if (ic_units == NULL) error("Unable to allocate memory for IC unit system"); - io_read_unit_system(h_file, ic_units); + io_read_unit_system(h_file, ic_units, mpi_rank); if (units_are_equal(ic_units, internal_units)) { @@ -656,8 +624,9 @@ void read_ic_serial(char* fileName, const struct unit_system* internal_units, break; default: - message("Particle Type %d not yet supported. Particles ignored", - ptype); + if (mpi_rank == 0) + message("Particle Type %d not yet supported. Particles ignored", + ptype); } /* Read everything */ diff --git a/src/single_io.c b/src/single_io.c index 194563352dff5570b8703f828fac95bccbf7409f..a8199af6a84d3f1bdffd746e5a1a49e6e2518dca 100644 --- a/src/single_io.c +++ b/src/single_io.c @@ -169,57 +169,25 @@ void readArray(hid_t h_grp, const struct io_props prop, size_t N, * @todo A better version using HDF5 hyper-slabs to write the file directly from * the part array will be written once the structures have been stabilized. */ -void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, - char* partTypeGroupName, const struct io_props props, size_t N, +void writeArray(const struct engine* e, hid_t grp, char* fileName, + FILE* xmfFile, char* partTypeGroupName, + const struct io_props props, size_t N, const struct unit_system* internal_units, const struct unit_system* snapshot_units) { const size_t typeSize = io_sizeof_type(props.type); - const size_t copySize = typeSize * props.dimension; const size_t num_elements = N * props.dimension; /* message("Writing '%s' array...", props.name); */ /* Allocate temporary buffer */ - void* temp = malloc(num_elements * io_sizeof_type(props.type)); - if (temp == NULL) error("Unable to allocate memory for temporary buffer"); - - /* Copy particle data to temporary buffer */ - if (props.convert_part == NULL && - props.convert_gpart == NULL) { /* No conversion */ - - char* temp_c = temp; - for (size_t i = 0; i < N; ++i) - memcpy(&temp_c[i * copySize], props.field + i * props.partSize, copySize); - - } else if (props.convert_part != NULL) { /* conversion (for parts)*/ - - float* temp_f = temp; - for (size_t i = 0; i < N; ++i) - temp_f[i] = props.convert_part(e, &props.parts[i]); - - } else if (props.convert_gpart != NULL) { /* conversion (for gparts)*/ - - float* temp_f = temp; - for (size_t i = 0; i < N; ++i) - temp_f[i] = props.convert_gpart(e, &props.gparts[i]); - } - - /* Unit conversion if necessary */ - const double factor = - units_conversion_factor(internal_units, snapshot_units, props.units); - if (factor != 1.) { - - /* message("Converting ! factor=%e", factor); */ + void* temp = NULL; + if (posix_memalign((void**)&temp, IO_BUFFER_ALIGNMENT, + num_elements * typeSize) != 0) + error("Unable to allocate temporary i/o buffer"); - if (io_is_double_precision(props.type)) { - double* temp_d = temp; - for (size_t i = 0; i < num_elements; ++i) temp_d[i] *= factor; - } else { - float* temp_f = temp; - for (size_t i = 0; i < num_elements; ++i) temp_f[i] *= factor; - } - } + /* Copy the particle data to the temporary buffer */ + io_copy_temp_buffer(temp, e, props, N, internal_units, snapshot_units); /* Create data space */ const hid_t h_space = H5Screate(H5S_SIMPLE); @@ -419,7 +387,7 @@ void read_ic_single(char* fileName, const struct unit_system* internal_units, /* Read the unit system used in the ICs */ struct unit_system* ic_units = malloc(sizeof(struct unit_system)); if (ic_units == NULL) error("Unable to allocate memory for IC unit system"); - io_read_unit_system(h_file, ic_units); + io_read_unit_system(h_file, ic_units, 0); /* Tell the user if a conversion will be needed */ if (units_are_equal(ic_units, internal_units)) { diff --git a/src/space.c b/src/space.c index c9db8c84ac3a04c2fc82fcce620e14dc2070b107..b9fe639b195226da3d3168581ebd26f3273d28eb 100644 --- a/src/space.c +++ b/src/space.c @@ -2551,7 +2551,7 @@ void space_synchronize_particle_positions(struct space *s) { * * Calls hydro_first_init_part() on all the particles */ -void space_init_parts(struct space *s) { +void space_first_init_parts(struct space *s) { const size_t nr_parts = s->nr_parts; struct part *restrict p = s->parts; @@ -2583,7 +2583,7 @@ void space_init_parts(struct space *s) { * * Calls cooling_init_xpart() on all the particles */ -void space_init_xparts(struct space *s) { +void space_first_init_xparts(struct space *s) { const size_t nr_parts = s->nr_parts; struct part *restrict p = s->parts; @@ -2600,7 +2600,7 @@ void space_init_xparts(struct space *s) { * * Calls gravity_first_init_gpart() on all the particles */ -void space_init_gparts(struct space *s) { +void space_first_init_gparts(struct space *s) { const size_t nr_gparts = s->nr_gparts; struct gpart *restrict gp = s->gparts; @@ -2631,7 +2631,7 @@ void space_init_gparts(struct space *s) { * * Calls star_first_init_spart() on all the particles */ -void space_init_sparts(struct space *s) { +void space_first_init_sparts(struct space *s) { const size_t nr_sparts = s->nr_sparts; struct spart *restrict sp = s->sparts; @@ -2657,6 +2657,88 @@ void space_init_sparts(struct space *s) { } } +void space_init_parts_mapper(void *restrict map_data, int count, + void *restrict extra_data) { + + struct part *restrict parts = map_data; + const struct hydro_space *restrict hs = extra_data; + for (int k = 0; k < count; k++) hydro_init_part(&parts[k], hs); +} + +/** + * @brief Calls the #part initialisation function on all particles in the space. + * + * @param s The #space. + * @param verbose Are we talkative? + */ +void space_init_parts(struct space *s, int verbose) { + + const ticks tic = getticks(); + + if (s->nr_parts > 0) + threadpool_map(&s->e->threadpool, space_init_parts_mapper, s->parts, + s->nr_parts, sizeof(struct part), 0, &s->hs); + if (verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - tic), + clocks_getunit()); +} + +void space_init_gparts_mapper(void *restrict map_data, int count, + void *restrict extra_data) { + + struct gpart *gparts = map_data; + for (int k = 0; k < count; k++) gravity_init_gpart(&gparts[k]); +} + +/** + * @brief Calls the #gpart initialisation function on all particles in the + * space. + * + * @param s The #space. + * @param verbose Are we talkative? + */ +void space_init_gparts(struct space *s, int verbose) { + + const ticks tic = getticks(); + + if (s->nr_gparts > 0) + threadpool_map(&s->e->threadpool, space_init_gparts_mapper, s->gparts, + s->nr_gparts, sizeof(struct gpart), 0, NULL); + if (verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - tic), + clocks_getunit()); +} + +void space_convert_quantities_mapper(void *restrict map_data, int count, + void *restrict extra_data) { + struct space *s = extra_data; + struct part *restrict parts = map_data; + const ptrdiff_t index = parts - s->parts; + struct xpart *restrict xparts = s->xparts + index; + for (int k = 0; k < count; k++) + hydro_convert_quantities(&parts[k], &xparts[k]); +} + +/** + * @brief Calls the #part quantities conversion function on all particles in the + * space. + * + * @param s The #space. + * @param verbose Are we talkative? + */ +void space_convert_quantities(struct space *s, int verbose) { + + const ticks tic = getticks(); + + if (s->nr_parts > 0) + threadpool_map(&s->e->threadpool, space_convert_quantities_mapper, s->parts, + s->nr_parts, sizeof(struct part), 0, s); + + if (verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - tic), + clocks_getunit()); +} + /** * @brief Split the space into cells given the array of particles. * @@ -2845,11 +2927,16 @@ void space_init(struct space *s, const struct swift_params *params, hydro_space_init(&s->hs, s); + ticks tic = getticks(); + if (verbose) message("first init..."); /* Set the particles in a state where they are ready for a run */ - space_init_parts(s); - space_init_xparts(s); - space_init_gparts(s); - space_init_sparts(s); + space_first_init_parts(s); + space_first_init_xparts(s); + space_first_init_gparts(s); + space_first_init_sparts(s); + if (verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - tic), + clocks_getunit()); /* Init the space lock. */ if (lock_init(&s->lock) != 0) error("Failed to create space spin-lock."); @@ -2871,7 +2958,8 @@ void space_replicate(struct space *s, int replicate, int verbose) { if (replicate < 1) error("Invalid replicate value: %d", replicate); - message("Replicating space %d times along each axis.", replicate); + if (verbose) + message("Replicating space %d times along each axis.", replicate); const int factor = replicate * replicate * replicate; diff --git a/src/space.h b/src/space.h index 1ea841affece7b4863edf20468d1d71caec93ab8..c49dc37dc0df27cd3647044f219e36c299dcd73b 100644 --- a/src/space.h +++ b/src/space.h @@ -221,9 +221,9 @@ void space_synchronize_particle_positions(struct space *s); void space_do_parts_sort(); void space_do_gparts_sort(); void space_do_sparts_sort(); -void space_init_parts(struct space *s); -void space_init_gparts(struct space *s); -void space_init_sparts(struct space *s); +void space_init_parts(struct space *s, int verbose); +void space_init_gparts(struct space *s, int verbose); +void space_convert_quantities(struct space *s, int verbose); void space_link_cleanup(struct space *s); void space_check_drift_point(struct space *s, integertime_t ti_drift, int multipole); diff --git a/src/swift.h b/src/swift.h index a5556730ae965109385257c0c38bfc34277223d4..33a0425154d45e030443bc7f2c405377ef6a39e2 100644 --- a/src/swift.h +++ b/src/swift.h @@ -32,6 +32,7 @@ #include "cooling.h" #include "cycle.h" #include "debug.h" +#include "dump.h" #include "engine.h" #include "error.h" #include "gravity.h" @@ -40,6 +41,7 @@ #include "hydro.h" #include "hydro_properties.h" #include "lock.h" +#include "logger.h" #include "map.h" #include "multipole.h" #include "parallel_io.h" diff --git a/src/threadpool.c b/src/threadpool.c index 465756f71d88df81921a880edf8cdb1ee17f6026..0ef69f9895d3d127a330b105ec385d97975fc82f 100644 --- a/src/threadpool.c +++ b/src/threadpool.c @@ -169,10 +169,10 @@ void *threadpool_runner(void *data) { while (1) { /* Let the controller know that this thread is waiting. */ - pthread_barrier_wait(&tp->wait_barrier); + swift_barrier_wait(&tp->wait_barrier); /* Wait for the controller. */ - pthread_barrier_wait(&tp->run_barrier); + swift_barrier_wait(&tp->run_barrier); /* If no map function is specified, just die. We use this as a mechanism to shut down threads without leaving the barriers in an invalid state. */ @@ -212,8 +212,8 @@ void threadpool_init(struct threadpool *tp, int num_threads) { if (num_threads == 1) return; /* Init the barriers. */ - if (pthread_barrier_init(&tp->wait_barrier, NULL, num_threads) != 0 || - pthread_barrier_init(&tp->run_barrier, NULL, num_threads) != 0) + if (swift_barrier_init(&tp->wait_barrier, NULL, num_threads) != 0 || + swift_barrier_init(&tp->run_barrier, NULL, num_threads) != 0) error("Failed to initialize barriers."); /* Set the task counter to zero. */ @@ -237,7 +237,7 @@ void threadpool_init(struct threadpool *tp, int num_threads) { } /* Wait for all the threads to be up and running. */ - pthread_barrier_wait(&tp->wait_barrier); + swift_barrier_wait(&tp->wait_barrier); } /** @@ -288,13 +288,13 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function, tp->num_threads_running = 0; /* Wait for all the threads to be up and running. */ - pthread_barrier_wait(&tp->run_barrier); + swift_barrier_wait(&tp->run_barrier); /* Do some work while I'm at it. */ threadpool_chomp(tp, tp->num_threads - 1); /* Wait for all threads to be done. */ - pthread_barrier_wait(&tp->wait_barrier); + swift_barrier_wait(&tp->wait_barrier); #ifdef SWIFT_DEBUG_THREADPOOL /* Log the total call time to thread id -1. */ @@ -321,15 +321,15 @@ void threadpool_clean(struct threadpool *tp) { * and waiting for all the threads to terminate. This ensures that no * thread is still waiting at a barrier. */ tp->map_function = NULL; - pthread_barrier_wait(&tp->run_barrier); + swift_barrier_wait(&tp->run_barrier); for (int k = 0; k < tp->num_threads - 1; k++) { void *retval; pthread_join(tp->threads[k], &retval); } /* Release the barriers. */ - if (pthread_barrier_destroy(&tp->wait_barrier) != 0 || - pthread_barrier_destroy(&tp->run_barrier) != 0) + if (swift_barrier_destroy(&tp->wait_barrier) != 0 || + swift_barrier_destroy(&tp->run_barrier) != 0) error("Failed to destroy threadpool barriers."); /* Clean up memory. */ diff --git a/src/threadpool.h b/src/threadpool.h index 019403f658a22d36c4a6e1ec1ae1fdc47c62658d..9d19b56836330f59093933bbed5900727142f4a2 100644 --- a/src/threadpool.h +++ b/src/threadpool.h @@ -22,10 +22,11 @@ /* Config parameters. */ #include "../config.h" -/* Some standard headers. */ +/* Standard headers */ #include <pthread.h> /* Local includes. */ +#include "barrier.h" #include "cycle.h" /* Local defines. */ @@ -70,8 +71,8 @@ struct threadpool { pthread_t *threads; /* This is where threads go to rest. */ - pthread_barrier_t wait_barrier; - pthread_barrier_t run_barrier; + swift_barrier_t wait_barrier; + swift_barrier_t run_barrier; /* Current map data and count. */ void *map_data, *map_extra_data; diff --git a/src/units.c b/src/units.c index 13d3e5dd35b14c325527ff35703712258e273ef3..fef137971e58c51027da357c11007ccc0946e4b7 100644 --- a/src/units.c +++ b/src/units.c @@ -587,3 +587,17 @@ double units_conversion_factor(const struct unit_system* from, return units_general_conversion_factor(from, to, baseUnitsExp); } + +/** + * @brief print a #unit_system + * + * @param us The #unit_system + */ +void units_print(const struct unit_system* us) { + message("Units:"); + message("\tUnit Mass: %g", us->UnitMass_in_cgs); + message("\tUnit Length: %g", us->UnitLength_in_cgs); + message("\tUnit Time: %g", us->UnitTime_in_cgs); + message("\tUnit Current: %g", us->UnitCurrent_in_cgs); + message("\tUnit Temperature: %g", us->UnitTemperature_in_cgs); +} diff --git a/src/units.h b/src/units.h index a5765495f9f52159ab70a1072c1f8571ddcdf14b..657b29c070f3816293c18edfc46ad6b960ec9b33 100644 --- a/src/units.h +++ b/src/units.h @@ -137,4 +137,6 @@ double units_conversion_factor(const struct unit_system* from, const struct unit_system* to, enum unit_conversion_factor unit); +void units_print(const struct unit_system* us); + #endif /* SWIFT_UNITS_H */ diff --git a/src/version.c b/src/version.c index 46c31103c953ce2ff70b9e346f88470008dd8266..f4177e5c83c776ea063ad32fd00895199c94b182 100644 --- a/src/version.c +++ b/src/version.c @@ -332,6 +332,22 @@ const char *fftw3_version(void) { return version; } +/** + * @brief return the thread barrier used in SWIFT. + * + * @result description of the thread barriers + */ +const char *thread_barrier_version(void) { + + static char version[256] = {0}; +#if defined(HAVE_PTHREAD_BARRIERS) + sprintf(version, "%s", "pthread"); +#else + sprintf(version, "homemade"); +#endif + return version; +} + /** * @brief Prints a greeting message to the standard output containing code * version and revision number diff --git a/src/version.h b/src/version.h index 60998958098c1a37198cdbb3729982835f6e4f62..1af76b647b2d401bb4a7998b281864fb3e63a0c8 100644 --- a/src/version.h +++ b/src/version.h @@ -34,6 +34,7 @@ const char* mpi_version(void); const char* metis_version(void); const char* hdf5_version(void); const char* fftw3_version(void); +const char* thread_barrier_version(void); void greetings(void); #endif /* SWIFT_VERSION_H */ diff --git a/tests/test125cells.c b/tests/test125cells.c index 62ded5c1df47c9d40759dbc7aa239bb4a8a47dc4..bf1c219fab9927b81bb087684f6ad0d747e958bc 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -474,8 +474,10 @@ int main(int argc, char *argv[]) { unsigned long long cpufreq = 0; clocks_set_cpufreq(cpufreq); - /* Choke on FP-exceptions */ +/* Choke on FP-exceptions */ +#ifdef HAVE_FE_ENABLE_EXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); +#endif /* Get some randomness going */ srand(0); diff --git a/tests/test27cells.c b/tests/test27cells.c index c153fc6f57dc976e1f00767bae275a6a3de24c00..f9e3ba1dd78e67a11414c88efe3c604b02b1aa16 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -333,8 +333,10 @@ int main(int argc, char *argv[]) { unsigned long long cpufreq = 0; clocks_set_cpufreq(cpufreq); - /* Choke on FP-exceptions */ +/* Choke on FP-exceptions */ +#ifdef HAVE_FE_ENABLE_EXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); +#endif /* Get some randomness going */ srand(0); diff --git a/tests/testActivePair.c b/tests/testActivePair.c index c8e0233c2c317be321ed8b923a46a98e0905b36c..fd2244db971f6a5f7cfeb62b9661c33b7ee88145 100644 --- a/tests/testActivePair.c +++ b/tests/testActivePair.c @@ -433,8 +433,10 @@ int main(int argc, char *argv[]) { unsigned long long cpufreq = 0; clocks_set_cpufreq(cpufreq); - /* Choke on FP-exceptions */ +/* Choke on FP-exceptions */ +#ifdef HAVE_FE_ENABLE_EXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); +#endif /* Generate a RNG seed from time. */ unsigned int seed = time(NULL); diff --git a/tests/testDump.c b/tests/testDump.c index 7343af49f654582de444681ec291311d41251dca..b1406ba5f5e6652c6e2727d6b161215f4e905d62 100644 --- a/tests/testDump.c +++ b/tests/testDump.c @@ -20,6 +20,8 @@ /* Config parameters. */ #include "../config.h" +#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */ + /* Some standard headers. */ #include <errno.h> #include <fcntl.h> @@ -30,11 +32,8 @@ #include <sys/types.h> #include <unistd.h> -/* This object's header. */ -#include "../src/dump.h" - /* Local headers. */ -#include "../src/threadpool.h" +#include "swift.h" void dump_mapper(void *map_data, int num_elements, void *extra_data) { struct dump *d = (struct dump *)extra_data; @@ -49,10 +48,14 @@ int main(int argc, char *argv[]) { /* Some constants. */ const int num_threads = 4; - const char *filename = "/tmp/dump_test.out"; const int num_runs = 20; const int chunk_size = 1000; + /* Some constants. */ + char filename[256]; + const int now = time(NULL); + sprintf(filename, "/tmp/SWIFT_dump_test_%d.out", now); + /* Prepare a threadpool to write to the dump. */ struct threadpool t; threadpool_init(&t, num_threads); @@ -82,6 +85,15 @@ int main(int argc, char *argv[]) { /* Clean the threads */ threadpool_clean(&t); + /* Be clean */ + remove(filename); + /* Return a happy number. */ return 0; } + +#else + +int main(int argc, char *argv[]) { return 0; } + +#endif /* HAVE_POSIX_FALLOCATE */ diff --git a/tests/testFFT.c b/tests/testFFT.c index 4ddd030ece95bf26cbfe41f2408be7c3e0c50535..ba737935afc4568a1509d2c43a3534b60a6ef867 100644 --- a/tests/testFFT.c +++ b/tests/testFFT.c @@ -76,6 +76,8 @@ int main() { engine.max_active_bin = num_time_bins; engine.gravity_properties = &gravity_properties; engine.nr_threads = 1; + engine.nodeID = 0; + engine_rank = 0; struct runner runner; runner.e = &engine; diff --git a/tests/testLogger.c b/tests/testLogger.c index ec3b33b6a9e38741e41b4678681e7afe9b9a7950..9ec08607383fdb192b7ba994e4af506fde12fea9 100644 --- a/tests/testLogger.c +++ b/tests/testLogger.c @@ -20,18 +20,16 @@ /* Config parameters. */ #include "../config.h" +#ifdef HAVE_POSIX_FALLOCATE /* Are we on a sensible platform? */ + /* Some standard headers. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> -/* This object's header. */ -#include "../src/logger.h" - /* Local headers. */ -#include "../src/dump.h" -#include "../src/part.h" +#include "swift.h" void test_log_parts(struct dump *d) { @@ -223,7 +221,9 @@ void test_log_timestamps(struct dump *d) { int main(int argc, char *argv[]) { /* Some constants. */ - const char *filename = "/tmp/dump_test.out"; + char filename[256]; + const int now = time(NULL); + sprintf(filename, "/tmp/SWIFT_logger_test_%d.out", now); /* Prepare a dump. */ struct dump d; @@ -241,7 +241,15 @@ int main(int argc, char *argv[]) { /* Finalize the dump. */ dump_close(&d); + /* Be clean */ + remove(filename); + /* Return a happy number. */ - printf("PASS\n"); return 0; } + +#else + +int main(int argc, char *argv[]) { return 0; } + +#endif /* HAVE_POSIX_FALLOCATE */ diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c index 1afe829ce70c5d02f565e8fd002ba414abcb7388..ded767c5d657c0a05dc227d1ee8facbfa5f4de63 100644 --- a/tests/testPeriodicBC.c +++ b/tests/testPeriodicBC.c @@ -391,8 +391,10 @@ int main(int argc, char *argv[]) { unsigned long long cpufreq = 0; clocks_set_cpufreq(cpufreq); - /* Choke on FP-exceptions */ +/* Choke on FP-exceptions */ +#ifdef HAVE_FE_ENABLE_EXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); +#endif /* Get some randomness going */ srand(0); diff --git a/tests/testSymmetry.c b/tests/testSymmetry.c index 73c5708a6add174b88f26cc716a716fa2ad81709..68a878b05c3fc298ef7d0b159df9997d2ef1f82d 100644 --- a/tests/testSymmetry.c +++ b/tests/testSymmetry.c @@ -28,8 +28,10 @@ int main(int argc, char *argv[]) { - /* Choke if need be */ +/* Choke on FPEs */ +#ifdef HAVE_FE_ENABLE_EXCEPT feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); +#endif #if defined(SHADOWFAX_SPH) /* Initialize the Voronoi simulation box */