diff --git a/.gitignore b/.gitignore index 28a830818af36faad3f4278c6adcba5562b59ee7..e3e17bb82a01d9af0ace6ed72d196cf2dba242f6 100644 --- a/.gitignore +++ b/.gitignore @@ -34,12 +34,17 @@ examples/*/*/*.txt examples/*/*/used_parameters.yml examples/*/gravity_checks_*.dat -tests/testPair +tests/testActivePair +tests/brute_force_periodic_BC_standard.dat +tests/swift_periodic_BC_standard.dat +tests/brute_force_periodic_BC_pertrubed.dat +tests/swift_periodic_BC_perturbed.dat tests/brute_force_standard.dat tests/swift_dopair_standard.dat tests/brute_force_perturbed.dat tests/swift_dopair_perturbed.dat tests/test27cells +tests/testPeriodicBC tests/test125cells tests/brute_force_27_standard.dat tests/swift_dopair_27_standard.dat @@ -49,6 +54,11 @@ tests/brute_force_125_standard.dat tests/swift_dopair_125_standard.dat tests/brute_force_125_perturbed.dat tests/swift_dopair_125_perturbed.dat +tests/brute_force_active.dat +tests/brute_force_periodic_BC_perturbed.dat +tests/swift_dopair_active.dat +tests/test_nonsym_density_serial.dat +tests/test_nonsym_density_vec.dat tests/testGreetings tests/testReading tests/input.hdf5 @@ -64,12 +74,12 @@ tests/testMaths tests/testThreadpool tests/testParser tests/parser_output.yml +tests/testPeriodicBC.sh +tests/testPeriodicBCPerturbed.sh tests/test27cells.sh tests/test27cellsPerturbed.sh tests/test125cells.sh tests/test125cellsPerturbed.sh -tests/testPair.sh -tests/testPairPerturbed.sh tests/testParser.sh tests/testReading.sh tests/testAdiabaticIndex @@ -95,6 +105,9 @@ theory/paper_pasc/pasc_paper.pdf theory/Multipoles/fmm.pdf theory/Multipoles/fmm_standalone.pdf theory/Multipoles/potential.pdf +theory/Multipoles/potential_long.pdf +theory/Multipoles/potential_short.pdf +theory/Multipoles/force_short.pdf m4/libtool.m4 m4/ltoptions.m4 diff --git a/README b/README index 2dedb32a04a7cf143c3e65560c45a68c0e5d1c2a..c088a94488133ddf53cd8a6eba45d8dcdebfeb72 100644 --- a/README +++ b/README @@ -15,28 +15,31 @@ Usage: swift [OPTION]... PARAMFILE swift_mpi [OPTION]... PARAMFILE Valid options are: - -a Pin runners using processor affinity. - -c Run with cosmological time integration. - -C Run with cooling. - -d Dry run. Read the parameter file, allocate memory but does not read - the particles from ICs and exit before the start of time integration. - Allows user to check validy of parameter and IC files as well as memory limits. - -D Always drift all particles even the ones far from active particles. This emulates - Gadget-[23] and GIZMO's default behaviours. - -e Enable floating-point exceptions (debugging mode). - -f {int} Overwrite the CPU frequency (Hz) to be used for time measurements. - -g Run with an external gravitational potential. - -G Run with self-gravity. - -M Reconstruct the multipoles every time-step. - -n {int} Execute a fixed number of time steps. When unset use the time_end parameter to stop. - -s Run with hydrodynamics. - -S Run with stars. - -t {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified. - -T Print timers every time-step. - -v [12] Increase the level of verbosity - 1: MPI-rank 0 writes - 2: All MPI-ranks write - -y {int} Time-step frequency at which task graphs are dumped. - -h Print this help message and exit. + -a Pin runners using processor affinity. + -c Run with cosmological time integration. + -C Run with cooling. + -d Dry run. Read the parameter file, allocate memory but does not read + the particles from ICs and exit before the start of time integration. + Allows user to check validy of parameter and IC files as well as memory limits. + -D Always drift all particles even the ones far from active particles. This emulates + Gadget-[23] and GIZMO's default behaviours. + -e Enable floating-point exceptions (debugging mode). + -f {int} Overwrite the CPU frequency (Hz) to be used for time measurements. + -g Run with an external gravitational potential. + -G Run with self-gravity. + -M Reconstruct the multipoles every time-step. + -n {int} Execute a fixed number of time steps. When unset use the time_end parameter to stop. + -P {sec:par:val} Set parameter value and overwrites values read from the parameters file. Can be used more than once. + -s Run with hydrodynamics. + -S Run with stars. + -t {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified. + -T Print timers every time-step. + -v [12] Increase the level of verbosity: + 1: MPI-rank 0 writes, + 2: All MPI-ranks write. + -y {int} Time-step frequency at which task graphs are dumped. + -Y {int} Time-step frequency at which threadpool tasks are dumped. + -h Print this help message and exit. + +See the file parameter_example.yml for an example of parameter file. -See the file examples/parameter_example.yml for an example of parameter file. diff --git a/configure.ac b/configure.ac index 788bb57eed801c1a1dff2204b57b34c4fadf3b58..74fede99f4fbf578af4e703cedaa42f2c278b037 100644 --- a/configure.ac +++ b/configure.ac @@ -16,7 +16,7 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # Init the project. -AC_INIT([SWIFT],[0.5.0],[https://gitlab.cosma.dur.ac.uk/swift/swiftsim]) +AC_INIT([SWIFT],[0.6.0],[https://gitlab.cosma.dur.ac.uk/swift/swiftsim]) swift_config_flags="$*" # Need to define this, instead of using fifth argument of AC_INIT, until 2.64. @@ -189,6 +189,19 @@ if test "$enable_task_debugging" = "yes"; then AC_DEFINE([SWIFT_DEBUG_TASKS],1,[Enable task debugging]) fi +# Check if threadpool debugging is on. +AC_ARG_ENABLE([threadpool-debugging], + [AS_HELP_STRING([--enable-threadpool-debugging], + [Store threadpool mapper timing information and generate threadpool dump files @<:@yes/no@:>@] + )], + [enable_threadpool_debugging="$enableval"], + [enable_threadpool_debugging="no"] +) +if test "$enable_threadpool_debugging" = "yes"; then + AC_DEFINE([SWIFT_DEBUG_THREADPOOL],1,[Enable threadpool debugging]) + LDFLAGS="$LDFLAGS -rdynamic" +fi + # Check if the general timers are switched on. AC_ARG_ENABLE([timers], [AS_HELP_STRING([--enable-timers], @@ -829,10 +842,10 @@ esac # Gravity multipole order AC_ARG_WITH([multipole-order], [AS_HELP_STRING([--with-multipole-order=<order>], - [order of the multipole and gravitational field expansion @<:@ default: 3@:>@] + [order of the multipole and gravitational field expansion @<:@ default: 4@:>@] )], [with_multipole_order="$withval"], - [with_multipole_order="3"] + [with_multipole_order="4"] ) AC_DEFINE_UNQUOTED([SELF_GRAVITY_MULTIPOLE_ORDER], [$with_multipole_order], [Multipole order]) @@ -848,19 +861,31 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""]) # Handle .in files. AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile]) AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh]) -AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh]) -AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh]) +AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh]) AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh]) AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh]) AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh]) AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh]) +AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh]) +AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh]) +AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh]) AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh]) # Save the compilation options AC_DEFINE_UNQUOTED([SWIFT_CONFIG_FLAGS],["$swift_config_flags"],[Flags passed to configure]) +# Make sure the latest git revision string gets included +touch src/version.c + +# Generate output. +AC_OUTPUT + # Report general configuration. -AC_MSG_RESULT([ +AC_MSG_RESULT([ + ------- Summary -------- + + $PACKAGE_NAME v.$PACKAGE_VERSION + Compiler : $CC - vendor : $ax_cv_c_compiler_vendor - version : $ax_cv_c_compiler_version @@ -887,14 +912,10 @@ AC_MSG_RESULT([ Multipole order : $with_multipole_order No gravity below ID : $no_gravity_below_id - Individual timers : $enable_timers - Task debugging : $enable_task_debugging - Debugging checks : $enable_debugging_checks - Gravity checks : $gravity_force_checks -]) - -# Make sure the latest git revision string gets included -touch src/version.c + Individual timers : $enable_timers + Task debugging : $enable_task_debugging + Threadpool debugging : $enable_threadpool_debugging + Debugging checks : $enable_debugging_checks + Gravity checks : $gravity_force_checks -# Generate output. -AC_OUTPUT + ------------------------]) diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in index 0df1f91194b6d1e7e98cb1b75be7d3eaaca7fc32..0193760d3114aecab91f0c2ad27a9c1dd77dec9a 100644 --- a/doc/Doxyfile.in +++ b/doc/Doxyfile.in @@ -1988,6 +1988,9 @@ INCLUDE_FILE_PATTERNS = # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. PREDEFINED = "__attribute__(x)= " +PREDEFINED += HAVE_HDF5 +PREDEFINED += WITH_MPI +PREDEFINED += WITH_VECTORIZATION # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The diff --git a/examples/CoolingBox/energy_plot.py b/examples/CoolingBox/energy_plot.py index c8948e7e209c2786ffdecbb2b8b606e73d703238..45f0b4f6b11c3855a919f6a98fd0ca006a887f82 100644 --- a/examples/CoolingBox/energy_plot.py +++ b/examples/CoolingBox/energy_plot.py @@ -34,7 +34,7 @@ import sys stats_filename = "./energy.txt" # First snapshot -snap_filename = "coolingBox_000.hdf5" +snap_filename = "coolingBox_0000.hdf5" # Some constants in cgs units k_b = 1.38E-16 #boltzmann @@ -104,7 +104,7 @@ print "Cooling time:", cooling_time_cgs, "[s]" u_snapshots_cgs = zeros(25) t_snapshots_cgs = zeros(25) for i in range(25): - snap = h5.File("coolingBox_%0.3d.hdf5"%i,'r') + snap = h5.File("coolingBox_%0.4d.hdf5"%i,'r') u_snapshots_cgs[i] = sum(snap["/PartType0/InternalEnergy"][:] * snap["/PartType0/Masses"][:]) / total_mass[0] * unit_length**2 / (unit_time)**2 t_snapshots_cgs[i] = snap["/Header"].attrs["Time"] * unit_time diff --git a/examples/CoolingHalo/density_profile.py b/examples/CoolingHalo/density_profile.py index 335f7089b6835b65cf37e1bcd312a17966c295a7..c53be03b369e04d2cb8e68e419e08347ee6721eb 100644 --- a/examples/CoolingHalo/density_profile.py +++ b/examples/CoolingHalo/density_profile.py @@ -20,7 +20,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "Hydrostatic_000.hdf5" +filename = "Hydrostatic_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -39,7 +39,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "Hydrostatic_%03d.hdf5" %i + filename = "Hydrostatic_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHalo/internal_energy_profile.py b/examples/CoolingHalo/internal_energy_profile.py index 854bdf223cfae75203a1924b4af6136b4b7aa6cd..d5f77c32ad17b02026abc7f8806c323c130c735a 100644 --- a/examples/CoolingHalo/internal_energy_profile.py +++ b/examples/CoolingHalo/internal_energy_profile.py @@ -38,7 +38,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "Hydrostatic_000.hdf5" +filename = "Hydrostatic_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -57,7 +57,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "Hydrostatic_%03d.hdf5" %i + filename = "Hydrostatic_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHalo/test_energy_conservation.py b/examples/CoolingHalo/test_energy_conservation.py index 00374e905e8eeb66bfe8c7360ab37522bc93af32..2e2ad3607f888f892f021a760dfa89753d52c133 100644 --- a/examples/CoolingHalo/test_energy_conservation.py +++ b/examples/CoolingHalo/test_energy_conservation.py @@ -17,7 +17,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "CoolingHalo_000.hdf5" +filename = "CoolingHalo_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -41,7 +41,7 @@ time_array_cgs = [] for i in range(n_snaps): - filename = "CoolingHalo_%03d.hdf5" %i + filename = "CoolingHalo_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHalo/velocity_profile.py b/examples/CoolingHalo/velocity_profile.py index d64d255b18482bc26578f21f46199aa3540ae7b5..7d31e66ff52c51d0852fa9165753032d130db9c2 100644 --- a/examples/CoolingHalo/velocity_profile.py +++ b/examples/CoolingHalo/velocity_profile.py @@ -39,7 +39,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "CoolingHalo_000.hdf5" +filename = "CoolingHalo_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -58,7 +58,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "CoolingHalo_%03d.hdf5" %i + filename = "CoolingHalo_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHaloWithSpin/density_profile.py b/examples/CoolingHaloWithSpin/density_profile.py index fb88ddd6aea71603a6f6fcb36b13771106737e6a..cc4f8a195d9b88dbbaef3891b57ab9e2dfa9e3ed 100644 --- a/examples/CoolingHaloWithSpin/density_profile.py +++ b/examples/CoolingHaloWithSpin/density_profile.py @@ -21,7 +21,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "CoolingHalo_000.hdf5" +filename = "CoolingHalo_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -42,7 +42,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "CoolingHalo_%03d.hdf5" %i + filename = "CoolingHalo_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHaloWithSpin/internal_energy_profile.py b/examples/CoolingHaloWithSpin/internal_energy_profile.py index 5f71d69ca7a978de242559f84ec390faa86a27f0..8e039bd3c2d1287946350b2af0efb595cc848ac0 100644 --- a/examples/CoolingHaloWithSpin/internal_energy_profile.py +++ b/examples/CoolingHaloWithSpin/internal_energy_profile.py @@ -39,7 +39,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "CoolingHalo_000.hdf5" +filename = "CoolingHalo_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -60,7 +60,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "CoolingHalo_%03d.hdf5" %i + filename = "CoolingHalo_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHaloWithSpin/test_energy_conservation.py b/examples/CoolingHaloWithSpin/test_energy_conservation.py index cc7518d2e4d64441b2c4d6b0663caae873f34d95..c9d020b69b7bcccc4778ee12071dd448df0bdee0 100644 --- a/examples/CoolingHaloWithSpin/test_energy_conservation.py +++ b/examples/CoolingHaloWithSpin/test_energy_conservation.py @@ -20,7 +20,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "CoolingHalo_000.hdf5" +filename = "CoolingHalo_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -44,7 +44,7 @@ time_array_cgs = [] for i in range(n_snaps): - filename = "CoolingHalo_%03d.hdf5" %i + filename = "CoolingHalo_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/CoolingHaloWithSpin/velocity_profile.py b/examples/CoolingHaloWithSpin/velocity_profile.py index 07df8e1b0751307513c30a5b128773b193c3a9cd..7247e23a34a3965207b0d4749b46fecfafc4eda9 100644 --- a/examples/CoolingHaloWithSpin/velocity_profile.py +++ b/examples/CoolingHaloWithSpin/velocity_profile.py @@ -39,7 +39,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "CoolingHalo_000.hdf5" +filename = "CoolingHalo_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -58,7 +58,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "CoolingHalo_%03d.hdf5" %i + filename = "CoolingHalo_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/DiscPatch/HydroStatic/README b/examples/DiscPatch/HydroStatic/README index 42853e6b51983f2868528202adec3fc829c2ddbc..49ed96dc3bac607a4d454547d880b10bb6b28857 100644 --- a/examples/DiscPatch/HydroStatic/README +++ b/examples/DiscPatch/HydroStatic/README @@ -18,3 +18,5 @@ output to 'Disc-Patch-dynamic.hdf5'. These are now the ICs for the actual test. When running SWIFT with the parameters from 'disc-patch.yml' and an ideal gas EoS on these ICs the disc should stay in equilibrium. + +The solution can be checked using the 'plotSolution.py' script. diff --git a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml index 6a27016b8a3f484b7c1c9b74594073d5f28efe90..6f17cfbb1e0125faf8e47fe4e9e55bfdf4df7b71 100644 --- a/examples/DiscPatch/HydroStatic/disc-patch-icc.yml +++ b/examples/DiscPatch/HydroStatic/disc-patch-icc.yml @@ -1,8 +1,8 @@ # Define the system of units to use internally. InternalUnitSystem: - UnitMass_in_cgs: 1.9885e33 # Grams - UnitLength_in_cgs: 3.0856776e18 # Centimeters - UnitVelocity_in_cgs: 1e5 # Centimeters per second + UnitMass_in_cgs: 1.9885e33 # Grams + UnitLength_in_cgs: 3.08567758149e18 # Centimeters + UnitVelocity_in_cgs: 1e5 # Centimeters per second UnitCurrent_in_cgs: 1 # Amperes UnitTemp_in_cgs: 1 # Kelvin @@ -11,17 +11,17 @@ TimeIntegration: time_begin: 0 # The starting time of the simulation (in internal units). time_end: 968. # The end time of the simulation (in internal units). dt_min: 1e-4 # The minimal time-step size of the simulation (in internal units). - dt_max: 1. # The maximal time-step size of the simulation (in internal units). + dt_max: 10. # The maximal time-step size of the simulation (in internal units). # Parameters governing the conserved quantities statistics Statistics: - delta_time: 1 # Time between statistics output + delta_time: 12. # Time between statistics output # Parameters governing the snapshots Snapshots: - basename: Disc-Patch # Common part of the name of output files - time_first: 0. # Time of the first output (in internal units) - delta_time: 12. # Time difference between consecutive outputs (in internal units) + basename: Disc-Patch # Common part of the name of output files + time_first: 0. # Time of the first output (in internal units) + delta_time: 48. # Time difference between outputs (in internal units) # Parameters for the hydrodynamics scheme SPH: @@ -29,7 +29,7 @@ SPH: delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. - max_smoothing_length: 70. # Maximal smoothing length allowed (in internal units). + h_max: 60. # Maximal smoothing length allowed (in internal units). # Parameters related to the initial conditions InitialConditions: @@ -39,6 +39,8 @@ InitialConditions: DiscPatchPotential: surface_density: 10. scale_height: 100. - z_disc: 200. + x_disc: 400. + x_trunc: 300. + x_max: 350. timestep_mult: 0.03 growth_time: 5. diff --git a/examples/DiscPatch/HydroStatic/disc-patch.yml b/examples/DiscPatch/HydroStatic/disc-patch.yml index 8bd67c5b08de82bb6a3d47ccf3419f85e3e5c6b1..8816bc17ca526d01b7abcf55bb43287bbb36224a 100644 --- a/examples/DiscPatch/HydroStatic/disc-patch.yml +++ b/examples/DiscPatch/HydroStatic/disc-patch.yml @@ -1,8 +1,8 @@ # Define the system of units to use internally. InternalUnitSystem: - UnitMass_in_cgs: 1.9885e33 # Grams - UnitLength_in_cgs: 3.0856776e18 # Centimeters - UnitVelocity_in_cgs: 1e5 # Centimeters per second + UnitMass_in_cgs: 1.9885e33 # Grams + UnitLength_in_cgs: 3.08567758149e18 # Centimeters + UnitVelocity_in_cgs: 1e5 # Centimeters per second UnitCurrent_in_cgs: 1 # Amperes UnitTemp_in_cgs: 1 # Kelvin @@ -11,17 +11,17 @@ TimeIntegration: time_begin: 968 # The starting time of the simulation (in internal units). time_end: 12000. # The end time of the simulation (in internal units). dt_min: 1e-4 # The minimal time-step size of the simulation (in internal units). - dt_max: 1. # The maximal time-step size of the simulation (in internal units). + dt_max: 10. # The maximal time-step size of the simulation (in internal units). # Parameters governing the conserved quantities statistics Statistics: - delta_time: 1 # Time between statistics output + delta_time: 24 # Time between statistics output # Parameters governing the snapshots Snapshots: - basename: Disc-Patch-dynamic # Common part of the name of output files - time_first: 968. # Time of the first output (in internal units) - delta_time: 24. # Time difference between consecutive outputs (in internal units) + basename: Disc-Patch-dynamic # Common part of the name of output files + time_first: 968. # Time of the first output (in internal units) + delta_time: 96. # Time difference between outputs (in internal units) # Parameters for the hydrodynamics scheme SPH: @@ -29,7 +29,7 @@ SPH: delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. - max_smoothing_length: 70. # Maximal smoothing length allowed (in internal units). + h_max: 60. # Maximal smoothing length allowed (in internal units). # Parameters related to the initial conditions InitialConditions: @@ -39,5 +39,7 @@ InitialConditions: DiscPatchPotential: surface_density: 10. scale_height: 100. - z_disc: 200. + x_disc: 400. + x_trunc: 300. + x_max: 380. timestep_mult: 0.03 diff --git a/examples/DiscPatch/HydroStatic/dynamic.pro b/examples/DiscPatch/HydroStatic/dynamic.pro deleted file mode 100644 index 00ee3f7a8d2dc435be2093af959efd2c49903637..0000000000000000000000000000000000000000 --- a/examples/DiscPatch/HydroStatic/dynamic.pro +++ /dev/null @@ -1,139 +0,0 @@ -; -; test energy / angular momentum conservation of test problem -; - -iplot = 1 ; if iplot = 1, make plot of E/Lz conservation, else, simply compare final and initial energy - -; set physical constants -@physunits - -indir = './' -;basefile = 'Disc-Patch-dynamic_' -basefile = 'Disc-Patch_' - -; set properties of potential -uL = phys.pc ; unit of length -uM = phys.msun ; unit of mass -uV = 1d5 ; unit of velocity - -; properties of patch -surface_density = 100. ; surface density of all mass, which generates the gravitational potential -scale_height = 100. -z_disk = 200. ; -fgas = 0.1 ; gas fraction -gamma = 5./3. - -; derived units -constG = 10.^(alog10(phys.g)+alog10(uM)-2d0*alog10(uV)-alog10(uL)) ; -pcentre = [0.,0.,z_disk] * pc / uL -utherm = !pi * constG * surface_density * scale_height / (gamma-1.) -temp = (utherm*uV^2)*phys.m_h/phys.kb -soundspeed = sqrt(gamma * (gamma-1.) * utherm) -t_dyn = sqrt(scale_height / (constG * surface_density)) -rho0 = fgas*(surface_density)/(2.*scale_height) -print,' dynamical time = ',t_dyn,' = ',t_dyn*UL/uV/(1d6*phys.yr),' Myr' -print,' thermal energy per unit mass = ',utherm -print,' central density = ',rho0,' = ',rho0*uM/uL^3/m_h,' particles/cm^3' -print,' central temperature = ',temp -lambda = 2 * !pi * phys.G^1.5 * (scale_height*uL)^1.5 * (surface_density * uM/uL^2)^0.5 * phys.m_h^2 / (gamma-1) / fgas -print,' lambda = ',lambda -stop -; -infile = indir + basefile + '*' -spawn,'ls -1 '+infile,res -nfiles = n_elements(res) - - -; choose: calculate change of energy and Lz, comparing first and last -; snapshots for all particles, or do so for a subset - -; compare all -ifile = 0 -inf = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5' -id = h5rd(inf,'PartType0/ParticleIDs') -nfollow = n_elements(id) - - -; compute anlytic profile -nbins = 100 -zbins = findgen(nbins)/float(nbins-1) * 2 * scale_height -rbins = (surface_density/(2.*scale_height)) / cosh(abs(zbins)/scale_height)^2 - - -; plot analytic profile -wset,0 -plot,[0],[0],xr=[0,2*scale_height],yr=[0,max(rbins)],/nodata,xtitle='|z|',ytitle=textoidl('\rho') -oplot,zbins,rbins,color=blue - -ifile = 0 -nskip = nfiles - 1 -isave = 0 -nplot = 8192 ; randomly plot particles -color = floor(findgen(nfiles)/float(nfiles-1)*255) -;for ifile=0,nfiles-1,nskip do begin -tsave = [0.] -toplot = [1,nfiles-1] -for iplot=0,1 do begin - ifile = toplot[iplot] - inf = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5' - time = h5ra(inf, 'Header','Time') - tsave = [tsave, time] - print,' time= ',time - p = h5rd(inf,'PartType0/Coordinates') - v = h5rd(inf,'PartType0/Velocities') - id = h5rd(inf,'PartType0/ParticleIDs') - rho = h5rd(inf,'PartType0/Density') - h = h5rd(inf,'PartType0/SmoothingLength') - utherm = h5rd(inf,'PartType0/InternalEnergy') - indx = sort(id) - -; substract disk centre - for ic=0,2 do p[ic,*]=p[ic,*] - pcentre[ic] - - -;; ; if you want to sort particles by ID -;; id = id[indx] -;; rho = rho[indx] -;; utherm = utherm[indx] -;; h = h[indx] -;; for ic=0,2 do begin -;; tmp = reform(p[ic,*]) & p[ic,*] = tmp[indx] -;; tmp = reform(v[ic,*]) & v[ic,*] = tmp[indx] -;; endfor - - ip = floor(randomu(ifile+1,nplot)*n_elements(rho)) - color = red - if(ifile eq 1) then begin - color=black - endif else begin - color=red - endelse - oplot,abs(p[2,ip]), rho[ip], psym=3, color=color - - isave = isave + 1 - -endfor - -; time in units of dynamical time -tsave = tsave[1:*] / t_dyn - -label = [''] -for i=0,n_elements(tsave)-1 do label=[label,'time/t_dynamic='+string(tsave[i],format='(f8.0)')] -label = label[1:*] -legend,['analytic',label[0],label[1]],linestyle=[0,0,0],color=[blue,black,red],box=0,/top,/right - -; make histograms of particle velocities -xr = 1d-3 * [-1,1] -bsize = 1.d-5 -ohist,v[0,*]/soundspeed,x,vx,xr[0],xr[1],bsize -ohist,v[1,*]/soundspeed,y,vy,xr[0],xr[1],bsize -ohist,v[2,*]/soundspeed,z,vz,xr[0],xr[1],bsize -wset,2 -plot,x,vx,psym=10,xtitle='velocity/soundspeed',ytitle='pdf',/nodata,xr=xr,/xs -oplot,x,vx,psym=10,color=black -oplot,y,vy,psym=10,color=blue -oplot,z,vz,psym=10,color=red -legend,['vx/c','vy/c','vz/c'],linestyle=[0,0,0],color=[black,blue,red],box=0,/top,/right -end - - diff --git a/examples/DiscPatch/HydroStatic/makeIC.py b/examples/DiscPatch/HydroStatic/makeIC.py index 6ba1ccd06fed84ca728aadaa5922dbba536b6881..11b482059b494fc9a6b9447fdfe2e7ec543d52ff 100644 --- a/examples/DiscPatch/HydroStatic/makeIC.py +++ b/examples/DiscPatch/HydroStatic/makeIC.py @@ -1,158 +1,162 @@ ############################################################################### - # This file is part of SWIFT. - # Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk) - # Tom Theuns (tom.theuns@durham.ac.uk) - # - # This program is free software: you can redistribute it and/or modify - # it under the terms of the GNU Lesser General Public License as published - # by the Free Software Foundation, either version 3 of the License, or - # (at your option) any later version. - # - # This program is distributed in the hope that it will be useful, - # but WITHOUT ANY WARRANTY; without even the implied warranty of - # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - # GNU General Public License for more details. - # - # You should have received a copy of the GNU Lesser General Public License - # along with this program. If not, see <http://www.gnu.org/licenses/>. - # - ############################################################################## +# This file is part of SWIFT. +# Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk) +# Tom Theuns (tom.theuns@durham.ac.uk) +# 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk) +# Bert Vandenbroucke (bert.vandenbroucke@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +############################################################################## import h5py import sys -import numpy +import numpy as np import math import random -import matplotlib.pyplot as plt # Generates a disc-patch in hydrostatic equilibrium -# see Creasey, Theuns & Bower, 2013, for the equations: -# disc parameters are: surface density sigma -# scale height b -# density: rho(z) = (sigma/2b) sech^2(z/b) -# isothermal velocity dispersion = <v_z^2? = b pi G sigma -# grad potential = 2 pi G sigma tanh(z/b) -# potential = ln(cosh(z/b)) + const -# Dynamical time = sqrt(b / (G sigma)) -# to obtain the 1/ch^2(z/b) profile from a uniform profile (a glass, say, or a uniform random variable), note that, when integrating in z -# \int 0^z dz/ch^2(z) = tanh(z)-tanh(0) = \int_0^x dx = x (where the last integral refers to a uniform density distribution), so that z = atanh(x) -# usage: python makeIC.py 1000 - -# physical constants in cgs -NEWTON_GRAVITY_CGS = 6.672e-8 -SOLAR_MASS_IN_CGS = 1.9885e33 -PARSEC_IN_CGS = 3.0856776e18 -PROTON_MASS_IN_CGS = 1.6726231e24 -YEAR_IN_CGS = 3.154e+7 - -# choice of units -const_unit_length_in_cgs = (PARSEC_IN_CGS) -const_unit_mass_in_cgs = (SOLAR_MASS_IN_CGS) -const_unit_velocity_in_cgs = (1e5) +# +# See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948 +# +# +# Disc parameters are: surface density -- sigma +# scale height -- b +# gas adiabatic index -- gamma +# +# Problem parameters are: Ratio height/width of the box -- z_factor +# Size of the patch -- side_length + +# Parameters of the gas disc +surface_density = 10. +scale_height = 100. +gas_gamma = 5./3. -print "UnitMass_in_cgs: ", const_unit_mass_in_cgs -print "UnitLength_in_cgs: ", const_unit_length_in_cgs -print "UnitVelocity_in_cgs: ", const_unit_velocity_in_cgs +# Parameters of the problem +x_factor = 2 +side_length = 400. +# File +fileName = "Disc-Patch.hdf5" -# parameters of potential -surface_density = 100. # surface density of all mass, which generates the gravitational potential -scale_height = 100. -gamma = 5./3. -fgas = 0.1 # gas fraction - -# derived units -const_unit_time_in_cgs = (const_unit_length_in_cgs / const_unit_velocity_in_cgs) -const_G = ((NEWTON_GRAVITY_CGS*const_unit_mass_in_cgs*const_unit_time_in_cgs*const_unit_time_in_cgs/(const_unit_length_in_cgs*const_unit_length_in_cgs*const_unit_length_in_cgs))) -print 'G=', const_G -utherm = math.pi * const_G * surface_density * scale_height / (gamma-1) -v_disp = numpy.sqrt(2 * utherm) -soundspeed = numpy.sqrt(utherm / (gamma * (gamma-1.))) -t_dyn = numpy.sqrt(scale_height / (const_G * surface_density)) -t_cross = scale_height / soundspeed -print 'dynamical time = ',t_dyn,' sound crossing time = ',t_cross,' sound speed= ',soundspeed,' 3D velocity dispersion = ',v_disp,' thermal_energy= ',utherm - - -# Parameters -periodic= 1 # 1 For periodic box -boxSize = 400. # [kpc] -Radius = 100. # maximum radius of particles [kpc] -G = const_G +#################################################################### -# File -fileName = "Disc-Patch.hdf5" - -#--------------------------------------------------- -mass = 1 - -#-------------------------------------------------- - - -# using glass ICs -# read glass file and generate gas positions and tile it ntile times in each dimension -ntile = 1 -inglass = 'glassCube_32.hdf5' -infile = h5py.File(inglass, "r") -one_glass_p = infile["/PartType0/Coordinates"][:,:] -one_glass_h = infile["/PartType0/SmoothingLength"][:] - -# scale in [-0.5,0.5]*BoxSize / ntile -one_glass_p[:,:] -= 0.5 -one_glass_p *= boxSize / ntile -one_glass_h *= boxSize / ntile -ndens_glass = (one_glass_h.shape[0]) / (boxSize/ntile)**3 -h_glass = numpy.amin(one_glass_h) * (boxSize/ntile) - -glass_p = [] -glass_h = [] -for ix in range(0,ntile): - for iy in range(0,ntile): - for iz in range(0,ntile): - shift = one_glass_p.copy() - shift[:,0] += (ix-(ntile-1)/2.) * boxSize / ntile - shift[:,1] += (iy-(ntile-1)/2.) * boxSize / ntile - shift[:,2] += (iz-(ntile-1)/2.) * boxSize / ntile - glass_p.append(shift) - glass_h.append(one_glass_h.copy()) - -glass_p = numpy.concatenate(glass_p, axis=0) -glass_h = numpy.concatenate(glass_h, axis=0) - -# random shuffle of glas ICs -numpy.random.seed(12345) -indx = numpy.random.rand(numpy.shape(glass_h)[0]) -indx = numpy.argsort(indx) -glass_p = glass_p[indx, :] -glass_h = glass_h[indx] - -# select numGas of them -numGas = 8192 -pos = glass_p[0:numGas,:] -h = glass_h[0:numGas] -numGas = numpy.shape(pos)[0] - -# compute furthe properties of ICs -column_density = fgas * surface_density * numpy.tanh(boxSize/2./scale_height) -enclosed_mass = column_density * boxSize * boxSize -pmass = enclosed_mass / numGas -meanrho = enclosed_mass / boxSize**3 -print 'pmass= ',pmass,' mean(rho) = ', meanrho,' entropy= ', (gamma-1) * utherm / meanrho**(gamma-1) - -# desired density -rho = surface_density / (2. * scale_height) / numpy.cosh(abs(pos[:,2])/scale_height)**2 -u = (1. + 0 * h) * utherm -entropy = (gamma-1) * u / rho**(gamma-1) -mass = 0.*h + pmass -entropy_flag = 0 -vel = 0 + 0 * pos - -# move centre of disc to middle of box -pos[:,:] += boxSize/2 - - -# create numPart dm particles -numPart = 0 +# physical constants in cgs +NEWTON_GRAVITY_CGS = 6.67408e-8 +SOLAR_MASS_IN_CGS = 1.9885e33 +PARSEC_IN_CGS = 3.08567758149e18 +PROTON_MASS_IN_CGS = 1.672621898e-24 +BOLTZMANN_IN_CGS = 1.38064852e-16 +YEAR_IN_CGS = 3.15569252e7 +# choice of units +unit_length_in_cgs = (PARSEC_IN_CGS) +unit_mass_in_cgs = (SOLAR_MASS_IN_CGS) +unit_velocity_in_cgs = (1e5) +unit_time_in_cgs = unit_length_in_cgs / unit_velocity_in_cgs + +print "UnitMass_in_cgs: %.5e"%unit_mass_in_cgs +print "UnitLength_in_cgs: %.5e"%unit_length_in_cgs +print "UnitVelocity_in_cgs: %.5e"%unit_velocity_in_cgs +print "UnitTime_in_cgs: %.5e"%unit_time_in_cgs +print "" + +# Derived units +const_G = NEWTON_GRAVITY_CGS * unit_mass_in_cgs * unit_time_in_cgs**2 * \ + unit_length_in_cgs**-3 +const_mp = PROTON_MASS_IN_CGS * unit_mass_in_cgs**-1 +const_kb = BOLTZMANN_IN_CGS * unit_mass_in_cgs**-1 * unit_length_in_cgs**-2 * \ + unit_time_in_cgs**2 + +print "--- Some constants [internal units] ---" +print "G_Newton: %.5e"%const_G +print "m_proton: %.5e"%const_mp +print "k_boltzmann: %.5e"%const_kb +print "" + +# derived quantities +temp = math.pi * const_G * surface_density * scale_height * const_mp / \ + const_kb +u_therm = const_kb * temp / ((gas_gamma-1) * const_mp) +v_disp = math.sqrt(2 * u_therm) +soundspeed = math.sqrt(u_therm / (gas_gamma * (gas_gamma-1.))) +t_dyn = math.sqrt(scale_height / (const_G * surface_density)) +t_cross = scale_height / soundspeed + +print "--- Properties of the gas [internal units] ---" +print "Gas temperature: %.5e"%temp +print "Gas thermal_energy: %.5e"%u_therm +print "Dynamical time: %.5e"%t_dyn +print "Sound crossing time: %.5e"%t_cross +print "Gas sound speed: %.5e"%soundspeed +print "Gas 3D vel_disp: %.5e"%v_disp +print "" + +# Problem properties +boxSize_x = side_length +boxSize_y = boxSize_x +boxSize_z = boxSize_x +boxSize_x *= x_factor +volume = boxSize_x * boxSize_y * boxSize_z +M_tot = boxSize_y * boxSize_z * surface_density * \ + math.tanh(boxSize_x / (2. * scale_height)) +density = M_tot / volume +entropy = (gas_gamma - 1.) * u_therm / density**(gas_gamma - 1.) + +print "--- Problem properties [internal units] ---" +print "Box: [%.1f, %.1f, %.1f]"%(boxSize_x, boxSize_y, boxSize_z) +print "Volume: %.5e"%volume +print "Total mass: %.5e"%M_tot +print "Density: %.5e"%density +print "Entropy: %.5e"%entropy +print "" + +#################################################################### + +# Read glass pre-ICs +infile = h5py.File('glassCube_32.hdf5', "r") +one_glass_pos = infile["/PartType0/Coordinates"][:,:] +one_glass_h = infile["/PartType0/SmoothingLength"][:] + +# Rescale to the problem size +one_glass_pos *= side_length +one_glass_h *= side_length + +# Now create enough copies to fill the volume in x +pos = np.copy(one_glass_pos) +h = np.copy(one_glass_h) +for i in range(1, x_factor): + one_glass_pos[:,0] += side_length + pos = np.append(pos, one_glass_pos, axis=0) + h = np.append(h, one_glass_h, axis=0) + +# Compute further properties of ICs +numPart = np.size(h) +mass = M_tot / numPart + +print "--- Particle properties [internal units] ---" +print "Number part.: ", numPart +print "Part. mass: %.5e"%mass +print "" + +# Create additional arrays +u = np.ones(numPart) * u_therm +mass = np.ones(numPart) * mass +vel = np.zeros((numPart, 3)) +ids = 1 + np.linspace(0, numPart, numPart, endpoint=False) + +#################################################################### # Create and write output file #File @@ -160,97 +164,45 @@ file = h5py.File(fileName, 'w') #Units grp = file.create_group("/Units") -grp.attrs["Unit length in cgs (U_L)"] = const_unit_length_in_cgs -grp.attrs["Unit mass in cgs (U_M)"] = const_unit_mass_in_cgs -grp.attrs["Unit time in cgs (U_t)"] = const_unit_length_in_cgs / const_unit_velocity_in_cgs +grp.attrs["Unit length in cgs (U_L)"] = unit_length_in_cgs +grp.attrs["Unit mass in cgs (U_M)"] = unit_mass_in_cgs +grp.attrs["Unit time in cgs (U_t)"] = unit_time_in_cgs grp.attrs["Unit current in cgs (U_I)"] = 1. grp.attrs["Unit temperature in cgs (U_T)"] = 1. # Header grp = file.create_group("/Header") -grp.attrs["BoxSize"] = boxSize -grp.attrs["NumPart_Total"] = [numGas, numPart, 0, 0, 0, 0] +grp.attrs["BoxSize"] = [boxSize_x, boxSize_y, boxSize_z] +grp.attrs["NumPart_Total"] = [numPart, 0, 0, 0, 0, 0] grp.attrs["NumPart_Total_HighWord"] = [0, 0, 0, 0, 0, 0] -grp.attrs["NumPart_ThisFile"] = [numGas, numPart, 0, 0, 0, 0] +grp.attrs["NumPart_ThisFile"] = [numPart, 0, 0, 0, 0, 0] grp.attrs["Time"] = 0.0 grp.attrs["NumFilesPerSnapshot"] = 1 grp.attrs["MassTable"] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] -grp.attrs["Flag_Entropy_ICs"] = [entropy_flag] +grp.attrs["Flag_Entropy_ICs"] = [0, 0, 0, 0, 0, 0] grp.attrs["Dimension"] = 3 #Runtime parameters grp = file.create_group("/RuntimePars") -grp.attrs["PeriodicBoundariesOn"] = periodic - +grp.attrs["PeriodicBoundariesOn"] = 1 # write gas particles grp0 = file.create_group("/PartType0") -ds = grp0.create_dataset('Coordinates', (numGas, 3), 'f') -ds[()] = pos - -ds = grp0.create_dataset('Velocities', (numGas, 3), 'f') -ds[()] = vel - -ds = grp0.create_dataset('Masses', (numGas,), 'f') -ds[()] = mass - -ds = grp0.create_dataset('SmoothingLength', (numGas,), 'f') -ds[()] = h - -ds = grp0.create_dataset('InternalEnergy', (numGas,), 'f') -u = numpy.full((numGas, ), utherm) -if (entropy_flag == 1): - ds[()] = entropy -else: - ds[()] = u - -ids = 1 + numpy.linspace(0, numGas, numGas, endpoint=False) -ds = grp0.create_dataset('ParticleIDs', (numGas, ), 'L') -ds[()] = ids - -print "Internal energy:", u[0] - -# generate dark matter particles if needed -if(numPart > 0): - - # set seed for random number - numpy.random.seed(1234) - - grp1 = file.create_group("/PartType1") - - radius = Radius * (numpy.random.rand(N))**(1./3.) - ctheta = -1. + 2 * numpy.random.rand(N) - stheta = numpy.sqrt(1.-ctheta**2) - phi = 2 * math.pi * numpy.random.rand(N) - r = numpy.zeros((numPart, 3)) - - speed = vrot - v = numpy.zeros((numPart, 3)) - omega = speed / radius - period = 2.*math.pi/omega - print 'period = minimum = ',min(period), ' maximum = ',max(period) - - v[:,0] = -omega * r[:,1] - v[:,1] = omega * r[:,0] - - ds = grp1.create_dataset('Coordinates', (numPart, 3), 'd') - ds[()] = r - - ds = grp1.create_dataset('Velocities', (numPart, 3), 'f') - ds[()] = v - v = numpy.zeros(1) - - m = numpy.full((numPart, ),10) - ds = grp1.create_dataset('Masses', (numPart,), 'f') - ds[()] = m - m = numpy.zeros(1) - - ids = 1 + numpy.linspace(0, numPart, numPart, endpoint=False, dtype='L') - ds = grp1.create_dataset('ParticleIDs', (numPart, ), 'L') - ds[()] = ids - - -file.close() - -sys.exit() +ds = grp0.create_dataset('Coordinates', (numPart, 3), 'f', data=pos) +ds = grp0.create_dataset('Velocities', (numPart, 3), 'f') +ds = grp0.create_dataset('Masses', (numPart,), 'f', data=mass) +ds = grp0.create_dataset('SmoothingLength', (numPart,), 'f', data=h) +ds = grp0.create_dataset('InternalEnergy', (numPart,), 'f', data=u) +ds = grp0.create_dataset('ParticleIDs', (numPart, ), 'L', data=ids) + +#################################################################### + +print "--- Runtime parameters (YAML file): ---" +print "DiscPatchPotential:surface_density: ", surface_density +print "DiscPatchPotential:scale_height: ", scale_height +print "DiscPatchPotential:x_disc: ", 0.5 * boxSize_x +print "" + +print "--- Constant parameters: ---" +print "const_isothermal_internal_energy: %ef"%u_therm diff --git a/examples/DiscPatch/HydroStatic/plot.py b/examples/DiscPatch/HydroStatic/plotSolution.py similarity index 58% rename from examples/DiscPatch/HydroStatic/plot.py rename to examples/DiscPatch/HydroStatic/plotSolution.py index 2de749f9e3b3c287390218e09ea347d660f9ce8a..681f7d8ab3f2320b5de75e688edcb92efef9d883 100644 --- a/examples/DiscPatch/HydroStatic/plot.py +++ b/examples/DiscPatch/HydroStatic/plotSolution.py @@ -1,6 +1,7 @@ ################################################################################ # This file is part of SWIFT. # Copyright (c) 2017 Bert Vandenbroucke (bert.vandenbroucke@gmail.com) +# Matthieu Schaller (matthieu.schaller@durham.ac.uk) # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Lesser General Public License as published @@ -20,7 +21,7 @@ ## # This script plots the Disc-Patch_*.hdf5 snapshots. # It takes two (optional) parameters: the counter value of the first and last -# snapshot to plot (default: 0 81). +# snapshot to plot (default: 0 21). ## import numpy as np @@ -34,12 +35,14 @@ import sys # Parameters surface_density = 10. scale_height = 100. -z_disc = 200. -utherm = 20.2615290634 +x_disc = 400. +x_trunc = 300. +x_max = 350. +utherm = 20.2678457288 gamma = 5. / 3. start = 0 -stop = 81 +stop = 21 if len(sys.argv) > 1: start = int(sys.argv[1]) if len(sys.argv) > 2: @@ -48,14 +51,14 @@ if len(sys.argv) > 2: # Get the analytic solution for the density def get_analytic_density(x): return 0.5 * surface_density / scale_height / \ - np.cosh( (x - z_disc) / scale_height )**2 + np.cosh( (x - x_disc) / scale_height )**2 # Get the analytic solution for the (isothermal) pressure def get_analytic_pressure(x): return (gamma - 1.) * utherm * get_analytic_density(x) # Get the data fields to plot from the snapshot file with the given name: -# snapshot time, z-coord, density, pressure, velocity norm +# snapshot time, x-coord, density, pressure, velocity norm def get_data(name): file = h5py.File(name, "r") coords = np.array(file["/PartType0/Coordinates"]) @@ -67,7 +70,7 @@ def get_data(name): vtot = np.sqrt( v[:,0]**2 + v[:,1]**2 + v[:,2]**2 ) - return float(file["/Header"].attrs["Time"]), coords[:,2], rho, P, vtot + return float(file["/Header"].attrs["Time"]), coords[:,0], rho, P, vtot # scan the folder for snapshot files and plot all of them (within the requested # range) @@ -78,23 +81,38 @@ for f in sorted(glob.glob("Disc-Patch_*.hdf5")): print "processing", f, "..." - zrange = np.linspace(0., 400., 1000) - time, z, rho, P, v = get_data(f) + xrange = np.linspace(0., 2. * x_disc, 1000) + time, x, rho, P, v = get_data(f) fig, ax = pl.subplots(3, 1, sharex = True) - ax[0].plot(z, rho, "r.") - ax[0].plot(zrange, get_analytic_density(zrange), "k-") + ax[0].plot(x, rho, "r.") + ax[0].plot(xrange, get_analytic_density(xrange), "k-") + ax[0].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5) + ax[0].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5) + ax[0].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5) + ax[0].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5) + ax[0].set_ylim(0., 1.2 * get_analytic_density(x_disc)) ax[0].set_ylabel("density") - ax[1].plot(z, v, "r.") - ax[1].plot(zrange, np.zeros(len(zrange)), "k-") + ax[1].plot(x, v, "r.") + ax[1].plot(xrange, np.zeros(len(xrange)), "k-") + ax[1].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5) + ax[1].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5) + ax[1].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5) + ax[1].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5) + ax[1].set_ylim(-0.5, 10.) ax[1].set_ylabel("velocity norm") - ax[2].plot(z, P, "r.") - ax[2].plot(zrange, get_analytic_pressure(zrange), "k-") - ax[2].set_xlim(0., 400.) - ax[2].set_xlabel("z") + ax[2].plot(x, P, "r.") + ax[2].plot(xrange, get_analytic_pressure(xrange), "k-") + ax[2].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5) + ax[2].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5) + ax[2].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5) + ax[2].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5) + ax[2].set_xlim(0., 2. * x_disc) + ax[2].set_ylim(0., 1.2 * get_analytic_pressure(x_disc)) + ax[2].set_xlabel("x") ax[2].set_ylabel("pressure") pl.suptitle("t = {0:.2f}".format(time)) diff --git a/examples/DiscPatch/HydroStatic/run.sh b/examples/DiscPatch/HydroStatic/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..e1f47ecad54e7e171d78b7da080d56579e985d1e --- /dev/null +++ b/examples/DiscPatch/HydroStatic/run.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# Generate the initial conditions if they are not present. +if [ ! -e glassCube_32.hdf5 ] +then + echo "Fetching initial glass file for the disc patch example..." + ./getGlass.sh +fi +if [ ! -e Disc-Patch.hdf5 ] +then + echo "Generating initial conditions for the disc patch example..." + python makeIC.py +fi + +# Run SWIFT +../../swift -g -s -t 4 disc-patch-icc.yml 2>&1 | tee output.log + +python plotSolution.py diff --git a/examples/DiscPatch/HydroStatic/test.pro b/examples/DiscPatch/HydroStatic/test.pro deleted file mode 100644 index 950aebc65d7d34cd7aaeb2368734e5492902a912..0000000000000000000000000000000000000000 --- a/examples/DiscPatch/HydroStatic/test.pro +++ /dev/null @@ -1,142 +0,0 @@ -; -; test energy / angular momentum conservation of test problem -; - -iplot = 1 ; if iplot = 1, make plot of E/Lz conservation, else, simply compare final and initial energy - -; set physical constants -@physunits - -indir = './' -basefile = 'Disc-Patch_' - -; set properties of potential -uL = phys.pc ; unit of length -uM = phys.msun ; unit of mass -uV = 1d5 ; unit of velocity - -; properties of patch -surface_density = 10. -scale_height = 100. - -; derived units -constG = 10.^(alog10(phys.g)+alog10(uM)-2d0*alog10(uV)-alog10(uL)) ; -pcentre = [0.,0.,200.] * pc / uL - -; -infile = indir + basefile + '*' -spawn,'ls -1 '+infile,res -nfiles = n_elements(res) - - -; choose: calculate change of energy and Lz, comparing first and last -; snapshots for all particles, or do so for a subset - -; compare all -ifile = 0 -inf = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5' -id = h5rd(inf,'PartType0/ParticleIDs') -nfollow = n_elements(id) - -; follow a subset -; nfollow = min(4000, nfollow) ; number of particles to follow - -; -if (iplot eq 1) then begin - nskip = 1 - nsave = nfiles -endif else begin - nskip = nfiles - 2 - nsave = 2 -endelse - -; -lout = fltarr(nfollow, nsave) ; Lz -xout = fltarr(nfollow, nsave) ; x -yout = fltarr(nfollow, nsave) ; y -zout = fltarr(nfollow, nsave) ; z -vzout = fltarr(nfollow, nsave) ; z -rout = fltarr(nfollow, nsave) ; rho -hout = fltarr(nfollow, nsave) ; h -uout = fltarr(nfollow, nsave) ; thermal energy -eout = fltarr(nfollow, nsave) ; energies -ekin = fltarr(nfollow, nsave) -epot = fltarr(nfollow, nsave) ; 2 pi G Sigma b ln(cosh(z/b)) + const -tout = fltarr(nsave) - -ifile = 0 -isave = 0 -for ifile=0,nfiles-1,nskip do begin - inf = indir + basefile + strtrim(string(ifile,'(i3.3)'),1) + '.hdf5' - time = h5ra(inf, 'Header','Time') - p = h5rd(inf,'PartType0/Coordinates') - v = h5rd(inf,'PartType0/Velocities') - id = h5rd(inf,'PartType0/ParticleIDs') - rho = h5rd(inf,'PartType0/Density') - h = h5rd(inf,'PartType0/SmoothingLength') - utherm = h5rd(inf,'PartType0/InternalEnergy') - indx = sort(id) - -; if you want to sort particles by ID - id = id[indx] - rho = rho[indx] - utherm = utherm[indx] - h = h[indx] - for ic=0,2 do begin - tmp = reform(p[ic,*]) & p[ic,*] = tmp[indx] - tmp = reform(v[ic,*]) & v[ic,*] = tmp[indx] - endfor - -; calculate energy - dd = size(p,/dimen) & npart = dd[1] - ener = fltarr(npart) - dr = fltarr(npart) & dv = dr - for ic=0,2 do dr[*] = dr[*] + (p[ic,*]-pcentre[ic])^2 - for ic=0,2 do dv[*] = dv[*] + v[ic,*]^2 - xout[*,isave] = p[0,0:nfollow-1]-pcentre[0] - yout[*,isave] = p[1,0:nfollow-1]-pcentre[1] - zout[*,isave] = p[2,0:nfollow-1]-pcentre[2] - vzout[*,isave]= v[2,0:nfollow-1] - rout[*,isave] = rho[0:nfollow-1] - hout[*,isave] = h[0:nfollow-1] - uout[*,isave] = utherm[0:nfollow-1] - Lz = (p[0,*]-pcentre[0]) * v[1,*] - (p[1,*]-pcentre[1]) * v[0,*] - dz = reform(p[2,0:nfollow-1]-pcentre[2]) -; print,'time = ',time,p[0,0],v[0,0],id[0] - ek = 0.5 * dv - ep = fltarr(nfollow) - ep = 2 * !pi * constG * surface_density * scale_height * alog(cosh(abs(dz)/scale_height)) - ener = ek + ep - tout(isave) = time - lout[*,isave] = lz[0:nfollow-1] - eout(*,isave) = ener[0:nfollow-1] - ekin(*,isave) = ek[0:nfollow-1] - epot(*,isave) = ep[0:nfollow-1] - print,format='('' time= '',f7.1,'' E= '',f9.2,'' Lz= '',e9.2)', time,eout[0],lz[0] - isave = isave + 1 - -endfor - -x0 = reform(xout[0,*]) -y0 = reform(xout[1,*]) -z0 = reform(xout[2,*]) - - -; plot density profile and compare to analytic profile -nplot = nfollow - - ; plot density profile -wset,0 -xr = [0, 3*scale_height] -nbins = 100 -zpos = findgen(nbins)/float(nbins-1) * max(xr) -dens = (surface_density/(2.d0*scale_height)) * 1./cosh(zpos/scale_height)^2 -plot,[0],[0],xr=xr,/xs,yr=[0,max(dens)*1.4],/ys,/nodata,xtitle='|z|',ytitle='density' -oplot,zpos,dens,color=black,thick=3 -;oplot,abs(zout[*,1]),rout[*,1],psym=3 ; initial profile -oplot,abs(zout[*,nsave-1]),rout[*,nsave-1],psym=3,color=red - - -end - - diff --git a/examples/DiscPatch/HydroStatic_1D/disc-patch-icc.yml b/examples/DiscPatch/HydroStatic_1D/disc-patch-icc.yml new file mode 100644 index 0000000000000000000000000000000000000000..6f17cfbb1e0125faf8e47fe4e9e55bfdf4df7b71 --- /dev/null +++ b/examples/DiscPatch/HydroStatic_1D/disc-patch-icc.yml @@ -0,0 +1,46 @@ +# Define the system of units to use internally. +InternalUnitSystem: + UnitMass_in_cgs: 1.9885e33 # Grams + UnitLength_in_cgs: 3.08567758149e18 # Centimeters + UnitVelocity_in_cgs: 1e5 # Centimeters per second + UnitCurrent_in_cgs: 1 # Amperes + UnitTemp_in_cgs: 1 # Kelvin + +# Parameters governing the time integration +TimeIntegration: + time_begin: 0 # The starting time of the simulation (in internal units). + time_end: 968. # The end time of the simulation (in internal units). + dt_min: 1e-4 # The minimal time-step size of the simulation (in internal units). + dt_max: 10. # The maximal time-step size of the simulation (in internal units). + +# Parameters governing the conserved quantities statistics +Statistics: + delta_time: 12. # Time between statistics output + +# Parameters governing the snapshots +Snapshots: + basename: Disc-Patch # Common part of the name of output files + time_first: 0. # Time of the first output (in internal units) + delta_time: 48. # Time difference between outputs (in internal units) + +# Parameters for the hydrodynamics scheme +SPH: + resolution_eta: 1.2349 # Target smoothing length in units of the mean inter-particle separation (1.2349 == 48Ngbs with the cubic spline kernel). + delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. + CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. + max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. + h_max: 60. # Maximal smoothing length allowed (in internal units). + +# Parameters related to the initial conditions +InitialConditions: + file_name: Disc-Patch.hdf5 # The file to read + +# External potential parameters +DiscPatchPotential: + surface_density: 10. + scale_height: 100. + x_disc: 400. + x_trunc: 300. + x_max: 350. + timestep_mult: 0.03 + growth_time: 5. diff --git a/examples/DiscPatch/HydroStatic_1D/makeIC.py b/examples/DiscPatch/HydroStatic_1D/makeIC.py new file mode 100644 index 0000000000000000000000000000000000000000..1589dfc8c73e5b9bf3c2cad4bcf3029654d9e67e --- /dev/null +++ b/examples/DiscPatch/HydroStatic_1D/makeIC.py @@ -0,0 +1,194 @@ +############################################################################### +# This file is part of SWIFT. +# Copyright (c) 2016 John A. Regan (john.a.regan@durham.ac.uk) +# Tom Theuns (tom.theuns@durham.ac.uk) +# 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk) +# Bert Vandenbroucke (bert.vandenbroucke@gmail.com) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +############################################################################## + +import h5py +import sys +import numpy as np +import math +import random + +# Generates a disc-patch in hydrostatic equilibrium +# +# See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948 +# +# +# Disc parameters are: surface density -- sigma +# scale height -- b +# gas adiabatic index -- gamma +# +# Problem parameters are: Ratio height/width of the box -- z_factor +# Size of the patch -- side_length + +# Parameters of the gas disc +surface_density = 10. +scale_height = 100. +gas_gamma = 5./3. + +# Parameters of the problem +x_factor = 2 +side_length = 400. +numPart = 1000 + +# File +fileName = "Disc-Patch.hdf5" + +#################################################################### + +# physical constants in cgs +NEWTON_GRAVITY_CGS = 6.67408e-8 +SOLAR_MASS_IN_CGS = 1.9885e33 +PARSEC_IN_CGS = 3.08567758149e18 +PROTON_MASS_IN_CGS = 1.672621898e-24 +BOLTZMANN_IN_CGS = 1.38064852e-16 +YEAR_IN_CGS = 3.15569252e7 + +# choice of units +unit_length_in_cgs = (PARSEC_IN_CGS) +unit_mass_in_cgs = (SOLAR_MASS_IN_CGS) +unit_velocity_in_cgs = (1e5) +unit_time_in_cgs = unit_length_in_cgs / unit_velocity_in_cgs + +print "UnitMass_in_cgs: %.5e"%unit_mass_in_cgs +print "UnitLength_in_cgs: %.5e"%unit_length_in_cgs +print "UnitVelocity_in_cgs: %.5e"%unit_velocity_in_cgs +print "UnitTime_in_cgs: %.5e"%unit_time_in_cgs +print "" + +# Derived units +const_G = NEWTON_GRAVITY_CGS * unit_mass_in_cgs * unit_time_in_cgs**2 * \ + unit_length_in_cgs**-3 +const_mp = PROTON_MASS_IN_CGS * unit_mass_in_cgs**-1 +const_kb = BOLTZMANN_IN_CGS * unit_mass_in_cgs**-1 * unit_length_in_cgs**-2 * \ + unit_time_in_cgs**2 + +print "--- Some constants [internal units] ---" +print "G_Newton: %.5e"%const_G +print "m_proton: %.5e"%const_mp +print "k_boltzmann: %.5e"%const_kb +print "" + +# derived quantities +temp = math.pi * const_G * surface_density * scale_height * const_mp / \ + const_kb +u_therm = const_kb * temp / ((gas_gamma-1) * const_mp) +v_disp = math.sqrt(2 * u_therm) +soundspeed = math.sqrt(u_therm / (gas_gamma * (gas_gamma-1.))) +t_dyn = math.sqrt(scale_height / (const_G * surface_density)) +t_cross = scale_height / soundspeed + +print "--- Properties of the gas [internal units] ---" +print "Gas temperature: %.5e"%temp +print "Gas thermal_energy: %.5e"%u_therm +print "Dynamical time: %.5e"%t_dyn +print "Sound crossing time: %.5e"%t_cross +print "Gas sound speed: %.5e"%soundspeed +print "Gas 3D vel_disp: %.5e"%v_disp +print "" + +# Problem properties +boxSize_x = side_length +boxSize_x *= x_factor +volume = boxSize_x +M_tot = surface_density * math.tanh(boxSize_x / (2. * scale_height)) +density = M_tot / volume +entropy = (gas_gamma - 1.) * u_therm / density**(gas_gamma - 1.) + +print "--- Problem properties [internal units] ---" +print "Box: %.1f"%boxSize_x +print "Volume: %.5e"%volume +print "Total mass: %.5e"%M_tot +print "Density: %.5e"%density +print "Entropy: %.5e"%entropy +print "" + +#################################################################### + +# Now create enough copies to fill the volume in x +pos = np.zeros((numPart, 3)) +h = np.zeros(numPart) + 2. * boxSize_x / numPart +for i in range(numPart): + pos[i, 0] = (i + 0.5) * boxSize_x / numPart + +# Compute further properties of ICs +mass = M_tot / numPart + +print "--- Particle properties [internal units] ---" +print "Number part.: ", numPart +print "Part. mass: %.5e"%mass +print "" + +# Create additional arrays +u = np.ones(numPart) * u_therm +mass = np.ones(numPart) * mass +vel = np.zeros((numPart, 3)) +ids = 1 + np.linspace(0, numPart, numPart, endpoint=False) + +#################################################################### +# Create and write output file + +#File +file = h5py.File(fileName, 'w') + +#Units +grp = file.create_group("/Units") +grp.attrs["Unit length in cgs (U_L)"] = unit_length_in_cgs +grp.attrs["Unit mass in cgs (U_M)"] = unit_mass_in_cgs +grp.attrs["Unit time in cgs (U_t)"] = unit_time_in_cgs +grp.attrs["Unit current in cgs (U_I)"] = 1. +grp.attrs["Unit temperature in cgs (U_T)"] = 1. + +# Header +grp = file.create_group("/Header") +grp.attrs["BoxSize"] = [boxSize_x, 1., 1.] +grp.attrs["NumPart_Total"] = [numPart, 0, 0, 0, 0, 0] +grp.attrs["NumPart_Total_HighWord"] = [0, 0, 0, 0, 0, 0] +grp.attrs["NumPart_ThisFile"] = [numPart, 0, 0, 0, 0, 0] +grp.attrs["Time"] = 0.0 +grp.attrs["NumFilesPerSnapshot"] = 1 +grp.attrs["MassTable"] = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0] +grp.attrs["Flag_Entropy_ICs"] = [0, 0, 0, 0, 0, 0] +grp.attrs["Dimension"] = 1 + +#Runtime parameters +grp = file.create_group("/RuntimePars") +grp.attrs["PeriodicBoundariesOn"] = 1 + +# write gas particles +grp0 = file.create_group("/PartType0") + +ds = grp0.create_dataset('Coordinates', (numPart, 3), 'f', data=pos) +ds = grp0.create_dataset('Velocities', (numPart, 3), 'f') +ds = grp0.create_dataset('Masses', (numPart,), 'f', data=mass) +ds = grp0.create_dataset('SmoothingLength', (numPart,), 'f', data=h) +ds = grp0.create_dataset('InternalEnergy', (numPart,), 'f', data=u) +ds = grp0.create_dataset('ParticleIDs', (numPart, ), 'L', data=ids) + +#################################################################### + +print "--- Runtime parameters (YAML file): ---" +print "DiscPatchPotential:surface_density: ", surface_density +print "DiscPatchPotential:scale_height: ", scale_height +print "DiscPatchPotential:x_disc: ", 0.5 * boxSize_x +print "" + +print "--- Constant parameters: ---" +print "const_isothermal_internal_energy: %ef"%u_therm diff --git a/examples/DiscPatch/HydroStatic_1D/plotSolution.py b/examples/DiscPatch/HydroStatic_1D/plotSolution.py new file mode 100644 index 0000000000000000000000000000000000000000..681f7d8ab3f2320b5de75e688edcb92efef9d883 --- /dev/null +++ b/examples/DiscPatch/HydroStatic_1D/plotSolution.py @@ -0,0 +1,121 @@ +################################################################################ +# This file is part of SWIFT. +# Copyright (c) 2017 Bert Vandenbroucke (bert.vandenbroucke@gmail.com) +# Matthieu Schaller (matthieu.schaller@durham.ac.uk) +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +################################################################################ + +## +# This script plots the Disc-Patch_*.hdf5 snapshots. +# It takes two (optional) parameters: the counter value of the first and last +# snapshot to plot (default: 0 21). +## + +import numpy as np +import h5py +import matplotlib +matplotlib.use("Agg") +import pylab as pl +import glob +import sys + +# Parameters +surface_density = 10. +scale_height = 100. +x_disc = 400. +x_trunc = 300. +x_max = 350. +utherm = 20.2678457288 +gamma = 5. / 3. + +start = 0 +stop = 21 +if len(sys.argv) > 1: + start = int(sys.argv[1]) +if len(sys.argv) > 2: + stop = int(sys.argv[2]) + +# Get the analytic solution for the density +def get_analytic_density(x): + return 0.5 * surface_density / scale_height / \ + np.cosh( (x - x_disc) / scale_height )**2 + +# Get the analytic solution for the (isothermal) pressure +def get_analytic_pressure(x): + return (gamma - 1.) * utherm * get_analytic_density(x) + +# Get the data fields to plot from the snapshot file with the given name: +# snapshot time, x-coord, density, pressure, velocity norm +def get_data(name): + file = h5py.File(name, "r") + coords = np.array(file["/PartType0/Coordinates"]) + rho = np.array(file["/PartType0/Density"]) + u = np.array(file["/PartType0/InternalEnergy"]) + v = np.array(file["/PartType0/Velocities"]) + + P = (gamma - 1.) * rho * u + + vtot = np.sqrt( v[:,0]**2 + v[:,1]**2 + v[:,2]**2 ) + + return float(file["/Header"].attrs["Time"]), coords[:,0], rho, P, vtot + +# scan the folder for snapshot files and plot all of them (within the requested +# range) +for f in sorted(glob.glob("Disc-Patch_*.hdf5")): + num = int(f[-8:-5]) + if num < start or num > stop: + continue + + print "processing", f, "..." + + xrange = np.linspace(0., 2. * x_disc, 1000) + time, x, rho, P, v = get_data(f) + + fig, ax = pl.subplots(3, 1, sharex = True) + + ax[0].plot(x, rho, "r.") + ax[0].plot(xrange, get_analytic_density(xrange), "k-") + ax[0].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5) + ax[0].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5) + ax[0].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5) + ax[0].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5) + ax[0].set_ylim(0., 1.2 * get_analytic_density(x_disc)) + ax[0].set_ylabel("density") + + ax[1].plot(x, v, "r.") + ax[1].plot(xrange, np.zeros(len(xrange)), "k-") + ax[1].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5) + ax[1].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5) + ax[1].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5) + ax[1].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5) + ax[1].set_ylim(-0.5, 10.) + ax[1].set_ylabel("velocity norm") + + ax[2].plot(x, P, "r.") + ax[2].plot(xrange, get_analytic_pressure(xrange), "k-") + ax[2].plot([x_disc - x_max, x_disc - x_max], [0, 10], "k--", alpha=0.5) + ax[2].plot([x_disc + x_max, x_disc + x_max], [0, 10], "k--", alpha=0.5) + ax[2].plot([x_disc - x_trunc, x_disc - x_trunc], [0, 10], "k--", alpha=0.5) + ax[2].plot([x_disc + x_trunc, x_disc + x_trunc], [0, 10], "k--", alpha=0.5) + ax[2].set_xlim(0., 2. * x_disc) + ax[2].set_ylim(0., 1.2 * get_analytic_pressure(x_disc)) + ax[2].set_xlabel("x") + ax[2].set_ylabel("pressure") + + pl.suptitle("t = {0:.2f}".format(time)) + + pl.savefig("{name}.png".format(name = f[:-5])) + pl.close() diff --git a/examples/DiscPatch/HydroStatic_1D/run.sh b/examples/DiscPatch/HydroStatic_1D/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..e9d073a6cc7a06ec9ebd9fdb556c44778d32c7f4 --- /dev/null +++ b/examples/DiscPatch/HydroStatic_1D/run.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +# Generate the initial conditions if they are not present. +if [ ! -e Disc-Patch.hdf5 ] +then + echo "Generating initial conditions for the disc patch example..." + python makeIC.py +fi + +# Run SWIFT +../../swift -g -s -t 4 disc-patch-icc.yml 2>&1 | tee output.log + +python plotSolution.py diff --git a/examples/EAGLE_100/eagle_100.yml b/examples/EAGLE_100/eagle_100.yml index a9b83b81f085e66b36d115c5265b66d6093ffdfb..1ea1518825debe56cb8462c4a1b398c03c257bfe 100644 --- a/examples/EAGLE_100/eagle_100.yml +++ b/examples/EAGLE_100/eagle_100.yml @@ -23,6 +23,12 @@ Snapshots: Statistics: delta_time: 1e-2 # Time between statistics output +# Parameters for the self-gravity scheme +Gravity: + eta: 0.025 # Constant dimensionless multiplier for time integration. + epsilon: 0.0001 # Softening length (in internal units). + theta: 0.7 # Opening angle (Multipole acceptance criterion) + # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). diff --git a/examples/EAGLE_12/eagle_12.yml b/examples/EAGLE_12/eagle_12.yml index 6afffed0f9d39b34588b89569a85ab56223fc548..f56c330590ac25cc5b3fe8f68ed68aa1e94d6490 100644 --- a/examples/EAGLE_12/eagle_12.yml +++ b/examples/EAGLE_12/eagle_12.yml @@ -12,9 +12,6 @@ TimeIntegration: time_end: 1e-2 # The end time of the simulation (in internal units). dt_min: 1e-10 # The minimal time-step size of the simulation (in internal units). dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units). - -Scheduler: - cell_split_size: 50 # Parameters governing the snapshots Snapshots: @@ -29,8 +26,8 @@ Statistics: # Parameters for the self-gravity scheme Gravity: eta: 0.025 # Constant dimensionless multiplier for time integration. + epsilon: 0.001 # Softening length (in internal units). theta: 0.7 # Opening angle (Multipole acceptance criterion) - epsilon: 0.0001 # Softening length (in internal units). # Parameters for the hydrodynamics scheme SPH: diff --git a/examples/EAGLE_25/eagle_25.yml b/examples/EAGLE_25/eagle_25.yml index c755768bcfafebf3efe6307080e9e85d3a0a4bf5..5dee9dad0b5d7f694c61fa4c983ead0f1cd6e5e2 100644 --- a/examples/EAGLE_25/eagle_25.yml +++ b/examples/EAGLE_25/eagle_25.yml @@ -27,8 +27,7 @@ Statistics: Gravity: eta: 0.025 # Constant dimensionless multiplier for time integration. epsilon: 0.0001 # Softening length (in internal units). - a_smooth: 1000. - r_cut: 4. + theta: 0.7 # Opening angle (Multipole acceptance criterion) # Parameters for the hydrodynamics scheme SPH: diff --git a/examples/EAGLE_50/eagle_50.yml b/examples/EAGLE_50/eagle_50.yml index b84b1eb7c362f85d8cd6a08ff2a15f72d1337396..898c28935abd02ec115ce107bdcfa4006c41dc48 100644 --- a/examples/EAGLE_50/eagle_50.yml +++ b/examples/EAGLE_50/eagle_50.yml @@ -23,6 +23,12 @@ Snapshots: Statistics: delta_time: 1e-2 # Time between statistics output +# Parameters for the self-gravity scheme +Gravity: + eta: 0.025 # Constant dimensionless multiplier for time integration. + epsilon: 0.0001 # Softening length (in internal units). + theta: 0.7 # Opening angle (Multipole acceptance criterion) + # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). diff --git a/examples/EAGLE_6/README b/examples/EAGLE_6/README new file mode 100644 index 0000000000000000000000000000000000000000..9fe951252f1abf4e27264c6497ec14451080b01e --- /dev/null +++ b/examples/EAGLE_6/README @@ -0,0 +1,13 @@ +ICs extracted from the EAGLE suite of simulations. + +WARNING: These ICs correspond to a very small cosmological volume +and are not representative of actual load-balancing of large runs. + +The particle distribution here is the snapshot 27 (z=0.1) of the 6.25Mpc +Ref-model. h- and a- factors from the original Gadget code have been +corrected for. Variables not used in a pure hydro & gravity code have +been removed. +Everything is ready to be run without cosmological integration. + +MD5 checksum of the ICs: +a4efccd3646a60ad8600ac3a2895ea82 EAGLE_ICs_6.hdf5 diff --git a/examples/EAGLE_6/eagle_6.yml b/examples/EAGLE_6/eagle_6.yml new file mode 100644 index 0000000000000000000000000000000000000000..f55ecc856953d4cb60a86e3461625318a1757693 --- /dev/null +++ b/examples/EAGLE_6/eagle_6.yml @@ -0,0 +1,44 @@ +# Define the system of units to use internally. +InternalUnitSystem: + UnitMass_in_cgs: 1.989e43 # 10^10 M_sun in grams + UnitLength_in_cgs: 3.085678e24 # Mpc in centimeters + UnitVelocity_in_cgs: 1e5 # km/s in centimeters per second + UnitCurrent_in_cgs: 1 # Amperes + UnitTemp_in_cgs: 1 # Kelvin + +# Parameters governing the time integration +TimeIntegration: + time_begin: 0. # The starting time of the simulation (in internal units). + time_end: 1e-2 # The end time of the simulation (in internal units). + dt_min: 1e-10 # The minimal time-step size of the simulation (in internal units). + dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units). + +Scheduler: + cell_split_size: 64 + +# Parameters governing the snapshots +Snapshots: + basename: eagle # Common part of the name of output files + time_first: 0. # Time of the first output (in internal units) + delta_time: 1e-3 # Time difference between consecutive outputs (in internal units) + +# Parameters governing the conserved quantities statistics +Statistics: + delta_time: 1e-2 # Time between statistics output + +# Parameters for the self-gravity scheme +Gravity: + eta: 0.025 # Constant dimensionless multiplier for time integration. + theta: 0.7 # Opening angle (Multipole acceptance criterion) + epsilon: 0.0001 # Softening length (in internal units). + +# Parameters for the hydrodynamics scheme +SPH: + resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). + delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. + CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. + +# Parameters related to the initial conditions +InitialConditions: + file_name: ./EAGLE_ICs_6.hdf5 # The file to read + diff --git a/examples/EAGLE_6/getIC.sh b/examples/EAGLE_6/getIC.sh new file mode 100755 index 0000000000000000000000000000000000000000..08daa32a9b708532ab3e78924fb44f7c5dd06795 --- /dev/null +++ b/examples/EAGLE_6/getIC.sh @@ -0,0 +1,2 @@ +#!/bin/bash +wget http://virgodb.cosma.dur.ac.uk/swift-webstorage/ICs/EAGLE_ICs_6.hdf5 diff --git a/examples/EAGLE_6/run.sh b/examples/EAGLE_6/run.sh new file mode 100755 index 0000000000000000000000000000000000000000..d8e5592467a115460bb455ab31bb5e1f4017a948 --- /dev/null +++ b/examples/EAGLE_6/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash + + # Generate the initial conditions if they are not present. +if [ ! -e EAGLE_ICs_6.hdf5 ] +then + echo "Fetching initial conditions for the EAGLE 6Mpc example..." + ./getIC.sh +fi + +../swift -s -t 16 eagle_6.yml 2>&1 | tee output.log + diff --git a/examples/ExternalPointMass/energy_plot.py b/examples/ExternalPointMass/energy_plot.py index 25640bcb5af2966dcd57efbe1a814bb18ac4f263..1863305614c226f64faac3d86fa2f809d49b9d74 100644 --- a/examples/ExternalPointMass/energy_plot.py +++ b/examples/ExternalPointMass/energy_plot.py @@ -34,7 +34,7 @@ import sys stats_filename = "./energy.txt" # First snapshot -snap_filename = "pointMass_000.hdf5" +snap_filename = "pointMass_0000.hdf5" f = h5.File(snap_filename,'r') # Read the units parameters from the snapshot @@ -71,7 +71,7 @@ Lz_snap = np.zeros(402) # Read all the particles from the snapshots for i in range(402): - snap_filename = "pointMass_%0.3d.hdf5"%i + snap_filename = "pointMass_%0.4d.hdf5"%i f = h5.File(snap_filename,'r') pos_x = f["PartType1/Coordinates"][:,0] diff --git a/examples/Gradients/run.sh b/examples/Gradients/run.sh index cc1adc676427b257445f64a011ed8ebee87285ab..44c25ac5695175c40483d9f8b3bbd160b2fcbc0a 100755 --- a/examples/Gradients/run.sh +++ b/examples/Gradients/run.sh @@ -2,12 +2,12 @@ python makeICs.py stretched ../swift -s -t 2 gradientsStretched.yml -python plot.py gradients_stretched_001.hdf5 stretched +python plot.py gradients_stretched_0001.hdf5 stretched python makeICs.py cartesian ../swift -s -t 2 gradientsCartesian.yml -python plot.py gradients_cartesian_001.hdf5 cartesian +python plot.py gradients_cartesian_0001.hdf5 cartesian python makeICs.py random ../swift -s -t 2 gradientsRandom.yml -python plot.py gradients_random_001.hdf5 random +python plot.py gradients_random_0001.hdf5 random diff --git a/examples/GreshoVortex_2D/plotSolution.py b/examples/GreshoVortex_2D/plotSolution.py index 7a86daa6a4e5e1dd80888ceac9a6eb6b08dff443..d497a6b297bf38b39cf85a9107a769c20f815b77 100644 --- a/examples/GreshoVortex_2D/plotSolution.py +++ b/examples/GreshoVortex_2D/plotSolution.py @@ -83,7 +83,7 @@ solution_s = solution_P / solution_rho**gas_gamma solution_u = solution_P /((gas_gamma - 1.)*solution_rho) # Read the simulation data -sim = h5py.File("gresho_%03d.hdf5"%snap, "r") +sim = h5py.File("gresho_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/HydrostaticHalo/density_profile.py b/examples/HydrostaticHalo/density_profile.py index 5248587ec343d3c0ffe2cef0cbd8716b9a1e055c..a28b4d56a911c10afba07fcb25b377428eb4f857 100644 --- a/examples/HydrostaticHalo/density_profile.py +++ b/examples/HydrostaticHalo/density_profile.py @@ -42,7 +42,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "Hydrostatic_000.hdf5" +filename = "Hydrostatic_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -63,7 +63,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "Hydrostatic_%03d.hdf5" %i + filename = "Hydrostatic_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/HydrostaticHalo/internal_energy_profile.py b/examples/HydrostaticHalo/internal_energy_profile.py index f1be049adb8e972f89fd9ffe86106b1b9f3b19dc..f73fe4b70718054b29a7147b4ee3fa5b13539acf 100644 --- a/examples/HydrostaticHalo/internal_energy_profile.py +++ b/examples/HydrostaticHalo/internal_energy_profile.py @@ -60,7 +60,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "Hydrostatic_000.hdf5" +filename = "Hydrostatic_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -79,7 +79,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "Hydrostatic_%03d.hdf5" %i + filename = "Hydrostatic_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/HydrostaticHalo/test_energy_conservation.py b/examples/HydrostaticHalo/test_energy_conservation.py index 8368d475813d248ca93c12e46737b062752ab779..cc3e3da38d714f103b5f89c7eb713b64ddc6a8ec 100644 --- a/examples/HydrostaticHalo/test_energy_conservation.py +++ b/examples/HydrostaticHalo/test_energy_conservation.py @@ -38,7 +38,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "Hydrostatic_000.hdf5" +filename = "Hydrostatic_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -62,7 +62,7 @@ time_array_cgs = [] for i in range(n_snaps): - filename = "Hydrostatic_%03d.hdf5" %i + filename = "Hydrostatic_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/HydrostaticHalo/velocity_profile.py b/examples/HydrostaticHalo/velocity_profile.py index f8f607362846a323937a9203dab8bc228f52a149..19ae4b9c3339a0fb2f2bf73fb6e60acb6d82ba7e 100644 --- a/examples/HydrostaticHalo/velocity_profile.py +++ b/examples/HydrostaticHalo/velocity_profile.py @@ -60,7 +60,7 @@ H_0_cgs = 100. * h * KM_PER_SEC_IN_CGS / (1.0e6 * PARSEC_IN_CGS) #read some header/parameter information from the first snapshot -filename = "Hydrostatic_000.hdf5" +filename = "Hydrostatic_0000.hdf5" f = h5.File(filename,'r') params = f["Parameters"] unit_mass_cgs = float(params.attrs["InternalUnitSystem:UnitMass_in_cgs"]) @@ -79,7 +79,7 @@ M_vir_cgs = r_vir_cgs * v_c_cgs**2 / CONST_G_CGS for i in range(n_snaps): - filename = "Hydrostatic_%03d.hdf5" %i + filename = "Hydrostatic_%04d.hdf5" %i f = h5.File(filename,'r') coords_dset = f["PartType0/Coordinates"] coords = np.array(coords_dset) diff --git a/examples/IsothermalPotential/energy_plot.py b/examples/IsothermalPotential/energy_plot.py index 0afa6fa93fa2a992e6ddeab3c9d33538c0b41de3..dab30715fbdaa0393f62c764ba552bbe4106325d 100644 --- a/examples/IsothermalPotential/energy_plot.py +++ b/examples/IsothermalPotential/energy_plot.py @@ -34,7 +34,7 @@ import sys stats_filename = "./energy.txt" # First snapshot -snap_filename = "Isothermal_000.hdf5" +snap_filename = "Isothermal_0000.hdf5" f = h5.File(snap_filename,'r') # Read the units parameters from the snapshot @@ -70,7 +70,7 @@ Lz_snap = np.zeros(402) # Read all the particles from the snapshots for i in range(402): - snap_filename = "Isothermal_%0.3d.hdf5"%i + snap_filename = "Isothermal_%0.4d.hdf5"%i f = h5.File(snap_filename,'r') pos_x = f["PartType1/Coordinates"][:,0] diff --git a/examples/KelvinHelmholtz_2D/plotSolution.py b/examples/KelvinHelmholtz_2D/plotSolution.py index 9191f3ac7ec75c61d5fdab5d347c86222f787fab..77ab6fb244da25d13760f90653fac7eac11a0ee7 100644 --- a/examples/KelvinHelmholtz_2D/plotSolution.py +++ b/examples/KelvinHelmholtz_2D/plotSolution.py @@ -63,7 +63,7 @@ rc('font',**{'family':'sans-serif','sans-serif':['Times']}) snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("kelvinHelmholtz_%03d.hdf5"%snap, "r") +sim = h5py.File("kelvinHelmholtz_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/Makefile.am b/examples/Makefile.am index 1dd240fb6015fe5fdd2465cccb1bb221706efeed..5501601f95bde15484142e994dbf3d6fa475da98 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -60,9 +60,11 @@ EXTRA_DIST = BigCosmoVolume/makeIC.py \ BigPerturbedBox/makeIC_fcc.py \ CosmoVolume/cosmoVolume.yml CosmoVolume/getIC.sh CosmoVolume/run.sh \ CoolingBox/coolingBox.yml CoolingBox/energy_plot.py CoolingBox/makeIC.py CoolingBox/run.sh \ + EAGLE_6/eagle_6.yml EAGLE_6/getIC.sh EAGLE_6/README EAGLE_6/run.sh \ EAGLE_12/eagle_12.yml EAGLE_12/getIC.sh EAGLE_12/README EAGLE_12/run.sh \ EAGLE_25/eagle_25.yml EAGLE_25/getIC.sh EAGLE_25/README EAGLE_25/run.sh \ EAGLE_50/eagle_50.yml EAGLE_50/getIC.sh EAGLE_50/README EAGLE_50/run.sh \ + EAGLE_100/eagle_100.yml EAGLE_100/getIC.sh EAGLE_100/README EAGLE_100/run.sh \ ExternalPointMass/externalPointMass.yml ExternalPointMass/makeIC.py ExternalPointMass/run.sh ExternalPointMass/energy_plot.py \ GreshoVortex_2D/getGlass.sh GreshoVortex_2D/gresho.yml GreshoVortex_2D/makeIC.py GreshoVortex_2D/plotSolution.py GreshoVortex_2D/run.sh \ HydrostaticHalo/README HydrostaticHalo/hydrostatic.yml HydrostaticHalo/makeIC.py HydrostaticHalo/run.sh \ @@ -70,11 +72,17 @@ EXTRA_DIST = BigCosmoVolume/makeIC.py \ IsothermalPotential/README IsothermalPotential/run.sh IsothermalPotential/energy_plot.py IsothermalPotential/isothermal.yml IsothermalPotential/makeIC.py \ KelvinHelmholtz_2D/kelvinHelmholtz.yml KelvinHelmholtz_2D/makeIC.py KelvinHelmholtz_2D/plotSolution.py KelvinHelmholtz_2D/run.sh \ MultiTypes/makeIC.py MultiTypes/multiTypes.yml MultiTypes/run.sh \ + Noh_1D/makeIC.py Noh_1D/noh.yml Noh_1D/plotSolution.py Noh_1D/run.sh \ + Noh_2D/makeIC.py Noh_2D/noh.yml Noh_2D/plotSolution.py Noh_2D/run.sh Noh_2D/getGlass.sh \ + Noh_3D/makeIC.py Noh_3D/noh.yml Noh_3D/plotSolution.py Noh_3D/run.sh Noh_3D/getGlass.sh \ PerturbedBox_2D/makeIC.py PerturbedBox_2D/perturbedPlane.yml \ PerturbedBox_3D/makeIC.py PerturbedBox_3D/perturbedBox.yml PerturbedBox_3D/run.sh \ SedovBlast_1D/makeIC.py SedovBlast_1D/plotSolution.py SedovBlast_1D/run.sh SedovBlast_1D/sedov.yml \ SedovBlast_2D/getGlass.sh SedovBlast_2D/makeIC.py SedovBlast_2D/plotSolution.py SedovBlast_2D/run.sh SedovBlast_2D/sedov.yml \ SedovBlast_3D/getGlass.sh SedovBlast_3D/makeIC.py SedovBlast_3D/plotSolution.py SedovBlast_3D/run.sh SedovBlast_3D/sedov.yml \ + SineWavePotential_1D/makeIC.py SineWavePotential_1D/plotSolution.py SineWavePotential_1D/run.sh SineWavePotential_1D/sineWavePotential.yml \ + SineWavePotential_2D/makeIC.py SineWavePotential_2D/plotSolution.py SineWavePotential_2D/run.sh SineWavePotential_2D/sineWavePotential.yml \ + SineWavePotential_3D/makeIC.py SineWavePotential_3D/plotSolution.py SineWavePotential_3D/run.sh SineWavePotential_3D/sineWavePotential.yml \ SodShock_1D/makeIC.py SodShock_1D/plotSolution.py SodShock_1D/run.sh SodShock_1D/sodShock.yml \ SodShock_2D/getGlass.sh SodShock_2D/makeIC.py SodShock_2D/plotSolution.py SodShock_2D/run.sh SodShock_2D/sodShock.yml \ SodShock_3D/getGlass.sh SodShock_3D/makeIC.py SodShock_3D/plotSolution.py SodShock_3D/run.sh SodShock_3D/sodShock.yml \ @@ -88,8 +96,15 @@ EXTRA_DIST += parameter_example.yml # Scripts to plot task graphs EXTRA_DIST += plot_tasks_MPI.py plot_tasks.py \ + analyse_tasks_MPI.py analyse_tasks.py \ process_plot_tasks_MPI process_plot_tasks +# Scripts to plot threadpool 'task' graphs +EXTRA_DIST += analyse_threadpool_tasks.py \ + plot_threadpool.py \ + process_plot_threadpool + # Script for scaling plot -EXTRA_DIST += plot_scaling_results.py +EXTRA_DIST += plot_scaling_results.py \ + plot_scaling_results_breakdown.py diff --git a/examples/Noh_1D/plotSolution.py b/examples/Noh_1D/plotSolution.py index f4916af6e6066d21f76c28b5acef41e1907a83fd..25b9b2f16b24cba5def592a5cf00dbae82195ef7 100644 --- a/examples/Noh_1D/plotSolution.py +++ b/examples/Noh_1D/plotSolution.py @@ -58,7 +58,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("noh_%03d.hdf5"%snap, "r") +sim = h5py.File("noh_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/Noh_2D/plotSolution.py b/examples/Noh_2D/plotSolution.py index a01a712efd412488aea09c3f3c4e8d68323fc916..775ddf4e8a7954c14034ad51a6b66622c41a6996 100644 --- a/examples/Noh_2D/plotSolution.py +++ b/examples/Noh_2D/plotSolution.py @@ -58,7 +58,7 @@ rc('font',**{'family':'sans-serif','sans-serif':['Times']}) snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("noh_%03d.hdf5"%snap, "r") +sim = h5py.File("noh_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/Noh_3D/plotSolution.py b/examples/Noh_3D/plotSolution.py index 1742e13a5daeff392690a9804fb2831ef4304963..386b9f728b5e8d8e38fb7ec9aeaa336d194e35dd 100644 --- a/examples/Noh_3D/plotSolution.py +++ b/examples/Noh_3D/plotSolution.py @@ -59,7 +59,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("noh_%03d.hdf5"%snap, "r") +sim = h5py.File("noh_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/PerturbedBox_2D/perturbedPlane.yml b/examples/PerturbedBox_2D/perturbedPlane.yml index b92e29f620edc6f72399111fbe73ba6bd1485e92..a0c6b6d9dbc7a677002dbce5abc6e5d268b56e97 100644 --- a/examples/PerturbedBox_2D/perturbedPlane.yml +++ b/examples/PerturbedBox_2D/perturbedPlane.yml @@ -9,7 +9,7 @@ InternalUnitSystem: # Parameters governing the time integration TimeIntegration: time_begin: 0. # The starting time of the simulation (in internal units). - time_end: 10. # The end time of the simulation (in internal units). + time_end: 1000. # The end time of the simulation (in internal units). dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units). dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units). @@ -21,12 +21,11 @@ Snapshots: # Parameters governing the conserved quantities statistics Statistics: - delta_time: 1e-3 # Time between statistics output + delta_time: 1. # Time between statistics output # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). - delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. # Parameters related to the initial conditions diff --git a/examples/PerturbedBox_3D/perturbedBox.yml b/examples/PerturbedBox_3D/perturbedBox.yml index 71c8dece4df5505eb44511ee92291feedd7ffab1..3148510979d0e349c0d6242bf11e1a0db94f9e1f 100644 --- a/examples/PerturbedBox_3D/perturbedBox.yml +++ b/examples/PerturbedBox_3D/perturbedBox.yml @@ -9,9 +9,9 @@ InternalUnitSystem: # Parameters governing the time integration TimeIntegration: time_begin: 0. # The starting time of the simulation (in internal units). - time_end: 1. # The end time of the simulation (in internal units). + time_end: 1000 # The end time of the simulation (in internal units). dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units). - dt_max: 1e-3 # The maximal time-step size of the simulation (in internal units). + dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units). # Parameters governing the snapshots Snapshots: @@ -21,12 +21,11 @@ Snapshots: # Parameters governing the conserved quantities statistics Statistics: - delta_time: 1e-3 # Time between statistics output + delta_time: 1. # Time between statistics output # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). - delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. # Parameters related to the initial conditions diff --git a/examples/SedovBlast_1D/plotSolution.py b/examples/SedovBlast_1D/plotSolution.py index a62775b012edda3217558031c266ed6e9b48f423..2738b7c8f301a7351d962ac0f29faccd0a770fc9 100644 --- a/examples/SedovBlast_1D/plotSolution.py +++ b/examples/SedovBlast_1D/plotSolution.py @@ -64,7 +64,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("sedov_%03d.hdf5"%snap, "r") +sim = h5py.File("sedov_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/SedovBlast_2D/plotSolution.py b/examples/SedovBlast_2D/plotSolution.py index d8c0c9791d1834cc2a5cf0103b46a49e20d2e8a3..2b5de6f32b8673bbc825fbb5236f4e2ab3b4f408 100644 --- a/examples/SedovBlast_2D/plotSolution.py +++ b/examples/SedovBlast_2D/plotSolution.py @@ -65,7 +65,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("sedov_%03d.hdf5"%snap, "r") +sim = h5py.File("sedov_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/SedovBlast_3D/plotSolution.py b/examples/SedovBlast_3D/plotSolution.py index 6e90a9a43524b3cdb279054764b71fd1b546b366..ad34695d36f1bf8e8985b883200f17d6e38a70c9 100644 --- a/examples/SedovBlast_3D/plotSolution.py +++ b/examples/SedovBlast_3D/plotSolution.py @@ -65,7 +65,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("sedov_%03d.hdf5"%snap, "r") +sim = h5py.File("sedov_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/SodShock_1D/plotSolution.py b/examples/SodShock_1D/plotSolution.py index 0a7720f4a6cf26e5a8acda1101bd438850d8d553..e001a8d87a03cb246be63ab10d245f95eb1a7ce7 100644 --- a/examples/SodShock_1D/plotSolution.py +++ b/examples/SodShock_1D/plotSolution.py @@ -67,7 +67,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("sodShock_%03d.hdf5"%snap, "r") +sim = h5py.File("sodShock_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/SodShock_2D/plotSolution.py b/examples/SodShock_2D/plotSolution.py index b4a203d93518d98ee87282f4ea46d045c4c3b38a..19cbe0ffb766845c051ffb6cea81bd918d890e36 100644 --- a/examples/SodShock_2D/plotSolution.py +++ b/examples/SodShock_2D/plotSolution.py @@ -68,7 +68,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("sodShock_%03d.hdf5"%snap, "r") +sim = h5py.File("sodShock_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/SodShock_3D/plotSolution.py b/examples/SodShock_3D/plotSolution.py index 3d9616af55a204db4be9df2e42b355e266944153..6da7193bcd3cdfb7c22a3fc6a14f91aea5cff5f7 100644 --- a/examples/SodShock_3D/plotSolution.py +++ b/examples/SodShock_3D/plotSolution.py @@ -68,7 +68,7 @@ snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("sodShock_%03d.hdf5"%snap, "r") +sim = h5py.File("sodShock_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/SquareTest_2D/plotSolution.py b/examples/SquareTest_2D/plotSolution.py index b9efe76de1e6c5993fa5333be76a13ba95bdab0f..f182b4d7437348d29065b51df79e5334aa26f9a4 100644 --- a/examples/SquareTest_2D/plotSolution.py +++ b/examples/SquareTest_2D/plotSolution.py @@ -63,7 +63,7 @@ rc('font',**{'family':'sans-serif','sans-serif':['Times']}) snap = int(sys.argv[1]) # Read the simulation data -sim = h5py.File("square_%03d.hdf5"%snap, "r") +sim = h5py.File("square_%04d.hdf5"%snap, "r") boxSize = sim["/Header"].attrs["BoxSize"][0] time = sim["/Header"].attrs["Time"][0] scheme = sim["/HydroScheme"].attrs["Scheme"] diff --git a/examples/UniformDMBox/uniformBox.yml b/examples/UniformDMBox/uniformBox.yml index cffd442a9a5b16d8e042e41caf9991fcf0e1202e..e59d677b308ca70f212f74c7e4d8b79f015c77a9 100644 --- a/examples/UniformDMBox/uniformBox.yml +++ b/examples/UniformDMBox/uniformBox.yml @@ -28,7 +28,7 @@ Gravity: eta: 0.025 # Constant dimensionless multiplier for time integration. theta: 0.7 # Opening angle (Multipole acceptance criterion) epsilon: 0.00001 # Softening length (in internal units). - + # Parameters governing the conserved quantities statistics Statistics: delta_time: 1e-2 # Time between statistics output diff --git a/examples/analyse_tasks.py b/examples/analyse_tasks.py index 04cd59feedba7ee41621ac0891d544c4aa294543..970c4a91042b8c61185727f27ef898f93af81fdc 100755 --- a/examples/analyse_tasks.py +++ b/examples/analyse_tasks.py @@ -50,12 +50,17 @@ infile = args.input TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init_grav", "ghost", "extra_ghost", "drift_part", "drift_gpart", "kick1", "kick2", "timestep", "send", "recv", - "grav_top_level", "grav_long_range", "grav_mm", "grav_down", - "cooling", "sourceterms", "count"] + "grav_top_level", "grav_long_range", "grav_ghost", "grav_mm", + "grav_down", "cooling", "sourceterms", "count"] SUBTYPES = ["none", "density", "gradient", "force", "grav", "external_grav", "tend", "xv", "rho", "gpart", "multipole", "spart", "count"] +SIDS = ["(-1,-1,-1)", "(-1,-1, 0)", "(-1,-1, 1)", "(-1, 0,-1)", + "(-1, 0, 0)", "(-1, 0, 1)", "(-1, 1,-1)", "(-1, 1, 0)", + "(-1, 1, 1)", "( 0,-1,-1)", "( 0,-1, 0)", "( 0,-1, 1)", + "( 0, 0,-1)"] + # Read input. data = pl.loadtxt( infile ) @@ -66,11 +71,17 @@ print "# Maximum thread id:", maxthread full_step = data[0,:] tic_step = int(full_step[4]) toc_step = int(full_step[5]) +updates = int(full_step[6]) +g_updates = int(full_step[7]) +s_updates = int(full_step[8]) CPU_CLOCK = float(full_step[-1]) / 1000.0 data = data[1:,:] if args.verbose: - print "CPU frequency:", CPU_CLOCK * 1000.0 - + print "# CPU frequency:", CPU_CLOCK * 1000.0 +print "# updates:", updates +print "# g_updates:", g_updates +print "# s_updates:", s_updates + # Avoid start and end times of zero. data = data[data[:,4] != 0] data = data[data[:,5] != 0] @@ -78,6 +89,7 @@ data = data[data[:,5] != 0] # Calculate the time range. total_t = (toc_step - tic_step)/ CPU_CLOCK print "# Data range: ", total_t, "ms" +print # Correct times to relative values. start_t = float(tic_step) @@ -90,15 +102,16 @@ for i in range(maxthread): tasks[i] = [] # Gather into by thread data. -num_lines = pl.size(data) / 10 +num_lines = pl.size(data) / pl.size(full_step) for line in range(num_lines): thread = int(data[line,0]) tic = int(data[line,4]) / CPU_CLOCK toc = int(data[line,5]) / CPU_CLOCK tasktype = int(data[line,1]) subtype = int(data[line,2]) + sid = int(data[line, -1]) - tasks[thread].append([tic,toc,tasktype,subtype]) + tasks[thread].append([tic,toc,tasktype,subtype, sid]) # Sort by tic and gather used thread ids. threadids = [] @@ -109,10 +122,12 @@ for i in range(maxthread): # Times per task. print "# Task times:" -print "# {0:<16s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ +print "# -----------" +print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ .format("type/subtype", "count","minimum", "maximum", "sum", "mean", "percent") alltasktimes = {} +sidtimes = {} for i in threadids: tasktimes = {} for task in tasks[i]: @@ -126,12 +141,19 @@ for i in threadids: alltasktimes[key] = [] alltasktimes[key].append(dt) + my_sid = task[4] + if my_sid > -1: + if not my_sid in sidtimes: + sidtimes[my_sid] = [] + sidtimes[my_sid].append(dt) + + print "# Thread : ", i for key in sorted(tasktimes.keys()): taskmin = min(tasktimes[key]) taskmax = max(tasktimes[key]) tasksum = sum(tasktimes[key]) - print "{0:18s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + print "{0:19s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ .format(key, len(tasktimes[key]), taskmin, taskmax, tasksum, tasksum / len(tasktimes[key]), tasksum / total_t * 100.0) print @@ -141,14 +163,118 @@ for key in sorted(alltasktimes.keys()): taskmin = min(alltasktimes[key]) taskmax = max(alltasktimes[key]) tasksum = sum(alltasktimes[key]) - print "{0:18s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + print "{0:19s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ .format(key, len(alltasktimes[key]), taskmin, taskmax, tasksum, tasksum / len(alltasktimes[key]), tasksum / (len(threadids) * total_t) * 100.0) print +# For pairs, show stuf sorted by SID +print "# By SID (all threads): " +print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ + .format("Pair/Sub-pair SID", "count","minimum", "maximum", + "sum", "mean", "percent") + +for sid in range(0,13): + if sid in sidtimes: + sidmin = min(sidtimes[sid]) + sidmax = max(sidtimes[sid]) + sidsum = sum(sidtimes[sid]) + sidcount = len(sidtimes[sid]) + sidmean = sidsum / sidcount + else: + sidmin = 0. + sidmax = 0. + sidsum = 0. + sidcount = 0 + sidmean = 0. + print "{0:3d} {1:15s}: {2:7d} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.4f} {7:9.2f}"\ + .format(sid, SIDS[sid], sidcount, sidmin, sidmax, sidsum, + sidmean, sidsum / (len(threadids) * total_t) * 100.0) +print + # Dead times. -print "# Deadtimes:" +print "# Times not in tasks (deadtimes)" +print "# ------------------------------" +print "# Time before first task:" +print "# no. : {0:>9s} {1:>9s}".format("value", "percent") +predeadtimes = [] +for i in threadids: + predeadtime = tasks[i][0][0] + print "thread {0:2d}: {1:9.4f} {2:9.4f}"\ + .format(i, predeadtime, predeadtime / total_t * 100.0) + predeadtimes.append(predeadtime) + +predeadmin = min(predeadtimes) +predeadmax = max(predeadtimes) +predeadsum = sum(predeadtimes) +print "# : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(predeadtimes), predeadmin, predeadmax, predeadsum, + predeadsum / len(predeadtimes), + predeadsum / (len(threadids) * total_t ) * 100.0) +print + +print "# Time after last task:" +print "# no. : {0:>9s} {1:>9s}".format("value", "percent") +postdeadtimes = [] +for i in threadids: + postdeadtime = total_t - tasks[i][-1][1] + print "thread {0:2d}: {1:9.4f} {2:9.4f}"\ + .format(i, postdeadtime, postdeadtime / total_t * 100.0) + postdeadtimes.append(postdeadtime) + +postdeadmin = min(postdeadtimes) +postdeadmax = max(postdeadtimes) +postdeadsum = sum(postdeadtimes) +print "# : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(postdeadtimes), postdeadmin, postdeadmax, postdeadsum, + postdeadsum / len(postdeadtimes), + postdeadsum / (len(threadids) * total_t ) * 100.0) +print + +# Time in engine, i.e. from first to last tasks. +print "# Time between tasks (engine deadtime):" +print "# no. : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +enginedeadtimes = [] +for i in threadids: + deadtimes = [] + last = tasks[i][0][0] + for task in tasks[i]: + dt = task[0] - last + deadtimes.append(dt) + last = task[1] + + # Drop first value, last value already gone. + if len(deadtimes) > 1: + deadtimes = deadtimes[1:] + else: + # Only one task, so no deadtime by definition. + deadtimes = [0.0] + + deadmin = min(deadtimes) + deadmax = max(deadtimes) + deadsum = sum(deadtimes) + print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + .format(i, len(deadtimes), deadmin, deadmax, deadsum, + deadsum / len(deadtimes), deadsum / total_t * 100.0) + enginedeadtimes.extend(deadtimes) + +deadmin = min(enginedeadtimes) +deadmax = max(enginedeadtimes) +deadsum = sum(enginedeadtimes) +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(enginedeadtimes), deadmin, deadmax, deadsum, + deadsum / len(enginedeadtimes), + deadsum / (len(threadids) * total_t ) * 100.0) +print + +# All times in step. +print "# All deadtimes:" print "# no. : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ .format("count", "minimum", "maximum", "sum", "mean", "percent") alldeadtimes = [] @@ -179,5 +305,4 @@ print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ deadsum / (len(threadids) * total_t ) * 100.0) print - sys.exit(0) diff --git a/examples/analyse_tasks_MPI.py b/examples/analyse_tasks_MPI.py index 9feffaf67ec393257d75428e310a2e8b807df39a..b78d73e879046b05b8a089f97c4c9c00a5f7bb79 100755 --- a/examples/analyse_tasks_MPI.py +++ b/examples/analyse_tasks_MPI.py @@ -42,6 +42,9 @@ parser.add_argument("input", help="Thread data file (-y output)") parser.add_argument("-v", "--verbose", dest="verbose", help="Verbose output (default: False)", default=False, action="store_true") +parser.add_argument("-r", "--rank", dest="rank", + help="Rank to process (default: all)", + default="all", action="store") args = parser.parse_args() infile = args.input @@ -56,17 +59,36 @@ TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", SUBTYPES = ["none", "density", "gradient", "force", "grav", "external_grav", "tend", "xv", "rho", "gpart", "multipole", "spart", "count"] +SIDS = ["(-1,-1,-1)", "(-1,-1, 0)", "(-1,-1, 1)", "(-1, 0,-1)", + "(-1, 0, 0)", "(-1, 0, 1)", "(-1, 1,-1)", "(-1, 1, 0)", + "(-1, 1, 1)", "( 0,-1,-1)", "( 0,-1, 0)", "( 0,-1, 1)", + "( 0, 0,-1)"] + # Read input. data = pl.loadtxt( infile ) # Get the CPU clock to convert ticks into milliseconds. full_step = data[0,:] +updates = int(full_step[7]) +g_updates = int(full_step[8]) +s_updates = int(full_step[9]) CPU_CLOCK = float(full_step[-1]) / 1000.0 if args.verbose: print "# CPU frequency:", CPU_CLOCK * 1000.0 +print "# updates:", updates +print "# g_updates:", g_updates +print "# s_updates:", s_updates nranks = int(max(data[:,0])) + 1 print "# Number of ranks:", nranks +if args.rank == "all": + ranks = range(nranks) +else: + ranks = [int(args.rank)] + if ranks[0] >= nranks: + print "Error: maximum rank is " + str(nranks - 1) + sys.exit(1) + maxthread = int(max(data[:,1])) + 1 print "# Maximum thread id:", maxthread @@ -74,8 +96,8 @@ print "# Maximum thread id:", maxthread sdata = data[data[:,5] != 0] sdata = data[data[:,6] != 0] -# Now we process all the ranks. -for rank in range(nranks): +# Now we process the required ranks. +for rank in ranks: print "# Rank", rank data = sdata[sdata[:,0] == rank] @@ -92,6 +114,7 @@ for rank in range(nranks): # Calculate the time range. total_t = (toc_step - tic_step)/ CPU_CLOCK print "# Data range: ", total_t, "ms" + print # Correct times to relative values. start_t = float(tic_step) @@ -105,15 +128,16 @@ for rank in range(nranks): tasks[i] = [] # Gather into by thread data. - num_lines = pl.size(data) / 12 + num_lines = pl.shape(data)[0] for line in range(num_lines): thread = int(data[line,1]) tic = int(data[line,5]) / CPU_CLOCK toc = int(data[line,6]) / CPU_CLOCK tasktype = int(data[line,2]) subtype = int(data[line,3]) + sid = int(data[line, -1]) - tasks[thread].append([tic,toc,tasktype,subtype]) + tasks[thread].append([tic,toc,tasktype,subtype, sid]) # Sort by tic and gather used threads. threadids = [] @@ -123,10 +147,13 @@ for rank in range(nranks): # Times per task. print "# Task times:" - print "# {0:<16s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ + print "# -----------" + print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ .format("type/subtype", "count","minimum", "maximum", "sum", "mean", "percent") + alltasktimes = {} + sidtimes = {} for i in threadids: tasktimes = {} for task in tasks[i]: @@ -139,13 +166,19 @@ for rank in range(nranks): if not key in alltasktimes: alltasktimes[key] = [] alltasktimes[key].append(dt) + + my_sid = task[4] + if my_sid > -1: + if not my_sid in sidtimes: + sidtimes[my_sid] = [] + sidtimes[my_sid].append(dt) print "# Thread : ", i for key in sorted(tasktimes.keys()): taskmin = min(tasktimes[key]) taskmax = max(tasktimes[key]) tasksum = sum(tasktimes[key]) - print "{0:18s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + print "{0:19s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ .format(key, len(tasktimes[key]), taskmin, taskmax, tasksum, tasksum / len(tasktimes[key]), tasksum / total_t * 100.0) print @@ -161,8 +194,121 @@ for rank in range(nranks): tasksum / (len(threadids) * total_t) * 100.0) print + # For pairs, show stuf sorted by SID + print "# By SID (all threads): " + print "# {0:<17s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ + .format("Pair/Sub-pair SID", "count","minimum", "maximum", + "sum", "mean", "percent") + + for sid in range(0,13): + if sid in sidtimes: + sidmin = min(sidtimes[sid]) + sidmax = max(sidtimes[sid]) + sidsum = sum(sidtimes[sid]) + sidcount = len(sidtimes[sid]) + sidmean = sidsum / sidcount + else: + sidmin = 0. + sidmax = 0. + sidsum = 0. + sidcount = 0 + sidmean = 0. + print "{0:3d} {1:15s}: {2:7d} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.4f} {7:9.2f}"\ + .format(sid, SIDS[sid], sidcount, sidmin, sidmax, sidsum, + sidmean, sidsum / (len(threadids) * total_t) * 100.0) + print + # Dead times. - print "# Deadtimes:" + print "# Times not in tasks (deadtimes)" + print "# ------------------------------" + print "# Time before first task:" + print "# no. : {0:>9s} {1:>9s}".format("value", "percent") + predeadtimes = [] + for i in threadids: + if len(tasks[i]) > 0: + predeadtime = tasks[i][0][0] + print "thread {0:2d}: {1:9.4f} {2:9.4f}"\ + .format(i, predeadtime, predeadtime / total_t * 100.0) + predeadtimes.append(predeadtime) + else: + predeadtimes.append(0.0) + + predeadmin = min(predeadtimes) + predeadmax = max(predeadtimes) + predeadsum = sum(predeadtimes) + print "# : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") + print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(predeadtimes), predeadmin, predeadmax, predeadsum, + predeadsum / len(predeadtimes), + predeadsum / (len(threadids) * total_t ) * 100.0) + print + + print "# Time after last task:" + print "# no. : {0:>9s} {1:>9s}".format("value", "percent") + postdeadtimes = [] + for i in threadids: + if len(tasks[i]) > 0: + postdeadtime = total_t - tasks[i][-1][1] + print "thread {0:2d}: {1:9.4f} {2:9.4f}"\ + .format(i, postdeadtime, postdeadtime / total_t * 100.0) + postdeadtimes.append(postdeadtime) + else: + postdeadtimes.append(0.0) + + postdeadmin = min(postdeadtimes) + postdeadmax = max(postdeadtimes) + postdeadsum = sum(postdeadtimes) + print "# : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") + print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(postdeadtimes), postdeadmin, postdeadmax, postdeadsum, + postdeadsum / len(postdeadtimes), + postdeadsum / (len(threadids) * total_t ) * 100.0) + print + + # Time in engine, i.e. from first to last tasks. + print "# Time between tasks (engine deadtime):" + print "# no. : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") + enginedeadtimes = [] + for i in threadids: + deadtimes = [] + if len(tasks[i]) > 0: + last = tasks[i][0][0] + else: + last = 0.0 + for task in tasks[i]: + dt = task[0] - last + deadtimes.append(dt) + last = task[1] + + # Drop first value, last value already gone. + if len(deadtimes) > 1: + deadtimes = deadtimes[1:] + else: + # Only one or fewer tasks, so no deadtime by definition. + deadtimes = [0.0] + + deadmin = min(deadtimes) + deadmax = max(deadtimes) + deadsum = sum(deadtimes) + print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + .format(i, len(deadtimes), deadmin, deadmax, deadsum, + deadsum / len(deadtimes), deadsum / total_t * 100.0) + enginedeadtimes.extend(deadtimes) + + deadmin = min(enginedeadtimes) + deadmax = max(enginedeadtimes) + deadsum = sum(enginedeadtimes) + print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(enginedeadtimes), deadmin, deadmax, deadsum, + deadsum / len(enginedeadtimes), + deadsum / (len(threadids) * total_t ) * 100.0) + print + + # All times in step. + print "# All deadtimes:" print "# no. : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ .format("count", "minimum", "maximum", "sum", "mean", "percent") alldeadtimes = [] @@ -181,7 +327,7 @@ for rank in range(nranks): deadsum = sum(deadtimes) print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ .format(i, len(deadtimes), deadmin, deadmax, deadsum, - deadsum / len(deadtimes), deadsum / total_t * 100.0) + deadsum / len(deadtimes), deadsum / total_t * 100.0) alldeadtimes.extend(deadtimes) deadmin = min(alldeadtimes) @@ -190,8 +336,7 @@ for rank in range(nranks): print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ .format(len(alldeadtimes), deadmin, deadmax, deadsum, deadsum / len(alldeadtimes), - deadsum / (len(threadids) * total_t ) * 100.0) + deadsum / (len(threadids) * total_t ) * 100.0) print - sys.exit(0) diff --git a/examples/analyse_threadpool_tasks.py b/examples/analyse_threadpool_tasks.py new file mode 100755 index 0000000000000000000000000000000000000000..609af363b4110e010d6714bef6862d40e5acb278 --- /dev/null +++ b/examples/analyse_threadpool_tasks.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python +""" +Usage: + analsyse_threadpool_tasks.py [options] input.dat + +where input.dat is a threadpool dump for a step. Use the '-Y interval' flag +of the swift command to create these. + +The output is an analysis of the threadpool task timings, including deadtime +per thread and step, total amount of time spent for each task type, for the +whole step and per thread and the minimum and maximum times spent per task +type. + +This file is part of SWIFT. +Copyright (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk) + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" + +import matplotlib +matplotlib.use("Agg") +import matplotlib.collections as collections +import matplotlib.ticker as plticker +import pylab as pl +import sys +import argparse + +# Handle the command line. +parser = argparse.ArgumentParser(description="Analyse task dumps") + +parser.add_argument("input", help="Threadpool data file (-y output)") +parser.add_argument("-v", "--verbose", dest="verbose", + help="Verbose output (default: False)", + default=False, action="store_true") + +args = parser.parse_args() +infile = args.input + +# Read header. First two lines. +with open(infile) as infid: + head = [next(infid) for x in xrange(2)] +header = head[1][2:].strip() +header = eval(header) +nthread = int(header['num_threads']) + 1 +CPU_CLOCK = float(header['cpufreq']) / 1000.0 +print "Number of threads: ", nthread - 1 +if args.verbose: + print "CPU frequency:", CPU_CLOCK * 1000.0 + +# Read input. +data = pl.genfromtxt(infile, dtype=None, delimiter=" ") + +# Mixed types, so need to separate. +tics = [] +tocs = [] +funcs = [] +threads = [] +chunks = [] +for i in data: + if i[0] != "#": + funcs.append(i[0].replace("_mapper", "")) + if i[1] < 0: + threads.append(nthread-1) + else: + threads.append(i[1]) + chunks.append(i[2]) + tics.append(i[3]) + tocs.append(i[4]) +tics = pl.array(tics) +tocs = pl.array(tocs) +funcs = pl.array(funcs) +threads = pl.array(threads) +chunks = pl.array(chunks) + +# Recover the start and end time +tic_step = min(tics) +toc_step = max(tocs) + +# Calculate the time range. +total_t = (toc_step - tic_step)/ CPU_CLOCK +print "# Data range: ", total_t, "ms" +print + +# Correct times to relative millisecs. +start_t = float(tic_step) +tics = (tics - start_t) / CPU_CLOCK +tocs = (tocs - start_t) / CPU_CLOCK + +tasks = {} +tasks[-1] = [] +for i in range(nthread): + tasks[i] = [] + +# Gather into by thread data. +for i in range(len(tics)): + tasks[threads[i]].append([tics[i],tocs[i],funcs[i]]) + +# Don't actually process the fake thread. +nthread = nthread - 1 + +# Sort by tic and gather used thread ids. +threadids = [] +for i in range(nthread): + if len(tasks[i]) > 0: + tasks[i] = sorted(tasks[i], key=lambda task: task[0]) + threadids.append(i) + +# Times per task. +print "# Task times:" +print "# -----------" +print "# {0:<31s}: {1:>7s} {2:>9s} {3:>9s} {4:>9s} {5:>9s} {6:>9s}"\ + .format("type/subtype", "count","minimum", "maximum", + "sum", "mean", "percent") +alltasktimes = {} +sidtimes = {} +for i in threadids: + tasktimes = {} + for task in tasks[i]: + key = task[2] + dt = task[1] - task[0] + if not key in tasktimes: + tasktimes[key] = [] + tasktimes[key].append(dt) + + if not key in alltasktimes: + alltasktimes[key] = [] + alltasktimes[key].append(dt) + + print "# Thread : ", i + for key in sorted(tasktimes.keys()): + taskmin = min(tasktimes[key]) + taskmax = max(tasktimes[key]) + tasksum = sum(tasktimes[key]) + print "{0:33s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + .format(key, len(tasktimes[key]), taskmin, taskmax, tasksum, + tasksum / len(tasktimes[key]), tasksum / total_t * 100.0) + print + +print "# All threads : " +for key in sorted(alltasktimes.keys()): + taskmin = min(alltasktimes[key]) + taskmax = max(alltasktimes[key]) + tasksum = sum(alltasktimes[key]) + print "{0:33s}: {1:7d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + .format(key, len(alltasktimes[key]), taskmin, taskmax, tasksum, + tasksum / len(alltasktimes[key]), + tasksum / (len(threadids) * total_t) * 100.0) +print + +# Dead times. +print "# Times not in tasks (deadtimes)" +print "# ------------------------------" +print "# Time before first task:" +print "# no. : {0:>9s} {1:>9s}".format("value", "percent") +predeadtimes = [] +for i in threadids: + predeadtime = tasks[i][0][0] + print "thread {0:2d}: {1:9.4f} {2:9.4f}"\ + .format(i, predeadtime, predeadtime / total_t * 100.0) + predeadtimes.append(predeadtime) + +predeadmin = min(predeadtimes) +predeadmax = max(predeadtimes) +predeadsum = sum(predeadtimes) +print "# : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(predeadtimes), predeadmin, predeadmax, predeadsum, + predeadsum / len(predeadtimes), + predeadsum / (len(threadids) * total_t ) * 100.0) +print + +print "# Time after last task:" +print "# no. : {0:>9s} {1:>9s}".format("value", "percent") +postdeadtimes = [] +for i in threadids: + postdeadtime = total_t - tasks[i][-1][1] + print "thread {0:2d}: {1:9.4f} {2:9.4f}"\ + .format(i, postdeadtime, postdeadtime / total_t * 100.0) + postdeadtimes.append(postdeadtime) + +postdeadmin = min(postdeadtimes) +postdeadmax = max(postdeadtimes) +postdeadsum = sum(postdeadtimes) +print "# : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(postdeadtimes), postdeadmin, postdeadmax, postdeadsum, + postdeadsum / len(postdeadtimes), + postdeadsum / (len(threadids) * total_t ) * 100.0) +print + +# Time in threadpool, i.e. from first to last tasks. +print "# Time between tasks (threadpool deadtime):" +print "# no. : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +threadpooldeadtimes = [] +for i in threadids: + deadtimes = [] + last = tasks[i][0][0] + for task in tasks[i]: + dt = task[0] - last + deadtimes.append(dt) + last = task[1] + + # Drop first value, last value already gone. + if len(deadtimes) > 1: + deadtimes = deadtimes[1:] + else: + # Only one task, so no deadtime by definition. + deadtimes = [0.0] + + deadmin = min(deadtimes) + deadmax = max(deadtimes) + deadsum = sum(deadtimes) + print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + .format(i, len(deadtimes), deadmin, deadmax, deadsum, + deadsum / len(deadtimes), deadsum / total_t * 100.0) + threadpooldeadtimes.extend(deadtimes) + +deadmin = min(threadpooldeadtimes) +deadmax = max(threadpooldeadtimes) +deadsum = sum(threadpooldeadtimes) +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(threadpooldeadtimes), deadmin, deadmax, deadsum, + deadsum / len(threadpooldeadtimes), + deadsum / (len(threadids) * total_t ) * 100.0) +print + +# All times in step. +print "# All deadtimes:" +print "# no. : {0:>9s} {1:>9s} {2:>9s} {3:>9s} {4:>9s} {5:>9s}"\ + .format("count", "minimum", "maximum", "sum", "mean", "percent") +alldeadtimes = [] +for i in threadids: + deadtimes = [] + last = 0 + for task in tasks[i]: + dt = task[0] - last + deadtimes.append(dt) + last = task[1] + dt = total_t - last + deadtimes.append(dt) + + deadmin = min(deadtimes) + deadmax = max(deadtimes) + deadsum = sum(deadtimes) + print "thread {0:2d}: {1:9d} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.4f} {6:9.2f}"\ + .format(i, len(deadtimes), deadmin, deadmax, deadsum, + deadsum / len(deadtimes), deadsum / total_t * 100.0) + alldeadtimes.extend(deadtimes) + +deadmin = min(alldeadtimes) +deadmax = max(alldeadtimes) +deadsum = sum(alldeadtimes) +print "all : {0:9d} {1:9.4f} {2:9.4f} {3:9.4f} {4:9.4f} {5:9.2f}"\ + .format(len(alldeadtimes), deadmin, deadmax, deadsum, + deadsum / len(alldeadtimes), + deadsum / (len(threadids) * total_t ) * 100.0) +print + +sys.exit(0) diff --git a/examples/main.c b/examples/main.c index 631117148addd3ab7ad49ed2760855b793757870..ee1253062409ec2e787e064a5fb50da2c830d35d 100644 --- a/examples/main.c +++ b/examples/main.c @@ -26,7 +26,9 @@ #include "../config.h" /* Some standard headers. */ +#include <errno.h> #include <fenv.h> +#include <libgen.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -57,48 +59,53 @@ void print_help_message() { printf(" swift_mpi [OPTION]... PARAMFILE\n\n"); printf("Valid options are:\n"); - printf(" %2s %8s %s\n", "-a", "", "Pin runners using processor affinity."); - printf(" %2s %8s %s\n", "-c", "", "Run with cosmological time integration."); - printf(" %2s %8s %s\n", "-C", "", "Run with cooling."); + printf(" %2s %14s %s\n", "-a", "", "Pin runners using processor affinity."); + printf(" %2s %14s %s\n", "-c", "", + "Run with cosmological time integration."); + printf(" %2s %14s %s\n", "-C", "", "Run with cooling."); printf( - " %2s %8s %s\n", "-d", "", + " %2s %14s %s\n", "-d", "", "Dry run. Read the parameter file, allocate memory but does not read "); printf( - " %2s %8s %s\n", "", "", + " %2s %14s %s\n", "", "", "the particles from ICs and exit before the start of time integration."); - printf(" %2s %8s %s\n", "", "", + printf(" %2s %14s %s\n", "", "", "Allows user to check validy of parameter and IC files as well as " "memory limits."); - printf(" %2s %8s %s\n", "-D", "", + printf(" %2s %14s %s\n", "-D", "", "Always drift all particles even the ones far from active particles. " "This emulates"); - printf(" %2s %8s %s\n", "", "", + printf(" %2s %14s %s\n", "", "", "Gadget-[23] and GIZMO's default behaviours."); - printf(" %2s %8s %s\n", "-e", "", + printf(" %2s %14s %s\n", "-e", "", "Enable floating-point exceptions (debugging mode)."); - printf(" %2s %8s %s\n", "-f", "{int}", + printf(" %2s %14s %s\n", "-f", "{int}", "Overwrite the CPU frequency (Hz) to be used for time measurements."); - printf(" %2s %8s %s\n", "-g", "", + printf(" %2s %14s %s\n", "-g", "", "Run with an external gravitational potential."); - printf(" %2s %8s %s\n", "-F", "", "Run with feedback."); - printf(" %2s %8s %s\n", "-G", "", "Run with self-gravity."); - printf(" %2s %8s %s\n", "-M", "", + printf(" %2s %14s %s\n", "-G", "", "Run with self-gravity."); + printf(" %2s %14s %s\n", "-M", "", "Reconstruct the multipoles every time-step."); - printf(" %2s %8s %s\n", "-n", "{int}", + printf(" %2s %14s %s\n", "-n", "{int}", "Execute a fixed number of time steps. When unset use the time_end " "parameter to stop."); - printf(" %2s %8s %s\n", "-s", "", "Run with hydrodynamics."); - printf(" %2s %8s %s\n", "-S", "", "Run with stars."); - printf(" %2s %8s %s\n", "-t", "{int}", + printf(" %2s %14s %s\n", "-P", "{sec:par:val}", + "Set parameter value and overwrites values read from the parameters " + "file. Can be used more than once."); + printf(" %2s %14s %s\n", "-s", "", "Run with hydrodynamics."); + printf(" %2s %14s %s\n", "-S", "", "Run with stars."); + printf(" %2s %14s %s\n", "-t", "{int}", "The number of threads to use on each MPI rank. Defaults to 1 if not " "specified."); - printf(" %2s %8s %s\n", "-T", "", "Print timers every time-step."); - printf(" %2s %8s %s\n", "-v", "[12]", "Increase the level of verbosity."); - printf(" %2s %8s %s\n", "", "", "1: MPI-rank 0 writes "); - printf(" %2s %8s %s\n", "", "", "2: All MPI-ranks write"); - printf(" %2s %8s %s\n", "-y", "{int}", + printf(" %2s %14s %s\n", "-T", "", "Print timers every time-step."); + printf(" %2s %14s %s\n", "-v", "[12]", "Increase the level of verbosity:"); + printf(" %2s %14s %s\n", "", "", "1: MPI-rank 0 writes,"); + printf(" %2s %14s %s\n", "", "", "2: All MPI-ranks write."); + printf(" %2s %14s %s\n", "-y", "{int}", "Time-step frequency at which task graphs are dumped."); - printf(" %2s %8s %s\n", "-h", "", "Print this help message and exit."); + printf(" %2s %14s %s\n", "-Y", "{int}", + "Time-step frequency at which threadpool tasks are dumped."); + printf(" %2s %14s %s\n", "-h", "", "Print this help message and exit."); printf( "\nSee the file parameter_example.yml for an example of " "parameter file.\n"); @@ -135,7 +142,9 @@ int main(int argc, char *argv[]) { if ((res = MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN)) != MPI_SUCCESS) error("Call to MPI_Comm_set_errhandler failed with error %i.", res); - if (myrank == 0) message("MPI is up and running with %i node(s).", nr_nodes); + if (myrank == 0) + printf("[0000] [00000.0] main: MPI is up and running with %i node(s).\n\n", + nr_nodes); if (nr_nodes == 1) { message("WARNING: you are running with one MPI rank."); message("WARNING: you should use the non-MPI version of this program."); @@ -156,6 +165,7 @@ int main(int argc, char *argv[]) { int with_aff = 0; int dry_run = 0; int dump_tasks = 0; + int dump_threadpool = 0; int nsteps = -2; int with_cosmology = 0; int with_external_gravity = 0; @@ -170,15 +180,21 @@ int main(int argc, char *argv[]) { int verbose = 0; int nr_threads = 1; int with_verbose_timers = 0; + int nparams = 0; + char *cmdparams[PARSER_MAX_NO_OF_PARAMS]; char paramFileName[200] = ""; unsigned long long cpufreq = 0; /* Parse the parameters */ int c; - while ((c = getopt(argc, argv, "acCdDef:FgGhMn:sSt:Tv:y:")) != -1) + while ((c = getopt(argc, argv, "acCdDef:FgGhMn:P:sSt:Tv:y:Y:")) != -1) switch (c) { case 'a': +#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA) with_aff = 1; +#else + error("Need NUMA support for thread affinity"); +#endif break; case 'c': with_cosmology = 1; @@ -224,6 +240,10 @@ int main(int argc, char *argv[]) { return 1; } break; + case 'P': + cmdparams[nparams] = optarg; + nparams++; + break; case 's': with_hydro = 1; break; @@ -260,6 +280,21 @@ int main(int argc, char *argv[]) { "Task dumping is only possible if SWIFT was configured with the " "--enable-task-debugging option."); } +#endif + break; + case 'Y': + if (sscanf(optarg, "%d", &dump_threadpool) != 1) { + if (myrank == 0) printf("Error parsing dump_threadpool (-Y). \n"); + if (myrank == 0) print_help_message(); + return 1; + } +#ifndef SWIFT_DEBUG_THREADPOOL + if (dump_threadpool) { + error( + "Threadpool dumping is only possible if SWIFT was configured " + "with the " + "--enable-threadpool-debugging option."); + } #endif break; case '?': @@ -285,6 +320,14 @@ int main(int argc, char *argv[]) { if (myrank == 0) print_help_message(); return 1; } + if (with_stars && !with_external_gravity && !with_self_gravity) { + if (myrank == 0) + printf( + "Error: Cannot process stars without gravity, -g or -G must be " + "chosen.\n"); + if (myrank == 0) print_help_message(); + return 1; + } /* Genesis 1.1: And then, there was time ! */ clocks_set_cpufreq(cpufreq); @@ -351,6 +394,16 @@ int main(int argc, char *argv[]) { if (myrank == 0) { message("Reading runtime parameters from file '%s'", paramFileName); parser_read_file(paramFileName, params); + + /* Handle any command-line overrides. */ + if (nparams > 0) { + message( + "Overwriting values read from the YAML file with command-line " + "values."); + for (int k = 0; k < nparams; k++) parser_set_param(params, cmdparams[k]); + } + + /* And dump the parameters as used. */ // parser_print_params(¶ms); parser_write_params_to_file(params, "used_parameters.yml"); } @@ -359,6 +412,15 @@ int main(int argc, char *argv[]) { MPI_Bcast(params, sizeof(struct swift_params), MPI_BYTE, 0, MPI_COMM_WORLD); #endif + /* Check that we can write the snapshots by testing if the output + * directory exists and is searchable and writable. */ + char basename[PARSER_MAX_LINE_SIZE]; + parser_get_param_string(params, "Snapshots:basename", basename); + const char *dirp = dirname(basename); + if (access(dirp, W_OK | X_OK) != 0) { + error("Cannot write snapshots in directory %s (%s)", dirp, strerror(errno)); + } + /* Prepare the domain decomposition scheme */ struct repartition reparttype; #ifdef WITH_MPI @@ -403,6 +465,8 @@ int main(int argc, char *argv[]) { parser_get_param_string(params, "InitialConditions:file_name", ICfileName); const int replicate = parser_get_opt_param_int(params, "InitialConditions:replicate", 1); + const int clean_h_values = + parser_get_opt_param_int(params, "InitialConditions:cleanup_h", 0); if (myrank == 0) message("Reading ICs from file '%s'", ICfileName); fflush(stdout); @@ -509,6 +573,11 @@ int main(int argc, char *argv[]) { message("nr of cells at depth %i is %i.", data[0], data[1]); } +/* Initialise the table of Ewald corrections for the gravity checks */ +#ifdef SWIFT_GRAVITY_FORCE_CHECKS + if (periodic) gravity_exact_force_ewald_init(dim[0]); +#endif + /* Initialise the external potential properties */ struct external_potential potential; if (with_external_gravity) @@ -604,7 +673,7 @@ int main(int argc, char *argv[]) { #endif /* Initialise the particles */ - engine_init_particles(&e, flag_entropy_ICs); + engine_init_particles(&e, flag_entropy_ICs, clean_h_values); /* Write the state of the system before starting time integration. */ engine_dump_snapshot(&e); @@ -656,14 +725,16 @@ int main(int argc, char *argv[]) { /* Open file and position at end. */ file_thread = fopen(dumpfile, "a"); - fprintf(file_thread, " %03i 0 0 0 0 %lli %lli 0 0 0 0 %lli\n", myrank, - e.tic_step, e.toc_step, cpufreq); + fprintf(file_thread, " %03i 0 0 0 0 %lli %lli %zi %zi %zi 0 0 %lli\n", + myrank, e.tic_step, e.toc_step, e.updates, e.g_updates, + e.s_updates, cpufreq); int count = 0; for (int l = 0; l < e.sched.nr_tasks; l++) { if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) { fprintf( - file_thread, " %03i %i %i %i %i %lli %lli %i %i %i %i %i\n", - myrank, e.sched.tasks[l].rid, e.sched.tasks[l].type, + file_thread, + " %03i %i %i %i %i %lli %lli %i %i %i %i %i %i\n", myrank, + e.sched.tasks[l].rid, e.sched.tasks[l].type, e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL), e.sched.tasks[l].tic, e.sched.tasks[l].toc, (e.sched.tasks[l].ci != NULL) ? e.sched.tasks[l].ci->count @@ -674,7 +745,7 @@ int main(int argc, char *argv[]) { : 0, (e.sched.tasks[l].cj != NULL) ? e.sched.tasks[l].cj->gcount : 0, - e.sched.tasks[l].flags); + e.sched.tasks[l].flags, e.sched.tasks[l].sid); } fflush(stdout); count++; @@ -692,25 +763,43 @@ int main(int argc, char *argv[]) { FILE *file_thread; file_thread = fopen(dumpfile, "w"); /* Add some information to help with the plots */ - fprintf(file_thread, " %i %i %i %i %lli %lli %i %i %i %lli\n", -2, -1, -1, - 1, e.tic_step, e.toc_step, 0, 0, 0, cpufreq); + fprintf(file_thread, " %i %i %i %i %lli %lli %zi %zi %zi %i %lli\n", -2, + -1, -1, 1, e.tic_step, e.toc_step, e.updates, e.g_updates, + e.s_updates, 0, cpufreq); for (int l = 0; l < e.sched.nr_tasks; l++) { if (!e.sched.tasks[l].implicit && e.sched.tasks[l].toc != 0) { fprintf( - file_thread, " %i %i %i %i %lli %lli %i %i %i %i\n", + file_thread, " %i %i %i %i %lli %lli %i %i %i %i %i\n", e.sched.tasks[l].rid, e.sched.tasks[l].type, e.sched.tasks[l].subtype, (e.sched.tasks[l].cj == NULL), e.sched.tasks[l].tic, e.sched.tasks[l].toc, (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->count, (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->count, (e.sched.tasks[l].ci == NULL) ? 0 : e.sched.tasks[l].ci->gcount, - (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount); + (e.sched.tasks[l].cj == NULL) ? 0 : e.sched.tasks[l].cj->gcount, + e.sched.tasks[l].sid); } } fclose(file_thread); #endif // WITH_MPI } #endif // SWIFT_DEBUG_TASKS + +#ifdef SWIFT_DEBUG_THREADPOOL + /* Dump the task data using the given frequency. */ + if (dump_threadpool && (dump_threadpool == 1 || j % dump_threadpool == 1)) { + char dumpfile[40]; +#ifdef WITH_MPI + snprintf(dumpfile, 30, "threadpool_info-rank%d-step%d.dat", engine_rank, + j + 1); +#else + snprintf(dumpfile, 30, "threadpool_info-step%d.dat", j + 1); +#endif // WITH_MPI + threadpool_dump_log(&e.threadpool, dumpfile, 1); + } else { + threadpool_reset_log(&e.threadpool); + } +#endif // SWIFT_DEBUG_THREADPOOL } /* Print the values of the runner histogram. */ diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 8006c1a325845d6e9fec655b809310a63daa9ddb..9c3cee7630edf1be1e161a3e70547f06e6108ebd 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -8,12 +8,12 @@ InternalUnitSystem: # Parameters for the task scheduling Scheduler: - nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. - cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). - cell_sub_size: 64000000 # (Optional) Maximal number of interactions per sub-task (this is the default value). - cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). - cell_max_count: 10000 # (Optional) Maximal number of particles per cell allowed before triggering a sanitizing (this is the default value). - max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). + nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. + cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). + cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value). + cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value). + cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). + max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.) TimeIntegration: @@ -43,23 +43,25 @@ Statistics: # Parameters for the hydrodynamics scheme SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). - delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_ghost_iterations: 30 # (Optional) Maximal number of iterations allowed to converge towards the smoothing length. - max_volume_change: 2. # (Optional) Maximal allowed change of kernel volume over one time-step + h_tolerance: 1e-4 # (Optional) Relative accuracy of the Netwon-Raphson scheme for the smoothing lengths. h_max: 10. # (Optional) Maximal allowed smoothing length in internal units. Defaults to FLT_MAX if unspecified. + max_volume_change: 1.4 # (Optional) Maximal allowed change of kernel volume over one time-step. + max_ghost_iterations: 30 # (Optional) Maximal number of iterations allowed to converge towards the smoothing length. # Parameters for the self-gravity scheme Gravity: - eta: 0.025 # Constant dimensionless multiplier for time integration. - theta: 0.7 # Opening angle (Multipole acceptance criterion) - epsilon: 0.1 # Softening length (in internal units). - a_smooth: 1.25 # (Optional) Smoothing scale in top-level cell sizes to smooth the long-range forces over (this is the default value). - r_cut: 4.5 # (Optional) Cut-off in number of top-level cells beyond which no FMM forces are computed (this is the default value). + eta: 0.025 # Constant dimensionless multiplier for time integration. + theta: 0.7 # Opening angle (Multipole acceptance criterion) + epsilon: 0.1 # Softening length (in internal units). + a_smooth: 1.25 # (Optional) Smoothing scale in top-level cell sizes to smooth the long-range forces over (this is the default value). + r_cut_max: 4.5 # (Optional) Cut-off in number of top-level cells beyond which no FMM forces are computed (this is the default value). + r_cut_min: 0.1 # (Optional) Cut-off in number of top-level cells below which no truncation of FMM forces are performed (this is the default value). # Parameters related to the initial conditions InitialConditions: file_name: SedovBlast/sedov.hdf5 # The file to read + cleanup_h: 0 # (Optional) Clean the values of h that are read in. Set to 1 to activate. h_scaling: 1. # (Optional) A scaling factor to apply to all smoothing lengths in the ICs. shift_x: 0. # (Optional) A shift to apply to all particles read from the ICs (in internal units). shift_y: 0. @@ -103,7 +105,9 @@ IsothermalPotential: DiscPatchPotential: surface_density: 10. # Surface density of the disc (internal units) scale_height: 100. # Scale height of the disc (internal units) - z_disc: 200. # Position of the disc along the z-axis (internal units) + z_disc: 400. # Position of the disc along the z-axis (internal units) + z_trunc: 300. # (Optional) Distance from the disc along z-axis above which the potential gets truncated. + z_max: 380. # (Optional) Distance from the disc along z-axis above which the potential is set to 0. timestep_mult: 0.03 # Dimensionless pre-factor for the time-step condition growth_time: 5. # (Optional) Time for the disc to grow to its final size (multiple of the dynamical time) diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py index 88f176687db8116cfd4370970769164985e4d366..c49020939cca8f744db352631b2ec47267d7bd20 100755 --- a/examples/plot_tasks.py +++ b/examples/plot_tasks.py @@ -78,7 +78,7 @@ PLOT_PARAMS = {"axes.labelsize": 10, "figure.figsize" : (args.width, args.height), "figure.subplot.left" : 0.03, "figure.subplot.right" : 0.995, - "figure.subplot.bottom" : 0.1, + "figure.subplot.bottom" : 0.09, "figure.subplot.top" : 0.99, "figure.subplot.wspace" : 0., "figure.subplot.hspace" : 0., @@ -91,17 +91,18 @@ pl.rcParams.update(PLOT_PARAMS) TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init_grav", "ghost", "extra_ghost", "drift_part", "drift_gpart", "kick1", "kick2", "timestep", "send", "recv", - "grav_top_level", "grav_long_range", "grav_mm", "grav_down", - "cooling", "sourceterms", "count"] + "grav_top_level", "grav_long_range", "grav_ghost", "grav_mm", + "grav_down", "cooling", "sourceterms", "count"] SUBTYPES = ["none", "density", "gradient", "force", "grav", "external_grav", "tend", "xv", "rho", "gpart", "multipole", "spart", "count"] # Task/subtypes of interest. FULLTYPES = ["self/force", "self/density", "self/grav", "sub_self/force", - "sub_self/density", "pair/force", "pair/density", "pair/grav", - "sub_pair/force", - "sub_pair/density", "recv/xv", "send/xv", "recv/rho", "send/rho", + "sub_self/density", "sub_self/grav", "pair/force", "pair/density", + "pair/grav", "sub_pair/force", + "sub_pair/density", "sub_pair/grav", "recv/xv", "send/xv", + "recv/rho", "send/rho", "recv/tend", "send/tend"] # A number of colours for the various types. Recycled when there are @@ -109,7 +110,7 @@ FULLTYPES = ["self/force", "self/density", "self/grav", "sub_self/force", colours = ["cyan", "lightgray", "darkblue", "yellow", "tan", "dodgerblue", "sienna", "aquamarine", "bisque", "blue", "green", "lightgreen", "brown", "purple", "moccasin", "olivedrab", "chartreuse", - "darksage", "darkgreen", "green", "mediumseagreen", + "steelblue", "darkgreen", "green", "mediumseagreen", "mediumaquamarine", "darkslategrey", "mediumturquoise", "black", "cadetblue", "skyblue", "red", "slategray", "gold", "slateblue", "blueviolet", "mediumorchid", "firebrick", @@ -183,7 +184,7 @@ ecounter = [] for i in range(nthread): ecounter.append(0) -num_lines = pl.size(data) / 10 +num_lines = pl.size(data) / pl.size(full_step) for line in range(num_lines): thread = int(data[line,0]) @@ -243,21 +244,21 @@ for i in range(nthread): # Legend and room for it. nrow = len(typesseen) / 5 if not args.nolegend: - if len(typesseen) * 5 < nrow: - nrow = nrow + 1 ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white") - ax.set_ylim(0, nthread + nrow + 1) - ax.legend(loc=1, shadow=True, mode="expand", ncol=5) - + ax.set_ylim(0, nthread + 0.5) + ax.legend(loc=1, shadow=True, bbox_to_anchor=(0., 1.05 ,1., 0.2), mode="expand", ncol=5) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height*0.8]) + # Start and end of time-step ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1) ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1) -ax.set_xlabel("Wall clock time [ms]") +ax.set_xlabel("Wall clock time [ms]", labelpad=0.) if expand == 1: - ax.set_ylabel("Thread ID" ) + ax.set_ylabel("Thread ID", labelpad=0 ) else: - ax.set_ylabel("Thread ID * " + str(expand) ) + ax.set_ylabel("Thread ID * " + str(expand), labelpad=0 ) ax.set_yticks(pl.array(range(nthread)), True) loc = plticker.MultipleLocator(base=expand) diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py index 83465aee87e8b641775d760fa4db2f06b125dd8b..85d7c54567a66c9c2151732e0e7a11c6580f958b 100755 --- a/examples/plot_tasks_MPI.py +++ b/examples/plot_tasks_MPI.py @@ -278,12 +278,12 @@ for rank in range(nranks): # Legend and room for it. nrow = len(typesseen) / 5 - if len(typesseen) * 5 < nrow: - nrow = nrow + 1 ax.fill_between([0, 0], nethread+0.5, nethread + nrow + 0.5, facecolor="white") - ax.set_ylim(0, nethread + nrow + 1) + ax.set_ylim(0, nethread + 0.5) if data.size > 0: - ax.legend(loc=1, shadow=True, mode="expand", ncol=5) + ax.legend(loc=1, shadow=True, bbox_to_anchor=(0., 1.05 ,1., 0.2), mode="expand", ncol=5) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height*0.8]) # Start and end of time-step ax.plot([0, 0], [0, nethread + nrow + 1], 'k--', linewidth=1) diff --git a/examples/plot_threadpool.py b/examples/plot_threadpool.py new file mode 100755 index 0000000000000000000000000000000000000000..495fc3e0c532d9cafbf96e622decbc8179869160 --- /dev/null +++ b/examples/plot_threadpool.py @@ -0,0 +1,271 @@ +#!/usr/bin/env python +""" +Usage: + plot_threadpool.py [options] input.dat output.png + +where input.dat is a threadpool info file for a step. Use the '-Y interval' +flag of the swift command to create these. The output plot will be called +'output.png'. The --limit option can be used to produce plots with the same +time span and the --expand option to expand each thread line into '*expand' +lines, so that adjacent tasks of the same type can be distinguished. Other +options can be seen using the --help flag. + +This file is part of SWIFT. +Copyright (c) 2015 Pedro Gonnet (pedro.gonnet@durham.ac.uk), + Bert Vandenbroucke (bert.vandenbroucke@ugent.be) + Matthieu Schaller (matthieu.schaller@durham.ac.uk) + (c) 2017 Peter W. Draper (p.w.draper@durham.ac.uk) + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published +by the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with this program. If not, see <http://www.gnu.org/licenses/>. +""" + +import matplotlib +matplotlib.use("Agg") +import matplotlib.collections as collections +import matplotlib.ticker as plticker +import pylab as pl +import sys +import argparse + +# Handle the command line. +parser = argparse.ArgumentParser(description="Plot threadpool function graphs") + +parser.add_argument("input", help="Threadpool data file (-Y output)") +parser.add_argument("outpng", help="Name for output graphic file (PNG)") +parser.add_argument("-l", "--limit", dest="limit", + help="Upper time limit in millisecs (def: depends on data)", + default=0, type=int) +parser.add_argument("-e", "--expand", dest="expand", + help="Thread expansion factor (def: 1)", + default=1, type=int) +parser.add_argument("--height", dest="height", + help="Height of plot in inches (def: 4)", + default=4., type=float) +parser.add_argument("--width", dest="width", + help="Width of plot in inches (def: 16)", + default=16., type=float) +parser.add_argument("--nolegend", dest="nolegend", + help="Whether to show the legend (def: False)", + default=False, action="store_true") +parser.add_argument("-v", "--verbose", dest="verbose", + help="Show colour assignments and other details (def: False)", + default=False, action="store_true") + +args = parser.parse_args() +infile = args.input +outpng = args.outpng +delta_t = args.limit +expand = args.expand + +# Basic plot configuration. +PLOT_PARAMS = {"axes.labelsize": 10, + "axes.titlesize": 10, + "font.size": 12, + "legend.fontsize": 12, + "xtick.labelsize": 10, + "ytick.labelsize": 10, + "figure.figsize" : (args.width, args.height), + "figure.subplot.left" : 0.03, + "figure.subplot.right" : 0.995, + "figure.subplot.bottom" : 0.09, + "figure.subplot.top" : 0.99, + "figure.subplot.wspace" : 0., + "figure.subplot.hspace" : 0., + "lines.markersize" : 6, + "lines.linewidth" : 3. + } +pl.rcParams.update(PLOT_PARAMS) + +# A number of colours for the various types. Recycled when there are +# more task types than colours... +colours = ["cyan", "lightgray", "darkblue", "yellow", "tan", "dodgerblue", + "sienna", "aquamarine", "bisque", "blue", "green", "lightgreen", + "brown", "purple", "moccasin", "olivedrab", "chartreuse", + "darksage", "darkgreen", "green", "mediumseagreen", + "mediumaquamarine", "darkslategrey", "mediumturquoise", + "black", "cadetblue", "skyblue", "red", "slategray", "gold", + "slateblue", "blueviolet", "mediumorchid", "firebrick", + "magenta", "hotpink", "pink", "orange", "lightgreen"] +maxcolours = len(colours) + +# Read header. First two lines. +with open(infile) as infid: + head = [next(infid) for x in xrange(2)] +header = head[1][2:].strip() +header = eval(header) +nthread = int(header['num_threads']) + 1 +CPU_CLOCK = float(header['cpufreq']) / 1000.0 +print "Number of threads: ", nthread +if args.verbose: + print "CPU frequency:", CPU_CLOCK * 1000.0 + +# Read input. +data = pl.genfromtxt(infile, dtype=None, delimiter=" ") + +# Mixed types, so need to separate. +tics = [] +tocs = [] +funcs = [] +threads = [] +chunks = [] +for i in data: + if i[0] != "#": + funcs.append(i[0].replace("_mapper", "")) + if i[1] < 0: + threads.append(nthread-1) + else: + threads.append(i[1]) + chunks.append(i[2]) + tics.append(i[3]) + tocs.append(i[4]) +tics = pl.array(tics) +tocs = pl.array(tocs) +funcs = pl.array(funcs) +threads = pl.array(threads) +chunks = pl.array(chunks) + + +# Recover the start and end time +tic_step = min(tics) +toc_step = max(tocs) + +# Not known. + +# Calculate the time range, if not given. +delta_t = delta_t * CPU_CLOCK +if delta_t == 0: + dt = toc_step - tic_step + if dt > delta_t: + delta_t = dt + print "Data range: ", delta_t / CPU_CLOCK, "ms" + +# Once more doing the real gather and plots this time. +start_t = float(tic_step) +tics -= tic_step +tocs -= tic_step +end_t = (toc_step - start_t) / CPU_CLOCK + +# Get all "task" names and assign colours. +TASKTYPES = pl.unique(funcs) +print TASKTYPES + +# Set colours of task/subtype. +TASKCOLOURS = {} +ncolours = 0 +for task in TASKTYPES: + TASKCOLOURS[task] = colours[ncolours] + ncolours = (ncolours + 1) % maxcolours + +# For fiddling with colours... +if args.verbose: + print "#Selected colours:" + for task in sorted(TASKCOLOURS.keys()): + print "# " + task + ": " + TASKCOLOURS[task] + for task in sorted(SUBCOLOURS.keys()): + print "# " + task + ": " + SUBCOLOURS[task] + +tasks = {} +tasks[-1] = [] +for i in range(nthread*expand): + tasks[i] = [] + +# Counters for each thread when expanding. +ecounter = [] +for i in range(nthread): + ecounter.append(0) + +for i in range(len(threads)): + thread = threads[i] + + # Expand to cover extra lines if expanding. + ethread = thread * expand + (ecounter[thread] % expand) + ecounter[thread] = ecounter[thread] + 1 + thread = ethread + + tasks[thread].append({}) + tasks[thread][-1]["type"] = funcs[i] + tic = tics[i] / CPU_CLOCK + toc = tocs[i] / CPU_CLOCK + tasks[thread][-1]["tic"] = tic + tasks[thread][-1]["toc"] = toc + tasks[thread][-1]["colour"] = TASKCOLOURS[funcs[i]] + +# Use expanded threads from now on. +nthread = nthread * expand + +typesseen = [] +fig = pl.figure() +ax = fig.add_subplot(1,1,1) +ax.set_xlim(-delta_t * 0.01 / CPU_CLOCK, delta_t * 1.01 / CPU_CLOCK) +ax.set_ylim(0, nthread) + +# Fake thread is used to colour the whole range, do that first. +tictocs = [] +colours = [] +j = 0 +for task in tasks[nthread - expand]: + tictocs.append((task["tic"], task["toc"] - task["tic"])) + colours.append(task["colour"]) +ax.broken_barh(tictocs, [0,(nthread-1)], facecolors = colours, linewidth=0, alpha=0.15) + +# And we don't plot the fake thread. +nthread = nthread - expand +for i in range(nthread): + + # Collect ranges and colours into arrays. + tictocs = [] + colours = [] + j = 0 + for task in tasks[i]: + tictocs.append((task["tic"], task["toc"] - task["tic"])) + colours.append(task["colour"]) + + # Legend support, collections don't add to this. + qtask = task["type"] + if qtask not in typesseen: + pl.plot([], [], color=task["colour"], label=qtask) + typesseen.append(qtask) + + # Now plot. + ax.broken_barh(tictocs, [i+0.05,0.90], facecolors = colours, linewidth=0) + +# Legend and room for it. +nrow = len(typesseen) / 5 +if not args.nolegend: + ax.fill_between([0, 0], nthread+0.5, nthread + nrow + 0.5, facecolor="white") + ax.set_ylim(0, nthread + 0.5) + ax.legend(loc=1, shadow=True, bbox_to_anchor=(0., 1.05 ,1., 0.2), mode="expand", ncol=5) + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width, box.height*0.8]) + +# Start and end of time-step +ax.plot([0, 0], [0, nthread + nrow + 1], 'k--', linewidth=1) +ax.plot([end_t, end_t], [0, nthread + nrow + 1], 'k--', linewidth=1) + +ax.set_xlabel("Wall clock time [ms]", labelpad=0.) +if expand == 1: + ax.set_ylabel("Thread ID", labelpad=0 ) +else: + ax.set_ylabel("Thread ID * " + str(expand), labelpad=0 ) +ax.set_yticks(pl.array(range(nthread)), True) + +loc = plticker.MultipleLocator(base=expand) +ax.yaxis.set_major_locator(loc) +ax.grid(True, which='major', axis="y", linestyle="-") + +pl.show() +pl.savefig(outpng) +print "Graphics done, output written to", outpng + +sys.exit(0) diff --git a/examples/process_plot_tasks_MPI b/examples/process_plot_tasks_MPI index b2672b3711823eb87d0bede5b1ffd8945a735f98..691822ebc33b43450d69b06e49c2c95bb0683045 100755 --- a/examples/process_plot_tasks_MPI +++ b/examples/process_plot_tasks_MPI @@ -62,7 +62,9 @@ nrank=$(($nrank-1)) # And process them, echo "Processing thread info files..." echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./plot_tasks_MPI.py --expand 1 --limit $TIMERANGE \$0 \$2 " -echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./analyse_tasks_MPI.py \$0 > \$2.stats" +for i in $(seq 0 $nrank); do + echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./analyse_tasks_MPI.py -r $i \$0 > \$2${i}.stats" +done echo "Writing output index.html file" # Construct document - serial. @@ -93,7 +95,7 @@ EOF2 <img src="step${s}r${i}.png"> <pre> EOF2 -cat step${s}r.stats >> step${s}r${i}.html +cat step${s}r${i}.stats >> step${s}r${i}.html cat <<EOF2 >> step${s}r${i}.html </pre> </body> diff --git a/examples/process_plot_threadpool b/examples/process_plot_threadpool new file mode 100755 index 0000000000000000000000000000000000000000..343c1559ee37d6714ac32e5305457eddbb7e6414 --- /dev/null +++ b/examples/process_plot_threadpool @@ -0,0 +1,108 @@ +#!/bin/bash +# +# Usage: +# process_plot_threadpool nprocess [time-range-ms] +# +# Description: +# Process all the threadpool info files in the current directory +# creating function graphs for steps and threads. +# +# The input files are created by a run using the "-Y interval" flag and +# should be named "threadpool_info-step<n>.dat" in the current directory. +# All located files will be processed using "nprocess" concurrent +# processes and all plots will have the same time range if one is given. +# An output HTML file "index.html" will be created to view all the plots. +# +# +# This file is part of SWIFT: +# +# Copyright (C) 2017 Peter W. Draper (p.w.draper@durham.ac.uk) +# All Rights Reserved. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as published +# by the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# Handle command-line +if test "$1" = ""; then + echo "Usage: $0 nprocess [time-range-ms]" + exit 1 +fi +NPROCS=$1 +TIMERANGE=0 +LIMIT="(autoranged)" +if test "$2" != ""; then + TIMERANGE=$2 + LIMIT="" +fi + +# Find all thread info files. Use version sort to get into correct order. +files=$(ls -v threadpool_info-step*.dat) +if test $? != 0; then + echo "Failed to find any threadpool info files" + exit 1 +fi + +# Construct list of names, the step no and names for the graphics. +list="" +for f in $files; do + s=$(echo $f| sed 's,threadpool_info-step\(.*\).dat,\1,') + list="$list $f $s poolstep${s}r" +done + +# And process them, +echo "Processing threadpool info files..." +echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./plot_threadpool.py --expand 1 --limit $TIMERANGE --width 16 --height 4 \$0 \$2 " +echo $list | xargs -P $NPROCS -n 3 /bin/bash -c "./analyse_threadpool_tasks.py \$0 > \$2.stats" + +echo "Writing output threadpool-index.html file" +# Construct document - serial. +cat <<EOF > threadpool-index.html + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> + <head> + <title>SWIFT threadpool tasks $LIMIT</title> + </head> + <body> + <h1>SWIFT threadpool tasks $LIMIT</h1> +EOF + +echo $list | xargs -n 3 | while read f s g; do + cat <<EOF >> threadpool-index.html +<h2>Step $s</h2> +EOF + cat <<EOF >> threadpool-index.html +<a href="poolstep${s}r${i}.html"><img src="poolstep${s}r${i}.png" width=400px/></a> +EOF + cat <<EOF > poolstep${s}r${i}.html + <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html> +<body> +<img src="poolstep${s}r${i}.png"> +<pre> +EOF +cat poolstep${s}r${i}.stats >> poolstep${s}r${i}.html +cat <<EOF >> poolstep${s}r${i}.html +</body> +</html> +EOF + +done + +cat <<EOF >> threadpool-index.html + </body> +</html> +EOF + +echo "Finished" + +exit diff --git a/src/Makefile.am b/src/Makefile.am index 2ddcdb0908201c65053d7cc5380a4217277b5c13..ec01184928faf3d58b2d0890965a745d05718354 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -64,7 +64,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \ runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \ dimension.h equation_of_state.h part_type.h periodic.h \ - gravity.h gravity_io.h \ + gravity.h gravity_io.h gravity_cache.h \ gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \ gravity/Default/gravity_debug.h gravity/Default/gravity_part.h \ sourceterms.h \ @@ -86,6 +86,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h hydro/Gizmo/hydro_slope_limiters_cell.h \ hydro/Gizmo/hydro_slope_limiters_face.h \ hydro/Gizmo/hydro_slope_limiters.h \ + hydro/Gizmo/hydro_flux_limiters.h \ hydro/Gizmo/hydro_unphysical.h \ hydro/Gizmo/hydro_velocities.h \ hydro/Shadowswift/hydro_debug.h \ diff --git a/src/align.h b/src/align.h index 915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96..54435c4c9baa1ce9dc511e2903b7e2be2d6655de 100644 --- a/src/align.h +++ b/src/align.h @@ -23,9 +23,71 @@ * @brief The default struct alignment in SWIFT. */ #define SWIFT_STRUCT_ALIGNMENT 32 + /** * @brief Defines alignment of structures */ #define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT))) +/** + * @brief The default cache alignment in SWIFT. + */ +#define SWIFT_CACHE_ALIGNMENT 64 + +/** + * @brief Defines alignment of caches + */ +#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT))) + +/** + * @brief Macro to tell the compiler that a given array has the specified + * alignment. + * + * Note that this turns into a no-op but gives information to the compiler. + * + * @param array The array. + * @param alignment The alignment in bytes of the array. + */ +#if defined(__ICC) +#define swift_align_information(array, alignment) \ + __assume_aligned(array, alignment); +#elif defined(__GNUC__) +#define swift_align_information(array, alignment) \ + array = __builtin_assume_aligned(array, alignment); +#else +#define swift_align_information(array, alignment) ; +#endif + +/** + * @brief Macro to create a restrict pointer to an array and tell the compiler + * that the given array has the specified + * alignment. + * + * Note that this turns into a no-op but gives information to the compiler. + * + * @param array The array. + * @param ptr Pointer to array + * @param type Type of array + * @param alignment The alignment in bytes of the array. + */ +#define swift_declare_aligned_ptr(type, array, ptr, alignment) \ + type *restrict array = ptr; \ + swift_align_information(array, alignment); + +/** + * @brief Macro to tell the compiler that a given number is 0 modulo a given + * size. + * + * Note that this turns into a no-op but gives information to the compiler. + * GCC does not have the equivalent built-in so defaults to nothing. + * + * @param var The variable + * @param size The modulo of interest. + */ +#if defined(__ICC) +#define swift_assume_size(var, size) __assume(var % size == 0); +#else +#define swift_assume_size(var, size) ; +#endif + #endif /* SWIFT_ALIGN_H */ diff --git a/src/approx_math.h b/src/approx_math.h index ad07adeb4f3b1b54ca5f33d80eabb6a004d2a3aa..48319ddfd7a86c132a1cd18b4a08fa849a36a15a 100644 --- a/src/approx_math.h +++ b/src/approx_math.h @@ -36,4 +36,17 @@ __attribute__((always_inline)) INLINE static float approx_expf(float x) { return 1.f + x * (1.f + x * (0.5f + x * (1.f / 6.f + 1.f / 24.f * x))); } +/** + * @brief Approximate version of expf(x) using a 6th order Taylor expansion + * + */ +__attribute__((always_inline)) INLINE static float good_approx_expf(float x) { + return 1.f + + x * (1.f + + x * (0.5f + + x * ((1.f / 6.f) + + x * ((1.f / 24.f) + + x * ((1.f / 120.f) + (1.f / 720.f) * x))))); +} + #endif /* SWIFT_APPROX_MATH_H */ diff --git a/src/cache.h b/src/cache.h index 6739c2020e897d54e6586c9d121490aaab5661bc..70c63f72a45d730c826f039f535e7e8c5d467f64 100644 --- a/src/cache.h +++ b/src/cache.h @@ -23,6 +23,7 @@ #include "../config.h" /* Local headers */ +#include "align.h" #include "cell.h" #include "error.h" #include "part.h" @@ -30,9 +31,7 @@ #include "vector.h" #define NUM_VEC_PROC 2 -#define CACHE_ALIGN 64 #define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE) -#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE #ifdef WITH_VECTORIZATION /* Cache struct to hold a local copy of a cells' particle @@ -40,46 +39,46 @@ struct cache { /* Particle x position. */ - float *restrict x __attribute__((aligned(CACHE_ALIGN))); + float *restrict x SWIFT_CACHE_ALIGN; /* Particle y position. */ - float *restrict y __attribute__((aligned(CACHE_ALIGN))); + float *restrict y SWIFT_CACHE_ALIGN; /* Particle z position. */ - float *restrict z __attribute__((aligned(CACHE_ALIGN))); + float *restrict z SWIFT_CACHE_ALIGN; /* Particle smoothing length. */ - float *restrict h __attribute__((aligned(CACHE_ALIGN))); + float *restrict h SWIFT_CACHE_ALIGN; /* Particle mass. */ - float *restrict m __attribute__((aligned(CACHE_ALIGN))); + float *restrict m SWIFT_CACHE_ALIGN; /* Particle x velocity. */ - float *restrict vx __attribute__((aligned(CACHE_ALIGN))); + float *restrict vx SWIFT_CACHE_ALIGN; /* Particle y velocity. */ - float *restrict vy __attribute__((aligned(CACHE_ALIGN))); + float *restrict vy SWIFT_CACHE_ALIGN; /* Particle z velocity. */ - float *restrict vz __attribute__((aligned(CACHE_ALIGN))); + float *restrict vz SWIFT_CACHE_ALIGN; + /* Maximum index into neighbouring cell for particles that are in range. */ + int *restrict max_index SWIFT_CACHE_ALIGN; + /* Particle density. */ - float *restrict rho __attribute__((aligned(CACHE_ALIGN))); + float *restrict rho SWIFT_CACHE_ALIGN; /* Particle smoothing length gradient. */ - float *restrict grad_h __attribute__((aligned(CACHE_ALIGN))); + float *restrict grad_h SWIFT_CACHE_ALIGN; /* Pressure over density squared. */ - float *restrict pOrho2 __attribute__((aligned(CACHE_ALIGN))); + float *restrict pOrho2 SWIFT_CACHE_ALIGN; /* Balsara switch. */ - float *restrict balsara __attribute__((aligned(CACHE_ALIGN))); + float *restrict balsara SWIFT_CACHE_ALIGN; /* Particle sound speed. */ - float *restrict soundspeed __attribute__((aligned(CACHE_ALIGN))); - - /* Maximum distance of particles into neighbouring cell. */ - float *restrict max_d __attribute__((aligned(CACHE_ALIGN))); + float *restrict soundspeed SWIFT_CACHE_ALIGN; /* Cache size. */ int count; @@ -90,46 +89,46 @@ struct cache { struct c2_cache { /* Separation between two particles squared. */ - float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x separation between two particles. */ - float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y separation between two particles. */ - float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z separation between two particles. */ - float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Mass of particle pj. */ - float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* x velocity of particle pj. */ - float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* y velocity of particle pj. */ - float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* z velocity of particle pj. */ - float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Density of particle pj. */ - float rhoq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float rhoq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Smoothing length gradient of particle pj. */ - float grad_hq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float grad_hq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Pressure over density squared of particle pj. */ - float pOrho2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float pOrho2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Balsara switch of particle pj. */ - float balsaraq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float balsaraq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Sound speed of particle pj. */ - float soundspeedq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float soundspeedq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; /* Inverse smoothing length of particle pj. */ - float h_invq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN))); + float h_invq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN; }; /** @@ -144,9 +143,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, /* Align cache on correct byte boundary and pad cache size to be a multiple of * the vector size * and include 2 vector lengths for remainder operations. */ - unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; + size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE; if (rem > 0) pad += VEC_SIZE - rem; - unsigned int sizeBytes = (count + pad) * sizeof(float); + size_t sizeBytes = (count + pad) * sizeof(float); + size_t sizeIntBytes = (count + pad) * sizeof(int); int error = 0; /* Free memory if cache has already been allocated. */ @@ -159,28 +159,29 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, free(c->vy); free(c->vz); free(c->h); + free(c->max_index); free(c->rho); free(c->grad_h); free(c->pOrho2); free(c->balsara); free(c->soundspeed); - free(c->max_d); } - error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->rho, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->grad_h, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->pOrho2, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->balsara, CACHE_ALIGN, sizeBytes); - error += posix_memalign((void **)&c->soundspeed, CACHE_ALIGN, sizeBytes); + error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, + sizeIntBytes); + error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -194,162 +195,100 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, * @param ci_cache The cache. */ __attribute__((always_inline)) INLINE void cache_read_particles( - const struct cell *const ci, struct cache *const ci_cache) { + const struct cell *restrict const ci, + struct cache *restrict const ci_cache) { #if defined(GADGET2_SPH) -/* Shift the particles positions to a local frame so single precision can be - * used instead of double precision. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + const struct part *restrict parts = ci->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; - - ci_cache->rho[i] = ci->parts[i].rho; - ci_cache->grad_h[i] = ci->parts[i].force.f; - ci_cache->pOrho2[i] = ci->parts[i].force.P_over_rho2; - ci_cache->balsara[i] = ci->parts[i].force.balsara; - ci_cache->soundspeed[i] = ci->parts[i].force.soundspeed; + x[i] = (float)(parts[i].x[0] - loc[0]); + y[i] = (float)(parts[i].x[1] - loc[1]); + z[i] = (float)(parts[i].x[2] - loc[2]); + h[i] = parts[i].h; + + m[i] = parts[i].mass; + vx[i] = parts[i].v[0]; + vy[i] = parts[i].v[1]; + vz[i] = parts[i].v[2]; } #endif } /** - * @brief Populate cache by reading in the particles from two cells in unsorted - * order. + * @brief Populate cache by reading in the particles in unsorted order. * - * @param ci The i #cell. - * @param cj The j #cell. - * @param ci_cache The cache for cell ci. - * @param cj_cache The cache for cell cj. - * @param shift The amount to shift the particle positions to account for BCs + * @param ci The #cell. + * @param ci_cache The cache. */ -__attribute__((always_inline)) INLINE void cache_read_two_cells( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const double *const shift) { - - /* Shift the particles positions to a local frame (ci frame) so single - * precision can be - * used instead of double precision. Also shift the cell ci, particles - * positions due to BCs but leave cell cj. */ - for (int i = 0; i < ci->count; i++) { - ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[i].h; - - ci_cache->m[i] = ci->parts[i].mass; - ci_cache->vx[i] = ci->parts[i].v[0]; - ci_cache->vy[i] = ci->parts[i].v[1]; - ci_cache->vz[i] = ci->parts[i].v[2]; - } - - for (int i = 0; i < cj->count; i++) { - cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[i].h; - - cj_cache->m[i] = cj->parts[i].mass; - cj_cache->vx[i] = cj->parts[i].v[0]; - cj_cache->vy[i] = cj->parts[i].v[1]; - cj_cache->vz[i] = cj->parts[i].v[2]; - } -} - -__attribute__((always_inline)) INLINE void cache_read_cell_sorted( - const struct cell *const ci, struct cache *const ci_cache, - const struct entry *restrict sort_i, double *const loc, - double *const shift) { - - int idx; -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd -#endif - for (int i = 0; i < ci->count; i++) { - idx = sort_i[i].i; - - ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[idx].h; - - ci_cache->m[i] = ci->parts[idx].mass; - ci_cache->vx[i] = ci->parts[idx].v[0]; - ci_cache->vy[i] = ci->parts[idx].v[1]; - ci_cache->vz[i] = ci->parts[idx].v[2]; - } -} +__attribute__((always_inline)) INLINE void cache_read_force_particles( + const struct cell *restrict const ci, + struct cache *restrict const ci_cache) { -/** - * @brief Populate cache by reading in the particles from two cells in sorted - * order. - * - * @param ci The i #cell. - * @param cj The j #cell. - * @param ci_cache The #cache for cell ci. - * @param cj_cache The #cache for cell cj. - * @param sort_i The array of sorted particle indices for cell ci. - * @param sort_j The array of sorted particle indices for cell ci. - * @param shift The amount to shift the particle positions to account for BCs - */ -__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift) { +#if defined(GADGET2_SPH) - int idx; -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, SWIFT_CACHE_ALIGNMENT); + + const struct part *restrict parts = ci->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ for (int i = 0; i < ci->count; i++) { - idx = sort_i[i].i; - ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[i] = ci->parts[idx].h; - - ci_cache->m[i] = ci->parts[idx].mass; - ci_cache->vx[i] = ci->parts[idx].v[0]; - ci_cache->vy[i] = ci->parts[idx].v[1]; - ci_cache->vz[i] = ci->parts[idx].v[2]; + x[i] = (float)(parts[i].x[0] - loc[0]); + y[i] = (float)(parts[i].x[1] - loc[1]); + z[i] = (float)(parts[i].x[2] - loc[2]); + h[i] = parts[i].h; + + m[i] = parts[i].mass; + vx[i] = parts[i].v[0]; + vy[i] = parts[i].v[1]; + vz[i] = parts[i].v[2]; + + rho[i] = parts[i].rho; + grad_h[i] = parts[i].force.f; + pOrho2[i] = parts[i].force.P_over_rho2; + balsara[i] = parts[i].force.balsara; + soundspeed[i] = parts[i].force.soundspeed; } -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma simd #endif - for (int i = 0; i < cj->count; i++) { - idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; - } } /** @@ -370,13 +309,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted( * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( - const struct cell *const ci, const struct cell *const cj, - struct cache *const ci_cache, struct cache *const cj_cache, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const double *const shift, int *first_pi, int *last_pj, - const int num_vec_proc) { + const struct cell *restrict const ci, const struct cell *restrict const cj, + struct cache *restrict const ci_cache, + struct cache *restrict const cj_cache, const struct entry *restrict sort_i, + const struct entry *restrict sort_j, const double *restrict const shift, + int *first_pi, int *last_pj, const int num_vec_proc) { - int idx, ci_cache_idx; + int idx; /* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); if (rem != 0) { @@ -394,74 +333,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( int first_pi_align = *first_pi; int last_pj_align = *last_pj; - -/* Shift the particles positions to a local frame (ci frame) so single precision - * can be - * used instead of double precision. Also shift the cell ci, particles positions - * due to BCs but leave cell cj. */ -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif - for (int i = first_pi_align; i < ci->count; i++) { - /* Make sure ci_cache is filled from the first element. */ - ci_cache_idx = i - first_pi_align; - idx = sort_i[i].i; - ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; - ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; - ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; - ci_cache->h[ci_cache_idx] = ci->parts[idx].h; - - ci_cache->m[ci_cache_idx] = ci->parts[idx].mass; - ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; - ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; - ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single + * precision + * can be + * used instead of double precision. Also shift the cell ci, particles + * positions + * due to BCs but leave cell cj. */ + for (int i = 0; i < ci_cache_count; i++) { + idx = sort_i[i + first_pi_align].i; + x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]); + y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]); + z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]); + h[i] = parts_i[idx].h; + + m[i] = parts_i[idx].mass; + vx[i] = parts_i[idx].v[0]; + vy[i] = parts_i[idx].v[1]; + vz[i] = parts_i[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ - float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; + float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0]; for (int i = ci->count - first_pi_align; i < ci->count - first_pi_align + VEC_SIZE; i++) { - ci_cache->x[i] = fake_pix; - ci_cache->y[i] = 1.f; - ci_cache->z[i] = 1.f; - ci_cache->h[i] = 1.f; - - ci_cache->m[i] = 1.f; - ci_cache->vx[i] = 1.f; - ci_cache->vy[i] = 1.f; - ci_cache->vz[i] = 1.f; + x[i] = fake_pix; + y[i] = 1.f; + z[i] = 1.f; + h[i] = 1.f; + + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; } -#if defined(WITH_VECTORIZATION) && defined(__ICC) -#pragma vector aligned -#endif + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); + for (int i = 0; i <= last_pj_align; i++) { idx = sort_j[i].i; - cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; - cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; - cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; - cj_cache->h[i] = cj->parts[idx].h; - - cj_cache->m[i] = cj->parts[idx].mass; - cj_cache->vx[i] = cj->parts[idx].v[0]; - cj_cache->vy[i] = cj->parts[idx].v[1]; - cj_cache->vz[i] = cj->parts[idx].v[2]; + xj[i] = (float)(parts_j[idx].x[0] - loc[0]); + yj[i] = (float)(parts_j[idx].x[1] - loc[1]); + zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + hj[i] = parts_j[idx].h; + + mj[i] = parts_j[idx].mass; + vxj[i] = parts_j[idx].v[0]; + vyj[i] = parts_j[idx].v[1]; + vzj[i] = parts_j[idx].v[2]; } /* Pad cache with fake particles that exist outside the cell so will not * interact.*/ float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { - cj_cache->x[i] = fake_pjx; - cj_cache->y[i] = 1.f; - cj_cache->z[i] = 1.f; - cj_cache->h[i] = 1.f; - - cj_cache->m[i] = 1.f; - cj_cache->vx[i] = 1.f; - cj_cache->vy[i] = 1.f; - cj_cache->vz[i] = 1.f; + xj[i] = fake_pjx; + yj[i] = 1.f; + zj[i] = 1.f; + hj[i] = 1.f; + + mj[i] = 1.f; + vxj[i] = 1.f; + vyj[i] = 1.f; + vzj[i] = 1.f; } } @@ -479,7 +441,7 @@ static INLINE void cache_clean(struct cache *c) { free(c->vy); free(c->vz); free(c->h); - free(c->max_d); + free(c->max_index); } } diff --git a/src/cell.c b/src/cell.c index dbccfd2f42cabf38417cd87de0450489240884be..4502f5d265dc68540e16ed0e51e681cf5733f842 100644 --- a/src/cell.c +++ b/src/cell.c @@ -941,53 +941,52 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset, * @brief Sanitizes the smoothing length values of cells by setting large * outliers to more sensible values. * - * We compute the mean and standard deviation of the smoothing lengths in - * logarithmic space and limit values to mean + 4 sigma. + * Each cell with <1000 part will be processed. We limit h to be the size of + * the cell and replace 0s with a good estimate. * * @param c The cell. + * @param treated Has the cell already been sanitized at this level ? */ -void cell_sanitize(struct cell *c) { +void cell_sanitize(struct cell *c, int treated) { const int count = c->count; struct part *parts = c->parts; + float h_max = 0.f; - /* First collect some statistics */ - float h_mean = 0.f, h_mean2 = 0.f; - float h_min = FLT_MAX, h_max = 0.f; - for (int i = 0; i < count; ++i) { + /* Treat cells will <1000 particles */ + if (count < 1000 && !treated) { - const float h = logf(parts[i].h); - h_mean += h; - h_mean2 += h * h; - h_max = max(h_max, h); - h_min = min(h_min, h); - } - h_mean /= count; - h_mean2 /= count; - const float h_var = h_mean2 - h_mean * h_mean; - const float h_std = (h_var > 0.f) ? sqrtf(h_var) : 0.1f * h_mean; - - /* Choose a cut */ - const float h_limit = expf(h_mean + 4.f * h_std); - - /* Be verbose this is not innocuous */ - message("Cell properties: h_min= %f h_max= %f geometric mean= %f.", - expf(h_min), expf(h_max), expf(h_mean)); + /* Get an upper bound on h */ + const float upper_h_max = c->dmin / (1.2f * kernel_gamma); - if (c->h_max > h_limit) { + /* Apply it */ + for (int i = 0; i < count; ++i) { + if (parts[i].h == 0.f || parts[i].h > upper_h_max) + parts[i].h = upper_h_max; + } + } - message("Smoothing lengths will be limited to (mean + 4sigma)= %f.", - h_limit); + /* Recurse and gather the new h_max values */ + if (c->split) { - /* Apply the cut */ - for (int i = 0; i < count; ++i) parts->h = min(parts[i].h, h_limit); + for (int k = 0; k < 8; ++k) { + if (c->progeny[k] != NULL) { - c->h_max = h_limit; + /* Recurse */ + cell_sanitize(c->progeny[k], (count < 1000)); + /* And collect */ + h_max = max(h_max, c->progeny[k]->h_max); + } + } } else { - message("Smoothing lengths will not be limited."); + /* Get the new value of h_max */ + for (int i = 0; i < count; ++i) h_max = max(h_max, parts[i].h); } + + /* Record the change */ + c->h_max = h_max; } /** @@ -1280,7 +1279,11 @@ void cell_check_multipole(struct cell *c, void *data) { */ void cell_clean(struct cell *c) { - free(c->sort); + for (int i = 0; i < 13; i++) + if (c->sort[i] != NULL) { + free(c->sort[i]); + c->sort[i] = NULL; + } /* Recurse */ for (int k = 0; k < 8; k++) @@ -1316,6 +1319,355 @@ int cell_is_drift_needed(struct cell *c, const struct engine *e) { return 0; } +/** + * @brief Clear the drift flags on the given cell. + */ +void cell_clear_drift_flags(struct cell *c, void *data) { + c->do_drift = 0; + c->do_sub_drift = 0; +} + +/** + * @brief Activate the drifts on the given cell. + */ +void cell_activate_drift_part(struct cell *c, struct scheduler *s) { + + /* If this cell is already marked for drift, quit early. */ + if (c->do_drift) return; + + /* Mark this cell for drifting. */ + c->do_drift = 1; + + /* Set the do_sub_drifts all the way up and activate the super drift + if this has not yet been done. */ + if (c == c->super) { + scheduler_activate(s, c->drift_part); + } else { + for (struct cell *parent = c->parent; + parent != NULL && !parent->do_sub_drift; parent = parent->parent) { + parent->do_sub_drift = 1; + if (parent == c->super) { + scheduler_activate(s, parent->drift_part); + break; + } + } + } +} + +/** + * @brief Activate the sorts up a cell hierarchy. + */ + +void cell_activate_sorts_up(struct cell *c, struct scheduler *s) { + if (c == c->super) { + scheduler_activate(s, c->sorts); + if (c->nodeID == engine_rank) cell_activate_drift_part(c, s); + } else { + for (struct cell *parent = c->parent; + parent != NULL && !parent->do_sub_sort; parent = parent->parent) { + parent->do_sub_sort = 1; + if (parent == c->super) { + scheduler_activate(s, parent->sorts); + if (parent->nodeID == engine_rank) cell_activate_drift_part(parent, s); + break; + } + } + } +} + +/** + * @brief Activate the sorts on a given cell, if needed. + */ +void cell_activate_sorts(struct cell *c, int sid, struct scheduler *s) { + + /* Do we need to re-sort? */ + if (c->dx_max_sort > space_maxreldx * c->dmin) { + + /* Climb up the tree to active the sorts in that direction */ + for (struct cell *finger = c; finger != NULL; finger = finger->parent) { + if (finger->requires_sorts) { + atomic_or(&finger->do_sort, finger->requires_sorts); + cell_activate_sorts_up(finger, s); + } + finger->sorted = 0; + } + } + + /* Has this cell been sorted at all for the given sid? */ + if (!(c->sorted & (1 << sid)) || c->nodeID != engine_rank) { + atomic_or(&c->do_sort, (1 << sid)); + cell_activate_sorts_up(c, s); + } +} + +/** + * @brief Traverse a sub-cell task and activate the sort tasks along the way. + */ +void cell_activate_subcell_tasks(struct cell *ci, struct cell *cj, + struct scheduler *s) { + const struct engine *e = s->space->e; + + /* Store the current dx_max and h_max values. */ + ci->dx_max_old = ci->dx_max_part; + ci->h_max_old = ci->h_max; + if (cj != NULL) { + cj->dx_max_old = cj->dx_max_part; + cj->h_max_old = cj->h_max; + } + + /* Self interaction? */ + if (cj == NULL) { + /* Do anything? */ + if (!cell_is_active(ci, e)) return; + + /* Recurse? */ + if (cell_can_recurse_in_self_task(ci)) { + + /* Loop over all progenies and pairs of progenies */ + for (int j = 0; j < 8; j++) { + if (ci->progeny[j] != NULL) { + cell_activate_subcell_tasks(ci->progeny[j], NULL, s); + for (int k = j + 1; k < 8; k++) + if (ci->progeny[k] != NULL) + cell_activate_subcell_tasks(ci->progeny[j], ci->progeny[k], s); + } + } + } else { + + /* We have reached the bottom of the tree: activate drift */ + cell_activate_drift_part(ci, s); + } + } + + /* Otherwise, pair interation, recurse? */ + else if (cell_can_recurse_in_pair_task(ci) && + cell_can_recurse_in_pair_task(cj)) { + + /* Get the type of pair if not specified explicitly. */ + double shift[3]; + int sid = space_getsid(s->space, &ci, &cj, shift); + + /* Different types of flags. */ + switch (sid) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + break; + + case 1: /* ( 1 , 1 , 0 ) */ + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[0], s); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[1], s); + break; + + case 2: /* ( 1 , 1 , -1 ) */ + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s); + break; + + case 3: /* ( 1 , 0 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[0], s); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[2], s); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[0], s); + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[1], s); + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[2], s); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[0], s); + if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[1], s); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[3], s); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[0], s); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s); + if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[2], s); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[3], s); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[1], s); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[2], s); + if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[3], s); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[1], s); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[3], s); + break; + + case 6: /* ( 1 , -1 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s); + break; + + case 7: /* ( 1 , -1 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[2], s); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[3], s); + break; + + case 8: /* ( 1 , -1 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + cell_activate_subcell_tasks(ci->progeny[4], cj->progeny[3], s); + break; + + case 9: /* ( 0 , 1 , 1 ) */ + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[0], s); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[4], s); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[4], s); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[0], s); + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[1], s); + if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[4], s); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[5], s); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[0], s); + if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[1], s); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[4], s); + if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[5], s); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[0], s); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s); + if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[4], s); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[5], s); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[1], s); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[4], s); + if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[5], s); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[1], s); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + cell_activate_subcell_tasks(ci->progeny[2], cj->progeny[5], s); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[1], s); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + cell_activate_subcell_tasks(ci->progeny[6], cj->progeny[5], s); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[0], s); + if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[2], s); + if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[4], s); + if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) + cell_activate_subcell_tasks(ci->progeny[1], cj->progeny[6], s); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[0], s); + if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[2], s); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[4], s); + if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) + cell_activate_subcell_tasks(ci->progeny[3], cj->progeny[6], s); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[0], s); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[2], s); + if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[4], s); + if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) + cell_activate_subcell_tasks(ci->progeny[5], cj->progeny[6], s); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[0], s); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[2], s); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[4], s); + if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) + cell_activate_subcell_tasks(ci->progeny[7], cj->progeny[6], s); + break; + } + + } + + /* Otherwise, activate the sorts and drifts. */ + else if (cell_is_active(ci, e) || cell_is_active(cj, e)) { + + /* Get the type of pair if not specified explicitly. */ + double shift[3]; + int sid = space_getsid(s->space, &ci, &cj, shift); + + /* We are going to interact this pair, so store some values. */ + atomic_or(&ci->requires_sorts, 1 << sid); + atomic_or(&cj->requires_sorts, 1 << sid); + ci->dx_max_sort_old = ci->dx_max_sort; + cj->dx_max_sort_old = cj->dx_max_sort; + + /* Activate the drifts if the cells are local. */ + if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s); + if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s); + + /* Do we need to sort the cells? */ + cell_activate_sorts(ci, sid, s); + cell_activate_sorts(cj, sid, s); + } +} + /** * @brief Un-skips all the tasks associated with a given cell and checks * if the space needs to be rebuilt. @@ -1327,10 +1679,7 @@ int cell_is_drift_needed(struct cell *c, const struct engine *e) { */ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { -#ifdef WITH_MPI struct engine *e = s->space->e; -#endif - int rebuild = 0; /* Un-skip the density tasks involved with this cell. */ @@ -1338,33 +1687,31 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { struct task *t = l->t; struct cell *ci = t->ci; struct cell *cj = t->cj; - scheduler_activate(s, t); - /* Set the correct sorting flags */ - if (t->type == task_type_pair) { - if (ci->dx_max_sort > space_maxreldx * ci->dmin) { - for (struct cell *finger = ci; finger != NULL; finger = finger->parent) - finger->sorted = 0; - } - if (cj->dx_max_sort > space_maxreldx * cj->dmin) { - for (struct cell *finger = cj; finger != NULL; finger = finger->parent) - finger->sorted = 0; - } - if (!(ci->sorted & (1 << t->flags))) { -#ifdef SWIFT_DEBUG_CHECKS - if (!(ci->sorts->flags & (1 << t->flags))) - error("bad flags in sort task."); -#endif - scheduler_activate(s, ci->sorts); - if (ci->nodeID == engine_rank) scheduler_activate(s, ci->drift_part); + /* Only activate tasks that involve a local active cell. */ + if ((cell_is_active(ci, e) && ci->nodeID == engine_rank) || + (cj != NULL && cell_is_active(cj, e) && cj->nodeID == engine_rank)) { + scheduler_activate(s, t); + + /* Set the correct sorting flags */ + if (t->type == task_type_pair) { + /* Store some values. */ + atomic_or(&ci->requires_sorts, 1 << t->flags); + atomic_or(&cj->requires_sorts, 1 << t->flags); + ci->dx_max_sort_old = ci->dx_max_sort; + cj->dx_max_sort_old = cj->dx_max_sort; + + /* Activate the drift tasks. */ + if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s); + if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s); + + /* Check the sorts and activate them if needed. */ + cell_activate_sorts(ci, t->flags, s); + cell_activate_sorts(cj, t->flags, s); } - if (!(cj->sorted & (1 << t->flags))) { -#ifdef SWIFT_DEBUG_CHECKS - if (!(cj->sorts->flags & (1 << t->flags))) - error("bad flags in sort task."); -#endif - scheduler_activate(s, cj->sorts); - if (cj->nodeID == engine_rank) scheduler_activate(s, cj->drift_part); + /* Store current values of dx_max and h_max. */ + else if (t->type == task_type_sub_pair || t->type == task_type_sub_self) { + cell_activate_subcell_tasks(t->ci, t->cj, s); } } @@ -1373,55 +1720,60 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { /* Check whether there was too much particle motion, i.e. the cell neighbour conditions were violated. */ - if (max(ci->h_max, cj->h_max) + ci->dx_max_part + cj->dx_max_part > - cj->dmin) - rebuild = 1; + if (cell_need_rebuild_for_pair(ci, cj)) rebuild = 1; #ifdef WITH_MPI - /* Activate the send/recv flags. */ + /* Activate the send/recv tasks. */ if (ci->nodeID != engine_rank) { - /* Activate the tasks to recv foreign cell ci's data. */ - scheduler_activate(s, ci->recv_xv); - if (cell_is_active(ci, e)) { - scheduler_activate(s, ci->recv_rho); + /* If the local cell is active, receive data from the foreign cell. */ + if (cell_is_active(cj, e)) { + scheduler_activate(s, ci->recv_xv); + if (cell_is_active(ci, e)) { + scheduler_activate(s, ci->recv_rho); #ifdef EXTRA_HYDRO_LOOP - scheduler_activate(s, ci->recv_gradient); + scheduler_activate(s, ci->recv_gradient); #endif - scheduler_activate(s, ci->recv_ti); + } } - /* Look for the local cell cj's send tasks. */ - struct link *l = NULL; - for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_xv task."); - scheduler_activate(s, l->t); - - /* Drift both cells, the foreign one at the level which it is sent. */ - if (l->t->ci->drift_part) - scheduler_activate(s, l->t->ci->drift_part); - else - error("Drift task missing !"); - if (t->type == task_type_pair) scheduler_activate(s, cj->drift_part); - - if (cell_is_active(cj, e)) { + /* If the foreign cell is active, we want its ti_end values. */ + if (cell_is_active(ci, e)) scheduler_activate(s, ci->recv_ti); - for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID; + /* Look for the local cell cj's send tasks. */ + if (cell_is_active(ci, e)) { + struct link *l = NULL; + for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) ; - if (l == NULL) error("Missing link to send_rho task."); + if (l == NULL) error("Missing link to send_xv task."); scheduler_activate(s, l->t); + /* Drift the cell which will be sent; note that not all sent + particles will be drifted, only those that are needed. */ + cell_activate_drift_part(cj, s); + + if (cell_is_active(cj, e)) { + struct link *l = NULL; + for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID; + l = l->next) + ; + if (l == NULL) error("Missing link to send_rho task."); + scheduler_activate(s, l->t); + #ifdef EXTRA_HYDRO_LOOP - for (l = cj->send_gradient; - l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) - ; - if (l == NULL) error("Missing link to send_gradient task."); - scheduler_activate(s, l->t); + for (l = cj->send_gradient; + l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) + ; + if (l == NULL) error("Missing link to send_gradient task."); + scheduler_activate(s, l->t); #endif + } + } + /* If the local cell is active, send its ti_end values. */ + if (cell_is_active(cj, e)) { + struct link *l = NULL; for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) ; @@ -1431,87 +1783,92 @@ int cell_unskip_tasks(struct cell *c, struct scheduler *s) { } else if (cj->nodeID != engine_rank) { - /* Activate the tasks to recv foreign cell cj's data. */ - scheduler_activate(s, cj->recv_xv); - if (cell_is_active(cj, e)) { - scheduler_activate(s, cj->recv_rho); + /* If the local cell is active, receive data from the foreign cell. */ + if (cell_is_active(ci, e)) { + scheduler_activate(s, cj->recv_xv); + if (cell_is_active(cj, e)) { + scheduler_activate(s, cj->recv_rho); #ifdef EXTRA_HYDRO_LOOP - scheduler_activate(s, cj->recv_gradient); + scheduler_activate(s, cj->recv_gradient); #endif - scheduler_activate(s, cj->recv_ti); + } } - /* Look for the local cell ci's send tasks. */ - struct link *l = NULL; - for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_xv task."); - scheduler_activate(s, l->t); - - /* Drift both cells, the foreign one at the level which it is sent. */ - if (l->t->ci->drift_part) - scheduler_activate(s, l->t->ci->drift_part); - else - error("Drift task missing !"); - if (t->type == task_type_pair) scheduler_activate(s, ci->drift_part); - - if (cell_is_active(ci, e)) { + /* If the foreign cell is active, we want its ti_end values. */ + if (cell_is_active(cj, e)) scheduler_activate(s, cj->recv_ti); - for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID; + /* Look for the local cell ci's send tasks. */ + if (cell_is_active(cj, e)) { + struct link *l = NULL; + for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) ; - if (l == NULL) error("Missing link to send_rho task."); + if (l == NULL) error("Missing link to send_xv task."); scheduler_activate(s, l->t); + /* Drift the cell which will be sent; note that not all sent + particles will be drifted, only those that are needed. */ + cell_activate_drift_part(ci, s); + + if (cell_is_active(ci, e)) { + + struct link *l = NULL; + for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID; + l = l->next) + ; + if (l == NULL) error("Missing link to send_rho task."); + scheduler_activate(s, l->t); + #ifdef EXTRA_HYDRO_LOOP - for (l = ci->send_gradient; - l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) - ; - if (l == NULL) error("Missing link to send_gradient task."); - scheduler_activate(s, l->t); + for (l = ci->send_gradient; + l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) + ; + if (l == NULL) error("Missing link to send_gradient task."); + scheduler_activate(s, l->t); #endif + } + } + /* If the local cell is active, send its ti_end values. */ + if (cell_is_active(ci, e)) { + struct link *l = NULL; for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) ; if (l == NULL) error("Missing link to send_ti task."); scheduler_activate(s, l->t); } - } else if (t->type == task_type_pair) { - scheduler_activate(s, ci->drift_part); - scheduler_activate(s, cj->drift_part); - } -#else - if (t->type == task_type_pair) { - scheduler_activate(s, ci->drift_part); - scheduler_activate(s, cj->drift_part); } #endif } } /* Unskip all the other task types. */ - for (struct link *l = c->gradient; l != NULL; l = l->next) - scheduler_activate(s, l->t); - for (struct link *l = c->force; l != NULL; l = l->next) - scheduler_activate(s, l->t); - for (struct link *l = c->grav; l != NULL; l = l->next) - scheduler_activate(s, l->t); - if (c->extra_ghost != NULL) scheduler_activate(s, c->extra_ghost); - if (c->ghost != NULL) scheduler_activate(s, c->ghost); - if (c->init_grav != NULL) scheduler_activate(s, c->init_grav); - if (c->drift_part != NULL) scheduler_activate(s, c->drift_part); - if (c->drift_gpart != NULL) scheduler_activate(s, c->drift_gpart); - if (c->kick1 != NULL) scheduler_activate(s, c->kick1); - if (c->kick2 != NULL) scheduler_activate(s, c->kick2); - if (c->timestep != NULL) scheduler_activate(s, c->timestep); - if (c->grav_ghost[0] != NULL) scheduler_activate(s, c->grav_ghost[0]); - if (c->grav_ghost[1] != NULL) scheduler_activate(s, c->grav_ghost[1]); - if (c->grav_down != NULL) scheduler_activate(s, c->grav_down); - if (c->grav_long_range != NULL) scheduler_activate(s, c->grav_long_range); - if (c->cooling != NULL) scheduler_activate(s, c->cooling); - if (c->sourceterms != NULL) scheduler_activate(s, c->sourceterms); + if (c->nodeID == engine_rank && cell_is_active(c, e)) { + + for (struct link *l = c->gradient; l != NULL; l = l->next) + scheduler_activate(s, l->t); + for (struct link *l = c->force; l != NULL; l = l->next) + scheduler_activate(s, l->t); + for (struct link *l = c->grav; l != NULL; l = l->next) + scheduler_activate(s, l->t); + + if (c->extra_ghost != NULL) scheduler_activate(s, c->extra_ghost); + if (c->ghost_in != NULL) scheduler_activate(s, c->ghost_in); + if (c->ghost_out != NULL) scheduler_activate(s, c->ghost_out); + if (c->ghost != NULL) scheduler_activate(s, c->ghost); + if (c->init_grav != NULL) scheduler_activate(s, c->init_grav); + if (c->drift_gpart != NULL) scheduler_activate(s, c->drift_gpart); + if (c->kick1 != NULL) scheduler_activate(s, c->kick1); + if (c->kick2 != NULL) scheduler_activate(s, c->kick2); + if (c->timestep != NULL) scheduler_activate(s, c->timestep); + if (c->grav_ghost[0] != NULL) scheduler_activate(s, c->grav_ghost[0]); + if (c->grav_ghost[1] != NULL) scheduler_activate(s, c->grav_ghost[1]); + if (c->grav_down != NULL) scheduler_activate(s, c->grav_down); + if (c->grav_long_range != NULL) scheduler_activate(s, c->grav_long_range); + if (c->cooling != NULL) scheduler_activate(s, c->cooling); + if (c->sourceterms != NULL) scheduler_activate(s, c->sourceterms); + } return rebuild; } @@ -1536,13 +1893,21 @@ void cell_set_super(struct cell *c, struct cell *super) { if (c->progeny[k] != NULL) cell_set_super(c->progeny[k], super); } +void cell_set_super_mapper(void *map_data, int num_elements, void *extra_data) { + for (int ind = 0; ind < num_elements; ind++) { + struct cell *c = &((struct cell *)map_data)[ind]; + cell_set_super(c, NULL); + } +} + /** * @brief Recursively drifts the #part in a cell hierarchy. * * @param c The #cell. * @param e The #engine (to get ti_current). + * @param force Drift the particles irrespective of the #cell flags. */ -void cell_drift_part(struct cell *c, const struct engine *e) { +void cell_drift_part(struct cell *c, const struct engine *e, int force) { const float hydro_h_max = e->hydro_properties->h_max; const double timeBase = e->timeBase; @@ -1557,11 +1922,19 @@ void cell_drift_part(struct cell *c, const struct engine *e) { float dx_max_sort = 0.0f, dx2_max_sort = 0.f; float cell_h_max = 0.f; + /* Drift irrespective of cell flags? */ + force |= c->do_drift; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that we only drift local cells. */ + if (c->nodeID != engine_rank) error("Drifting a foreign cell is nope."); + /* Check that we are actually going to move forward. */ if (ti_current < ti_old_part) error("Attempt to drift to the past"); +#endif // SWIFT_DEBUG_CHECKS /* Are we not in a leaf ? */ - if (c->split) { + if (c->split && (force || c->do_sub_drift)) { /* Loop over the progeny and collect their data. */ for (int k = 0; k < 8; k++) @@ -1569,7 +1942,7 @@ void cell_drift_part(struct cell *c, const struct engine *e) { struct cell *cp = c->progeny[k]; /* Collect */ - cell_drift_part(cp, e); + cell_drift_part(cp, e, force); /* Update */ dx_max = max(dx_max, cp->dx_max_part); @@ -1577,7 +1950,15 @@ void cell_drift_part(struct cell *c, const struct engine *e) { cell_h_max = max(cell_h_max, cp->h_max); } - } else if (ti_current > ti_old_part) { + /* Store the values */ + c->h_max = cell_h_max; + c->dx_max_part = dx_max; + c->dx_max_sort = dx_max_sort; + + /* Update the time of the last drift */ + c->ti_old_part = ti_current; + + } else if (!c->split && force && ti_current > ti_old_part) { /* Loop over all the gas particles in the cell */ const size_t nr_parts = c->count; @@ -1616,20 +1997,18 @@ void cell_drift_part(struct cell *c, const struct engine *e) { dx_max = sqrtf(dx2_max); dx_max_sort = sqrtf(dx2_max_sort); - } else { + /* Store the values */ + c->h_max = cell_h_max; + c->dx_max_part = dx_max; + c->dx_max_sort = dx_max_sort; - cell_h_max = c->h_max; - dx_max = c->dx_max_part; - dx_max_sort = c->dx_max_sort; + /* Update the time of the last drift */ + c->ti_old_part = ti_current; } - /* Store the values */ - c->h_max = cell_h_max; - c->dx_max_part = dx_max; - c->dx_max_sort = dx_max_sort; - - /* Update the time of the last drift */ - c->ti_old_part = ti_current; + /* Clear the drift flags. */ + c->do_drift = 0; + c->do_sub_drift = 0; } /** diff --git a/src/cell.h b/src/cell.h index 2e32533402110040310be88629d0fb33f0128c62..e97400623dbb7a66aee981d21883fe4d8f73406a 100644 --- a/src/cell.h +++ b/src/cell.h @@ -31,15 +31,16 @@ /* Local includes. */ #include "align.h" +#include "kernel_hydro.h" #include "lock.h" #include "multipole.h" #include "part.h" +#include "space.h" #include "task.h" #include "timeline.h" /* Avoid cyclic inclusions */ struct engine; -struct space; struct scheduler; /* Max tag size set to 2^29 to take into account some MPI implementations @@ -122,7 +123,7 @@ struct cell { struct spart *sparts; /*! Pointer for the sorted indices. */ - struct entry *sort; + struct entry *sort[13]; /*! Pointers to the next level of cells. */ struct cell *progeny[8]; @@ -151,7 +152,9 @@ struct cell { /*! The multipole initialistation task */ struct task *init_grav; - /*! The ghost task */ + /*! The ghost tasks */ + struct task *ghost_in; + struct task *ghost_out; struct task *ghost; /*! The extra ghost task for complex hydro schemes */ @@ -236,9 +239,6 @@ struct cell { /*! Maximum beginning of (integer) time step in this cell. */ integertime_t ti_beg_max; - /*! Last (integer) time the cell's sort arrays were updated. */ - integertime_t ti_sort; - /*! Last (integer) time the cell's part were drifted forward in time. */ integertime_t ti_old_part; @@ -269,9 +269,6 @@ struct cell { /*! Nr of #spart in this cell. */ int scount; - /*! The size of the sort array */ - int sortsize; - /*! Bit-mask indicating the sorted directions */ unsigned int sorted; @@ -326,7 +323,30 @@ struct cell { /*! The maximal depth of this cell and its progenies */ char maxdepth; + /*! Values of dx_max and h_max before the drifts, used for sub-cell tasks. */ + float dx_max_old; + float h_max_old; + float dx_max_sort_old; + + /* Bit mask of sort directions that will be needed in the next timestep. */ + unsigned int requires_sorts; + + /*! Does this cell need to be drifted? */ + char do_drift; + + /*! Do any of this cell's sub-cells need to be drifted? */ + char do_sub_drift; + + /*! Bit mask of sorts that need to be computed for this cell. */ + unsigned int do_sort; + + /*! Do any of this cell's sub-cells need to be sorted? */ + char do_sub_sort; + #ifdef SWIFT_DEBUG_CHECKS + /*! Last (integer) time the cell's sort arrays were updated. */ + integertime_t ti_sort; + /*! The list of tasks that have been executed on this cell */ char tasks_executed[64]; @@ -344,7 +364,7 @@ struct cell { void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset, struct cell_buff *buff, struct cell_buff *sbuff, struct cell_buff *gbuff); -void cell_sanitize(struct cell *c); +void cell_sanitize(struct cell *c, int treated); int cell_locktree(struct cell *c); void cell_unlocktree(struct cell *c); int cell_glocktree(struct cell *c); @@ -373,10 +393,103 @@ void cell_reset_task_counters(struct cell *c); int cell_is_drift_needed(struct cell *c, const struct engine *e); int cell_unskip_tasks(struct cell *c, struct scheduler *s); void cell_set_super(struct cell *c, struct cell *super); -void cell_drift_part(struct cell *c, const struct engine *e); +void cell_drift_part(struct cell *c, const struct engine *e, int force); void cell_drift_gpart(struct cell *c, const struct engine *e); void cell_drift_multipole(struct cell *c, const struct engine *e); void cell_drift_all_multipoles(struct cell *c, const struct engine *e); void cell_check_timesteps(struct cell *c); +void cell_store_pre_drift_values(struct cell *c); +void cell_activate_subcell_tasks(struct cell *ci, struct cell *cj, + struct scheduler *s); +void cell_activate_drift_part(struct cell *c, struct scheduler *s); +void cell_activate_sorts(struct cell *c, int sid, struct scheduler *s); +void cell_clear_drift_flags(struct cell *c, void *data); +void cell_set_super_mapper(void *map_data, int num_elements, void *extra_data); + +/* Inlined functions (for speed). */ + +/** + * @brief Can a sub-pair hydro task recurse to a lower level based + * on the status of the particles in the cell. + * + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE static int cell_can_recurse_in_pair_task( + const struct cell *c) { + + /* Is the cell split ? */ + /* If so, is the cut-off radius plus the max distance the parts have moved */ + /* smaller than the sub-cell sizes ? */ + /* Note: We use the _old values as these might have been updated by a drift */ + return c->split && + ((kernel_gamma * c->h_max_old + c->dx_max_old) < 0.5f * c->dmin); +} + +/** + * @brief Can a sub-self hydro task recurse to a lower level based + * on the status of the particles in the cell. + * + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE static int cell_can_recurse_in_self_task( + const struct cell *c) { + + /* Is the cell split ? */ + /* Note: No need for more checks here as all the sub-pairs and sub-self */ + /* operations will be executed. So no need for the particle to be at exactly + */ + /* the right place. */ + return c->split; +} + +/** + * @brief Can a pair task associated with a cell be split into smaller + * sub-tasks. + * + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE static int cell_can_split_pair_task( + const struct cell *c) { + + /* Is the cell split ? */ + /* If so, is the cut-off radius with some leeway smaller than */ + /* the sub-cell sizes ? */ + /* Note that since tasks are create after a rebuild no need to take */ + /* into account any part motion (i.e. dx_max == 0 here) */ + return c->split && (space_stretch * kernel_gamma * c->h_max < 0.5f * c->dmin); +} + +/** + * @brief Can a self task associated with a cell be split into smaller + * sub-tasks. + * + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE static int cell_can_split_self_task( + const struct cell *c) { + + /* Is the cell split ? */ + /* Note: No need for more checks here as all the sub-pairs and sub-self */ + /* tasks will be created. So no need to check for h_max */ + return c->split && (space_stretch * kernel_gamma * c->h_max < 0.5f * c->dmin); +} + +/** + * @brief Have particles in a pair of cells moved too much and require a rebuild + * ? + * + * @param ci The first #cell. + * @param cj The second #cell. + */ +__attribute__((always_inline)) INLINE static int cell_need_rebuild_for_pair( + const struct cell *ci, const struct cell *cj) { + + /* Is the cut-off radius plus the max distance the parts in both cells have */ + /* moved larger than the cell size ? */ + /* Note ci->dmin == cj->dmin */ + return (kernel_gamma * max(ci->h_max, cj->h_max) + ci->dx_max_part + + cj->dx_max_part > + cj->dmin); +} #endif /* SWIFT_CELL_H */ diff --git a/src/collectgroup.c b/src/collectgroup.c index 0b4ddc405772a45a1e444ef48b65fcb7d37a248f..b7e5486b59a2ec5e47b7b864071a2bb1e5ce1850 100644 --- a/src/collectgroup.c +++ b/src/collectgroup.c @@ -170,7 +170,7 @@ static void doreduce1(struct mpicollectgroup1 *mpigrp11, } /** - * @brief MPI reduce operator for #mpicollectgroup structures. + * @brief MPI reduce operator for #mpicollectgroup1 structures. */ static void mpicollectgroup1_reduce(void *in, void *inout, int *len, MPI_Datatype *datatype) { diff --git a/src/const.h b/src/const.h index 141eb48acc633542aa98655caa8debdd2dbce530..c8060a2be51468a791e192a65a74f1a4d9bc8e30 100644 --- a/src/const.h +++ b/src/const.h @@ -37,7 +37,7 @@ #define const_max_u_change 0.1f /* Thermal energy per unit mass used as a constant for the isothermal EoS */ -#define const_isothermal_internal_energy 20.2615290634f +#define const_isothermal_internal_energy 20.2678457288f /* Type of gradients to use (GIZMO_SPH only) */ /* If no option is chosen, no gradients are used (first order scheme) */ @@ -49,6 +49,9 @@ #define SLOPE_LIMITER_PER_FACE #define SLOPE_LIMITER_CELL_WIDE +/* Types of flux limiter to use (GIZMO_SPH only) */ +#define GIZMO_FLUX_LIMITER + /* Options to control the movement of particles for GIZMO_SPH. */ /* This option disables particle movement */ //#define GIZMO_FIX_PARTICLES diff --git a/src/debug.c b/src/debug.c index 601f63d6e11bbbf95f62eaef1ec6ec7ec06d3ad9..903d7e5a2e30bca8980078991c5155830f5e4c43 100644 --- a/src/debug.c +++ b/src/debug.c @@ -26,6 +26,7 @@ /* Some standard headers. */ #include <float.h> #include <stdio.h> +#include <unistd.h> /* This object's header. */ #include "debug.h" @@ -450,3 +451,69 @@ void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells) { } #endif /* HAVE_MPI */ + +/** + * @brief parse the process /proc/self/statm file to get the process + * memory use (in KB). Top field in (). + * + * @param size total virtual memory (VIRT) + * @param resident resident non-swapped memory (RES) + * @param share shared (mmap'd) memory (SHR) + * @param trs text (exe) resident set (CODE) + * @param lrs library resident set + * @param drs data+stack resident set (DATA) + * @param dt dirty pages (nDRT) + */ +void getProcMemUse(long *size, long *resident, long *share, long *trs, + long *lrs, long *drs, long *dt) { + + /* Open the file. */ + FILE *file = fopen("/proc/self/statm", "r"); + if (file != NULL) { + int nscan = fscanf(file, "%ld %ld %ld %ld %ld %ld %ld", size, resident, + share, trs, lrs, drs, dt); + + if (nscan == 7) { + /* Convert pages into bytes. Usually 4096, but could be 512 on some + * systems so take care in conversion to KB. */ + long sz = sysconf(_SC_PAGESIZE); + *size *= sz; + *resident *= sz; + *share *= sz; + *trs *= sz; + *lrs *= sz; + *drs *= sz; + *dt *= sz; + + *size /= 1024; + *resident /= 1024; + *share /= 1024; + *trs /= 1024; + *lrs /= 1024; + *drs /= 1024; + *dt /= 1024; + } else { + error("Failed to read sufficient fields from /proc/self/statm"); + } + fclose(file); + } else { + error("Failed to open /proc/self/statm"); + } +} + +/** + * @brief Print the current memory use of the process. A la "top". + */ +void printProcMemUse() { + long size; + long resident; + long share; + long trs; + long lrs; + long drs; + long dt; + getProcMemUse(&size, &resident, &share, &trs, &lrs, &drs, &dt); + printf("## VIRT = %ld , RES = %ld , SHR = %ld , CODE = %ld, DATA = %ld\n", + size, resident, share, trs, drs); + fflush(stdout); +} diff --git a/src/debug.h b/src/debug.h index 7422a6f7f9815490966f08415e0312876ce0123f..7dca848b6bf4e44de5f40fa8e1c0849e8ee3d0b4 100644 --- a/src/debug.h +++ b/src/debug.h @@ -44,4 +44,7 @@ void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon, idx_t *xadj, void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells); #endif +void getProcMemUse(long *size, long *resident, long *share, long *trs, + long *lrs, long *drs, long *dt); +void printProcMemUse(); #endif /* SWIFT_DEBUG_H */ diff --git a/src/dimension.h b/src/dimension.h index 60c5208d846f9beebd7a1fd3e183fc771fbc5f91..0b2093d718a61c6ce850db1970412af3e2e462b9 100644 --- a/src/dimension.h +++ b/src/dimension.h @@ -118,6 +118,34 @@ __attribute__((always_inline)) INLINE static float pow_dimension_plus_one( #endif } +/** + * @brief Returns the argument to the power given by the dimension minus one + * + * Computes \f$x^{d-1}\f$. + */ +__attribute__((always_inline)) INLINE static float pow_dimension_minus_one( + float x) { + +#if defined(HYDRO_DIMENSION_3D) + + return x * x; + +#elif defined(HYDRO_DIMENSION_2D) + + return x; + +#elif defined(HYDRO_DIMENSION_1D) + + return 1.f; + +#else + + error("The dimension is not defined !"); + return 0.f; + +#endif +} + /** * @brief Inverts the given dimension by dimension matrix (in place) * diff --git a/src/engine.c b/src/engine.c index 417c9f626d7e2f8d96d49d8d2bed942102b96e4f..93481aca3d25fd9755b7c7f69ef25ddb4d9d9d06 100644 --- a/src/engine.c +++ b/src/engine.c @@ -57,6 +57,7 @@ #include "error.h" #include "gravity.h" #include "hydro.h" +#include "map.h" #include "minmax.h" #include "parallel_io.h" #include "part.h" @@ -76,22 +77,23 @@ /* Particle cache size. */ #define CACHE_SIZE 512 -const char *engine_policy_names[16] = {"none", - "rand", - "steal", - "keep", - "block", - "cpu_tight", - "mpi", - "numa_affinity", - "hydro", - "self_gravity", - "external_gravity", - "cosmology_integration", - "drift_all", - "cooling", - "sourceterms", - "stars"}; +const char *engine_policy_names[] = {"none", + "rand", + "steal", + "keep", + "block", + "cpu_tight", + "mpi", + "numa_affinity", + "hydro", + "self_gravity", + "external_gravity", + "cosmology_integration", + "drift_all", + "reconstruct_mpoles", + "cooling", + "sourceterms", + "stars"}; /** The rank of the engine as a global variable (for messages). */ int engine_rank; @@ -119,6 +121,24 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) { res->next = atomic_swap(l, res); } +/** + * @brief Recursively add non-implicit ghost tasks to a cell hierarchy. + */ +void engine_add_ghosts(struct engine *e, struct cell *c, struct task *ghost_in, + struct task *ghost_out) { + if (!c->split || c->count < engine_max_parts_per_ghost) { + struct scheduler *s = &e->sched; + c->ghost = + scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0, c, NULL); + scheduler_addunlock(s, ghost_in, c->ghost); + scheduler_addunlock(s, c->ghost, ghost_out); + } else { + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) + engine_add_ghosts(e, c->progeny[k], ghost_in, ghost_out); + } +} + /** * @brief Generate the hydro hierarchical tasks for a hierarchy of cells - * i.e. all the O(Npart) tasks. @@ -134,7 +154,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { struct scheduler *s = &e->sched; const int periodic = e->s->periodic; - const int is_hydro = (e->policy & engine_policy_hydro); + const int is_with_hydro = (e->policy & engine_policy_hydro); const int is_self_gravity = (e->policy & engine_policy_self_gravity); const int is_with_cooling = (e->policy & engine_policy_cooling); const int is_with_sourceterms = (e->policy & engine_policy_sourceterms); @@ -142,9 +162,21 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { /* Are we in a super-cell ? */ if (c->super == c) { + /* Add the sort task. */ + if (is_with_hydro) { + c->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, + c, NULL); + } + /* Local tasks only... */ if (c->nodeID == e->nodeID) { + /* Add the drift task. */ + if (is_with_hydro) { + c->drift_part = scheduler_addtask(s, task_type_drift_part, + task_subtype_none, 0, 0, c, NULL); + } + /* Add the two half kicks */ c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0, c, NULL); @@ -179,17 +211,22 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { scheduler_addunlock(s, c->grav_down, c->kick2); } - /* Generate the ghost task. */ - if (is_hydro) - c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, - 0, c, NULL); + /* Generate the ghost tasks. */ + if (is_with_hydro) { + c->ghost_in = + scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, + /* implicit = */ 1, c, NULL); + c->ghost_out = + scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, + /* implicit = */ 1, c, NULL); + engine_add_ghosts(e, c, c->ghost_in, c->ghost_out); #ifdef EXTRA_HYDRO_LOOP - /* Generate the extra ghost task. */ - if (is_hydro) + /* Generate the extra ghost task. */ c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost, task_subtype_none, 0, 0, c, NULL); #endif + } /* Cooling task */ if (is_with_cooling) { @@ -220,6 +257,145 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) { } } +void engine_make_hierarchical_tasks_mapper(void *map_data, int num_elements, + void *extra_data) { + struct engine *e = (struct engine *)extra_data; + + for (int ind = 0; ind < num_elements; ind++) { + struct cell *c = &((struct cell *)map_data)[ind]; + engine_make_hierarchical_tasks(e, c); + } +} + +#ifdef WITH_MPI +/** + * Do the exchange of one type of particles with all the other nodes. + * + * @param counts 2D array with the counts of particles to exchange with + * each other node. + * @param parts the particle data to exchange + * @param new_nr_parts the number of particles this node will have after all + * exchanges have completed. + * @param sizeofparts sizeof the particle struct. + * @param alignsize the memory alignment required for this particle type. + * @param mpi_type the MPI_Datatype for these particles. + * @param nr_nodes the number of nodes to exchange with. + * @param nodeID the id of this node. + * + * @result new particle data constructed from all the exchanges with the + * given alignment. + */ +static void *engine_do_redistribute(int *counts, char *parts, + size_t new_nr_parts, size_t sizeofparts, + size_t alignsize, MPI_Datatype mpi_type, + int nr_nodes, int nodeID) { + + /* Allocate a new particle array with some extra margin */ + char *parts_new = NULL; + if (posix_memalign( + (void **)&parts_new, alignsize, + sizeofparts * new_nr_parts * engine_redistribute_alloc_margin) != 0) + error("Failed to allocate new particle data."); + + /* Prepare MPI requests for the asynchronous communications */ + MPI_Request *reqs; + if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 2 * nr_nodes)) == + NULL) + error("Failed to allocate MPI request list."); + + /* Only send and receive only "chunk" particles per request. So we need to + * loop as many times as necessary here. Make 2Gb/sizeofparts so we only + * send 2Gb packets. */ + const int chunk = INT_MAX / sizeofparts; + int sent = 0; + int recvd = 0; + + int activenodes = 1; + while (activenodes) { + + for (int k = 0; k < 2 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL; + + /* Emit the sends and recvs for the data. */ + size_t offset_send = sent; + size_t offset_recv = recvd; + activenodes = 0; + + for (int k = 0; k < nr_nodes; k++) { + + /* Indices in the count arrays of the node of interest */ + const int ind_send = nodeID * nr_nodes + k; + const int ind_recv = k * nr_nodes + nodeID; + + /* Are we sending any data this loop? */ + int sending = counts[ind_send] - sent; + if (sending > 0) { + activenodes++; + if (sending > chunk) sending = chunk; + + /* If the send and receive is local then just copy. */ + if (k == nodeID) { + int receiving = counts[ind_recv] - recvd; + if (receiving > chunk) receiving = chunk; + memcpy(&parts_new[offset_recv * sizeofparts], + &parts[offset_send * sizeofparts], sizeofparts * receiving); + } else { + /* Otherwise send it. */ + int res = + MPI_Isend(&parts[offset_send * sizeofparts], sending, mpi_type, k, + ind_send, MPI_COMM_WORLD, &reqs[2 * k + 0]); + if (res != MPI_SUCCESS) + mpi_error(res, "Failed to isend parts to node %i.", k); + } + } + + /* If we're sending to this node, then move past it to next. */ + if (counts[ind_send] > 0) offset_send += counts[ind_send]; + + /* Are we receiving any data from this node? Note already done if coming + * from this node. */ + if (k != nodeID) { + int receiving = counts[ind_recv] - recvd; + if (receiving > 0) { + activenodes++; + if (receiving > chunk) receiving = chunk; + int res = MPI_Irecv(&parts_new[offset_recv * sizeofparts], receiving, + mpi_type, k, ind_recv, MPI_COMM_WORLD, + &reqs[2 * k + 1]); + if (res != MPI_SUCCESS) + mpi_error(res, "Failed to emit irecv of parts from node %i.", k); + } + } + + /* If we're receiving from this node, then move past it to next. */ + if (counts[ind_recv] > 0) offset_recv += counts[ind_recv]; + } + + /* Wait for all the sends and recvs to tumble in. */ + MPI_Status stats[2 * nr_nodes]; + int res; + if ((res = MPI_Waitall(2 * nr_nodes, reqs, stats)) != MPI_SUCCESS) { + for (int k = 0; k < 2 * nr_nodes; k++) { + char buff[MPI_MAX_ERROR_STRING]; + MPI_Error_string(stats[k].MPI_ERROR, buff, &res); + message("request from source %i, tag %i has error '%s'.", + stats[k].MPI_SOURCE, stats[k].MPI_TAG, buff); + } + error("Failed during waitall for part data."); + } + + /* Move to next chunks. */ + sent += chunk; + recvd += chunk; + } + + /* Free temps. */ + free(reqs); + + /* And return new memory. */ + return parts_new; +} +#endif + /** * @brief Redistribute the particles amongst the nodes according * to their cell's node IDs. @@ -249,32 +425,20 @@ void engine_redistribute(struct engine *e) { const double iwidth[3] = {s->iwidth[0], s->iwidth[1], s->iwidth[2]}; const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; struct part *parts = s->parts; - struct xpart *xparts = s->xparts; struct gpart *gparts = s->gparts; struct spart *sparts = s->sparts; ticks tic = getticks(); /* Allocate temporary arrays to store the counts of particles to be sent - and the destination of each particle */ - int *counts, *g_counts, *s_counts; + * and the destination of each particle */ + int *counts; if ((counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL) error("Failed to allocate counts temporary buffer."); - if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL) - error("Failed to allocate g_gcount temporary buffer."); - if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL) - error("Failed to allocate s_counts temporary buffer."); bzero(counts, sizeof(int) * nr_nodes * nr_nodes); - bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes); - bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes); - /* Allocate the destination index arrays. */ - int *dest, *g_dest, *s_dest; + int *dest; if ((dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL) error("Failed to allocate dest temporary buffer."); - if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL) - error("Failed to allocate g_dest temporary buffer."); - if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL) - error("Failed to allocate s_dest temporary buffer."); /* Get destination of each particle */ for (size_t k = 0; k < s->nr_parts; k++) { @@ -356,8 +520,18 @@ void engine_redistribute(struct engine *e) { } } } + free(dest); /* Get destination of each s-particle */ + int *s_counts; + if ((s_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL) + error("Failed to allocate s_counts temporary buffer."); + bzero(s_counts, sizeof(int) * nr_nodes * nr_nodes); + + int *s_dest; + if ((s_dest = (int *)malloc(sizeof(int) * s->nr_sparts)) == NULL) + error("Failed to allocate s_dest temporary buffer."); + for (size_t k = 0; k < s->nr_sparts; k++) { /* Periodic boundary conditions */ @@ -372,7 +546,7 @@ void engine_redistribute(struct engine *e) { sparts[k].x[2] * iwidth[2]); #ifdef SWIFT_DEBUG_CHECKS if (cid < 0 || cid >= s->nr_cells) - error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k, + error("Bad cell id %i for spart %zu at [%.3e,%.3e,%.3e].", cid, k, sparts[k].x[0], sparts[k].x[1], sparts[k].x[2]); #endif @@ -438,7 +612,18 @@ void engine_redistribute(struct engine *e) { } } + free(s_dest); + /* Get destination of each g-particle */ + int *g_counts; + if ((g_counts = (int *)malloc(sizeof(int) * nr_nodes * nr_nodes)) == NULL) + error("Failed to allocate g_gcount temporary buffer."); + bzero(g_counts, sizeof(int) * nr_nodes * nr_nodes); + + int *g_dest; + if ((g_dest = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL) + error("Failed to allocate g_dest temporary buffer."); + for (size_t k = 0; k < s->nr_gparts; k++) { /* Periodic boundary conditions */ @@ -453,7 +638,7 @@ void engine_redistribute(struct engine *e) { gparts[k].x[2] * iwidth[2]); #ifdef SWIFT_DEBUG_CHECKS if (cid < 0 || cid >= s->nr_cells) - error("Bad cell id %i for part %zu at [%.3e,%.3e,%.3e].", cid, k, + error("Bad cell id %i for gpart %zu at [%.3e,%.3e,%.3e].", cid, k, gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]); #endif @@ -482,7 +667,8 @@ void engine_redistribute(struct engine *e) { const int new_node = c->nodeID; if (g_dest[k] != new_node) - error("gpart's new node index not matching sorted index."); + error("gpart's new node index not matching sorted index (%d != %d).", + g_dest[k], new_node); if (gp->x[0] < c->loc[0] || gp->x[0] > c->loc[0] + c->width[0] || gp->x[1] < c->loc[1] || gp->x[1] > c->loc[1] + c->width[1] || @@ -491,6 +677,8 @@ void engine_redistribute(struct engine *e) { } #endif + free(g_dest); + /* Get all the counts from all the nodes. */ if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS) @@ -538,10 +726,9 @@ void engine_redistribute(struct engine *e) { } } - /* Each node knows how many parts, sparts and gparts will be transferred - to every other node. We can start preparing to receive data */ - - /* Get the new number of parts and gparts for this node */ + /* Now each node knows how many parts, sparts and gparts will be transferred + * to every other node. + * Get the new numbers of particles for this node. */ size_t nr_parts = 0, nr_gparts = 0, nr_sparts = 0; for (int k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID]; for (int k = 0; k < nr_nodes; k++) @@ -549,162 +736,42 @@ void engine_redistribute(struct engine *e) { for (int k = 0; k < nr_nodes; k++) nr_sparts += s_counts[k * nr_nodes + nodeID]; - /* Allocate the new arrays with some extra margin */ - struct part *parts_new = NULL; - struct xpart *xparts_new = NULL; - struct gpart *gparts_new = NULL; - struct spart *sparts_new = NULL; - if (posix_memalign((void **)&parts_new, part_align, - sizeof(struct part) * nr_parts * - engine_redistribute_alloc_margin) != 0) - error("Failed to allocate new part data."); - if (posix_memalign((void **)&xparts_new, xpart_align, - sizeof(struct xpart) * nr_parts * - engine_redistribute_alloc_margin) != 0) - error("Failed to allocate new xpart data."); - if (posix_memalign((void **)&gparts_new, gpart_align, - sizeof(struct gpart) * nr_gparts * - engine_redistribute_alloc_margin) != 0) - error("Failed to allocate new gpart data."); - if (posix_memalign((void **)&sparts_new, spart_align, - sizeof(struct spart) * nr_sparts * - engine_redistribute_alloc_margin) != 0) - error("Failed to allocate new spart data."); - - /* Prepare MPI requests for the asynchronous communications */ - MPI_Request *reqs; - if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 8 * nr_nodes)) == - NULL) - error("Failed to allocate MPI request list."); - for (int k = 0; k < 8 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL; - - /* Emit the sends and recvs for the particle and gparticle data. */ - size_t offset_send = 0, offset_recv = 0; - size_t g_offset_send = 0, g_offset_recv = 0; - size_t s_offset_send = 0, s_offset_recv = 0; - for (int k = 0; k < nr_nodes; k++) { - - /* Indices in the count arrays of the node of interest */ - const int ind_send = nodeID * nr_nodes + k; - const int ind_recv = k * nr_nodes + nodeID; - - /* Are we sending any part/xpart ? */ - if (counts[ind_send] > 0) { - - /* message("Sending %d part to node %d", counts[ind_send], k); */ - - /* If the send is to the same node, just copy */ - if (k == nodeID) { - memcpy(&parts_new[offset_recv], &s->parts[offset_send], - sizeof(struct part) * counts[ind_recv]); - memcpy(&xparts_new[offset_recv], &s->xparts[offset_send], - sizeof(struct xpart) * counts[ind_recv]); - offset_send += counts[ind_send]; - offset_recv += counts[ind_recv]; - - /* Else, emit some communications */ - } else { - if (MPI_Isend(&s->parts[offset_send], counts[ind_send], part_mpi_type, - k, 4 * ind_send + 0, MPI_COMM_WORLD, - &reqs[8 * k + 0]) != MPI_SUCCESS) - error("Failed to isend parts to node %i.", k); - if (MPI_Isend(&s->xparts[offset_send], counts[ind_send], xpart_mpi_type, - k, 4 * ind_send + 1, MPI_COMM_WORLD, - &reqs[8 * k + 1]) != MPI_SUCCESS) - error("Failed to isend xparts to node %i.", k); - offset_send += counts[ind_send]; - } - } - - /* Are we sending any gpart ? */ - if (g_counts[ind_send] > 0) { - - /* message("Sending %d gpart to node %d", g_counts[ind_send], k); */ - - /* If the send is to the same node, just copy */ - if (k == nodeID) { - memcpy(&gparts_new[g_offset_recv], &s->gparts[g_offset_send], - sizeof(struct gpart) * g_counts[ind_recv]); - g_offset_send += g_counts[ind_send]; - g_offset_recv += g_counts[ind_recv]; - - /* Else, emit some communications */ - } else { - if (MPI_Isend(&s->gparts[g_offset_send], g_counts[ind_send], - gpart_mpi_type, k, 4 * ind_send + 2, MPI_COMM_WORLD, - &reqs[8 * k + 2]) != MPI_SUCCESS) - error("Failed to isend gparts to node %i.", k); - g_offset_send += g_counts[ind_send]; - } - } - - /* Are we sending any spart ? */ - if (s_counts[ind_send] > 0) { - - /* message("Sending %d spart to node %d", s_counts[ind_send], k); */ + /* Now exchange the particles, type by type to keep the memory required + * under control. */ - /* If the send is to the same node, just copy */ - if (k == nodeID) { - memcpy(&sparts_new[s_offset_recv], &s->sparts[s_offset_send], - sizeof(struct spart) * s_counts[ind_recv]); - s_offset_send += s_counts[ind_send]; - s_offset_recv += s_counts[ind_recv]; - - /* Else, emit some communications */ - } else { - if (MPI_Isend(&s->sparts[s_offset_send], s_counts[ind_send], - spart_mpi_type, k, 4 * ind_send + 3, MPI_COMM_WORLD, - &reqs[8 * k + 3]) != MPI_SUCCESS) - error("Failed to isend gparts to node %i.", k); - s_offset_send += s_counts[ind_send]; - } - } - - /* Now emit the corresponding Irecv() */ - - /* Are we receiving any part/xpart from this node ? */ - if (k != nodeID && counts[ind_recv] > 0) { - if (MPI_Irecv(&parts_new[offset_recv], counts[ind_recv], part_mpi_type, k, - 4 * ind_recv + 0, MPI_COMM_WORLD, - &reqs[8 * k + 4]) != MPI_SUCCESS) - error("Failed to emit irecv of parts from node %i.", k); - if (MPI_Irecv(&xparts_new[offset_recv], counts[ind_recv], xpart_mpi_type, - k, 4 * ind_recv + 1, MPI_COMM_WORLD, - &reqs[8 * k + 5]) != MPI_SUCCESS) - error("Failed to emit irecv of xparts from node %i.", k); - offset_recv += counts[ind_recv]; - } + /* SPH particles. */ + void *new_parts = engine_do_redistribute(counts, (char *)s->parts, nr_parts, + sizeof(struct part), part_align, + part_mpi_type, nr_nodes, nodeID); + free(s->parts); + s->parts = (struct part *)new_parts; + s->nr_parts = nr_parts; + s->size_parts = engine_redistribute_alloc_margin * nr_parts; - /* Are we receiving any gpart from this node ? */ - if (k != nodeID && g_counts[ind_recv] > 0) { - if (MPI_Irecv(&gparts_new[g_offset_recv], g_counts[ind_recv], - gpart_mpi_type, k, 4 * ind_recv + 2, MPI_COMM_WORLD, - &reqs[8 * k + 6]) != MPI_SUCCESS) - error("Failed to emit irecv of gparts from node %i.", k); - g_offset_recv += g_counts[ind_recv]; - } + /* Extra SPH particle properties. */ + new_parts = engine_do_redistribute(counts, (char *)s->xparts, nr_parts, + sizeof(struct xpart), xpart_align, + xpart_mpi_type, nr_nodes, nodeID); + free(s->xparts); + s->xparts = (struct xpart *)new_parts; - /* Are we receiving any spart from this node ? */ - if (k != nodeID && s_counts[ind_recv] > 0) { - if (MPI_Irecv(&sparts_new[s_offset_recv], s_counts[ind_recv], - spart_mpi_type, k, 4 * ind_recv + 3, MPI_COMM_WORLD, - &reqs[8 * k + 7]) != MPI_SUCCESS) - error("Failed to emit irecv of sparts from node %i.", k); - s_offset_recv += s_counts[ind_recv]; - } - } + /* Gravity particles. */ + new_parts = engine_do_redistribute(g_counts, (char *)s->gparts, nr_gparts, + sizeof(struct gpart), gpart_align, + gpart_mpi_type, nr_nodes, nodeID); + free(s->gparts); + s->gparts = (struct gpart *)new_parts; + s->nr_gparts = nr_gparts; + s->size_gparts = engine_redistribute_alloc_margin * nr_gparts; - /* Wait for all the sends and recvs to tumble in. */ - MPI_Status stats[8 * nr_nodes]; - int res; - if ((res = MPI_Waitall(8 * nr_nodes, reqs, stats)) != MPI_SUCCESS) { - for (int k = 0; k < 8 * nr_nodes; k++) { - char buff[MPI_MAX_ERROR_STRING]; - MPI_Error_string(stats[k].MPI_ERROR, buff, &res); - message("request %i has error '%s'.", k, buff); - } - error("Failed during waitall for part data."); - } + /* Star particles. */ + new_parts = engine_do_redistribute(s_counts, (char *)s->sparts, nr_sparts, + sizeof(struct spart), spart_align, + spart_mpi_type, nr_nodes, nodeID); + free(s->sparts); + s->sparts = (struct spart *)new_parts; + s->nr_sparts = nr_sparts; + s->size_sparts = engine_redistribute_alloc_margin * nr_sparts; /* All particles have now arrived. Time for some final operations on the stuff we just received */ @@ -722,25 +789,25 @@ void engine_redistribute(struct engine *e) { for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) { /* Does this gpart have a gas partner ? */ - if (gparts_new[k].type == swift_type_gas) { + if (s->gparts[k].type == swift_type_gas) { const ptrdiff_t partner_index = - offset_parts - gparts_new[k].id_or_neg_offset; + offset_parts - s->gparts[k].id_or_neg_offset; /* Re-link */ - gparts_new[k].id_or_neg_offset = -partner_index; - parts_new[partner_index].gpart = &gparts_new[k]; + s->gparts[k].id_or_neg_offset = -partner_index; + s->parts[partner_index].gpart = &s->gparts[k]; } /* Does this gpart have a star partner ? */ - if (gparts_new[k].type == swift_type_star) { + if (s->gparts[k].type == swift_type_star) { const ptrdiff_t partner_index = - offset_sparts - gparts_new[k].id_or_neg_offset; + offset_sparts - s->gparts[k].id_or_neg_offset; /* Re-link */ - gparts_new[k].id_or_neg_offset = -partner_index; - sparts_new[partner_index].gpart = &gparts_new[k]; + s->gparts[k].id_or_neg_offset = -partner_index; + s->sparts[partner_index].gpart = &s->gparts[k]; } } @@ -749,59 +816,43 @@ void engine_redistribute(struct engine *e) { offset_sparts += count_sparts; } + /* Clean up the counts now we done. */ + free(counts); + free(g_counts); + free(s_counts); + #ifdef SWIFT_DEBUG_CHECKS /* Verify that all parts are in the right place. */ for (size_t k = 0; k < nr_parts; k++) { - const int cid = cell_getid(cdim, parts_new[k].x[0] * iwidth[0], - parts_new[k].x[1] * iwidth[1], - parts_new[k].x[2] * iwidth[2]); + const int cid = + cell_getid(cdim, s->parts[k].x[0] * iwidth[0], + s->parts[k].x[1] * iwidth[1], s->parts[k].x[2] * iwidth[2]); if (cells[cid].nodeID != nodeID) error("Received particle (%zu) that does not belong here (nodeID=%i).", k, cells[cid].nodeID); } for (size_t k = 0; k < nr_gparts; k++) { - const int cid = cell_getid(cdim, gparts_new[k].x[0] * iwidth[0], - gparts_new[k].x[1] * iwidth[1], - gparts_new[k].x[2] * iwidth[2]); + const int cid = cell_getid(cdim, s->gparts[k].x[0] * iwidth[0], + s->gparts[k].x[1] * iwidth[1], + s->gparts[k].x[2] * iwidth[2]); if (cells[cid].nodeID != nodeID) error("Received g-particle (%zu) that does not belong here (nodeID=%i).", k, cells[cid].nodeID); } for (size_t k = 0; k < nr_sparts; k++) { - const int cid = cell_getid(cdim, sparts_new[k].x[0] * iwidth[0], - sparts_new[k].x[1] * iwidth[1], - sparts_new[k].x[2] * iwidth[2]); + const int cid = cell_getid(cdim, s->sparts[k].x[0] * iwidth[0], + s->sparts[k].x[1] * iwidth[1], + s->sparts[k].x[2] * iwidth[2]); if (cells[cid].nodeID != nodeID) error("Received s-particle (%zu) that does not belong here (nodeID=%i).", k, cells[cid].nodeID); } /* Verify that the links are correct */ - part_verify_links(parts_new, gparts_new, sparts_new, nr_parts, nr_gparts, + part_verify_links(s->parts, s->gparts, s->sparts, nr_parts, nr_gparts, nr_sparts, e->verbose); #endif - /* Set the new part data, free the old. */ - free(parts); - free(xparts); - free(gparts); - free(sparts); - s->parts = parts_new; - s->xparts = xparts_new; - s->gparts = gparts_new; - s->sparts = sparts_new; - s->nr_parts = nr_parts; - s->nr_gparts = nr_gparts; - s->nr_sparts = nr_sparts; - s->size_parts = engine_redistribute_alloc_margin * nr_parts; - s->size_gparts = engine_redistribute_alloc_margin * nr_gparts; - s->size_sparts = engine_redistribute_alloc_margin * nr_sparts; - - /* Clean up the temporary stuff. */ - free(reqs); - free(counts); - free(dest); - /* Be verbose about what just happened. */ if (e->verbose) { int my_cells = 0; @@ -851,6 +902,16 @@ void engine_repartition(struct engine *e) { partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s, e->sched.tasks, e->sched.nr_tasks); + /* Partitioning requires copies of the particles, so we need to reduce the + * memory in use to the minimum, we can free the sorting indices and the + * tasks as these will be regenerated at the next rebuild. */ + + /* Sorting indices. */ + if (e->s->cells_top != NULL) space_free_cells(e->s); + + /* Task arrays. */ + scheduler_free_tasks(&e->sched); + /* Now comes the tricky part: Exchange particles between all nodes. This is done in two steps, first allreducing a matrix of how many particles go from where to where, then re-allocating @@ -870,7 +931,11 @@ void engine_repartition(struct engine *e) { message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); #else - error("SWIFT was not compiled with MPI and METIS support."); + if (e->reparttype->type != REPART_NONE) + error("SWIFT was not compiled with MPI and METIS support."); + + /* Clear the repartition flag. */ + e->forcerepart = 0; #endif } @@ -885,14 +950,15 @@ void engine_repartition_trigger(struct engine *e) { /* Do nothing if there have not been enough steps since the last * repartition, don't want to repeat this too often or immediately after - * a repartition step. */ - if (e->step - e->last_repartition > 2) { + * a repartition step. Also nothing to do when requested. */ + if (e->step - e->last_repartition >= 2 && + e->reparttype->type != REPART_NONE) { /* Old style if trigger is >1 or this is the second step (want an early * repartition following the initial repartition). */ if (e->reparttype->trigger > 1 || e->step == 2) { if (e->reparttype->trigger > 1) { - if (e->step % (int)e->reparttype->trigger == 2) e->forcerepart = 1; + if ((e->step % (int)e->reparttype->trigger) == 0) e->forcerepart = 1; } else { e->forcerepart = 1; } @@ -947,8 +1013,9 @@ void engine_repartition_trigger(struct engine *e) { if (e->forcerepart) e->last_repartition = e->step; } - /* We always reset CPU time for next check. */ - e->cputime_last_step = clocks_get_cputime_used(); + /* We always reset CPU time for next check, unless it will not be used. */ + if (e->reparttype->type != REPART_NONE) + e->cputime_last_step = clocks_get_cputime_used(); #endif } @@ -1027,28 +1094,25 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj, scheduler_addunlock(s, t_rho, ci->super->extra_ghost); /* The send_rho task depends on the cell's ghost task. */ - scheduler_addunlock(s, ci->super->ghost, t_rho); + scheduler_addunlock(s, ci->super->ghost_out, t_rho); /* The send_xv task should unlock the super-cell's ghost task. */ - scheduler_addunlock(s, t_xv, ci->super->ghost); + scheduler_addunlock(s, t_xv, ci->super->ghost_in); #else /* The send_rho task should unlock the super-cell's kick task. */ scheduler_addunlock(s, t_rho, ci->super->kick2); /* The send_rho task depends on the cell's ghost task. */ - scheduler_addunlock(s, ci->super->ghost, t_rho); + scheduler_addunlock(s, ci->super->ghost_out, t_rho); /* The send_xv task should unlock the super-cell's ghost task. */ - scheduler_addunlock(s, t_xv, ci->super->ghost); + scheduler_addunlock(s, t_xv, ci->super->ghost_in); #endif /* Drift before you send */ - if (ci->drift_part == NULL) - ci->drift_part = scheduler_addtask(s, task_type_drift_part, - task_subtype_none, 0, 0, ci, NULL); - scheduler_addunlock(s, ci->drift_part, t_xv); + scheduler_addunlock(s, ci->super->drift_part, t_xv); /* The super-cell's timestep task should unlock the send_ti task. */ scheduler_addunlock(s, ci->super->timestep, t_ti); @@ -1634,25 +1698,130 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, /** * @brief Constructs the top-level tasks for the short-range gravity - * interactions. + * and long-range gravity interactions. * + * - One FTT task per MPI rank. + * - Multiple gravity ghosts for dependencies. * - All top-cells get a self task. * - All pairs within range according to the multipole acceptance * criterion get a pair task. - * - * @param e The #engine. */ -void engine_make_self_gravity_tasks(struct engine *e) { +void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements, + void *extra_data) { + + struct engine *e = ((struct engine **)extra_data)[0]; + struct task **ghosts = ((struct task ***)extra_data)[1]; struct space *s = e->s; struct scheduler *sched = &e->sched; const int nodeID = e->nodeID; const int periodic = s->periodic; + const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; const int cdim[3] = {s->cdim[0], s->cdim[1], s->cdim[2]}; const int cdim_ghost[3] = {s->cdim[0] / 4 + 1, s->cdim[1] / 4 + 1, s->cdim[2] / 4 + 1}; const double theta_crit_inv = e->gravity_properties->theta_crit_inv; struct cell *cells = s->cells_top; + const int n_ghosts = cdim_ghost[0] * cdim_ghost[1] * cdim_ghost[2] * 2; + + /* Loop through the elements, which are just byte offsets from NULL. */ + for (int ind = 0; ind < num_elements; ind++) { + + /* Get the cell index. */ + const int cid = (size_t)(map_data) + ind; + const int i = cid / (cdim[1] * cdim[2]); + const int j = (cid / cdim[2]) % cdim[1]; + const int k = cid % cdim[2]; + + /* Get the cell */ + struct cell *ci = &cells[cid]; + + /* Skip cells without gravity particles */ + if (ci->gcount == 0) continue; + + /* Is that cell local ? */ + if (ci->nodeID != nodeID) continue; + + /* If the cells is local build a self-interaction */ + scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL); + + /* Deal with periodicity FFT task dependencies */ + const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4); + if (ghost_id > n_ghosts) error("Invalid ghost_id"); + if (periodic) { + ci->grav_ghost[0] = ghosts[2 * ghost_id + 0]; + ci->grav_ghost[1] = ghosts[2 * ghost_id + 1]; + } + + /* Recover the multipole information */ + struct gravity_tensors *const multi_i = ci->multipole; + const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]}; + + /* Loop over every other cell */ + for (int ii = 0; ii < cdim[0]; ii++) { + for (int jj = 0; jj < cdim[1]; jj++) { + for (int kk = 0; kk < cdim[2]; kk++) { + + /* Get the cell */ + const int cjd = cell_getid(cdim, ii, jj, kk); + struct cell *cj = &cells[cjd]; + + /* Avoid duplicates */ + if (cid <= cjd) continue; + + /* Skip cells without gravity particles */ + if (cj->gcount == 0) continue; + + /* Is that neighbour local ? */ + if (cj->nodeID != nodeID) continue; // MATTHIEU + + /* Recover the multipole information */ + struct gravity_tensors *const multi_j = cj->multipole; + + /* Get the distance between the CoMs */ + double dx = CoM_i[0] - multi_j->CoM[0]; + double dy = CoM_i[1] - multi_j->CoM[1]; + double dz = CoM_i[2] - multi_j->CoM[2]; + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + const double r2 = dx * dx + dy * dy + dz * dz; + + /* Are the cells too close for a MM interaction ? */ + if (!gravity_multipole_accept_rebuild(multi_i, multi_j, + theta_crit_inv, r2)) { + + /* Ok, we need to add a direct pair calculation */ + scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0, + ci, cj); + } + } + } + } + } +} + +/** + * @brief Constructs the top-level tasks for the short-range gravity + * interactions. + * + * - All top-cells get a self task. + * - All pairs within range according to the multipole acceptance + * criterion get a pair task. + * + * @param e The #engine. + */ +void engine_make_self_gravity_tasks(struct engine *e) { + + struct space *s = e->s; + struct scheduler *sched = &e->sched; + const int periodic = s->periodic; + const int cdim_ghost[3] = {s->cdim[0] / 4 + 1, s->cdim[1] / 4 + 1, + s->cdim[2] / 4 + 1}; struct task **ghosts = NULL; const int n_ghosts = cdim_ghost[0] * cdim_ghost[1] * cdim_ghost[2] * 2; @@ -1680,67 +1849,20 @@ void engine_make_self_gravity_tasks(struct engine *e) { } } - /* Run through the higher level cells */ - for (int i = 0; i < cdim[0]; i++) { - for (int j = 0; j < cdim[1]; j++) { - for (int k = 0; k < cdim[2]; k++) { - - /* Get the cell */ - const int cid = cell_getid(cdim, i, j, k); - struct cell *ci = &cells[cid]; - - /* Skip cells without gravity particles */ - if (ci->gcount == 0) continue; - - /* Is that cell local ? */ - if (ci->nodeID != nodeID) continue; - - /* If the cells is local build a self-interaction */ - scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, - NULL); - - /* Deal with periodicity dependencies */ - const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4); - if (ghost_id > n_ghosts) error("Invalid ghost_id"); - if (periodic) { - ci->grav_ghost[0] = ghosts[2 * ghost_id + 0]; - ci->grav_ghost[1] = ghosts[2 * ghost_id + 1]; - } - - /* Loop over every other cell */ - for (int ii = 0; ii < cdim[0]; ii++) { - for (int jj = 0; jj < cdim[1]; jj++) { - for (int kk = 0; kk < cdim[2]; kk++) { - - /* Get the cell */ - const int cjd = cell_getid(cdim, ii, jj, kk); - struct cell *cj = &cells[cjd]; - - /* Avoid duplicates */ - if (cid <= cjd) continue; - - /* Skip cells without gravity particles */ - if (cj->gcount == 0) continue; + /* Cretae the multipole self and pair tasks. */ + void *extra_data[2] = {e, ghosts}; + threadpool_map(&e->threadpool, engine_make_self_gravity_tasks_mapper, NULL, + s->nr_cells, 1, 0, extra_data); - /* Is that neighbour local ? */ - if (cj->nodeID != nodeID) continue; // MATTHIEU - - /* Are the cells to close for a MM interaction ? */ - if (!gravity_multipole_accept(ci->multipole, cj->multipole, - theta_crit_inv, 1)) { - - scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, - 0, ci, cj); - } - } - } - } - } - } - } + /* Clean up. */ if (periodic) free(ghosts); } +/** + * @brief Constructs the top-level tasks for the external gravity. + * + * @param e The #engine. + */ void engine_make_external_gravity_tasks(struct engine *e) { struct space *s = e->s; @@ -1774,9 +1896,15 @@ void engine_make_external_gravity_tasks(struct engine *e) { * Additional loop over neighbours can later be added by simply duplicating * all the tasks created by this function. * - * @param e The #engine. + * @param map_data Offset of first two indices disguised as a pointer. + * @param num_elements Number of cells to traverse. + * @param extra_data The #engine. */ -void engine_make_hydroloop_tasks(struct engine *e) { +void engine_make_hydroloop_tasks_mapper(void *map_data, int num_elements, + void *extra_data) { + + /* Extract the engine pointer. */ + struct engine *e = (struct engine *)extra_data; struct space *s = e->s; struct scheduler *sched = &e->sched; @@ -1784,53 +1912,53 @@ void engine_make_hydroloop_tasks(struct engine *e) { const int *cdim = s->cdim; struct cell *cells = s->cells_top; - /* Run through the highest level of cells and add pairs. */ - for (int i = 0; i < cdim[0]; i++) { - for (int j = 0; j < cdim[1]; j++) { - for (int k = 0; k < cdim[2]; k++) { - - /* Get the cell */ - const int cid = cell_getid(cdim, i, j, k); - struct cell *ci = &cells[cid]; - - /* Skip cells without hydro particles */ - if (ci->count == 0) continue; - - /* If the cells is local build a self-interaction */ - if (ci->nodeID == nodeID) - scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, - ci, NULL); - - /* Now loop over all the neighbours of this cell */ - for (int ii = -1; ii < 2; ii++) { - int iii = i + ii; - if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue; - iii = (iii + cdim[0]) % cdim[0]; - for (int jj = -1; jj < 2; jj++) { - int jjj = j + jj; - if (!s->periodic && (jjj < 0 || jjj >= cdim[1])) continue; - jjj = (jjj + cdim[1]) % cdim[1]; - for (int kk = -1; kk < 2; kk++) { - int kkk = k + kk; - if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue; - kkk = (kkk + cdim[2]) % cdim[2]; - - /* Get the neighbouring cell */ - const int cjd = cell_getid(cdim, iii, jjj, kkk); - struct cell *cj = &cells[cjd]; - - /* Is that neighbour local and does it have particles ? */ - if (cid >= cjd || cj->count == 0 || - (ci->nodeID != nodeID && cj->nodeID != nodeID)) - continue; - - /* Construct the pair task */ - const int sid = - sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))]; - scheduler_addtask(sched, task_type_pair, task_subtype_density, - sid, 0, ci, cj); - } - } + /* Loop through the elements, which are just byte offsets from NULL. */ + for (int ind = 0; ind < num_elements; ind++) { + + /* Get the cell index. */ + const int cid = (size_t)(map_data) + ind; + const int i = cid / (cdim[1] * cdim[2]); + const int j = (cid / cdim[2]) % cdim[1]; + const int k = cid % cdim[2]; + + /* Get the cell */ + struct cell *ci = &cells[cid]; + + /* Skip cells without hydro particles */ + if (ci->count == 0) continue; + + /* If the cells is local build a self-interaction */ + if (ci->nodeID == nodeID) + scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, ci, + NULL); + + /* Now loop over all the neighbours of this cell */ + for (int ii = -1; ii < 2; ii++) { + int iii = i + ii; + if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue; + iii = (iii + cdim[0]) % cdim[0]; + for (int jj = -1; jj < 2; jj++) { + int jjj = j + jj; + if (!s->periodic && (jjj < 0 || jjj >= cdim[1])) continue; + jjj = (jjj + cdim[1]) % cdim[1]; + for (int kk = -1; kk < 2; kk++) { + int kkk = k + kk; + if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue; + kkk = (kkk + cdim[2]) % cdim[2]; + + /* Get the neighbouring cell */ + const int cjd = cell_getid(cdim, iii, jjj, kkk); + struct cell *cj = &cells[cjd]; + + /* Is that neighbour local and does it have particles ? */ + if (cid >= cjd || cj->count == 0 || + (ci->nodeID != nodeID && cj->nodeID != nodeID)) + continue; + + /* Construct the pair task */ + const int sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))]; + scheduler_addtask(sched, task_type_pair, task_subtype_density, sid, 0, + ci, cj); } } } @@ -1843,41 +1971,24 @@ void engine_make_hydroloop_tasks(struct engine *e) { * For each hydrodynamic and gravity task, construct the links with * the corresponding cell. Similarly, construct the dependencies for * all the sorting tasks. - * - * @param e The #engine. */ -void engine_count_and_link_tasks(struct engine *e) { +void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, + void *extra_data) { + struct engine *e = (struct engine *)extra_data; struct scheduler *const sched = &e->sched; - const int nr_tasks = sched->nr_tasks; - for (int ind = 0; ind < nr_tasks; ind++) { + for (int ind = 0; ind < num_elements; ind++) { + struct task *const t = &((struct task *)map_data)[ind]; - struct task *const t = &sched->tasks[ind]; struct cell *const ci = t->ci; struct cell *const cj = t->cj; - /* Link sort tasks to the next-higher sort task. */ + /* Link sort tasks to all the higher sort task. */ if (t->type == task_type_sort) { - struct cell *finger = t->ci->parent; - while (finger != NULL && finger->sorts == NULL) finger = finger->parent; - if (finger != NULL) scheduler_addunlock(sched, t, finger->sorts); - } - - /* Link drift tasks to the next-higher drift task. */ - else if (t->type == task_type_drift_part) { - struct cell *finger = ci->parent; - while (finger != NULL && finger->drift_part == NULL) - finger = finger->parent; - if (finger != NULL) scheduler_addunlock(sched, t, finger->drift_part); - } - - /* Link drift tasks to the next-higher drift task. */ - else if (t->type == task_type_drift_gpart) { - struct cell *finger = ci->parent; - while (finger != NULL && finger->drift_gpart == NULL) - finger = finger->parent; - if (finger != NULL) scheduler_addunlock(sched, t, finger->drift_gpart); + for (struct cell *finger = t->ci->parent; finger != NULL; + finger = finger->parent) + if (finger->sorts != NULL) scheduler_addunlock(sched, t, finger->sorts); } /* Link self tasks to cells. */ @@ -2072,8 +2183,8 @@ static inline void engine_make_hydro_loops_dependencies( /* density loop --> ghost --> gradient loop --> extra_ghost */ /* extra_ghost --> force loop */ - scheduler_addunlock(sched, density, c->super->ghost); - scheduler_addunlock(sched, c->super->ghost, gradient); + scheduler_addunlock(sched, density, c->super->ghost_in); + scheduler_addunlock(sched, c->super->ghost_out, gradient); scheduler_addunlock(sched, gradient, c->super->extra_ghost); scheduler_addunlock(sched, c->super->extra_ghost, force); @@ -2103,8 +2214,8 @@ static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched, struct cell *c, int with_cooling) { /* density loop --> ghost --> force loop */ - scheduler_addunlock(sched, density, c->super->ghost); - scheduler_addunlock(sched, c->super->ghost, force); + scheduler_addunlock(sched, density, c->super->ghost_in); + scheduler_addunlock(sched, c->super->ghost_out, force); if (with_cooling) { /* force loop --> cooling (--> kick2) */ @@ -2125,32 +2236,32 @@ static inline void engine_make_hydro_loops_dependencies(struct scheduler *sched, * corresponding to the second hydro loop over neighbours. * With all the relevant tasks for a given cell available, we construct * all the dependencies for that cell. - * - * @param e The #engine. */ -void engine_make_extra_hydroloop_tasks(struct engine *e) { +void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, + void *extra_data) { + struct engine *e = (struct engine *)extra_data; struct scheduler *sched = &e->sched; - const int nr_tasks = sched->nr_tasks; const int nodeID = e->nodeID; const int with_cooling = (e->policy & engine_policy_cooling); - for (int ind = 0; ind < nr_tasks; ind++) { - struct task *t = &sched->tasks[ind]; + for (int ind = 0; ind < num_elements; ind++) { + struct task *t = &((struct task *)map_data)[ind]; /* Sort tasks depend on the drift of the cell. */ if (t->type == task_type_sort && t->ci->nodeID == engine_rank) { - scheduler_addunlock(sched, t->ci->drift_part, t); + scheduler_addunlock(sched, t->ci->super->drift_part, t); } /* Self-interaction? */ else if (t->type == task_type_self && t->subtype == task_subtype_density) { - /* Make all density tasks depend on the drift. */ - scheduler_addunlock(sched, t->ci->drift_part, t); + /* Make all density tasks depend on the drift and the sorts. */ + scheduler_addunlock(sched, t->ci->super->drift_part, t); + scheduler_addunlock(sched, t->ci->super->sorts, t); #ifdef EXTRA_HYDRO_LOOP - /* Start by constructing the task for the second and third hydro loop */ + /* Start by constructing the task for the second and third hydro loop. */ struct task *t2 = scheduler_addtask( sched, task_type_self, task_subtype_gradient, 0, 0, t->ci, NULL); struct task *t3 = scheduler_addtask( @@ -2181,11 +2292,15 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) { /* Otherwise, pair interaction? */ else if (t->type == task_type_pair && t->subtype == task_subtype_density) { - /* Make all density tasks depend on the drift. */ + /* Make all density tasks depend on the drift and the sorts. */ if (t->ci->nodeID == engine_rank) - scheduler_addunlock(sched, t->ci->drift_part, t); - if (t->cj->nodeID == engine_rank) - scheduler_addunlock(sched, t->cj->drift_part, t); + scheduler_addunlock(sched, t->ci->super->drift_part, t); + scheduler_addunlock(sched, t->ci->super->sorts, t); + if (t->ci->super != t->cj->super) { + if (t->cj->nodeID == engine_rank) + scheduler_addunlock(sched, t->cj->super->drift_part, t); + scheduler_addunlock(sched, t->cj->super->sorts, t); + } #ifdef EXTRA_HYDRO_LOOP /* Start by constructing the task for the second and third hydro loop */ @@ -2238,6 +2353,10 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) { else if (t->type == task_type_sub_self && t->subtype == task_subtype_density) { + /* Make all density tasks depend on the drift and sorts. */ + scheduler_addunlock(sched, t->ci->super->drift_part, t); + scheduler_addunlock(sched, t->ci->super->sorts, t); + #ifdef EXTRA_HYDRO_LOOP /* Start by constructing the task for the second and third hydro loop */ @@ -2280,6 +2399,16 @@ void engine_make_extra_hydroloop_tasks(struct engine *e) { else if (t->type == task_type_sub_pair && t->subtype == task_subtype_density) { + /* Make all density tasks depend on the drift. */ + if (t->ci->nodeID == engine_rank) + scheduler_addunlock(sched, t->ci->super->drift_part, t); + scheduler_addunlock(sched, t->ci->super->sorts, t); + if (t->ci->super != t->cj->super) { + if (t->cj->nodeID == engine_rank) + scheduler_addunlock(sched, t->cj->super->drift_part, t); + scheduler_addunlock(sched, t->cj->super->sorts, t); + } + #ifdef EXTRA_HYDRO_LOOP /* Start by constructing the task for the second and third hydro loop */ @@ -2364,21 +2493,6 @@ void engine_make_gravityrecursive_tasks(struct engine *e) { /* } */ } -void engine_check_sort_tasks(struct engine *e, struct cell *c) { - - /* Find the parent sort task, if any, and copy its flags. */ - if (c->sorts != NULL) { - struct cell *parent = c->parent; - while (parent != NULL && parent->sorts == NULL) parent = parent->parent; - if (parent != NULL) c->sorts->flags |= parent->sorts->flags; - } - - /* Recurse? */ - if (c->split) - for (int k = 0; k < 8; k++) - if (c->progeny[k] != NULL) engine_check_sort_tasks(e, c->progeny[k]); -} - /** * @brief Fill the #space's task list. * @@ -2396,7 +2510,10 @@ void engine_maketasks(struct engine *e) { scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell); /* Construct the firt hydro loop over neighbours */ - if (e->policy & engine_policy_hydro) engine_make_hydroloop_tasks(e); + if (e->policy & engine_policy_hydro) { + threadpool_map(&e->threadpool, engine_make_hydroloop_tasks_mapper, NULL, + s->nr_cells, 1, 0, e); + } /* Add the self gravity tasks. */ if (e->policy & engine_policy_self_gravity) engine_make_self_gravity_tasks(e); @@ -2411,17 +2528,31 @@ void engine_maketasks(struct engine *e) { /* Split the tasks. */ scheduler_splittasks(sched); - /* Allocate the list of cell-task links. The maximum number of links is the - * number of cells (s->tot_cells) times the number of neighbours (26) times - * the number of interaction types, so 26 * 3 (density, force, grav) pairs - * and 4 (density, force, grav, ext_grav) self. - */ + /* Free the old list of cell-task links. */ if (e->links != NULL) free(e->links); + e->size_links = 0; + +/* The maximum number of links is the + * number of cells (s->tot_cells) times the number of neighbours (26) times + * the number of interaction types, so 26 * 2 (density, force) pairs + * and 2 (density, force) self. + */ #ifdef EXTRA_HYDRO_LOOP - e->size_links = s->tot_cells * (26 * 4 + 4); + const int hydro_tasks_per_cell = 27 * 3; #else - e->size_links = s->tot_cells * (26 * 3 + 4); + const int hydro_tasks_per_cell = 27 * 2; #endif + const int self_grav_tasks_per_cell = 27 * 2; + const int ext_grav_tasks_per_cell = 1; + + if (e->policy & engine_policy_hydro) + e->size_links += s->tot_cells * hydro_tasks_per_cell; + if (e->policy & engine_policy_external_gravity) + e->size_links += s->tot_cells * ext_grav_tasks_per_cell; + if (e->policy & engine_policy_self_gravity) + e->size_links += s->tot_cells * self_grav_tasks_per_cell; + + /* Allocate the new list */ if ((e->links = malloc(sizeof(struct link) * e->size_links)) == NULL) error("Failed to allocate cell-task links."); e->nr_links = 0; @@ -2433,23 +2564,23 @@ void engine_maketasks(struct engine *e) { /* Count the number of tasks associated with each cell and store the density tasks in each cell, and make each sort depend on the sorts of its progeny. */ - engine_count_and_link_tasks(e); + threadpool_map(&e->threadpool, engine_count_and_link_tasks_mapper, + sched->tasks, sched->nr_tasks, sizeof(struct task), 0, e); /* Now that the self/pair tasks are at the right level, set the super * pointers. */ - for (int k = 0; k < nr_cells; k++) cell_set_super(&cells[k], NULL); - - /* Append hierarchical tasks to each cell. */ - for (int k = 0; k < nr_cells; k++) - engine_make_hierarchical_tasks(e, &cells[k]); + threadpool_map(&e->threadpool, cell_set_super_mapper, cells, nr_cells, + sizeof(struct cell), 0, NULL); /* Append hierarchical tasks to each cell. */ - for (int k = 0; k < nr_cells; k++) engine_check_sort_tasks(e, &cells[k]); + threadpool_map(&e->threadpool, engine_make_hierarchical_tasks_mapper, cells, + nr_cells, sizeof(struct cell), 0, e); /* Run through the tasks and make force tasks for each density task. Each force task depends on the cell ghosts and unlocks the kick task of its super-cell. */ - if (e->policy & engine_policy_hydro) engine_make_extra_hydroloop_tasks(e); + threadpool_map(&e->threadpool, engine_make_extra_hydroloop_tasks_mapper, + sched->tasks, sched->nr_tasks, sizeof(struct task), 0, e); /* Add the dependencies for the gravity stuff */ if (e->policy & (engine_policy_self_gravity | engine_policy_external_gravity)) @@ -2523,6 +2654,11 @@ void engine_marktasks_mapper(void *map_data, int num_elements, /* Set this task's skip. */ if (cell_is_active(t->ci, e)) scheduler_activate(s, t); + + /* Store current values of dx_max and h_max. */ + if (t->type == task_type_sub_self && t->subtype == task_subtype_density) { + cell_activate_subcell_tasks(t->ci, NULL, s); + } } /* Pair? */ @@ -2532,168 +2668,169 @@ void engine_marktasks_mapper(void *map_data, int num_elements, struct cell *ci = t->ci; struct cell *cj = t->cj; - /* Set this task's skip, otherwise nothing to do. */ - if (cell_is_active(t->ci, e) || cell_is_active(t->cj, e)) + /* If this task does not involve any active cells, skip it. */ + if (!cell_is_active(t->ci, e) && !cell_is_active(t->cj, e)) continue; + + /* Only activate tasks that involve a local active cell. */ + if ((cell_is_active(ci, e) && ci->nodeID == engine_rank) || + (cj != NULL && cell_is_active(cj, e) && cj->nodeID == engine_rank)) { scheduler_activate(s, t); - else - continue; - - /* If this is not a density task, we don't have to do any of the below. */ - if (t->subtype != task_subtype_density) continue; - - /* Too much particle movement? */ - if (max(ci->h_max, cj->h_max) + ci->dx_max_part + cj->dx_max_part > - cj->dmin) - *rebuild_space = 1; - - /* Set the correct sorting flags */ - if (t->type == task_type_pair) { - if (ci->dx_max_sort > space_maxreldx * ci->dmin) { - for (struct cell *finger = ci; finger != NULL; - finger = finger->parent) - finger->sorted = 0; - } - if (cj->dx_max_sort > space_maxreldx * cj->dmin) { - for (struct cell *finger = cj; finger != NULL; - finger = finger->parent) - finger->sorted = 0; - } - if (!(ci->sorted & (1 << t->flags))) { -#ifdef SWIFT_DEBUG_CHECKS - if (!(ci->sorts->flags & (1 << t->flags))) - error("bad flags in sort task."); -#endif - scheduler_activate(s, ci->sorts); - if (ci->nodeID == engine_rank) scheduler_activate(s, ci->drift_part); + + /* Set the correct sorting flags */ + if (t->type == task_type_pair && t->subtype == task_subtype_density) { + /* Store some values. */ + atomic_or(&ci->requires_sorts, 1 << t->flags); + atomic_or(&cj->requires_sorts, 1 << t->flags); + ci->dx_max_sort_old = ci->dx_max_sort; + cj->dx_max_sort_old = cj->dx_max_sort; + + /* Activate the drift tasks. */ + if (ci->nodeID == engine_rank) cell_activate_drift_part(ci, s); + if (cj->nodeID == engine_rank) cell_activate_drift_part(cj, s); + + /* Check the sorts and activate them if needed. */ + cell_activate_sorts(ci, t->flags, s); + cell_activate_sorts(cj, t->flags, s); } - if (!(cj->sorted & (1 << t->flags))) { -#ifdef SWIFT_DEBUG_CHECKS - if (!(cj->sorts->flags & (1 << t->flags))) - error("bad flags in sort task."); -#endif - scheduler_activate(s, cj->sorts); - if (cj->nodeID == engine_rank) scheduler_activate(s, cj->drift_part); + /* Store current values of dx_max and h_max. */ + else if (t->type == task_type_sub_pair && + t->subtype == task_subtype_density) { + cell_activate_subcell_tasks(t->ci, t->cj, s); } } -#ifdef WITH_MPI - /* Activate the send/recv flags. */ - if (ci->nodeID != engine_rank) { + /* Only interested in density tasks as of here. */ + if (t->subtype == task_subtype_density) { + + /* Too much particle movement? */ + if (cell_need_rebuild_for_pair(ci, cj)) *rebuild_space = 1; - /* Activate the tasks to recv foreign cell ci's data. */ - scheduler_activate(s, ci->recv_xv); - if (cell_is_active(ci, e)) { - scheduler_activate(s, ci->recv_rho); +#ifdef WITH_MPI + /* Activate the send/recv tasks. */ + if (ci->nodeID != engine_rank) { + + /* If the local cell is active, receive data from the foreign cell. */ + if (cell_is_active(cj, e)) { + scheduler_activate(s, ci->recv_xv); + if (cell_is_active(ci, e)) { + scheduler_activate(s, ci->recv_rho); #ifdef EXTRA_HYDRO_LOOP - scheduler_activate(s, ci->recv_gradient); + scheduler_activate(s, ci->recv_gradient); #endif - scheduler_activate(s, ci->recv_ti); - } + } + } - /* Look for the local cell cj's send tasks. */ - struct link *l = NULL; - for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_xv task."); - scheduler_activate(s, l->t); - - /* Drift both cells, the foreign one at the level which it is sent. */ - if (l->t->ci->drift_part) - scheduler_activate(s, l->t->ci->drift_part); - else - error("Drift task missing !"); - if (t->type == task_type_pair) scheduler_activate(s, cj->drift_part); - - if (cell_is_active(cj, e)) { - - for (l = cj->send_rho; l != NULL && l->t->cj->nodeID != ci->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_rho task."); - scheduler_activate(s, l->t); + /* If the foreign cell is active, we want its ti_end values. */ + if (cell_is_active(ci, e)) scheduler_activate(s, ci->recv_ti); + + /* Look for the local cell cj's send tasks. */ + if (cell_is_active(ci, e)) { + struct link *l = NULL; + for (l = cj->send_xv; l != NULL && l->t->cj->nodeID != ci->nodeID; + l = l->next) + ; + if (l == NULL) error("Missing link to send_xv task."); + scheduler_activate(s, l->t); + + /* Drift the cell which will be sent at the level at which it is + sent, i.e. drift the cell specified in the send task (l->t) + itself. */ + cell_activate_drift_part(l->t->ci, s); + + if (cell_is_active(cj, e)) { + struct link *l = NULL; + for (l = cj->send_rho; + l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) + ; + if (l == NULL) error("Missing link to send_rho task."); + scheduler_activate(s, l->t); #ifdef EXTRA_HYDRO_LOOP - for (l = cj->send_gradient; - l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) - ; - if (l == NULL) error("Missing link to send_gradient task."); - scheduler_activate(s, l->t); + for (l = cj->send_gradient; + l != NULL && l->t->cj->nodeID != ci->nodeID; l = l->next) + ; + if (l == NULL) error("Missing link to send_gradient task."); + scheduler_activate(s, l->t); #endif + } + } - for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_ti task."); - scheduler_activate(s, l->t); - } + /* If the local cell is active, send its ti_end values. */ + if (cell_is_active(cj, e)) { + struct link *l = NULL; + for (l = cj->send_ti; l != NULL && l->t->cj->nodeID != ci->nodeID; + l = l->next) + ; + if (l == NULL) error("Missing link to send_ti task."); + scheduler_activate(s, l->t); + } - } else if (cj->nodeID != engine_rank) { + } else if (cj->nodeID != engine_rank) { - /* Activate the tasks to recv foreign cell cj's data. */ - scheduler_activate(s, cj->recv_xv); - if (cell_is_active(cj, e)) { - scheduler_activate(s, cj->recv_rho); + /* If the local cell is active, receive data from the foreign cell. */ + if (cell_is_active(ci, e)) { + scheduler_activate(s, cj->recv_xv); + if (cell_is_active(cj, e)) { + scheduler_activate(s, cj->recv_rho); #ifdef EXTRA_HYDRO_LOOP - scheduler_activate(s, cj->recv_gradient); + scheduler_activate(s, cj->recv_gradient); #endif - scheduler_activate(s, cj->recv_ti); - } + } + } + + /* If the foreign cell is active, we want its ti_end values. */ + if (cell_is_active(cj, e)) scheduler_activate(s, cj->recv_ti); + + /* Look for the local cell ci's send tasks. */ + if (cell_is_active(cj, e)) { + struct link *l = NULL; + for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID; + l = l->next) + ; + if (l == NULL) error("Missing link to send_xv task."); + scheduler_activate(s, l->t); + + /* Drift the cell which will be sent at the level at which it is + sent, i.e. drift the cell specified in the send task (l->t) + itself. */ + cell_activate_drift_part(l->t->ci, s); - /* Look for the local cell ci's send tasks. */ - struct link *l = NULL; - for (l = ci->send_xv; l != NULL && l->t->cj->nodeID != cj->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_xv task."); - scheduler_activate(s, l->t); - - /* Drift both cells, the foreign one at the level which it is sent. */ - if (l->t->ci->drift_part) - scheduler_activate(s, l->t->ci->drift_part); - else - error("Drift task missing !"); - if (t->type == task_type_pair) scheduler_activate(s, ci->drift_part); - - if (cell_is_active(ci, e)) { - for (l = ci->send_rho; l != NULL && l->t->cj->nodeID != cj->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_rho task."); - scheduler_activate(s, l->t); + if (cell_is_active(ci, e)) { + + struct link *l = NULL; + for (l = ci->send_rho; + l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) + ; + if (l == NULL) error("Missing link to send_rho task."); + scheduler_activate(s, l->t); #ifdef EXTRA_HYDRO_LOOP - for (l = ci->send_gradient; - l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) - ; - if (l == NULL) error("Missing link to send_gradient task."); - scheduler_activate(s, l->t); + for (l = ci->send_gradient; + l != NULL && l->t->cj->nodeID != cj->nodeID; l = l->next) + ; + if (l == NULL) error("Missing link to send_gradient task."); + scheduler_activate(s, l->t); #endif + } + } - for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID; - l = l->next) - ; - if (l == NULL) error("Missing link to send_ti task."); - scheduler_activate(s, l->t); + /* If the local cell is active, send its ti_end values. */ + if (cell_is_active(ci, e)) { + struct link *l = NULL; + for (l = ci->send_ti; l != NULL && l->t->cj->nodeID != cj->nodeID; + l = l->next) + ; + if (l == NULL) error("Missing link to send_ti task."); + scheduler_activate(s, l->t); + } } - - } else if (t->type == task_type_pair) { - scheduler_activate(s, ci->drift_part); - scheduler_activate(s, cj->drift_part); - } -#else - if (t->type == task_type_pair) { - scheduler_activate(s, ci->drift_part); - scheduler_activate(s, cj->drift_part); - } #endif + } } /* Kick/Drift/init ? */ - else if (t->type == task_type_kick1 || t->type == task_type_kick2 || - t->type == task_type_drift_part || - t->type == task_type_drift_gpart || - t->type == task_type_init_grav) { + if (t->type == task_type_kick1 || t->type == task_type_kick2 || + t->type == task_type_drift_gpart || t->type == task_type_init_grav) { if (cell_is_active(t->ci, e)) scheduler_activate(s, t); } @@ -2733,7 +2870,7 @@ int engine_marktasks(struct engine *e) { /* Run through the tasks and mark as skip or not. */ size_t extra_data[3] = {(size_t)e, rebuild_space, (size_t)&e->sched}; threadpool_map(&e->threadpool, engine_marktasks_mapper, s->tasks, s->nr_tasks, - sizeof(struct task), 10000, extra_data); + sizeof(struct task), 0, extra_data); rebuild_space = extra_data[1]; if (e->verbose) @@ -2790,8 +2927,10 @@ void engine_print_task_counts(struct engine *e) { * @brief Rebuild the space and tasks. * * @param e The #engine. + * @param clean_h_values Are we cleaning up the values of h before building + * the tasks ? */ -void engine_rebuild(struct engine *e) { +void engine_rebuild(struct engine *e, int clean_h_values) { const ticks tic = getticks(); @@ -2802,7 +2941,7 @@ void engine_rebuild(struct engine *e) { space_rebuild(e->s, e->verbose); /* Initial cleaning up session ? */ - if (e->s->sanitized == 0) space_sanitize(e->s); + if (clean_h_values) space_sanitize(e->s); /* If in parallel, exchange the cell structure. */ #ifdef WITH_MPI @@ -2856,7 +2995,7 @@ void engine_prepare(struct engine *e) { if (e->forcerepart) engine_repartition(e); /* Do we need rebuilding ? */ - if (e->forcerebuild) engine_rebuild(e); + if (e->forcerebuild) engine_rebuild(e, 0); /* Unskip active tasks and check for rebuild */ engine_unskip(e); @@ -2878,39 +3017,14 @@ void engine_prepare(struct engine *e) { * @brief Implements a barrier for the #runner threads. * * @param e The #engine. - * @param tid The thread ID */ -void engine_barrier(struct engine *e, int tid) { - - /* First, get the barrier mutex. */ - if (pthread_mutex_lock(&e->barrier_mutex) != 0) - error("Failed to get barrier mutex."); - - /* This thread is no longer running. */ - e->barrier_running -= 1; +void engine_barrier(struct engine *e) { - /* If all threads are in, send a signal... */ - if (e->barrier_running == 0) - if (pthread_cond_broadcast(&e->barrier_cond) != 0) - error("Failed to broadcast barrier full condition."); + /* Wait at the wait barrier. */ + pthread_barrier_wait(&e->wait_barrier); - /* Wait for the barrier to open. */ - while (e->barrier_launch == 0 || tid >= e->barrier_launchcount) - if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0) - error("Error waiting for barrier to close."); - - /* This thread has been launched. */ - e->barrier_running += 1; - e->barrier_launch -= 1; - - /* If I'm the last one out, signal the condition again. */ - if (e->barrier_launch == 0) - if (pthread_cond_broadcast(&e->barrier_cond) != 0) - error("Failed to broadcast empty barrier condition."); - - /* Last but not least, release the mutex. */ - if (pthread_mutex_unlock(&e->barrier_mutex) != 0) - error("Failed to get unlock the barrier mutex."); + /* Wait at the run barrier. */ + pthread_barrier_wait(&e->run_barrier); } /** @@ -3145,6 +3259,9 @@ void engine_skip_force_and_kick(struct engine *e) { t->type == task_type_cooling || t->type == task_type_sourceterms) t->skip = 1; } + + /* Run through the cells and clear some flags. */ + space_map_cells_pre(e->s, 1, cell_clear_drift_flags, NULL); } /** @@ -3161,19 +3278,20 @@ void engine_skip_drift(struct engine *e) { struct task *t = &tasks[i]; - /* Skip everything that moves the particles */ - if (t->type == task_type_drift_part || t->type == task_type_drift_gpart) - t->skip = 1; + /* Skip everything that updates the particles */ + if (t->type == task_type_drift_part) t->skip = 1; } + + /* Run through the cells and clear some flags. */ + space_map_cells_pre(e->s, 1, cell_clear_drift_flags, NULL); } /** * @brief Launch the runners. * * @param e The #engine. - * @param nr_runners The number of #runner to let loose. */ -void engine_launch(struct engine *e, int nr_runners) { +void engine_launch(struct engine *e) { const ticks tic = getticks(); @@ -3186,15 +3304,10 @@ void engine_launch(struct engine *e, int nr_runners) { atomic_inc(&e->sched.waiting); /* Cry havoc and let loose the dogs of war. */ - e->barrier_launch = nr_runners; - e->barrier_launchcount = nr_runners; - if (pthread_cond_broadcast(&e->barrier_cond) != 0) - error("Failed to broadcast barrier open condition."); + pthread_barrier_wait(&e->run_barrier); /* Load the tasks. */ - pthread_mutex_unlock(&e->barrier_mutex); scheduler_start(&e->sched); - pthread_mutex_lock(&e->barrier_mutex); /* Remove the safeguard. */ pthread_mutex_lock(&e->sched.sleep_mutex); @@ -3203,9 +3316,7 @@ void engine_launch(struct engine *e, int nr_runners) { pthread_mutex_unlock(&e->sched.sleep_mutex); /* Sit back and wait for the runners to come home. */ - while (e->barrier_launch || e->barrier_running) - if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0) - error("Error while waiting for barrier."); + pthread_barrier_wait(&e->wait_barrier); if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -3218,9 +3329,12 @@ void engine_launch(struct engine *e, int nr_runners) { * * @param e The #engine * @param flag_entropy_ICs Did the 'Internal Energy' of the particles actually - *contain entropy ? + * contain entropy ? + * @param clean_h_values Are we cleaning up the values of h before building + * the tasks ? */ -void engine_init_particles(struct engine *e, int flag_entropy_ICs) { +void engine_init_particles(struct engine *e, int flag_entropy_ICs, + int clean_h_values) { struct space *s = e->s; @@ -3237,7 +3351,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) { } /* Construct all cells and tasks to start everything */ - engine_rebuild(e); + engine_rebuild(e, clean_h_values); /* No time integration. We just want the density and ghosts */ engine_skip_force_and_kick(e); @@ -3252,7 +3366,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) { /* Now, launch the calculation */ TIMER_TIC; - engine_launch(e, e->nr_threads); + engine_launch(e); TIMER_TOC(timer_runners); /* Apply some conversions (e.g. internal energy -> entropy) */ @@ -3268,7 +3382,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) { if (hydro_need_extra_init_loop) { engine_marktasks(e); engine_skip_force_and_kick(e); - engine_launch(e, e->nr_threads); + engine_launch(e); } } @@ -3310,7 +3424,7 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) { #endif /* Run the 0th time-step */ - engine_launch(e, e->nr_threads); + engine_launch(e); #ifdef SWIFT_GRAVITY_FORCE_CHECKS /* Check the accuracy of the gravity calculation */ @@ -3321,6 +3435,69 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs) { /* Recover the (integer) end of the next time-step */ engine_collect_timestep_and_rebuild(e, 1); + /* Check if any particles have the same position. This is not + * allowed (/0) so we abort.*/ + if (s->nr_parts > 0) { + + /* Sorting should put the same positions next to each other... */ + int failed = 0; + double *prev_x = s->parts[0].x; + for (size_t k = 1; k < s->nr_parts; k++) { + if (prev_x[0] == s->parts[k].x[0] && prev_x[1] == s->parts[k].x[1] && + prev_x[2] == s->parts[k].x[2]) { + if (e->verbose) + message("Two particles occupy location: %f %f %f", prev_x[0], + prev_x[1], prev_x[2]); + failed++; + } + prev_x = s->parts[k].x; + } + if (failed > 0) + error( + "Have %d particle pairs with the same locations.\n" + "Cannot continue", + failed); + } + + /* Also check any gparts. This is not supposed to be fatal so only warn. */ + if (s->nr_gparts > 0) { + int failed = 0; + double *prev_x = s->gparts[0].x; + for (size_t k = 1; k < s->nr_gparts; k++) { + if (prev_x[0] == s->gparts[k].x[0] && prev_x[1] == s->gparts[k].x[1] && + prev_x[2] == s->gparts[k].x[2]) { + if (e->verbose) + message("Two gparts occupy location: %f %f %f / %f %f %f", prev_x[0], + prev_x[1], prev_x[2], s->gparts[k].x[0], s->gparts[k].x[1], + s->gparts[k].x[2]); + failed++; + } + prev_x = s->gparts[k].x; + } + if (failed > 0) + message( + "WARNING: found %d gpart pairs at the same location. " + "That is not optimal", + failed); + } + + /* Check the top-level cell h_max matches the particles as these can be + * updated in the the ghost tasks (only a problem if the ICs estimates for h + * are too small). Note this must be followed by a rebuild as sub-cells will + * not be updated until that is done. */ + if (s->cells_top != NULL && s->nr_parts > 0) { + for (int i = 0; i < s->nr_cells; i++) { + struct cell *c = &s->cells_top[i]; + if (c->nodeID == engine_rank && c->count > 0) { + float part_h_max = c->parts[0].h; + for (int k = 1; k < c->count; k++) { + if (c->parts[k].h > part_h_max) part_h_max = c->parts[k].h; + } + c->h_max = max(part_h_max, c->h_max); + } + } + } + clocks_gettime(&time2); #ifdef SWIFT_DEBUG_CHECKS @@ -3421,7 +3598,7 @@ void engine_step(struct engine *e) { /* Start all the tasks. */ TIMER_TIC; - engine_launch(e, e->nr_threads); + engine_launch(e); TIMER_TOC(timer_runners); #ifdef SWIFT_GRAVITY_FORCE_CHECKS @@ -3537,7 +3714,7 @@ void engine_do_drift_all_mapper(void *map_data, int num_elements, struct cell *c = &cells[ind]; if (c != NULL && c->nodeID == e->nodeID) { /* Drift all the particles */ - cell_drift_part(c, e); + cell_drift_part(c, e, 1); /* Drift all the g-particles */ cell_drift_gpart(c, e); @@ -3564,7 +3741,7 @@ void engine_drift_all(struct engine *e) { #endif threadpool_map(&e->threadpool, engine_do_drift_all_mapper, e->s->cells_top, - e->s->nr_cells, sizeof(struct cell), 1, e); + e->s->nr_cells, sizeof(struct cell), 0, e); /* Synchronize particle positions */ space_synchronize_particle_positions(e->s); @@ -3616,7 +3793,7 @@ void engine_drift_top_multipoles(struct engine *e) { const ticks tic = getticks(); threadpool_map(&e->threadpool, engine_do_drift_top_multipoles_mapper, - e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 10, e); + e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 0, e); #ifdef SWIFT_DEBUG_CHECKS /* Check that all cells have been drifted to the current time. */ @@ -3654,7 +3831,7 @@ void engine_reconstruct_multipoles(struct engine *e) { const ticks tic = getticks(); threadpool_map(&e->threadpool, engine_do_reconstruct_multipoles_mapper, - e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 10, e); + e->s->cells_top, e->s->nr_cells, sizeof(struct cell), 0, e); if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -4054,7 +4231,7 @@ void engine_init(struct engine *e, struct space *s, e->parameter_file = params; #ifdef WITH_MPI e->cputime_last_step = 0; - e->last_repartition = -1; + e->last_repartition = 0; #endif engine_rank = nodeID; @@ -4228,7 +4405,7 @@ void engine_init(struct engine *e, struct space *s, "Version: %s \n# " "Number of threads: %d\n# Number of MPI ranks: %d\n# Hydrodynamic " "scheme: %s\n# Hydrodynamic kernel: %s\n# No. of neighbours: %.2f " - "+/- %.2f\n# Eta: %f\n", + "+/- %.4f\n# Eta: %f\n", hostname(), git_branch(), git_revision(), compiler_name(), compiler_version(), e->nr_threads, e->nr_nodes, SPH_IMPLEMENTATION, kernel_name, e->hydro_properties->target_neighbours, @@ -4322,20 +4499,14 @@ void engine_init(struct engine *e, struct space *s, threadpool_init(&e->threadpool, e->nr_threads); /* First of all, init the barrier and lock it. */ - if (pthread_mutex_init(&e->barrier_mutex, NULL) != 0) - error("Failed to initialize barrier mutex."); - if (pthread_cond_init(&e->barrier_cond, NULL) != 0) - error("Failed to initialize barrier condition variable."); - if (pthread_mutex_lock(&e->barrier_mutex) != 0) - error("Failed to lock barrier mutex."); - e->barrier_running = 0; - e->barrier_launch = 0; - e->barrier_launchcount = 0; + if (pthread_barrier_init(&e->wait_barrier, NULL, e->nr_threads + 1) != 0 || + pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0) + error("Failed to initialize barrier."); /* Init the scheduler with enough tasks for the initial sorting tasks. */ const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads; - scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal, - e->nodeID, &e->threadpool); + scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, + (policy & scheduler_flag_steal), e->nodeID, &e->threadpool); /* Allocate and init the threads. */ if ((e->runners = (struct runner *)malloc(sizeof(struct runner) * @@ -4344,7 +4515,6 @@ void engine_init(struct engine *e, struct space *s, for (int k = 0; k < e->nr_threads; k++) { e->runners[k].id = k; e->runners[k].e = e; - e->barrier_running += 1; if (pthread_create(&e->runners[k].thread, NULL, &runner_main, &e->runners[k]) != 0) error("Failed to create runner thread."); @@ -4380,8 +4550,12 @@ void engine_init(struct engine *e, struct space *s, e->runners[k].qid = k * nr_queues / e->nr_threads; } -#ifdef WITH_VECTORIZATION /* Allocate particle caches. */ + e->runners[k].ci_gravity_cache.count = 0; + e->runners[k].cj_gravity_cache.count = 0; + gravity_cache_init(&e->runners[k].ci_gravity_cache, space_splitsize); + gravity_cache_init(&e->runners[k].cj_gravity_cache, space_splitsize); +#ifdef WITH_VECTORIZATION e->runners[k].ci_cache.count = 0; e->runners[k].cj_cache.count = 0; cache_init(&e->runners[k].ci_cache, CACHE_SIZE); @@ -4407,9 +4581,7 @@ void engine_init(struct engine *e, struct space *s, #endif /* Wait for the runner threads to be in place. */ - while (e->barrier_running || e->barrier_launch) - if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0) - error("Error while waiting for runner threads to get in place."); + pthread_barrier_wait(&e->wait_barrier); } /** @@ -4423,7 +4595,7 @@ void engine_print_policy(struct engine *e) { if (e->nodeID == 0) { printf("[0000] %s engine_policy: engine policies are [ ", clocks_get_timesincestart()); - for (int k = 1; k < 32; k++) + for (int k = 0; k <= engine_maxpolicy; k++) if (e->policy & (1 << k)) printf(" %s ", engine_policy_names[k + 1]); printf(" ]\n"); fflush(stdout); @@ -4431,7 +4603,7 @@ void engine_print_policy(struct engine *e) { #else printf("%s engine_policy: engine policies are [ ", clocks_get_timesincestart()); - for (int k = 1; k < 31; k++) + for (int k = 0; k <= engine_maxpolicy; k++) if (e->policy & (1 << k)) printf(" %s ", engine_policy_names[k + 1]); printf(" ]\n"); fflush(stdout); @@ -4474,8 +4646,12 @@ void engine_compute_next_snapshot_time(struct engine *e) { void engine_clean(struct engine *e) { #ifdef WITH_VECTORIZATION - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache); - for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache); + for (int i = 0; i < e->nr_threads; ++i) { + cache_clean(&e->runners[i].ci_cache); + cache_clean(&e->runners[i].cj_cache); + gravity_cache_clean(&e->runners[i].ci_gravity_cache); + gravity_cache_clean(&e->runners[i].cj_gravity_cache); + } #endif free(e->runners); free(e->snapshotUnits); diff --git a/src/engine.h b/src/engine.h index e62b12332d3ac1b985b8f6d7181ea66824ec4f13..47a30a99b696304365a0ddf31d4499628a649a37 100644 --- a/src/engine.h +++ b/src/engine.h @@ -71,17 +71,18 @@ enum engine_policy { engine_policy_sourceterms = (1 << 14), engine_policy_stars = (1 << 15) }; - +#define engine_maxpolicy 15 extern const char *engine_policy_names[]; #define engine_queue_scale 1.2 #define engine_maxtaskspercell 96 #define engine_maxproxies 64 -#define engine_tasksreweight 10 +#define engine_tasksreweight 1 #define engine_parts_size_grow 1.05 #define engine_redistribute_alloc_margin 1.2 #define engine_default_energy_file_name "energy" #define engine_default_timesteps_file_name "timesteps" +#define engine_max_parts_per_ghost 1000 /* The rank of the engine as a global variable (for messages). */ extern int engine_rank; @@ -156,7 +157,7 @@ struct engine { double timeFirstSnapshot; double deltaTimeSnapshot; integertime_t ti_nextSnapshot; - char snapshotBaseName[200]; + char snapshotBaseName[PARSER_MAX_LINE_SIZE]; int snapshotCompression; struct unit_system *snapshotUnits; @@ -175,9 +176,8 @@ struct engine { int count_step; /* Data for the threads' barrier. */ - pthread_mutex_t barrier_mutex; - pthread_cond_t barrier_cond; - volatile int barrier_running, barrier_launch, barrier_launchcount; + pthread_barrier_t wait_barrier; + pthread_barrier_t run_barrier; /* ID of the node this engine lives on. */ int nr_nodes, nodeID; @@ -252,7 +252,7 @@ struct engine { }; /* Function prototypes. */ -void engine_barrier(struct engine *e, int tid); +void engine_barrier(struct engine *e); void engine_compute_next_snapshot_time(struct engine *e); void engine_unskip(struct engine *e); void engine_drift_all(struct engine *e); @@ -270,9 +270,10 @@ void engine_init(struct engine *e, struct space *s, const struct external_potential *potential, const struct cooling_function_data *cooling_func, struct sourceterms *sourceterms); -void engine_launch(struct engine *e, int nr_runners); +void engine_launch(struct engine *e); void engine_prepare(struct engine *e); -void engine_init_particles(struct engine *e, int flag_entropy_ICs); +void engine_init_particles(struct engine *e, int flag_entropy_ICs, + int clean_h_values); void engine_step(struct engine *e); void engine_maketasks(struct engine *e); void engine_split(struct engine *e, struct partition *initial_partition); @@ -281,7 +282,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, int *ind_gpart, size_t *Ngpart, size_t offset_sparts, int *ind_spart, size_t *Nspart); -void engine_rebuild(struct engine *e); +void engine_rebuild(struct engine *e, int clean_h_values); void engine_repartition(struct engine *e); void engine_repartition_trigger(struct engine *e); void engine_makeproxies(struct engine *e); diff --git a/src/gravity.c b/src/gravity.c index 97b2955b32e1513c3d86d1d1f4da2169130feb77..f58bc1b7456bc5dfc95b4c976ebda8e1999ff3e0 100644 --- a/src/gravity.c +++ b/src/gravity.c @@ -21,9 +21,15 @@ #include "../config.h" /* Some standard headers. */ +#include <float.h> #include <stdio.h> +#include <stdlib.h> #include <unistd.h> +#ifdef HAVE_HDF5 +#include <hdf5.h> +#endif + /* This object's header. */ #include "gravity.h" @@ -39,6 +45,256 @@ struct exact_force_data { double const_G; }; +#ifdef SWIFT_GRAVITY_FORCE_CHECKS + +/* Size of the Ewald table */ +#define Newald 64 + +/* Components of the Ewald correction */ +static float fewald_x[Newald + 1][Newald + 1][Newald + 1]; +static float fewald_y[Newald + 1][Newald + 1][Newald + 1]; +static float fewald_z[Newald + 1][Newald + 1][Newald + 1]; + +/* Factor used to normalize the access to the Ewald table */ +float ewald_fac; +#endif + +/** + * @brief Allocates the memory and computes one octant of the + * Ewald correction table. + * + * We follow Hernquist, Bouchet & Suto, 1991, ApJS, Volume 75, p.231-240, + * equations (2.14a) and (2.14b) with alpha = 2. We consider all terms with + * |x - nL| < 4L and |h|^2 < 16. + * + * @param boxSize The side-length (L) of the volume. + */ +void gravity_exact_force_ewald_init(double boxSize) { + +#ifdef SWIFT_GRAVITY_FORCE_CHECKS + const ticks tic = getticks(); + message("Computing Ewald correction table..."); + + /* Level of correction (Hernquist et al. 1991)*/ + const float alpha = 2.f; + + /* some useful constants */ + const float alpha2 = alpha * alpha; + const float factor_exp1 = 2.f * alpha / sqrt(M_PI); + const float factor_exp2 = -M_PI * M_PI / alpha2; + const float factor_sin = 2.f * M_PI; + const float boxSize_inv2 = 1.f / (boxSize * boxSize); + + /* Ewald factor to access the table */ + ewald_fac = (double)(2 * Newald) / boxSize; + + /* Zero everything */ + bzero(fewald_x, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float)); + bzero(fewald_y, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float)); + bzero(fewald_z, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float)); + + /* Compute the values in one of the octants */ + for (int i = 0; i <= Newald; ++i) { + for (int j = 0; j <= Newald; ++j) { + for (int k = 0; k <= Newald; ++k) { + + if (i == 0 && j == 0 && k == 0) continue; + + /* Distance vector */ + const float r_x = 0.5f * ((float)i) / Newald; + const float r_y = 0.5f * ((float)j) / Newald; + const float r_z = 0.5f * ((float)k) / Newald; + + /* Norm of distance vector */ + const float r2 = r_x * r_x + r_y * r_y + r_z * r_z; + const float r_inv = 1.f / sqrtf(r2); + const float r_inv3 = r_inv * r_inv * r_inv; + + /* Normal gravity potential term */ + float f_x = r_x * r_inv3; + float f_y = r_y * r_inv3; + float f_z = r_z * r_inv3; + + for (int n_i = -4; n_i <= 4; ++n_i) { + for (int n_j = -4; n_j <= 4; ++n_j) { + for (int n_k = -4; n_k <= 4; ++n_k) { + + const float d_x = r_x - n_i; + const float d_y = r_y - n_j; + const float d_z = r_z - n_k; + + /* Discretised distance */ + const float r_tilde2 = d_x * d_x + d_y * d_y + d_z * d_z; + const float r_tilde_inv = 1.f / sqrtf(r_tilde2); + const float r_tilde = r_tilde_inv * r_tilde2; + const float r_tilde_inv3 = + r_tilde_inv * r_tilde_inv * r_tilde_inv; + + const float val = + erfcf(alpha * r_tilde) + + factor_exp1 * r_tilde * expf(-alpha2 * r_tilde2); + + /* First correction term */ + const float f = val * r_tilde_inv3; + f_x -= f * d_x; + f_y -= f * d_y; + f_z -= f * d_z; + } + } + } + + for (int h_i = -4; h_i <= 4; ++h_i) { + for (int h_j = -4; h_j <= 4; ++h_j) { + for (int h_k = -4; h_k <= 4; ++h_k) { + + const float h2 = h_i * h_i + h_j * h_j + h_k * h_k; + + const float h2_inv = 1.f / (h2 + FLT_MIN); + const float h_dot_x = h_i * r_x + h_j * r_y + h_k * r_z; + + const float val = 2.f * h2_inv * expf(h2 * factor_exp2) * + sinf(factor_sin * h_dot_x); + + /* Second correction term */ + f_x -= val * h_i; + f_y -= val * h_j; + f_z -= val * h_k; + } + } + } + + /* Save back to memory */ + fewald_x[i][j][k] = f_x; + fewald_y[i][j][k] = f_y; + fewald_z[i][j][k] = f_z; + } + } + } + +/* Dump the Ewald table to a file */ +#ifdef HAVE_HDF5 + hid_t h_file = + H5Fcreate("Ewald.hdf5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + if (h_file < 0) error("Error while opening file for Ewald dump."); + + /* Create dataspace */ + hsize_t dim[3] = {Newald + 1, Newald + 1, Newald + 1}; + hid_t h_space = H5Screate_simple(3, dim, NULL); + hid_t h_data; + h_data = H5Dcreate(h_file, "Ewald_x", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT, + &(fewald_x[0][0][0])); + H5Dclose(h_data); + h_data = H5Dcreate(h_file, "Ewald_y", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT, + &(fewald_y[0][0][0])); + H5Dclose(h_data); + h_data = H5Dcreate(h_file, "Ewald_z", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT, + H5P_DEFAULT, H5P_DEFAULT); + H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT, + &(fewald_z[0][0][0])); + H5Dclose(h_data); + H5Sclose(h_space); + H5Fclose(h_file); +#endif + + /* Apply the box-size correction */ + for (int i = 0; i <= Newald; ++i) { + for (int j = 0; j <= Newald; ++j) { + for (int k = 0; k <= Newald; ++k) { + fewald_x[i][j][k] *= boxSize_inv2; + fewald_y[i][j][k] *= boxSize_inv2; + fewald_z[i][j][k] *= boxSize_inv2; + } + } + } + + /* Say goodbye */ + message("Ewald correction table computed (took %.3f %s). ", + clocks_from_ticks(getticks() - tic), clocks_getunit()); +#else + error("Gravity checking function called without the corresponding flag."); +#endif +} + +#ifdef SWIFT_GRAVITY_FORCE_CHECKS +/** + * @brief Compute the Ewald correction for a given distance vector r. + * + * We interpolate the Ewald correction tables using a tri-linear interpolation + * similar to a CIC. + * + * @param rx x-coordinate of distance vector. + * @param ry y-coordinate of distance vector. + * @param rz z-coordinate of distance vector. + * @param corr (return) The Ewald correction. + */ +__attribute__((always_inline)) INLINE static void +gravity_exact_force_ewald_evaluate(double rx, double ry, double rz, + double corr[3]) { + + const double s_x = (rx < 0.) ? 1. : -1.; + const double s_y = (ry < 0.) ? 1. : -1.; + const double s_z = (rz < 0.) ? 1. : -1.; + rx = fabs(rx); + ry = fabs(ry); + rz = fabs(rz); + + int i = (int)(rx * ewald_fac); + if (i >= Newald) i = Newald - 1; + const double dx = rx * ewald_fac - i; + const double tx = 1. - dx; + + int j = (int)(ry * ewald_fac); + if (j >= Newald) j = Newald - 1; + const double dy = ry * ewald_fac - j; + const double ty = 1. - dy; + + int k = (int)(rz * ewald_fac); + if (k >= Newald) k = Newald - 1; + const double dz = rz * ewald_fac - k; + const double tz = 1. - dz; + + /* Interpolation in X */ + corr[0] = 0.; + corr[0] += fewald_x[i + 0][j + 0][k + 0] * tx * ty * tz; + corr[0] += fewald_x[i + 0][j + 0][k + 1] * tx * ty * dz; + corr[0] += fewald_x[i + 0][j + 1][k + 0] * tx * dy * tz; + corr[0] += fewald_x[i + 0][j + 1][k + 1] * tx * dy * dz; + corr[0] += fewald_x[i + 1][j + 0][k + 0] * dx * ty * tz; + corr[0] += fewald_x[i + 1][j + 0][k + 1] * dx * ty * dz; + corr[0] += fewald_x[i + 1][j + 1][k + 0] * dx * dy * tz; + corr[0] += fewald_x[i + 1][j + 1][k + 1] * dx * dy * dz; + corr[0] *= s_x; + + /* Interpolation in Y */ + corr[1] = 0.; + corr[1] += fewald_y[i + 0][j + 0][k + 0] * tx * ty * tz; + corr[1] += fewald_y[i + 0][j + 0][k + 1] * tx * ty * dz; + corr[1] += fewald_y[i + 0][j + 1][k + 0] * tx * dy * tz; + corr[1] += fewald_y[i + 0][j + 1][k + 1] * tx * dy * dz; + corr[1] += fewald_y[i + 1][j + 0][k + 0] * dx * ty * tz; + corr[1] += fewald_y[i + 1][j + 0][k + 1] * dx * ty * dz; + corr[1] += fewald_y[i + 1][j + 1][k + 0] * dx * dy * tz; + corr[1] += fewald_y[i + 1][j + 1][k + 1] * dx * dy * dz; + corr[1] *= s_y; + + /* Interpolation in Z */ + corr[2] = 0.; + corr[2] += fewald_z[i + 0][j + 0][k + 0] * tx * ty * tz; + corr[2] += fewald_z[i + 0][j + 0][k + 1] * tx * ty * dz; + corr[2] += fewald_z[i + 0][j + 1][k + 0] * tx * dy * tz; + corr[2] += fewald_z[i + 0][j + 1][k + 1] * tx * dy * dz; + corr[2] += fewald_z[i + 1][j + 0][k + 0] * dx * ty * tz; + corr[2] += fewald_z[i + 1][j + 0][k + 1] * dx * ty * dz; + corr[2] += fewald_z[i + 1][j + 1][k + 0] * dx * dy * tz; + corr[2] += fewald_z[i + 1][j + 1][k + 1] * dx * dy * dz; + corr[2] *= s_z; +} +#endif + /** * @brief Checks whether the file containing the exact accelerations for * the current choice of parameters already exists. @@ -63,7 +319,7 @@ int gravity_exact_force_file_exits(const struct engine *e) { char line[100]; char dummy1[10], dummy2[10]; double epsilon, newton_G; - int N; + int N, periodic; /* Reads file header */ if (fgets(line, 100, file) != line) error("Problem reading title"); if (fgets(line, 100, file) != line) error("Problem reading G"); @@ -72,10 +328,12 @@ int gravity_exact_force_file_exits(const struct engine *e) { sscanf(line, "%s %s %d", dummy1, dummy2, &N); if (fgets(line, 100, file) != line) error("Problem reading epsilon"); sscanf(line, "%s %s %le", dummy1, dummy2, &epsilon); + if (fgets(line, 100, file) != line) error("Problem reading BC"); + sscanf(line, "%s %s %d", dummy1, dummy2, &periodic); fclose(file); /* Check whether it matches the current parameters */ - if (N == SWIFT_GRAVITY_FORCE_CHECKS && + if (N == SWIFT_GRAVITY_FORCE_CHECKS && periodic == e->s->periodic && (fabs(epsilon - e->gravity_properties->epsilon) / epsilon < 1e-5) && (fabs(newton_G - e->physical_constants->const_newton_G) / newton_G < 1e-5)) { @@ -101,6 +359,8 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts, struct exact_force_data *data = (struct exact_force_data *)extra_data; const struct space *s = data->s; const struct engine *e = data->e; + const int periodic = s->periodic; + const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; const double const_G = data->const_G; int counter = 0; @@ -112,6 +372,12 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts, if (gpi->id_or_neg_offset % SWIFT_GRAVITY_FORCE_CHECKS == 0 && gpart_is_active(gpi, e)) { + /* Get some information about the particle */ + const double pix[3] = {gpi->x[0], gpi->x[1], gpi->x[2]}; + const double hi = gpi->epsilon; + const double hi_inv = 1. / hi; + const double hi_inv3 = hi_inv * hi_inv * hi_inv; + /* Be ready for the calculation */ double a_grav[3] = {0., 0., 0.}; @@ -124,43 +390,53 @@ void gravity_exact_force_compute_mapper(void *map_data, int nr_gparts, if (gpi == gpj) continue; /* Compute the pairwise distance. */ - const double dx[3] = {gpi->x[0] - gpj->x[0], // x - gpi->x[1] - gpj->x[1], // y - gpi->x[2] - gpj->x[2]}; // z - const double r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + double dx = gpj->x[0] - pix[0]; + double dy = gpj->x[1] - pix[1]; + double dz = gpj->x[2] - pix[2]; + + /* Now apply periodic BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } - const double r = sqrt(r2); - const double ir = 1. / r; + const double r2 = dx * dx + dy * dy + dz * dz; + const double r_inv = 1. / sqrt(r2); + const double r = r2 * r_inv; const double mj = gpj->mass; - const double hi = gpi->epsilon; double f; - const double f_lr = 1.; if (r >= hi) { /* Get Newtonian gravity */ - f = mj * ir * ir * ir * f_lr; + f = mj * r_inv * r_inv * r_inv; } else { - const double hi_inv = 1. / hi; - const double hi_inv3 = hi_inv * hi_inv * hi_inv; const double ui = r * hi_inv; double W; kernel_grav_eval_double(ui, &W); /* Get softened gravity */ - f = mj * hi_inv3 * W * f_lr; - - // printf("r=%e hi=%e W=%e fac=%e\n", r, hi, W, f); + f = mj * hi_inv3 * W; } - const double fdx[3] = {f * dx[0], f * dx[1], f * dx[2]}; + a_grav[0] += f * dx; + a_grav[1] += f * dy; + a_grav[2] += f * dz; - a_grav[0] -= fdx[0]; - a_grav[1] -= fdx[1]; - a_grav[2] -= fdx[2]; + /* Apply Ewald correction for periodic BC */ + if (periodic && r > 1e-5 * hi) { + + double corr[3]; + gravity_exact_force_ewald_evaluate(dx, dy, dz, corr); + + a_grav[0] += mj * corr[0]; + a_grav[1] += mj * corr[1]; + a_grav[2] += mj * corr[2]; + } } /* Store the exact answer */ @@ -207,7 +483,7 @@ void gravity_exact_force_compute(struct space *s, const struct engine *e) { data.const_G = e->physical_constants->const_newton_G; threadpool_map(&s->e->threadpool, gravity_exact_force_compute_mapper, - s->gparts, s->nr_gparts, sizeof(struct gpart), 1000, &data); + s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data); message("Computed exact gravity for %d gparts (took %.3f %s). ", data.counter_global, clocks_from_ticks(getticks() - tic), @@ -245,8 +521,9 @@ void gravity_exact_force_check(struct space *s, const struct engine *e, fprintf(file_swift, "# Gravity accuracy test - SWIFT FORCES\n"); fprintf(file_swift, "# G= %16.8e\n", e->physical_constants->const_newton_G); fprintf(file_swift, "# N= %d\n", SWIFT_GRAVITY_FORCE_CHECKS); - fprintf(file_swift, "# epsilon=%16.8e\n", e->gravity_properties->epsilon); - fprintf(file_swift, "# theta=%16.8e\n", e->gravity_properties->theta_crit); + fprintf(file_swift, "# epsilon= %16.8e\n", e->gravity_properties->epsilon); + fprintf(file_swift, "# periodic= %d\n", s->periodic); + fprintf(file_swift, "# theta= %16.8e\n", e->gravity_properties->theta_crit); fprintf(file_swift, "# Git Branch: %s\n", git_branch()); fprintf(file_swift, "# Git Revision: %s\n", git_revision()); fprintf(file_swift, "# %16s %16s %16s %16s %16s %16s %16s\n", "id", "pos[0]", diff --git a/src/gravity.h b/src/gravity.h index 00b930c00fb2558f274feb2991b78e96dc8b990b..85e42370bc456dceb577c42ee609e3f0724e14ea 100644 --- a/src/gravity.h +++ b/src/gravity.h @@ -34,6 +34,8 @@ #include "./gravity/Default/gravity.h" #include "./gravity/Default/gravity_iact.h" +void gravity_exact_force_ewald_init(double boxSize); +void gravity_exact_force_ewald_free(); void gravity_exact_force_compute(struct space *s, const struct engine *e); void gravity_exact_force_check(struct space *s, const struct engine *e, float rel_tol); diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h index eca5c2491cbdcf5f0eca01417c8e6b29efc53459..d4a95540de17631ad445075d672d03a1236e34e3 100644 --- a/src/gravity/Default/gravity_iact.h +++ b/src/gravity/Default/gravity_iact.h @@ -28,11 +28,11 @@ #include "vector.h" /** - * @brief Gravity forces between particles + * @brief Gravity forces between particles truncated by the long-range kernel */ -__attribute__((always_inline)) INLINE static void runner_iact_grav_pp( - float rlr_inv, float r2, const float *dx, struct gpart *gpi, - struct gpart *gpj) { +__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_truncated( + float r2, const float *dx, struct gpart *gpi, struct gpart *gpj, + float rlr_inv) { /* Apply the gravitational acceleration. */ const float r = sqrtf(r2); @@ -41,7 +41,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp( const float mj = gpj->mass; const float hi = gpi->epsilon; const float hj = gpj->epsilon; - const float u = r * rlr_inv; + const float u_lr = r * rlr_inv; float f_lr, fi, fj, W; #ifdef SWIFT_DEBUG_CHECKS @@ -49,7 +49,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp( #endif /* Get long-range correction */ - kernel_long_grav_eval(u, &f_lr); + kernel_long_grav_eval(u_lr, &f_lr); if (r >= hi) { @@ -97,18 +97,84 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp( } /** - * @brief Gravity forces between particles (non-symmetric version) + * @brief Gravity forces between particles */ -__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym( - float rlr_inv, float r2, const float *dx, struct gpart *gpi, - const struct gpart *gpj) { +__attribute__((always_inline)) INLINE static void runner_iact_grav_pp( + float r2, const float *dx, struct gpart *gpi, struct gpart *gpj) { + + /* Apply the gravitational acceleration. */ + const float r = sqrtf(r2); + const float ir = 1.f / r; + const float mi = gpi->mass; + const float mj = gpj->mass; + const float hi = gpi->epsilon; + const float hj = gpj->epsilon; + float fi, fj, W; + +#ifdef SWIFT_DEBUG_CHECKS + if (r == 0.f) error("Interacting particles with 0 distance"); +#endif + + if (r >= hi) { + + /* Get Newtonian gravity */ + fi = mj * ir * ir * ir; + + } else { + + const float hi_inv = 1.f / hi; + const float hi_inv3 = hi_inv * hi_inv * hi_inv; + const float ui = r * hi_inv; + + kernel_grav_eval(ui, &W); + + /* Get softened gravity */ + fi = mj * hi_inv3 * W; + } + + if (r >= hj) { + + /* Get Newtonian gravity */ + fj = mi * ir * ir * ir; + + } else { + + const float hj_inv = 1.f / hj; + const float hj_inv3 = hj_inv * hj_inv * hj_inv; + const float uj = r * hj_inv; + + kernel_grav_eval(uj, &W); + + /* Get softened gravity */ + fj = mi * hj_inv3 * W; + } + + const float fidx[3] = {fi * dx[0], fi * dx[1], fi * dx[2]}; + gpi->a_grav[0] -= fidx[0]; + gpi->a_grav[1] -= fidx[1]; + gpi->a_grav[2] -= fidx[2]; + + const float fjdx[3] = {fj * dx[0], fj * dx[1], fj * dx[2]}; + gpj->a_grav[0] += fjdx[0]; + gpj->a_grav[1] += fjdx[1]; + gpj->a_grav[2] += fjdx[2]; +} + +/** + * @brief Gravity forces between particles truncated by the long-range kernel + * (non-symmetric version) + */ +__attribute__((always_inline)) INLINE static void +runner_iact_grav_pp_truncated_nonsym(float r2, const float *dx, + struct gpart *gpi, const struct gpart *gpj, + float rlr_inv) { /* Apply the gravitational acceleration. */ const float r = sqrtf(r2); const float ir = 1.f / r; const float mj = gpj->mass; const float hi = gpi->epsilon; - const float u = r * rlr_inv; + const float u_lr = r * rlr_inv; float f_lr, f, W; #ifdef SWIFT_DEBUG_CHECKS @@ -116,7 +182,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym( #endif /* Get long-range correction */ - kernel_long_grav_eval(u, &f_lr); + kernel_long_grav_eval(u_lr, &f_lr); if (r >= hi) { @@ -143,13 +209,44 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym( } /** - * @brief Gravity forces between particle and multipole + * @brief Gravity forces between particles (non-symmetric version) */ -__attribute__((always_inline)) INLINE static void runner_iact_grav_pm( - float rlr_inv, float r2, const float *dx, struct gpart *gp, - const struct multipole *multi) { +__attribute__((always_inline)) INLINE static void runner_iact_grav_pp_nonsym( + float r2, const float *dx, struct gpart *gpi, const struct gpart *gpj) { + + /* Apply the gravitational acceleration. */ + const float r = sqrtf(r2); + const float ir = 1.f / r; + const float mj = gpj->mass; + const float hi = gpi->epsilon; + float f, W; - error("Dead function"); +#ifdef SWIFT_DEBUG_CHECKS + if (r == 0.f) error("Interacting particles with 0 distance"); +#endif + + if (r >= hi) { + + /* Get Newtonian gravity */ + f = mj * ir * ir * ir; + + } else { + + const float hi_inv = 1.f / hi; + const float hi_inv3 = hi_inv * hi_inv * hi_inv; + const float ui = r * hi_inv; + + kernel_grav_eval(ui, &W); + + /* Get softened gravity */ + f = mj * hi_inv3 * W; + } + + const float fdx[3] = {f * dx[0], f * dx[1], f * dx[2]}; + + gpi->a_grav[0] -= fdx[0]; + gpi->a_grav[1] -= fdx[1]; + gpi->a_grav[2] -= fdx[2]; } #endif /* SWIFT_DEFAULT_GRAVITY_IACT_H */ diff --git a/src/gravity_cache.h b/src/gravity_cache.h new file mode 100644 index 0000000000000000000000000000000000000000..14b672233aa9958ec39af32a87baead98c0bae04 --- /dev/null +++ b/src/gravity_cache.h @@ -0,0 +1,247 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_GRAVITY_CACHE_H +#define SWIFT_GRAVITY_CACHE_H + +/* Config parameters. */ +#include "../config.h" + +/* Local headers */ +#include "align.h" +#include "error.h" +#include "gravity.h" +#include "vector.h" + +/** + * @brief A SoA object for the #gpart of a cell. + * + * This is used to help vectorize the leaf-leaf gravity interactions. + */ +struct gravity_cache { + + /*! #gpart x position. */ + float *restrict x SWIFT_CACHE_ALIGN; + + /*! #gpart y position. */ + float *restrict y SWIFT_CACHE_ALIGN; + + /*! #gpart z position. */ + float *restrict z SWIFT_CACHE_ALIGN; + + /*! #gpart softening length. */ + float *restrict epsilon SWIFT_CACHE_ALIGN; + + /*! #gpart mass. */ + float *restrict m SWIFT_CACHE_ALIGN; + + /*! #gpart x acceleration. */ + float *restrict a_x SWIFT_CACHE_ALIGN; + + /*! #gpart y acceleration. */ + float *restrict a_y SWIFT_CACHE_ALIGN; + + /*! #gpart z acceleration. */ + float *restrict a_z SWIFT_CACHE_ALIGN; + + /*! Cache size */ + int count; +}; + +/** + * @brief Frees the memory allocated in a #gravity_cache + * + * @param c The #gravity_cache to free. + */ +static INLINE void gravity_cache_clean(struct gravity_cache *c) { + + if (c->count > 0) { + free(c->x); + free(c->y); + free(c->z); + free(c->epsilon); + free(c->m); + free(c->a_x); + free(c->a_y); + free(c->a_z); + } + c->count = 0; +} + +/** + * @brief Allocates memory for the #gpart caches used in the leaf-leaf + * interactions. + * + * The cache is padded for the vector size and aligned properly + * + * @param c The #gravity_cache to allocate. + * @param count The number of #gpart to allocated for (space_splitsize is a good + * choice). + */ +static INLINE void gravity_cache_init(struct gravity_cache *c, int count) { + + /* Size of the gravity cache */ + const int padded_count = count - (count % VEC_SIZE) + VEC_SIZE; + const size_t sizeBytes = padded_count * sizeof(float); + + /* Delete old stuff if any */ + gravity_cache_clean(c); + + int error = 0; + error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->epsilon, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->a_x, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->a_y, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += posix_memalign((void **)&c->a_z, SWIFT_CACHE_ALIGNMENT, sizeBytes); + + if (error != 0) + error("Couldn't allocate gravity cache, size: %d", padded_count); + + c->count = padded_count; +} + +/** + * @brief Fills a #gravity_cache structure with some #gpart and shift them. + * + * @param c The #gravity_cache to fill. + * @param gparts The #gpart array to read from. + * @param gcount The number of particles to read. + * @param gcount_padded The number of particle to read padded to the next + * multiple of the vector length. + * @param shift A shift to apply to all the particles. + */ +__attribute__((always_inline)) INLINE void gravity_cache_populate( + struct gravity_cache *c, const struct gpart *restrict gparts, int gcount, + int gcount_padded, const double shift[3]) { + + /* Make the compiler understand we are in happy vectorization land */ + float *restrict x = c->x; + float *restrict y = c->y; + float *restrict z = c->z; + float *restrict m = c->m; + float *restrict epsilon = c->epsilon; + swift_align_information(x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT); + swift_align_information(m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Fill the input caches */ + for (int i = 0; i < gcount; ++i) { + x[i] = (float)(gparts[i].x[0] - shift[0]); + y[i] = (float)(gparts[i].x[1] - shift[1]); + z[i] = (float)(gparts[i].x[2] - shift[2]); + epsilon[i] = gparts[i].epsilon; + m[i] = gparts[i].mass; + } + +#ifdef SWIFT_DEBUG_CHECKS + if (gcount_padded < gcount) error("Padded counter smaller than counter"); +#endif + + /* Pad the caches */ + for (int i = gcount; i < gcount_padded; ++i) { + x[i] = 0.f; + y[i] = 0.f; + z[i] = 0.f; + epsilon[i] = 0.f; + m[i] = 0.f; + } +} + +/** + * @brief Fills a #gravity_cache structure with some #gpart. + * + * @param c The #gravity_cache to fill. + * @param gparts The #gpart array to read from. + * @param gcount The number of particles to read. + * @param gcount_padded The number of particle to read padded to the next + * multiple of the vector length. + */ +__attribute__((always_inline)) INLINE void gravity_cache_populate_no_shift( + struct gravity_cache *c, const struct gpart *restrict gparts, int gcount, + int gcount_padded) { + + /* Make the compiler understand we are in happy vectorization land */ + float *restrict x = c->x; + float *restrict y = c->y; + float *restrict z = c->z; + float *restrict m = c->m; + float *restrict epsilon = c->epsilon; + swift_align_information(x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(epsilon, SWIFT_CACHE_ALIGNMENT); + swift_align_information(m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Fill the input caches */ + for (int i = 0; i < gcount; ++i) { + x[i] = (float)(gparts[i].x[0]); + y[i] = (float)(gparts[i].x[1]); + z[i] = (float)(gparts[i].x[2]); + epsilon[i] = gparts[i].epsilon; + m[i] = gparts[i].mass; + } + +#ifdef SWIFT_DEBUG_CHECKS + if (gcount_padded < gcount) error("Padded counter smaller than counter"); +#endif + + /* Pad the caches */ + for (int i = gcount; i < gcount_padded; ++i) { + x[i] = 0.f; + y[i] = 0.f; + z[i] = 0.f; + epsilon[i] = 0.f; + m[i] = 0.f; + } +} + +/** + * @brief Write the output cache values back to the #gpart. + * + * @param c The #gravity_cache to read from. + * @param gparts The #gpart array to write to. + * @param gcount The number of particles to write. + */ +__attribute__((always_inline)) INLINE void gravity_cache_write_back( + const struct gravity_cache *c, struct gpart *restrict gparts, int gcount) { + + /* Make the compiler understand we are in happy vectorization land */ + float *restrict a_x = c->a_x; + float *restrict a_y = c->a_y; + float *restrict a_z = c->a_z; + swift_align_information(a_x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(a_y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(a_z, SWIFT_CACHE_ALIGNMENT); + + /* Write stuff back to the particles */ + for (int i = 0; i < gcount; ++i) { + gparts[i].a_grav[0] += a_x[i]; + gparts[i].a_grav[1] += a_y[i]; + gparts[i].a_grav[2] += a_z[i]; + } +} + +#endif /* SWIFT_GRAVITY_CACHE_H */ diff --git a/src/gravity_properties.c b/src/gravity_properties.c index b1098888b96cdef2205ed513e60a3799c63e8b9f..18cf044434f7840a5a76f483540bb924a2365e26 100644 --- a/src/gravity_properties.c +++ b/src/gravity_properties.c @@ -33,7 +33,8 @@ #include "kernel_gravity.h" #define gravity_props_default_a_smooth 1.25f -#define gravity_props_default_r_cut 4.5f +#define gravity_props_default_r_cut_max 4.5f +#define gravity_props_default_r_cut_min 0.1f void gravity_props_init(struct gravity_props *p, const struct swift_params *params) { @@ -41,8 +42,10 @@ void gravity_props_init(struct gravity_props *p, /* Tree-PM parameters */ p->a_smooth = parser_get_opt_param_float(params, "Gravity:a_smooth", gravity_props_default_a_smooth); - p->r_cut = parser_get_opt_param_float(params, "Gravity:r_cut", - gravity_props_default_r_cut); + p->r_cut_max = parser_get_opt_param_float(params, "Gravity:r_cut_max", + gravity_props_default_r_cut_max); + p->r_cut_min = parser_get_opt_param_float(params, "Gravity:r_cut_min", + gravity_props_default_r_cut_min); /* Time integration */ p->eta = parser_get_param_float(params, "Gravity:eta"); @@ -69,9 +72,10 @@ void gravity_props_print(const struct gravity_props *p) { message("Self-gravity softening: epsilon=%.4f (Plummer equivalent: %.4f)", p->epsilon, p->epsilon / 3.); - message("Self-gravity MM smoothing-scale: a_smooth=%f", p->a_smooth); + message("Self-gravity mesh smoothing-scale: a_smooth=%f", p->a_smooth); - message("Self-gravity MM cut-off: r_cut=%f", p->r_cut); + message("Self-gravity tree cut-off: r_cut_max=%f", p->r_cut_max); + message("Self-gravity truncation cut-off: r_cut_min=%f", p->r_cut_min); } #if defined(HAVE_HDF5) @@ -84,7 +88,8 @@ void gravity_props_print_snapshot(hid_t h_grpgrav, p->epsilon / 3.); io_write_attribute_f(h_grpgrav, "Opening angle", p->theta_crit); io_write_attribute_d(h_grpgrav, "MM order", SELF_GRAVITY_MULTIPOLE_ORDER); - io_write_attribute_f(h_grpgrav, "MM a_smooth", p->a_smooth); - io_write_attribute_f(h_grpgrav, "MM r_cut", p->r_cut); + io_write_attribute_f(h_grpgrav, "Mesh a_smooth", p->a_smooth); + io_write_attribute_f(h_grpgrav, "Mesh r_cut_max", p->r_cut_max); + io_write_attribute_f(h_grpgrav, "Mesh r_cut_min", p->r_cut_min); } #endif diff --git a/src/gravity_properties.h b/src/gravity_properties.h index be26f0d1d23b8cec71fa3cbbeedac9f61f337b2c..2a5e4cb1e07ea591e2e3821704ec55abe7980360 100644 --- a/src/gravity_properties.h +++ b/src/gravity_properties.h @@ -34,9 +34,16 @@ */ struct gravity_props { - /* Tree-PM parameters */ + /*! Mesh smoothing scale in units of top-level cell size */ float a_smooth; - float r_cut; + + /*! Distance below which the truncated mesh force is Newtonian in units of + * a_smooth */ + float r_cut_min; + + /*! Distance above which the truncated mesh force is negligible in units of + * a_smooth */ + float r_cut_max; /*! Time integration dimensionless multiplier */ float eta; diff --git a/src/hydro/Default/hydro.h b/src/hydro/Default/hydro.h index 051c22f46b3ecdff5d3de910e0f75409b0e78f02..31f0c4172099479abff9e1ed19487130a0a8938b 100644 --- a/src/hydro/Default/hydro.h +++ b/src/hydro/Default/hydro.h @@ -210,9 +210,6 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( const float irho = 1.f / p->rho; - /* Compute the derivative term */ - p->rho_dh = 1.f / (1.f + hydro_dimension_inv * p->h * p->rho_dh * irho); - /* Finish calculation of the velocity curl components */ p->density.rot_v[0] *= h_inv_dim_plus_one * irho; p->density.rot_v[1] *= h_inv_dim_plus_one * irho; @@ -222,6 +219,31 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->density.div_v *= h_inv_dim_plus_one * irho; } +/** + * @brief Sets all particle fields to sensible values when the #part has 0 ngbs. + * + * @param p The particle to act upon + * @param xp The extended particle data to act upon + */ +__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours( + struct part *restrict p, struct xpart *restrict xp) { + + /* Some smoothing length multiples. */ + const float h = p->h; + const float h_inv = 1.0f / h; /* 1/h */ + const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */ + + /* Re-set problematic values */ + p->rho = p->mass * kernel_root * h_inv_dim; + p->density.wcount = kernel_root * kernel_norm * h_inv_dim; + p->rho_dh = 0.f; + p->density.wcount_dh = 0.f; + p->density.div_v = 0.f; + p->density.rot_v[0] = 0.f; + p->density.rot_v[1] = 0.f; + p->density.rot_v[2] = 0.f; +} + /** * @brief Prepare a particle for the force calculation. * @@ -249,6 +271,9 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force( const float fc = p->force.soundspeed = sqrtf(hydro_gamma * hydro_gamma_minus_one * u); + /* Compute the derivative term */ + p->rho_dh = 1.f / (1.f + hydro_dimension_inv * p->h * p->rho_dh / p->rho); + /* Compute the P/Omega/rho2. */ xp->omega = 1.0f + hydro_dimension_inv * h * p->rho_dh / p->rho; p->force.P_over_rho2 = u * hydro_gamma_minus_one / (p->rho * xp->omega); diff --git a/src/hydro/Gadget2/hydro.h b/src/hydro/Gadget2/hydro.h index 91626749a89ede387547b6351dce59fa3569307a..66a475f32ec06eb40ff2bc890bc156f76e3b7b9f 100644 --- a/src/hydro/Gadget2/hydro.h +++ b/src/hydro/Gadget2/hydro.h @@ -206,12 +206,13 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->rho += p->mass * kernel_root; p->density.rho_dh -= hydro_dimension * p->mass * kernel_root; p->density.wcount += kernel_root; + p->density.wcount_dh -= hydro_dimension * kernel_root; /* Finish the calculation by inserting the missing h-factors */ p->rho *= h_inv_dim; p->density.rho_dh *= h_inv_dim_plus_one; - p->density.wcount *= kernel_norm; - p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm; + p->density.wcount *= h_inv_dim; + p->density.wcount_dh *= h_inv_dim_plus_one; const float rho_inv = 1.f / p->rho; @@ -224,6 +225,31 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->density.div_v *= h_inv_dim_plus_one * rho_inv; } +/** + * @brief Sets all particle fields to sensible values when the #part has 0 ngbs. + * + * @param p The particle to act upon + * @param xp The extended particle data to act upon + */ +__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours( + struct part *restrict p, struct xpart *restrict xp) { + + /* Some smoothing length multiples. */ + const float h = p->h; + const float h_inv = 1.0f / h; /* 1/h */ + const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */ + + /* Re-set problematic values */ + p->rho = p->mass * kernel_root * h_inv_dim; + p->density.wcount = kernel_root * kernel_norm * h_inv_dim; + p->density.rho_dh = 0.f; + p->density.wcount_dh = 0.f; + p->density.div_v = 0.f; + p->density.rot_v[0] = 0.f; + p->density.rot_v[1] = 0.f; + p->density.rot_v[2] = 0.f; +} + /** * @brief Prepare a particle for the force calculation. * @@ -239,6 +265,9 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force( const float fac_mu = 1.f; /* Will change with cosmological integration */ + /* Inverse of the physical density */ + const float rho_inv = 1.f / p->rho; + /* Compute the norm of the curl */ const float curl_v = sqrtf(p->density.rot_v[0] * p->density.rot_v[0] + p->density.rot_v[1] * p->density.rot_v[1] + @@ -254,7 +283,6 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force( const float soundspeed = gas_soundspeed_from_pressure(p->rho, pressure); /* Divide the pressure by the density squared to get the SPH term */ - const float rho_inv = 1.f / p->rho; const float P_over_rho2 = pressure * rho_inv * rho_inv; /* Compute the Balsara switch */ @@ -262,11 +290,11 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force( abs_div_v / (abs_div_v + curl_v + 0.0001f * soundspeed / fac_mu / p->h); /* Compute the "grad h" term */ - const float grad_h_term = + const float omega_inv = 1.f / (1.f + hydro_dimension_inv * p->h * p->density.rho_dh * rho_inv); /* Update variables. */ - p->force.f = grad_h_term; + p->force.f = omega_inv; p->force.P_over_rho2 = P_over_rho2; p->force.soundspeed = soundspeed; p->force.balsara = balsara; diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index b117b5a08a82679d0a4311235b4ac32fd1379dd6..81b6381f277284468c22d64312866c2e39cd1f0d 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -64,7 +64,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; - pi->density.wcount_dh -= ui * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute the kernel function for pj */ const float hj_inv = 1.f / hj; @@ -77,7 +77,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute contribution to the number of neighbours */ pj->density.wcount += wj; - pj->density.wcount_dh -= uj * wj_dx; + pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx); const float faci = mj * wi_dx * r_inv; const float facj = mi * wj_dx * r_inv; @@ -112,9 +112,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION - vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; + vector r, ri, r2, ui, uj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; vector mi, mj; vector dx[3], dv[3]; @@ -161,15 +161,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( hi.v = vec_load(Hi); hi_inv = vec_reciprocal(hi); - xi.v = r.v * hi_inv.v; + ui.v = r.v * hi_inv.v; hj.v = vec_load(Hj); hj_inv = vec_reciprocal(hj); - xj.v = r.v * hj_inv.v; + uj.v = r.v * hj_inv.v; /* Compute the kernel function. */ - kernel_deval_vec(&xi, &wi, &wi_dx); - kernel_deval_vec(&xj, &wj, &wj_dx); + kernel_deval_vec(&ui, &wi, &wi_dx); + kernel_deval_vec(&uj, &wj, &wj_dx); /* Compute dv. */ dv[0].v = vi[0].v - vj[0].v; @@ -188,17 +188,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( /* Compute density of pi. */ rhoi.v = mj.v * wi.v; - rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v); + rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); wcounti.v = wi.v; - wcounti_dh.v = xi.v * wi_dx.v; + wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); div_vi.v = mj.v * dvdr.v * wi_dx.v; for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; /* Compute density of pj. */ rhoj.v = mi.v * wj.v; - rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + xj.v * wj_dx.v); + rhoj_dh.v = mi.v * (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v); wcountj.v = wj.v; - wcountj_dh.v = xj.v * wj_dx.v; + wcountj_dh.v = (vec_set1(hydro_dimension) * wj.v + uj.v * wj_dx.v); div_vj.v = mi.v * dvdr.v * wj_dx.v; for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; @@ -241,7 +241,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /* Get r and r inverse. */ const float r = sqrtf(r2); - const float ri = 1.0f / r; + const float r_inv = 1.0f / r; /* Compute the kernel function */ const float hi_inv = 1.0f / hi; @@ -254,9 +254,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; - pi->density.wcount_dh -= ui * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); - const float fac = mj * wi_dx * ri; + const float fac = mj * wi_dx * r_inv; /* Compute dv dot r */ dv[0] = pi->v[0] - pj->v[0]; @@ -282,9 +282,9 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION - vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx; + vector r, ri, r2, ui, hi, hi_inv, wi, wi_dx; vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; vector mj; vector dx[3], dv[3]; @@ -328,9 +328,9 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, hi.v = vec_load(Hi); hi_inv = vec_reciprocal(hi); - xi.v = r.v * hi_inv.v; + ui.v = r.v * hi_inv.v; - kernel_deval_vec(&xi, &wi, &wi_dx); + kernel_deval_vec(&ui, &wi, &wi_dx); /* Compute dv. */ dv[0].v = vi[0].v - vj[0].v; @@ -349,9 +349,9 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, /* Compute density of pi. */ rhoi.v = mj.v * wi.v; - rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + xi.v * wi_dx.v); + rhoi_dh.v = mj.v * (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); wcounti.v = wi.v; - wcounti_dh.v = xi.v * wi_dx.v; + wcounti_dh.v = (vec_set1(hydro_dimension) * wi.v + ui.v * wi_dx.v); div_vi.v = mj.v * dvdr.v * wi_dx.v; for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; @@ -390,7 +390,7 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, vector *curlvySum, vector *curlvzSum, mask_t mask) { - vector r, ri, xi, wi, wi_dx; + vector r, ri, ui, wi, wi_dx; vector mj; vector dvx, dvy, dvz; vector vjx, vjy, vjz; @@ -407,10 +407,10 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, ri = vec_reciprocal_sqrt(*r2); r.v = vec_mul(r2->v, ri.v); - xi.v = vec_mul(r.v, hi_inv.v); + ui.v = vec_mul(r.v, hi_inv.v); /* Calculate the kernel for two particles. */ - kernel_deval_1_vec(&xi, &wi, &wi_dx); + kernel_deval_1_vec(&ui, &wi, &wi_dx); /* Compute dv. */ dvx.v = vec_sub(vix.v, vjx.v); @@ -432,14 +432,16 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, curlvry.v = vec_mul(curlvry.v, ri.v); curlvrz.v = vec_mul(curlvrz.v, ri.v); + vector wcount_dh_update; + wcount_dh_update.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + /* Mask updates to intermediate vector sums for particle pi. */ rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); - wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); curlvxSum->v = vec_mask_add(curlvxSum->v, @@ -464,13 +466,14 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vector *curlvySum, vector *curlvzSum, mask_t mask, mask_t mask2, short mask_cond) { - vector r, ri, r2, xi, wi, wi_dx; + + vector r, ri, r2, ui, wi, wi_dx; vector mj; vector dx, dy, dz, dvx, dvy, dvz; vector vjx, vjy, vjz; vector dvdr; vector curlvrx, curlvry, curlvrz; - vector r_2, ri2, r2_2, xi2, wi2, wi_dx2; + vector r_2, ri2, r2_2, ui2, wi2, wi_dx2; vector mj2; vector dx2, dy2, dz2, dvx2, dvy2, dvz2; vector vjx2, vjy2, vjz2; @@ -501,11 +504,11 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, r.v = vec_mul(r2.v, ri.v); r_2.v = vec_mul(r2_2.v, ri2.v); - xi.v = vec_mul(r.v, hi_inv.v); - xi2.v = vec_mul(r_2.v, hi_inv.v); + ui.v = vec_mul(r.v, hi_inv.v); + ui2.v = vec_mul(r_2.v, hi_inv.v); /* Calculate the kernel for two particles. */ - kernel_deval_2_vec(&xi, &wi, &wi_dx, &xi2, &wi2, &wi_dx2); + kernel_deval_2_vec(&ui, &wi, &wi_dx, &ui2, &wi2, &wi_dx2); /* Compute dv. */ dvx.v = vec_sub(vix.v, vjx.v); @@ -542,25 +545,25 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, curlvrz.v = vec_mul(curlvrz.v, ri.v); curlvrz2.v = vec_mul(curlvrz2.v, ri2.v); + vector wcount_dh_update, wcount_dh_update2; + wcount_dh_update.v = + vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(ui.v, wi_dx.v)); + wcount_dh_update2.v = + vec_fma(vec_set1(hydro_dimension), wi2.v, vec_mul(ui2.v, wi_dx2.v)); + /* Mask updates to intermediate vector sums for particle pi. */ /* Mask only when needed. */ if (mask_cond) { rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj.v, wi.v), mask); rhoSum->v = vec_mask_add(rhoSum->v, vec_mul(mj2.v, wi2.v), mask2); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask); - rho_dhSum->v = vec_mask_sub( - rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(xi2.v, wi_dx2.v))), - mask2); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v), mask); + rho_dhSum->v = + vec_mask_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v), mask2); wcountSum->v = vec_mask_add(wcountSum->v, wi.v, mask); wcountSum->v = vec_mask_add(wcountSum->v, wi2.v, mask2); - wcount_dhSum->v = - vec_mask_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v), mask); - wcount_dhSum->v = - vec_mask_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v), mask2); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update.v, mask); + wcount_dhSum->v = vec_mask_sub(wcount_dhSum->v, wcount_dh_update2.v, mask2); div_vSum->v = vec_mask_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask); div_vSum->v = vec_mask_sub( @@ -580,22 +583,27 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, } else { rhoSum->v = vec_add(rhoSum->v, vec_mul(mj.v, wi.v)); rhoSum->v = vec_add(rhoSum->v, vec_mul(mj2.v, wi2.v)); - rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul( - mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, vec_mul(xi.v, wi_dx.v)))); - rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, vec_fma(vec_set1(hydro_dimension), wi2.v, - vec_mul(xi2.v, wi_dx2.v)))); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj.v, wcount_dh_update.v)); + rho_dhSum->v = vec_sub(rho_dhSum->v, vec_mul(mj2.v, wcount_dh_update2.v)); wcountSum->v = vec_add(wcountSum->v, wi.v); wcountSum->v = vec_add(wcountSum->v, wi2.v); - wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(xi.v, wi_dx.v)); - wcount_dhSum->v = vec_sub(wcount_dhSum->v, vec_mul(xi2.v, wi_dx2.v)); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update.v); + wcount_dhSum->v = vec_sub(wcount_dhSum->v, wcount_dh_update2.v); div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - div_vSum->v = vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); - curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); - curlvxSum->v = vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); - curlvySum->v = vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); - curlvySum->v = vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); - curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); - curlvzSum->v = vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); + div_vSum->v = + vec_sub(div_vSum->v, vec_mul(mj2.v, vec_mul(dvdr2.v, wi_dx2.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v))); + curlvxSum->v = + vec_add(curlvxSum->v, vec_mul(mj2.v, vec_mul(curlvrx2.v, wi_dx2.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v))); + curlvySum->v = + vec_add(curlvySum->v, vec_mul(mj2.v, vec_mul(curlvry2.v, wi_dx2.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v))); + curlvzSum->v = + vec_add(curlvzSum->v, vec_mul(mj2.v, vec_mul(curlvrz2.v, wi_dx2.v))); } } #endif @@ -703,7 +711,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; @@ -985,7 +993,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef WITH_VECTORIZATION +#ifdef WITH_OLD_VECTORIZATION vector r, r2, ri; vector xi, xj; diff --git a/src/hydro/Gizmo/hydro.h b/src/hydro/Gizmo/hydro.h index 6d39c54d2ddc3571ac34c54fc9eede6f7dee6ac5..2c2f54699bb380a491edf61a83ad8a031572c86c 100644 --- a/src/hydro/Gizmo/hydro.h +++ b/src/hydro/Gizmo/hydro.h @@ -49,17 +49,21 @@ __attribute__((always_inline)) INLINE static float hydro_compute_timestep( return CFL_condition; #endif - if (p->timestepvars.vmax == 0.) { - /* vmax can be zero in vacuum cells that only have vacuum neighbours */ - /* in this case, the time step should be limited by the maximally - allowed time step. Since we do not know what that value is here, we set - the time step to a very large value */ - return FLT_MAX; - } else { - const float psize = powf(p->geometry.volume / hydro_dimension_unit_sphere, - hydro_dimension_inv); - return 2. * CFL_condition * psize / fabsf(p->timestepvars.vmax); + float vrel[3]; + vrel[0] = p->primitives.v[0] - xp->v_full[0]; + vrel[1] = p->primitives.v[1] - xp->v_full[1]; + vrel[2] = p->primitives.v[2] - xp->v_full[2]; + float vmax = + sqrtf(vrel[0] * vrel[0] + vrel[1] * vrel[1] + vrel[2] * vrel[2]) + + sqrtf(hydro_gamma * p->primitives.P / p->primitives.rho); + vmax = max(vmax, p->timestepvars.vmax); + const float psize = powf(p->geometry.volume / hydro_dimension_unit_sphere, + hydro_dimension_inv); + float dt = FLT_MAX; + if (vmax > 0.) { + dt = psize / vmax; } + return CFL_condition * dt; } /** @@ -225,14 +229,15 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( /* Some smoothing length multiples. */ const float h = p->h; const float ih = 1.0f / h; + const float ihdim = pow_dimension(ih); + const float ihdim_plus_one = ihdim * ih; /* Final operation on the density. */ p->density.wcount += kernel_root; - p->density.wcount *= kernel_norm; + p->density.wcount *= ihdim; - p->density.wcount_dh *= ih * kernel_gamma * kernel_norm; - - const float ihdim = pow_dimension(ih); + p->density.wcount_dh -= hydro_dimension * kernel_root; + p->density.wcount_dh *= ihdim_plus_one; /* Final operation on the geometry. */ /* we multiply with the smoothing kernel normalization ih3 and calculate the @@ -366,6 +371,42 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->density.wcount_dh *= p->density.wcorr; } +/** + * @brief Sets all particle fields to sensible values when the #part has 0 ngbs. + * + * @param p The particle to act upon + * @param xp The extended particle data to act upon + */ +__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours( + struct part* restrict p, struct xpart* restrict xp) { + + /* Some smoothing length multiples. */ + const float h = p->h; + const float h_inv = 1.0f / h; /* 1/h */ + const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */ + + /* Re-set problematic values */ + p->density.wcount = kernel_root * kernel_norm * h_inv_dim; + p->density.wcount_dh = 0.f; + p->geometry.volume = 1.0f; + p->geometry.matrix_E[0][0] = 1.0f; + p->geometry.matrix_E[0][1] = 0.0f; + p->geometry.matrix_E[0][2] = 0.0f; + p->geometry.matrix_E[1][0] = 0.0f; + p->geometry.matrix_E[1][1] = 1.0f; + p->geometry.matrix_E[1][2] = 0.0f; + p->geometry.matrix_E[2][0] = 0.0f; + p->geometry.matrix_E[2][1] = 0.0f; + p->geometry.matrix_E[2][2] = 1.0f; + /* centroid is relative w.r.t. particle position */ + /* by setting the centroid to 0.0f, we make sure no velocity correction is + applied */ + p->geometry.centroid[0] = 0.0f; + p->geometry.centroid[1] = 0.0f; + p->geometry.centroid[2] = 0.0f; + p->geometry.Atot = 1.0f; +} + /** * @brief Prepare a particle for the gradient calculation. * @@ -384,7 +425,7 @@ __attribute__((always_inline)) INLINE static void hydro_prepare_force( struct part* restrict p, struct xpart* restrict xp) { /* Initialize time step criterion variables */ - p->timestepvars.vmax = 0.0f; + p->timestepvars.vmax = 0.; /* Set the actual velocity of the particle */ hydro_velocities_prepare_force(p, xp); @@ -601,24 +642,12 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra( a_grav[1] = p->gpart->a_grav[1]; a_grav[2] = p->gpart->a_grav[2]; - /* Store the gravitational acceleration for later use. */ - /* This is used for the prediction step. */ - p->gravity.old_a[0] = a_grav[0]; - p->gravity.old_a[1] = a_grav[1]; - p->gravity.old_a[2] = a_grav[2]; - /* Make sure the gpart knows the mass has changed. */ p->gpart->mass = p->conserved.mass; - /* Kick the momentum for half a time step */ - /* Note that this also affects the particle movement, as the velocity for - the particles is set after this. */ - p->conserved.momentum[0] += dt * p->conserved.mass * a_grav[0]; - p->conserved.momentum[1] += dt * p->conserved.mass * a_grav[1]; - p->conserved.momentum[2] += dt * p->conserved.mass * a_grav[2]; - #if !defined(EOS_ISOTHERMAL_GAS) - /* This part still needs to be tested! */ + /* If the energy needs to be updated, we need to do it before the momentum + is updated, as the old value of the momentum enters the equations. */ p->conserved.energy += dt * (p->conserved.momentum[0] * a_grav[0] + p->conserved.momentum[1] * a_grav[1] + p->conserved.momentum[2] * a_grav[2]); @@ -627,6 +656,13 @@ __attribute__((always_inline)) INLINE static void hydro_kick_extra( a_grav[1] * p->gravity.mflux[1] + a_grav[2] * p->gravity.mflux[2]); #endif + + /* Kick the momentum for half a time step */ + /* Note that this also affects the particle movement, as the velocity for + the particles is set after this. */ + p->conserved.momentum[0] += dt * p->conserved.mass * a_grav[0]; + p->conserved.momentum[1] += dt * p->conserved.mass * a_grav[1]; + p->conserved.momentum[2] += dt * p->conserved.mass * a_grav[2]; } /* reset fluxes */ diff --git a/src/hydro/Gizmo/hydro_debug.h b/src/hydro/Gizmo/hydro_debug.h index a05ff9a7d96f04ca3354235540adc31386a2d2e3..17e7f8a08570e355a701f8e165ee8af745fa34ab 100644 --- a/src/hydro/Gizmo/hydro_debug.h +++ b/src/hydro/Gizmo/hydro_debug.h @@ -46,7 +46,7 @@ __attribute__((always_inline)) INLINE static void hydro_debug_particle( "volume=%.3e, " "matrix_E=[[%.3e,%.3e,%.3e],[%.3e,%.3e,%.3e],[%.3e,%.3e,%.3e]]}, " "timestepvars={" - "vmax=%.3e}, " + "vmax=%.3e}," "density={" "div_v=%.3e, " "wcount_dh=%.3e, " diff --git a/src/hydro/Gizmo/hydro_flux_limiters.h b/src/hydro/Gizmo/hydro_flux_limiters.h new file mode 100644 index 0000000000000000000000000000000000000000..dc91cf2808e02d903ff97efddc20c164db9c954e --- /dev/null +++ b/src/hydro/Gizmo/hydro_flux_limiters.h @@ -0,0 +1,81 @@ + +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2017 Bert Vandenbroucke (bert.vandenbroucke@gmail.com) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +#ifndef SWIFT_HYDRO_FLUX_LIMITERS_H +#define SWIFT_HYDRO_FLUX_LIMITERS_H + +#ifdef GIZMO_FLUX_LIMITER + +#define HYDRO_FLUX_LIMITER_IMPLEMENTATION "GIZMO flux limiter" + +/** + * @brief Limit the flux between two particles. + * + * @param flux Unlimited flux between the particles. + * @param pi Particle i. + * @param pj Particle j. + */ +__attribute__((always_inline)) INLINE static void hydro_flux_limiters_apply( + float* flux, struct part* pi, struct part* pj) { + + float flux_limit_factor = 1.; + const float timefac = max(pi->force.dt, pj->force.dt); + const float areafac = max(pi->geometry.Atot, pj->geometry.Atot); + const float totfac = timefac * areafac; + if (flux[0] * totfac > pi->conserved.mass) { + flux_limit_factor = pi->conserved.mass / (flux[0] * totfac); + } + if (flux[0] * totfac > pj->conserved.mass) { + flux_limit_factor = + min(pj->conserved.mass / (flux[0] * totfac), flux_limit_factor); + } + if (flux[4] * totfac > pi->conserved.energy) { + flux_limit_factor = + min(pi->conserved.energy / (flux[4] * totfac), flux_limit_factor); + } + if (flux[4] * totfac > pj->conserved.energy) { + flux_limit_factor = + min(pj->conserved.energy / (flux[4] * totfac), flux_limit_factor); + } + + flux[0] *= flux_limit_factor; + flux[1] *= flux_limit_factor; + flux[2] *= flux_limit_factor; + flux[3] *= flux_limit_factor; + flux[4] *= flux_limit_factor; +} + +#else + +#define HYDRO_FLUX_LIMITER_IMPLEMENTATION "No flux limiter" + +/** + * @brief Limit the flux between two particles. + * + * @param flux Unlimited flux between the particles. + * @param pi Particle i. + * @param pj Particle j. + */ +__attribute__((always_inline)) INLINE static void hydro_flux_limiters_apply( + float* flux, struct part* pi, struct part* pj) {} + +#endif + +#endif // SWIFT_HYDRO_FLUX_LIMITERS_H diff --git a/src/hydro/Gizmo/hydro_gradients.h b/src/hydro/Gizmo/hydro_gradients.h index 5ad6d87619a7629a703a8b9c03d089e69ffbdf7d..896128bd45d7964c1f4c8d63564f6fced38db770 100644 --- a/src/hydro/Gizmo/hydro_gradients.h +++ b/src/hydro/Gizmo/hydro_gradients.h @@ -99,7 +99,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict( float xij_j[3]; int k; float xfac; - float a_grav_i[3], a_grav_j[3]; /* perform gradient reconstruction in space and time */ /* space */ @@ -141,34 +140,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict( pj->primitives.gradients.P[1] * xij_j[1] + pj->primitives.gradients.P[2] * xij_j[2]; - a_grav_i[0] = pi->gravity.old_a[0]; - a_grav_i[1] = pi->gravity.old_a[1]; - a_grav_i[2] = pi->gravity.old_a[2]; - - a_grav_i[0] += pi->gravity.grad_a[0][0] * xij_i[0] + - pi->gravity.grad_a[0][1] * xij_i[1] + - pi->gravity.grad_a[0][2] * xij_i[2]; - a_grav_i[1] += pi->gravity.grad_a[1][0] * xij_i[0] + - pi->gravity.grad_a[1][1] * xij_i[1] + - pi->gravity.grad_a[1][2] * xij_i[2]; - a_grav_i[2] += pi->gravity.grad_a[2][0] * xij_i[0] + - pi->gravity.grad_a[2][1] * xij_i[1] + - pi->gravity.grad_a[2][2] * xij_i[2]; - - a_grav_j[0] = pj->gravity.old_a[0]; - a_grav_j[1] = pj->gravity.old_a[1]; - a_grav_j[2] = pj->gravity.old_a[2]; - - a_grav_j[0] += pj->gravity.grad_a[0][0] * xij_j[0] + - pj->gravity.grad_a[0][1] * xij_j[1] + - pj->gravity.grad_a[0][2] * xij_j[2]; - a_grav_j[1] += pj->gravity.grad_a[1][0] * xij_j[0] + - pj->gravity.grad_a[1][1] * xij_j[1] + - pj->gravity.grad_a[1][2] * xij_j[2]; - a_grav_j[2] += pj->gravity.grad_a[2][0] * xij_j[0] + - pj->gravity.grad_a[2][1] * xij_j[1] + - pj->gravity.grad_a[2][2] * xij_j[2]; - hydro_slope_limit_face(Wi, Wj, dWi, dWj, xij_i, xij_j, r); /* time */ @@ -198,10 +169,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict( hydro_gamma * Wi[4] * (pi->primitives.gradients.v[0][0] + pi->primitives.gradients.v[1][1] + pi->primitives.gradients.v[2][2])); - - dWi[1] += 0.5 * mindt * a_grav_i[0]; - dWi[2] += 0.5 * mindt * a_grav_i[1]; - dWi[3] += 0.5 * mindt * a_grav_i[2]; } if (Wj[0] > 0.0f) { @@ -230,10 +197,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_predict( hydro_gamma * Wj[4] * (pj->primitives.gradients.v[0][0] + pj->primitives.gradients.v[1][1] + pj->primitives.gradients.v[2][2])); - - dWj[1] += 0.5 * mindt * a_grav_j[0]; - dWj[2] += 0.5 * mindt * a_grav_j[1]; - dWj[3] += 0.5 * mindt * a_grav_j[2]; } Wi[0] += dWi[0]; diff --git a/src/hydro/Gizmo/hydro_gradients_gizmo.h b/src/hydro/Gizmo/hydro_gradients_gizmo.h index ee3ad6919f81f042ceacc5db8b4e818d63c90266..bc50c10d84cdd6b444887a8bb5fdf7b49a004eb8 100644 --- a/src/hydro/Gizmo/hydro_gradients_gizmo.h +++ b/src/hydro/Gizmo/hydro_gradients_gizmo.h @@ -45,18 +45,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_init( p->primitives.gradients.P[1] = 0.0f; p->primitives.gradients.P[2] = 0.0f; - p->gravity.grad_a[0][0] = 0.0f; - p->gravity.grad_a[0][1] = 0.0f; - p->gravity.grad_a[0][2] = 0.0f; - - p->gravity.grad_a[1][0] = 0.0f; - p->gravity.grad_a[1][1] = 0.0f; - p->gravity.grad_a[1][2] = 0.0f; - - p->gravity.grad_a[2][0] = 0.0f; - p->gravity.grad_a[2][1] = 0.0f; - p->gravity.grad_a[2][2] = 0.0f; - hydro_slope_limit_cell_init(p); } @@ -157,35 +145,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect( (Wi[4] - Wj[4]) * wi * (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); - pi->gravity.grad_a[0][0] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi * - (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]); - pi->gravity.grad_a[0][1] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi * - (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]); - pi->gravity.grad_a[0][2] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi * - (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); - - pi->gravity.grad_a[1][0] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi * - (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]); - pi->gravity.grad_a[1][1] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi * - (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]); - pi->gravity.grad_a[1][2] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi * - (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); - - pi->gravity.grad_a[2][0] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi * - (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]); - pi->gravity.grad_a[2][1] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi * - (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]); - pi->gravity.grad_a[2][2] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi * - (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); } else { /* The gradient matrix was not well-behaved, switch to SPH gradients */ @@ -223,27 +182,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect( wi_dx * dx[1] * (pi->primitives.P - pj->primitives.P) / r; pi->primitives.gradients.P[2] -= wi_dx * dx[2] * (pi->primitives.P - pj->primitives.P) / r; - - pi->gravity.grad_a[0][0] -= - wi_dx * dx[0] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - pi->gravity.grad_a[0][1] -= - wi_dx * dx[1] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - pi->gravity.grad_a[0][2] -= - wi_dx * dx[2] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - - pi->gravity.grad_a[1][0] -= - wi_dx * dx[0] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - pi->gravity.grad_a[1][1] -= - wi_dx * dx[1] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - pi->gravity.grad_a[1][2] -= - wi_dx * dx[2] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - - pi->gravity.grad_a[2][0] -= - wi_dx * dx[0] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; - pi->gravity.grad_a[2][1] -= - wi_dx * dx[1] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; - pi->gravity.grad_a[2][2] -= - wi_dx * dx[2] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; } hydro_slope_limit_cell_collect(pi, pj, r); @@ -306,35 +244,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect( (Wi[4] - Wj[4]) * wj * (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]); - pj->gravity.grad_a[0][0] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wj * - (Bj[0][0] * dx[0] + Bj[0][1] * dx[1] + Bj[0][2] * dx[2]); - pj->gravity.grad_a[0][1] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wj * - (Bj[1][0] * dx[0] + Bj[1][1] * dx[1] + Bj[1][2] * dx[2]); - pj->gravity.grad_a[0][2] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wj * - (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]); - - pj->gravity.grad_a[1][0] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wj * - (Bj[0][0] * dx[0] + Bj[0][1] * dx[1] + Bj[0][2] * dx[2]); - pj->gravity.grad_a[1][1] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wj * - (Bj[1][0] * dx[0] + Bj[1][1] * dx[1] + Bj[1][2] * dx[2]); - pj->gravity.grad_a[1][2] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wj * - (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]); - - pj->gravity.grad_a[2][0] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wj * - (Bj[0][0] * dx[0] + Bj[0][1] * dx[1] + Bj[0][2] * dx[2]); - pj->gravity.grad_a[2][1] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wj * - (Bj[1][0] * dx[0] + Bj[1][1] * dx[1] + Bj[1][2] * dx[2]); - pj->gravity.grad_a[2][2] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wj * - (Bj[2][0] * dx[0] + Bj[2][1] * dx[1] + Bj[2][2] * dx[2]); } else { /* SPH gradients */ @@ -371,27 +280,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_collect( wj_dx * dx[1] * (pi->primitives.P - pj->primitives.P) / r; pj->primitives.gradients.P[2] -= wj_dx * dx[2] * (pi->primitives.P - pj->primitives.P) / r; - - pj->gravity.grad_a[0][0] -= - wj_dx * dx[0] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - pj->gravity.grad_a[0][1] -= - wj_dx * dx[1] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - pj->gravity.grad_a[0][2] -= - wj_dx * dx[2] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - - pj->gravity.grad_a[1][0] -= - wj_dx * dx[0] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - pj->gravity.grad_a[1][1] -= - wj_dx * dx[1] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - pj->gravity.grad_a[1][2] -= - wj_dx * dx[2] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - - pj->gravity.grad_a[2][0] -= - wj_dx * dx[0] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; - pj->gravity.grad_a[2][1] -= - wj_dx * dx[1] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; - pj->gravity.grad_a[2][2] -= - wj_dx * dx[2] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; } hydro_slope_limit_cell_collect(pj, pi, r); @@ -493,35 +381,6 @@ hydro_gradients_nonsym_collect(float r2, float *dx, float hi, float hj, (Wi[4] - Wj[4]) * wi * (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); - pi->gravity.grad_a[0][0] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi * - (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]); - pi->gravity.grad_a[0][1] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi * - (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]); - pi->gravity.grad_a[0][2] += - (pi->gravity.old_a[0] - pj->gravity.old_a[0]) * wi * - (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); - - pi->gravity.grad_a[1][0] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi * - (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]); - pi->gravity.grad_a[1][1] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi * - (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]); - pi->gravity.grad_a[1][2] += - (pi->gravity.old_a[1] - pj->gravity.old_a[1]) * wi * - (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); - - pi->gravity.grad_a[2][0] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi * - (Bi[0][0] * dx[0] + Bi[0][1] * dx[1] + Bi[0][2] * dx[2]); - pi->gravity.grad_a[2][1] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi * - (Bi[1][0] * dx[0] + Bi[1][1] * dx[1] + Bi[1][2] * dx[2]); - pi->gravity.grad_a[2][2] += - (pi->gravity.old_a[2] - pj->gravity.old_a[2]) * wi * - (Bi[2][0] * dx[0] + Bi[2][1] * dx[1] + Bi[2][2] * dx[2]); } else { /* Gradient matrix is not well-behaved, switch to SPH gradients */ @@ -558,27 +417,6 @@ hydro_gradients_nonsym_collect(float r2, float *dx, float hi, float hj, wi_dx * dx[1] * (pi->primitives.P - pj->primitives.P) / r; pi->primitives.gradients.P[2] -= wi_dx * dx[2] * (pi->primitives.P - pj->primitives.P) / r; - - pi->gravity.grad_a[0][0] -= - wi_dx * dx[0] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - pi->gravity.grad_a[0][1] -= - wi_dx * dx[1] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - pi->gravity.grad_a[0][2] -= - wi_dx * dx[2] * (pi->gravity.old_a[0] - pj->gravity.old_a[0]) / r; - - pi->gravity.grad_a[1][0] -= - wi_dx * dx[0] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - pi->gravity.grad_a[1][1] -= - wi_dx * dx[1] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - pi->gravity.grad_a[1][2] -= - wi_dx * dx[2] * (pi->gravity.old_a[1] - pj->gravity.old_a[1]) / r; - - pi->gravity.grad_a[2][0] -= - wi_dx * dx[0] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; - pi->gravity.grad_a[2][1] -= - wi_dx * dx[1] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; - pi->gravity.grad_a[2][2] -= - wi_dx * dx[2] * (pi->gravity.old_a[2] - pj->gravity.old_a[2]) / r; } hydro_slope_limit_cell_collect(pi, pj, r); @@ -618,17 +456,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_finalize( p->primitives.gradients.P[1] *= ihdim; p->primitives.gradients.P[2] *= ihdim; - p->gravity.grad_a[0][0] *= ihdim; - p->gravity.grad_a[0][1] *= ihdim; - p->gravity.grad_a[0][2] *= ihdim; - - p->gravity.grad_a[1][0] *= ihdim; - p->gravity.grad_a[1][1] *= ihdim; - p->gravity.grad_a[1][2] *= ihdim; - - p->gravity.grad_a[2][0] *= ihdim; - p->gravity.grad_a[2][1] *= ihdim; - p->gravity.grad_a[2][2] *= ihdim; } else { const float ihdimp1 = pow_dimension_plus_one(ih); @@ -653,18 +480,6 @@ __attribute__((always_inline)) INLINE static void hydro_gradients_finalize( p->primitives.gradients.P[0] *= ihdimp1 * volume; p->primitives.gradients.P[1] *= ihdimp1 * volume; p->primitives.gradients.P[2] *= ihdimp1 * volume; - - p->gravity.grad_a[0][0] *= ihdimp1 * volume; - p->gravity.grad_a[0][1] *= ihdimp1 * volume; - p->gravity.grad_a[0][2] *= ihdimp1 * volume; - - p->gravity.grad_a[1][0] *= ihdimp1 * volume; - p->gravity.grad_a[1][1] *= ihdimp1 * volume; - p->gravity.grad_a[1][2] *= ihdimp1 * volume; - - p->gravity.grad_a[2][0] *= ihdimp1 * volume; - p->gravity.grad_a[2][1] *= ihdimp1 * volume; - p->gravity.grad_a[2][2] *= ihdimp1 * volume; } hydro_slope_limit_cell(p); diff --git a/src/hydro/Gizmo/hydro_iact.h b/src/hydro/Gizmo/hydro_iact.h index 8798dc859a790a83ab7a3b6f1709b1302f574581..0c7c8251b7d1c105dfc0c4b1637724accadaa4ae 100644 --- a/src/hydro/Gizmo/hydro_iact.h +++ b/src/hydro/Gizmo/hydro_iact.h @@ -20,6 +20,7 @@ ******************************************************************************/ #include "adiabatic_index.h" +#include "hydro_flux_limiters.h" #include "hydro_gradients.h" #include "riemann.h" @@ -57,7 +58,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( kernel_deval(xi, &wi, &wi_dx); pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + xi * wi_dx); /* these are eqns. (1) and (2) in the summary */ pi->geometry.volume += wi; @@ -74,7 +75,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( kernel_deval(xj, &wj, &wj_dx); pj->density.wcount += wj; - pj->density.wcount_dh -= xj * wj_dx; + pj->density.wcount_dh -= (hydro_dimension * wj + xj * wj_dx); /* these are eqns. (1) and (2) in the summary */ pj->geometry.volume += wj; @@ -121,7 +122,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( kernel_deval(xi, &wi, &wi_dx); pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + xi * wi_dx); /* these are eqns. (1) and (2) in the summary */ pi->geometry.volume += wi; @@ -346,8 +347,11 @@ __attribute__((always_inline)) INLINE static void runner_iact_fluxes_common( } dvdotdx = (Wi[1] - Wj[1]) * dx[0] + (Wi[2] - Wj[2]) * dx[1] + (Wi[3] - Wj[3]) * dx[2]; - if (dvdotdx > 0.) { - vmax -= dvdotdx / r; + dvdotdx = min(dvdotdx, (vi[0] - vj[0]) * dx[0] + (vi[1] - vj[1]) * dx[1] + + (vi[2] - vj[2]) * dx[2]); + if (dvdotdx < 0.) { + /* the magical factor 3 also appears in Gadget2 */ + vmax -= 3. * dvdotdx / r; } pi->timestepvars.vmax = max(pi->timestepvars.vmax, vmax); if (mode == 1) { @@ -487,36 +491,10 @@ __attribute__((always_inline)) INLINE static void runner_iact_fluxes_common( float totflux[5]; riemann_solve_for_flux(Wi, Wj, n_unit, vij, totflux); - /* Flux limiter */ - float flux_limit_factor = 1.; - float timefac = max(dti, dtj); - float areafac = max(pi->geometry.Atot, pj->geometry.Atot); - if (totflux[0] * areafac * timefac > pi->conserved.mass) { - flux_limit_factor = pi->conserved.mass / (totflux[0] * areafac * timefac); - } - if (totflux[0] * areafac * timefac > pj->conserved.mass) { - flux_limit_factor = - min(pj->conserved.mass / (totflux[0] * areafac * timefac), - flux_limit_factor); - } - if (totflux[4] * areafac * timefac > pi->conserved.energy) { - flux_limit_factor = - min(pi->conserved.energy / (totflux[4] * areafac * timefac), - flux_limit_factor); - } - if (totflux[4] * areafac * timefac > pj->conserved.energy) { - flux_limit_factor = - min(pj->conserved.energy / (totflux[4] * areafac * timefac), - flux_limit_factor); - } - totflux[0] *= flux_limit_factor; - totflux[1] *= flux_limit_factor; - totflux[2] *= flux_limit_factor; - totflux[3] *= flux_limit_factor; - totflux[4] *= flux_limit_factor; + hydro_flux_limiters_apply(totflux, pi, pj); /* Store mass flux */ - float mflux = mindt * Anorm * totflux[0]; + float mflux = Anorm * totflux[0]; pi->gravity.mflux[0] += mflux * dx[0]; pi->gravity.mflux[1] += mflux * dx[1]; pi->gravity.mflux[2] += mflux * dx[2]; @@ -554,7 +532,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_fluxes_common( if (mode == 1 || pj->force.active == 0) { /* Store mass flux */ - mflux = mindt * Anorm * totflux[0]; + mflux = Anorm * totflux[0]; pj->gravity.mflux[0] -= mflux * dx[0]; pj->gravity.mflux[1] -= mflux * dx[1]; pj->gravity.mflux[2] -= mflux * dx[2]; diff --git a/src/hydro/Gizmo/hydro_io.h b/src/hydro/Gizmo/hydro_io.h index 3d58be2f47c4e1904aaac5f69d1862f1d453e488..d20f7e2eb1cf50be7690e15a9569d8e9c4605af5 100644 --- a/src/hydro/Gizmo/hydro_io.h +++ b/src/hydro/Gizmo/hydro_io.h @@ -18,6 +18,7 @@ ******************************************************************************/ #include "adiabatic_index.h" +#include "hydro_flux_limiters.h" #include "hydro_gradients.h" #include "hydro_slope_limiters.h" #include "io_properties.h" @@ -127,7 +128,7 @@ float convert_Etot(struct engine* e, struct part* p) { void hydro_write_particles(struct part* parts, struct io_props* list, int* num_fields) { - *num_fields = 11; + *num_fields = 10; /* List what we want to write */ list[0] = io_make_output_field("Coordinates", DOUBLE, 3, UNIT_CONV_LENGTH, @@ -152,8 +153,6 @@ void hydro_write_particles(struct part* parts, struct io_props* list, list[9] = io_make_output_field_convert_part("TotEnergy", FLOAT, 1, UNIT_CONV_ENERGY, parts, conserved.energy, convert_Etot); - list[10] = io_make_output_field("GravAcceleration", FLOAT, 3, - UNIT_CONV_ACCELERATION, parts, gravity.old_a); } /** @@ -171,6 +170,10 @@ void writeSPHflavour(hid_t h_grpsph) { io_write_attribute_s(h_grpsph, "Piecewise slope limiter model", HYDRO_SLOPE_LIMITER_FACE_IMPLEMENTATION); + /* Flux limiter information */ + io_write_attribute_s(h_grpsph, "Flux limiter model", + HYDRO_FLUX_LIMITER_IMPLEMENTATION); + /* Riemann solver information */ io_write_attribute_s(h_grpsph, "Riemann solver type", RIEMANN_SOLVER_IMPLEMENTATION); diff --git a/src/hydro/Gizmo/hydro_part.h b/src/hydro/Gizmo/hydro_part.h index 6c96004847ae23b46ec3f5182f742e0e84f1118d..47f722c5a2dcce2f3ce603ade3029821d6686067 100644 --- a/src/hydro/Gizmo/hydro_part.h +++ b/src/hydro/Gizmo/hydro_part.h @@ -153,10 +153,13 @@ struct part { } geometry; - /* Variables used for timestep calculation (currently not used). */ + /* Variables used for timestep calculation. */ struct { - /* Maximum fluid velocity among all neighbours. */ + /* Maximum signal velocity among all the neighbours of the particle. The + * signal velocity encodes information about the relative fluid velocities + * AND particle velocities of the neighbour and this particle, as well as + * the sound speed of both particles. */ float vmax; } timestepvars; @@ -201,14 +204,6 @@ struct part { /* Specific stuff for the gravity-hydro coupling. */ struct { - /* Previous value of the gravitational acceleration. */ - float old_a[3]; - - float grad_a[3][3]; - - /* Previous value of the mass flux vector. */ - float old_mflux[3]; - /* Current value of the mass flux vector. */ float mflux[3]; diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h index 8f216a550ae061d01a594ff23d57575e754f85dc..4d8ca5b05547467c973e17983774b64736060471 100644 --- a/src/hydro/Minimal/hydro.h +++ b/src/hydro/Minimal/hydro.h @@ -219,12 +219,34 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->rho += p->mass * kernel_root; p->density.rho_dh -= hydro_dimension * p->mass * kernel_root; p->density.wcount += kernel_root; + p->density.wcount_dh -= hydro_dimension * kernel_root; /* Finish the calculation by inserting the missing h-factors */ p->rho *= h_inv_dim; p->density.rho_dh *= h_inv_dim_plus_one; p->density.wcount *= kernel_norm; - p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm; + p->density.wcount_dh *= h_inv_dim_plus_one; +} + +/** + * @brief Sets all particle fields to sensible values when the #part has 0 ngbs. + * + * @param p The particle to act upon + * @param xp The extended particle data to act upon + */ +__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours( + struct part *restrict p, struct xpart *restrict xp) { + + /* Some smoothing length multiples. */ + const float h = p->h; + const float h_inv = 1.0f / h; /* 1/h */ + const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */ + + /* Re-set problematic values */ + p->rho = p->mass * kernel_root * h_inv_dim; + p->density.wcount = kernel_root * kernel_norm * h_inv_dim; + p->density.rho_dh = 0.f; + p->density.wcount_dh = 0.f; } /** diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h index 169947b99e92d9bd1b0870d502a49e311820ff81..621177a3363e651e12dd728ad96ddadce3812f0e 100644 --- a/src/hydro/Minimal/hydro_iact.h +++ b/src/hydro/Minimal/hydro_iact.h @@ -51,23 +51,23 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute density of pi. */ const float hi_inv = 1.f / hi; - const float xi = r * hi_inv; - kernel_deval(xi, &wi, &wi_dx); + const float ui = r * hi_inv; + kernel_deval(ui, &wi, &wi_dx); pi->rho += mj * wi; - pi->density.rho_dh -= mj * (hydro_dimension * wi + xi * wi_dx); + pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx); pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute density of pj. */ const float hj_inv = 1.f / hj; - const float xj = r * hj_inv; - kernel_deval(xj, &wj, &wj_dx); + const float uj = r * hj_inv; + kernel_deval(uj, &wj, &wj_dx); pj->rho += mi * wj; - pj->density.rho_dh -= mi * (hydro_dimension * wj + xj * wj_dx); + pj->density.rho_dh -= mi * (hydro_dimension * wj + uj * wj_dx); pj->density.wcount += wj; - pj->density.wcount_dh -= xj * wj_dx; + pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx); } /** @@ -96,13 +96,13 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( const float r = sqrtf(r2); const float h_inv = 1.f / hi; - const float xi = r * h_inv; - kernel_deval(xi, &wi, &wi_dx); + const float ui = r * h_inv; + kernel_deval(ui, &wi, &wi_dx); pi->rho += mj * wi; - pi->density.rho_dh -= mj * (hydro_dimension * wi + xi * wi_dx); + pi->density.rho_dh -= mj * (hydro_dimension * wi + ui * wi_dx); pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); } /** diff --git a/src/hydro/PressureEntropy/hydro.h b/src/hydro/PressureEntropy/hydro.h index 4c4868cd3703e5ec5466d4878749a61284b19344..080b796b21d7f3b48191cd375574ae1de6d11d1a 100644 --- a/src/hydro/PressureEntropy/hydro.h +++ b/src/hydro/PressureEntropy/hydro.h @@ -212,14 +212,15 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->density.pressure_dh -= hydro_dimension * p->mass * p->entropy_one_over_gamma * kernel_root; p->density.wcount += kernel_root; + p->density.wcount_dh -= hydro_dimension * kernel_root; /* Finish the calculation by inserting the missing h-factors */ p->rho *= h_inv_dim; p->rho_bar *= h_inv_dim; p->density.rho_dh *= h_inv_dim_plus_one; p->density.pressure_dh *= h_inv_dim_plus_one; - p->density.wcount *= kernel_norm; - p->density.wcount_dh *= h_inv * kernel_gamma * kernel_norm; + p->density.wcount *= h_inv_dim; + p->density.wcount_dh *= h_inv_dim_plus_one; const float rho_inv = 1.f / p->rho; const float entropy_minus_one_over_gamma = 1.f / p->entropy_one_over_gamma; @@ -236,6 +237,33 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( p->density.div_v *= h_inv_dim_plus_one * rho_inv; } +/** + * @brief Sets all particle fields to sensible values when the #part has 0 ngbs. + * + * @param p The particle to act upon + * @param xp The extended particle data to act upon + */ +__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours( + struct part *restrict p, struct xpart *restrict xp) { + + /* Some smoothing length multiples. */ + const float h = p->h; + const float h_inv = 1.0f / h; /* 1/h */ + const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */ + + /* Re-set problematic values */ + p->rho = p->mass * kernel_root * h_inv_dim; + p->rho_bar = p->mass * kernel_root * h_inv_dim; + p->density.wcount = kernel_root * kernel_norm * h_inv_dim; + p->density.rho_dh = 0.f; + p->density.wcount_dh = 0.f; + p->density.pressure_dh = 0.f; + p->density.div_v = 0.f; + p->density.rot_v[0] = 0.f; + p->density.rot_v[1] = 0.f; + p->density.rot_v[2] = 0.f; +} + /** * @brief Prepare a particle for the force calculation. * diff --git a/src/hydro/PressureEntropy/hydro_iact.h b/src/hydro/PressureEntropy/hydro_iact.h index ce1c38ca69954252dc804af9181b9060a14afcb9..37a9f2b01af16fe598b414a9f67123849bee1442 100644 --- a/src/hydro/PressureEntropy/hydro_iact.h +++ b/src/hydro/PressureEntropy/hydro_iact.h @@ -59,7 +59,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; - pi->density.wcount_dh -= ui * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute contribution to the weighted density */ pi->rho_bar += mj * pj->entropy_one_over_gamma * wi; @@ -77,7 +77,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /* Compute contribution to the number of neighbours */ pj->density.wcount += wj; - pj->density.wcount_dh -= uj * wj_dx; + pj->density.wcount_dh -= (hydro_dimension * wj + uj * wj_dx); /* Compute contribution to the weighted density */ pj->rho_bar += mi * pi->entropy_one_over_gamma * wj; @@ -147,7 +147,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /* Compute contribution to the number of neighbours */ pi->density.wcount += wi; - pi->density.wcount_dh -= ui * wi_dx; + pi->density.wcount_dh -= (hydro_dimension * wi + ui * wi_dx); /* Compute contribution to the weighted density */ pi->rho_bar += mj * pj->entropy_one_over_gamma * wi; diff --git a/src/hydro/Shadowswift/hydro.h b/src/hydro/Shadowswift/hydro.h index 0568d47ee7ed33c59790cbca943cccbf1ceda58f..abbcdcd2f7879d8063a906e44ab2fe6a3e675828 100644 --- a/src/hydro/Shadowswift/hydro.h +++ b/src/hydro/Shadowswift/hydro.h @@ -238,6 +238,25 @@ __attribute__((always_inline)) INLINE static void hydro_end_density( #endif } +/** + * @brief Sets all particle fields to sensible values when the #part has 0 ngbs. + * + * @param p The particle to act upon + * @param xp The extended particle data to act upon + */ +__attribute__((always_inline)) INLINE static void hydro_part_has_no_neighbours( + struct part* restrict p, struct xpart* restrict xp) { + + /* Some smoothing length multiples. */ + const float h = p->h; + const float h_inv = 1.0f / h; /* 1/h */ + const float h_inv_dim = pow_dimension(h_inv); /* 1/h^d */ + + /* Re-set problematic values */ + p->density.wcount = kernel_root * kernel_norm * h_inv_dim; + p->density.wcount_dh = 0.f; +} + /** * @brief Prepare a particle for the gradient calculation. * diff --git a/src/hydro_properties.c b/src/hydro_properties.c index 818c1b6349192ed73b28cd4c3ae771f89a3754cd..1e7554f7d84220b8c962d60cc4538c685b5bad52 100644 --- a/src/hydro_properties.c +++ b/src/hydro_properties.c @@ -33,16 +33,26 @@ #include "kernel_hydro.h" #define hydro_props_default_max_iterations 30 -#define hydro_props_default_volume_change 2.0f +#define hydro_props_default_volume_change 1.4f #define hydro_props_default_h_max FLT_MAX +#define hydro_props_default_h_tolerance 1e-4 void hydro_props_init(struct hydro_props *p, const struct swift_params *params) { /* Kernel properties */ p->eta_neighbours = parser_get_param_float(params, "SPH:resolution_eta"); + + /* Tolerance for the smoothing length Newton-Raphson scheme */ + p->h_tolerance = parser_get_opt_param_float(params, "SPH:h_tolerance", + hydro_props_default_h_tolerance); + + /* Get derived properties */ p->target_neighbours = pow_dimension(p->eta_neighbours) * kernel_norm; - p->delta_neighbours = parser_get_param_float(params, "SPH:delta_neighbours"); + const float delta_eta = p->eta_neighbours * (1.f + p->h_tolerance); + p->delta_neighbours = + (pow_dimension(delta_eta) - pow_dimension(p->eta_neighbours)) * + kernel_norm; #ifdef SHADOWFAX_SPH /* change the meaning of target_neighbours and delta_neighbours */ @@ -81,9 +91,11 @@ void hydro_props_print(const struct hydro_props *p) { message("Hydrodynamic scheme: %s in %dD.", SPH_IMPLEMENTATION, (int)hydro_dimension); - message("Hydrodynamic kernel: %s with %.2f +/- %.2f neighbours (eta=%f).", - kernel_name, p->target_neighbours, p->delta_neighbours, - p->eta_neighbours); + message("Hydrodynamic kernel: %s with eta=%f (%.2f neighbours).", kernel_name, + p->eta_neighbours, p->target_neighbours); + + message("Hydrodynamic relative tolerance in h: %.5f (+/- %.4f neighbours).", + p->h_tolerance, p->delta_neighbours); message("Hydrodynamic integration: CFL parameter: %.4f.", p->CFL_condition); @@ -110,6 +122,7 @@ void hydro_props_print_snapshot(hid_t h_grpsph, const struct hydro_props *p) { io_write_attribute_f(h_grpsph, "Kernel target N_ngb", p->target_neighbours); io_write_attribute_f(h_grpsph, "Kernel delta N_ngb", p->delta_neighbours); io_write_attribute_f(h_grpsph, "Kernel eta", p->eta_neighbours); + io_write_attribute_f(h_grpsph, "Smoothing length tolerance", p->h_tolerance); io_write_attribute_f(h_grpsph, "Maximal smoothing length", p->h_max); io_write_attribute_f(h_grpsph, "CFL parameter", p->CFL_condition); io_write_attribute_f(h_grpsph, "Volume log(max(delta h))", diff --git a/src/hydro_properties.h b/src/hydro_properties.h index 716c4c060c21eb95d05f9d50e13d4681a958a6fd..a887ccb6df13b649cd1ef1009059c6f08908669c 100644 --- a/src/hydro_properties.h +++ b/src/hydro_properties.h @@ -16,10 +16,14 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ - #ifndef SWIFT_HYDRO_PROPERTIES #define SWIFT_HYDRO_PROPERTIES +/** + * @file hydro_properties.h + * @brief Contains all the constants and parameters of the hydro scheme + */ + /* Config parameters. */ #include "../config.h" @@ -35,19 +39,28 @@ */ struct hydro_props { - /* Kernel properties */ + /*! Resolution parameter */ float eta_neighbours; + + /*! Target weightd number of neighbours (for info only)*/ float target_neighbours; + + /*! Smoothing length tolerance */ + float h_tolerance; + + /*! Tolerance on neighbour number (for info only)*/ float delta_neighbours; - /* Maximal smoothing length */ + /*! Maximal smoothing length */ float h_max; - /* Number of iterations to converge h */ + /*! Maximal number of iterations to converge h */ int max_smoothing_iterations; - /* Time integration properties */ + /*! Time integration properties */ float CFL_condition; + + /*! Maximal change of h over one time-step */ float log_max_h_change; }; diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 45384e1aabb0189fd69a6a3cff122df95706af85..2e0f457d05c926fc1efa4fd334e7c8cc69189133 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -341,20 +341,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx( /* ------------------------------------------------------------------------- */ -#ifdef WITH_VECTORIZATION - -static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv); - -static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); - -static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); - -static const vector kernel_gamma_inv_dim_vec = - FILL_VEC((float)kernel_gamma_inv_dim); - -static const vector kernel_gamma_inv_dim_plus_one_vec = - FILL_VEC((float)kernel_gamma_inv_dim_plus_one); - +#ifdef WITH_OLD_VECTORIZATION /** * @brief Computes the kernel function and its derivative (Vectorised version). * @@ -373,7 +360,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( /* Load x and get the interval id. */ vector ind; - ind.m = vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v)); + ind.m = + vec_ftoi(vec_fmin(vec_mul(x.v, kernel_ivals_vec.v), kernel_ivals_vec.v)); /* load the coefficients. */ vector c[kernel_degree + 1]; @@ -392,9 +380,26 @@ __attribute__((always_inline)) INLINE static void kernel_deval_vec( } /* Return everything */ - w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); - dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); } +#endif + +#ifdef WITH_VECTORIZATION + +static const vector kernel_gamma_inv_vec = FILL_VEC((float)kernel_gamma_inv); + +static const vector kernel_ivals_vec = FILL_VEC((float)kernel_ivals); + +static const vector kernel_constant_vec = FILL_VEC((float)kernel_constant); + +static const vector kernel_gamma_inv_dim_vec = + FILL_VEC((float)kernel_gamma_inv_dim); + +static const vector kernel_gamma_inv_dim_plus_one_vec = + FILL_VEC((float)kernel_gamma_inv_dim_plus_one); /* Define constant vectors for the Wendland C2 and Cubic Spline kernel * coefficients. */ @@ -468,14 +473,15 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( w->v = vec_fma(x.v, w->v, wendland_const_c5.v); #elif defined(CUBIC_SPLINE_KERNEL) vector w2, dw_dx2; - mask_t mask_reg1, mask_reg2; + mask_t mask_reg; - /* Form a mask for each part of the kernel. */ - vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + /* Form a mask for one part of the kernel. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together - * using masks. */ + * using a mask. */ /* Init the iteration for Horner's scheme. */ w->v = vec_fma(cubic_1_const_c0.v, x.v, cubic_1_const_c1.v); @@ -494,20 +500,17 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( w->v = vec_fma(x.v, w->v, cubic_1_const_c3.v); w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); - /* Mask out unneeded values. */ - w->v = vec_and_mask(w->v, mask_reg1); - w2.v = vec_and_mask(w2.v, mask_reg2); - dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1); - dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2); + /* Blend both kernel regions into one vector (mask out unneeded values). */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w2.v); + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w2.v); - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); #else #error "Vectorisation not supported for this kernel!!!" #endif - /* Return everything */ + /* Return everyting */ w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, @@ -579,13 +582,13 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector w_2, dw_dx_2; vector w2_2, dw_dx2_2; - mask_t mask_reg1, mask_reg2, mask_reg1_v2, mask_reg2_v2; + mask_t mask_reg, mask_reg_v2; - /* Form a mask for each part of the kernel. */ - vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg1_v2, vec_cmp_lt(x2.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ - vec_create_mask(mask_reg2_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */ + /* Form a mask for one part of the kernel for each vector. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg_v2, vec_cmp_gte(x2.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -619,29 +622,23 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( w_2.v = vec_fma(x.v, w_2.v, cubic_2_const_c3.v); w2_2.v = vec_fma(x2.v, w2_2.v, cubic_2_const_c3.v); - /* Mask out unneeded values. */ - w->v = vec_and_mask(w->v, mask_reg1); - w2->v = vec_and_mask(w2->v, mask_reg1_v2); - w_2.v = vec_and_mask(w_2.v, mask_reg2); - w2_2.v = vec_and_mask(w2_2.v, mask_reg2_v2); - dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1); - dw_dx2->v = vec_and_mask(dw_dx2->v, mask_reg1_v2); - dw_dx_2.v = vec_and_mask(dw_dx_2.v, mask_reg2); - dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_v2); - - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w_2.v); - w2->v = vec_add(w2->v, w2_2.v); - dw_dx->v = vec_add(dw_dx->v, dw_dx_2.v); - dw_dx2->v = vec_add(dw_dx2->v, dw_dx2_2.v); + /* Blend both kernel regions into one vector (mask out unneeded values). */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w_2.v); + w2->v = vec_blend(mask_reg_v2, w2->v, w2_2.v); + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx_2.v); + dw_dx2->v = vec_blend(mask_reg_v2, dw_dx2->v, dw_dx2_2.v); /* Return everything */ - w->v = vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); - w2->v = vec_mul(w2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); - dw_dx->v = - vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); - dw_dx2->v = - vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); + w->v = + vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + w2->v = vec_mul(w2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx2->v = vec_mul(dw_dx2->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); #endif } @@ -672,12 +669,13 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, w->v = vec_fma(x.v, w->v, wendland_const_c5.v); #elif defined(CUBIC_SPLINE_KERNEL) vector w2; - mask_t mask_reg1, mask_reg2; + mask_t mask_reg; /* Form a mask for each part of the kernel. */ - vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ - + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -693,11 +691,10 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, w2.v = vec_fma(x.v, w2.v, cubic_2_const_c3.v); /* Mask out unneeded values. */ - w->v = vec_and_mask(w->v, mask_reg1); - w2.v = vec_and_mask(w2.v, mask_reg2); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + w->v = vec_blend(mask_reg, w->v, w2.v); - /* Added both w and w2 together to form complete result. */ - w->v = vec_add(w->v, w2.v); #else #error "Vectorisation not supported for this kernel!!!" #endif @@ -796,11 +793,12 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector dw_dx2; - mask_t mask_reg1, mask_reg2; + mask_t mask_reg; /* Form a mask for each part of the kernel. */ - vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -814,18 +812,17 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); /* Mask out unneeded values. */ - dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1); - dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); - /* Added both dwdx and dwdx2 together to form complete result. */ - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); #else #error "Vectorisation not supported for this kernel!!!" #endif /* Mask out result for particles that lie outside of the kernel function. */ mask_t mask; - vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ dw_dx->v = vec_and_mask(dw_dx->v, mask); @@ -842,6 +839,10 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( * * @param u The ratio of the distance to the smoothing length $u = x/h$. * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + * @param u_2 The ratio of the distance to the smoothing length $u = x/h$ for + * second particle. + * @param dw_dx_2 (return) The norm of the gradient of $|\\nabla W(x,h)|$ for + * second particle. */ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( vector *u, vector *dw_dx, vector *u_2, vector *dw_dx_2) { @@ -869,15 +870,15 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( #elif defined(CUBIC_SPLINE_KERNEL) vector dw_dx2, dw_dx2_2; - mask_t mask_reg1, mask_reg2; - mask_t mask_reg1_2, mask_reg2_2; + mask_t mask_reg; + mask_t mask_reg_v2; + + /* Form a mask for one part of the kernel. */ + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + vec_create_mask(mask_reg, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + vec_create_mask(mask_reg_v2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */ - /* Form a mask for each part of the kernel. */ - vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg1_2, vec_cmp_lt(x_2.v, cond.v)); /* 0 < x < 0.5 */ - vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ - vec_create_mask(mask_reg2_2, vec_cmp_gte(x_2.v, cond.v)); /* 0.5 < x < 1 */ - /* Work out w for both regions of the kernel and combine the results together * using masks. */ @@ -894,22 +895,19 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( dw_dx2_2.v = vec_fma(dw_dx2_2.v, x_2.v, cubic_2_dwdx_const_c2.v); /* Mask out unneeded values. */ - dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1); - dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_reg1_2); - dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2); - dw_dx2_2.v = vec_and_mask(dw_dx2_2.v, mask_reg2_2); + /* Only need the mask for one region as the vec_blend defaults to the vector + * when the mask is 0.*/ + dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + dw_dx_2->v = vec_blend(mask_reg_v2, dw_dx_2->v, dw_dx2_2.v); - /* Added both dwdx and dwdx2 together to form complete result. */ - dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); - dw_dx_2->v = vec_add(dw_dx_2->v, dw_dx2_2.v); #else #error "Vectorisation not supported for this kernel!!!" #endif /* Mask out result for particles that lie outside of the kernel function. */ mask_t mask, mask_2; - vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ - vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask, vec_cmp_lt(x.v, vec_set1(1.f))); /* x < 1 */ + vec_create_mask(mask_2, vec_cmp_lt(x_2.v, vec_set1(1.f))); /* x < 1 */ dw_dx->v = vec_and_mask(dw_dx->v, mask); dw_dx_2->v = vec_and_mask(dw_dx_2->v, mask_2); diff --git a/src/kernel_long_gravity.h b/src/kernel_long_gravity.h index 7b1c5984647c3be232770dc32fc1b112ad8bee94..ec31c2743079da22d1f3dd0c8683adf674aca1e3 100644 --- a/src/kernel_long_gravity.h +++ b/src/kernel_long_gravity.h @@ -19,33 +19,67 @@ #ifndef SWIFT_KERNEL_LONG_GRAVITY_H #define SWIFT_KERNEL_LONG_GRAVITY_H -#include <math.h> +/* Config parameters. */ +#include "../config.h" -/* Includes. */ +/* Local headers. */ +#include "approx_math.h" #include "const.h" #include "inline.h" -#include "vector.h" -#define one_over_sqrt_pi ((float)(M_2_SQRTPI * 0.5)) +/* Standard headers */ +#include <math.h> /** * @brief Computes the long-range correction term for the FFT calculation. * - * @param u The ratio of the distance to the FFT cell scale $u = x/A$. + * @param u The ratio of the distance to the FFT cell scale \f$u = r/r_s\f$. * @param W (return) The value of the kernel function. */ __attribute__((always_inline)) INLINE static void kernel_long_grav_eval( float u, float *const W) { - /* const float arg1 = u * 0.5f; */ - /* const float arg2 = u * one_over_sqrt_pi; */ - /* const float arg3 = -arg1 * arg1; */ +#ifdef GADGET2_LONG_RANGE_CORRECTION + + const float one_over_sqrt_pi = ((float)(M_2_SQRTPI * 0.5)); + + const float arg1 = u * 0.5f; + const float arg2 = u * one_over_sqrt_pi; + const float arg3 = -arg1 * arg1; + + const float term1 = erfcf(arg1); + const float term2 = arg2 * expf(arg3); + + *W = term1 + term2; +#else + + const float arg = 2.f * u; + const float exp_arg = good_approx_expf(arg); + const float term = 1.f / (1.f + exp_arg); - /* const float term1 = erfcf(arg1); */ - /* const float term2 = arg2 * expf(arg3); */ + *W = arg * exp_arg * term * term - exp_arg * term + 1.f; + *W *= 2.f; +#endif +} + +/** + * @brief Returns the long-range truncation of the Poisson potential in Fourier + * space. + * + * @param u2 The square of the Fourier mode times the cell scale + * \f$u^2 = k^2r_s^2\f$. + * @param W (return) The value of the kernel function. + */ +__attribute__((always_inline)) INLINE static void fourier_kernel_long_grav_eval( + double u2, double *const W) { - /* *W = term1 + term2; */ - *W = 1.f; +#ifdef GADGET2_LONG_RANGE_CORRECTION + *W = exp(-u2); +#else + const double u = sqrt(u2); + const double arg = M_PI_2 * u; + *W = arg / sinh(arg); +#endif } #endif // SWIFT_KERNEL_LONG_GRAVITY_H diff --git a/src/multipole.h b/src/multipole.h index 23f5194a30b7316aac15073cba36dc404efa21c1..004757924cccb6bc2f450c19f1ccd600f50e1990 100644 --- a/src/multipole.h +++ b/src/multipole.h @@ -1498,23 +1498,28 @@ INLINE static void gravity_M2M(struct multipole *m_a, * @param pos_a The position of the multipole. * @param props The #gravity_props of this calculation. * @param periodic Is the calculation periodic ? + * @param dim The size of the simulation box. */ INLINE static void gravity_M2L(struct grav_tensor *l_b, const struct multipole *m_a, const double pos_b[3], const double pos_a[3], - const struct gravity_props *props, - int periodic) { + const struct gravity_props *props, int periodic, + const double dim[3]) { /* Recover some constants */ const double eps2 = props->epsilon2; /* Compute distance vector */ - const double dx = - periodic ? box_wrap(pos_b[0] - pos_a[0], 0., 1.) : pos_b[0] - pos_a[0]; - const double dy = - periodic ? box_wrap(pos_b[1] - pos_a[1], 0., 1.) : pos_b[1] - pos_a[1]; - const double dz = - periodic ? box_wrap(pos_b[2] - pos_a[2], 0., 1.) : pos_b[2] - pos_a[2]; + double dx = pos_b[0] - pos_a[0]; + double dy = pos_b[1] - pos_a[1]; + double dz = pos_b[2] - pos_a[2]; + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } /* Compute distance */ const double r2 = dx * dx + dy * dy + dz * dz; @@ -2174,12 +2179,10 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b, * @param lb The #grav_tensor to shift. * @param pos_a The position to which m_b will be shifted. * @param pos_b The current postion of the multipole to shift. - * @param periodic Is the calculation periodic ? */ INLINE static void gravity_L2L(struct grav_tensor *la, const struct grav_tensor *lb, - const double pos_a[3], const double pos_b[3], - int periodic) { + const double pos_a[3], const double pos_b[3]) { /* Initialise everything to zero */ gravity_field_tensors_init(la); @@ -2636,31 +2639,50 @@ INLINE static void gravity_L2P(const struct grav_tensor *lb, /** * @brief Checks whether a cell-cell interaction can be appromixated by a M-M - * interaction. + * interaction using the CoM and cell radius at rebuild. + * + * We use the multipole acceptance criterion of Dehnen, 2002, JCoPh, Volume 179, + * Issue 1, pp.27-42, equation 10. * * @param ma The #multipole of the first #cell. * @param mb The #multipole of the second #cell. * @param theta_crit_inv The inverse of the critical opening angle. - * @param rebuild Are we using the current value of CoM or the ones from - * the last rebuild ? + * @param r2 Square of the distance (periodically wrapped) between the + * multipoles. */ -__attribute__((always_inline)) INLINE static int gravity_multipole_accept( - const struct gravity_tensors *ma, const struct gravity_tensors *mb, - double theta_crit_inv, int rebuild) { +__attribute__((always_inline)) INLINE static int +gravity_multipole_accept_rebuild(const struct gravity_tensors *const ma, + const struct gravity_tensors *const mb, + double theta_crit_inv, double r2) { - const double r_crit_a = - (rebuild ? ma->r_max_rebuild : ma->r_max) * theta_crit_inv; - const double r_crit_b = - (rebuild ? mb->r_max_rebuild : mb->r_max) * theta_crit_inv; + const double r_crit_a = ma->r_max_rebuild * theta_crit_inv; + const double r_crit_b = mb->r_max_rebuild * theta_crit_inv; - const double dx = rebuild ? ma->CoM_rebuild[0] - mb->CoM_rebuild[0] - : ma->CoM[0] - mb->CoM[0]; - const double dy = rebuild ? ma->CoM_rebuild[1] - mb->CoM_rebuild[1] - : ma->CoM[1] - mb->CoM[1]; - const double dz = rebuild ? ma->CoM_rebuild[2] - mb->CoM_rebuild[2] - : ma->CoM[2] - mb->CoM[2]; + // MATTHIEU: Make this mass-dependent ? - const double r2 = dx * dx + dy * dy + dz * dz; + /* Multipole acceptance criterion (Dehnen 2002, eq.10) */ + return (r2 > (r_crit_a + r_crit_b) * (r_crit_a + r_crit_b)); +} + +/** + * @brief Checks whether a cell-cell interaction can be appromixated by a M-M + * interaction using the CoM and cell radius at the current time. + * + * We use the multipole acceptance criterion of Dehnen, 2002, JCoPh, Volume 179, + * Issue 1, pp.27-42, equation 10. + * + * @param ma The #multipole of the first #cell. + * @param mb The #multipole of the second #cell. + * @param theta_crit_inv The inverse of the critical opening angle. + * @param r2 Square of the distance (periodically wrapped) between the + * multipoles. + */ +__attribute__((always_inline)) INLINE static int gravity_multipole_accept( + const struct gravity_tensors *const ma, + const struct gravity_tensors *const mb, double theta_crit_inv, double r2) { + + const double r_crit_a = ma->r_max * theta_crit_inv; + const double r_crit_b = mb->r_max * theta_crit_inv; // MATTHIEU: Make this mass-dependent ? diff --git a/src/parallel_io.c b/src/parallel_io.c index b857fd76a53738b19e5b26b8717881e71c424b6e..65f8fc9c20b1856a9c2f72625fb3bba0c8f7be8e 100644 --- a/src/parallel_io.c +++ b/src/parallel_io.c @@ -667,7 +667,7 @@ void write_output_parallel(struct engine* e, const char* baseName, /* File name */ char fileName[FILENAME_BUFFER_SIZE]; - snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName, + snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%04i.hdf5", baseName, outputCount); /* First time, we need to create the XMF file */ diff --git a/src/parser.c b/src/parser.c index 41a3e8637630eceb3beb9383acb3344028d38659..0b608b29263342240af68fd99d2fdd3241e2a1e6 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk) + * 2017 Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -89,6 +90,64 @@ void parser_read_file(const char *file_name, struct swift_params *params) { fclose(file); } +/** + * @brief Set or update a parameter using a compressed format. + * + * The compressed format allows a value to be given as a single + * string and has the format "section:parameter:value", with all + * names as would be given in the parameter file. + * + * @param params Structure that holds the parameters. + * @param namevalue the parameter name and value as described. + */ +void parser_set_param(struct swift_params *params, const char *namevalue) { + + /* Get the various parts. */ + char name[PARSER_MAX_LINE_SIZE]; + char value[PARSER_MAX_LINE_SIZE]; + name[0] = '\0'; + value[0] = '\0'; + + /* Name is part until second colon. */ + char *p1 = strchr(namevalue, ':'); + if (p1 != NULL) { + char *p2 = strchr(p1 + 1, ':'); + if (p2 != NULL) { + memcpy(name, namevalue, p2 - namevalue); + name[p2 - namevalue] = '\0'; + + /* Value is rest after second colon. */ + p2++; + strcpy(value, p2); + } + } + + /* Sanity check. */ + if (strlen(name) == 0 || strlen(value) == 0 || strchr(value, ':') != NULL) + error( + "Cannot parse compressed parameter string: '%s', check syntax " + "should be section:parameter:value", + namevalue); + + /* And update or set. */ + int updated = 0; + for (int i = 0; i < params->paramCount; i++) { + if (strcmp(name, params->data[i].name) == 0) { + message("Value of '%s' changed from '%s' to '%s'", params->data[i].name, + params->data[i].value, value); + strcpy(params->data[i].value, value); + updated = 1; + } + } + if (!updated) { + strcpy(params->data[params->paramCount].name, name); + strcpy(params->data[params->paramCount].value, value); + params->paramCount++; + if (params->paramCount == PARSER_MAX_NO_OF_PARAMS) + error("Too many parameters, current maximum is %d.", params->paramCount); + } +} + /** * @brief Counts the number of times a specific character appears in a string. * @@ -238,7 +297,7 @@ static void parse_value(char *line, struct swift_params *params) { /* Check for more than one value on the same line. */ if (count_char(line, PARSER_VALUE_CHAR) > 1) { - error("Inavlid line:%d '%s', only one value allowed per line.", lineNumber, + error("Invalid line:%d '%s', only one value allowed per line.", lineNumber, line); } diff --git a/src/parser.h b/src/parser.h index b78e21194d256ed7b50b8a09718c9725d52a1e0b..bab6d8b25f5334546ac2aaf39a3f25ef7fb6ff57 100644 --- a/src/parser.h +++ b/src/parser.h @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2016 James Willis (james.s.willis@durham.ac.uk) + * 2017 Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -55,6 +56,7 @@ void parser_read_file(const char *file_name, struct swift_params *params); void parser_print_params(const struct swift_params *params); void parser_write_params_to_file(const struct swift_params *params, const char *file_name); +void parser_set_param(struct swift_params *params, const char *desc); char parser_get_param_char(const struct swift_params *params, const char *name); int parser_get_param_int(const struct swift_params *params, const char *name); diff --git a/src/partition.c b/src/partition.c index c57918745c11d2858b40eefc218e2551e635d6fb..f30e5d0ad3c9ce8750a39891b2527729d9ad3b5d 100644 --- a/src/partition.c +++ b/src/partition.c @@ -897,27 +897,7 @@ void partition_initial_partition(struct partition *initial_partition, bzero(weights, sizeof(int) * s->nr_cells); /* Check each particle and accumilate the counts per cell. */ - struct part *parts = s->parts; - int *cdim = s->cdim; - double iwidth[3], dim[3]; - iwidth[0] = s->iwidth[0]; - iwidth[1] = s->iwidth[1]; - iwidth[2] = s->iwidth[2]; - dim[0] = s->dim[0]; - dim[1] = s->dim[1]; - dim[2] = s->dim[2]; - for (size_t k = 0; k < s->nr_parts; k++) { - for (int j = 0; j < 3; j++) { - if (parts[k].x[j] < 0.0) - parts[k].x[j] += dim[j]; - else if (parts[k].x[j] >= dim[j]) - parts[k].x[j] -= dim[j]; - } - const int cid = - cell_getid(cdim, parts[k].x[0] * iwidth[0], - parts[k].x[1] * iwidth[1], parts[k].x[2] * iwidth[2]); - weights[cid]++; - } + accumulate_counts(s, weights); /* Get all the counts from all the nodes. */ if (MPI_Allreduce(MPI_IN_PLACE, weights, s->nr_cells, MPI_INT, MPI_SUM, @@ -1090,6 +1070,10 @@ void partition_init(struct partition *partition, parser_get_opt_param_float(params, "DomainDecomposition:trigger", 0.05f); if (repartition->trigger <= 0) error("Invalid DomainDecomposition:trigger, must be greater than zero"); + if (repartition->trigger < 2 && repartition->trigger >= 1) + error( + "Invalid DomainDecomposition:trigger, must be 2 or greater or less" + " than 1"); /* Fraction of particles that should be updated before a repartition * based on CPU time is considered. */ diff --git a/src/potential/disc_patch/potential.h b/src/potential/disc_patch/potential.h index 8fa40ecd4e6503cde8be00db8c6fb8a70c84ebdf..ab229d009c692db727e8f2341c3c49813f74f2b8 100644 --- a/src/potential/disc_patch/potential.h +++ b/src/potential/disc_patch/potential.h @@ -30,6 +30,7 @@ /* Local includes. */ #include "const.h" #include "error.h" +#include "minmax.h" #include "parser.h" #include "part.h" #include "physical_constants.h" @@ -39,34 +40,63 @@ /** * @brief External Potential Properties - Disc patch case * - * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948 + * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948. + * + * We truncate the accelerations beyond z_trunc using a 1-cos(z) function + * that smoothly brings the accelerations to 0 at z_max. */ struct external_potential { - /*! Surface density of the disc */ - double surface_density; + /*! Surface density of the disc (sigma) */ + float surface_density; + + /*! Disc scale-height (b) */ + float scale_height; + + /*! Inverse of disc scale-height (1/b) */ + float scale_height_inv; + + /*! Position of the disc along the x-axis */ + float x_disc; - /*! Disc scale-height */ - double scale_height; + /*! Position above which the accelerations get truncated */ + float x_trunc; - /*! Position of the disc along the z-axis */ - double z_disc; + /*! Position above which the accelerations are zero */ + float x_max; + + /*! The truncated transition regime */ + float x_trans; + + /*! Inverse of the truncated transition regime */ + float x_trans_inv; /*! Dynamical time of the system */ - double dynamical_time; + float dynamical_time; - /*! Time over which to grow the disk in units of the dynamical time */ - double growth_time; + /*! Time over which to grow the disk */ + float growth_time; + + /*! Inverse of the growth time */ + float growth_time_inv; /*! Time-step condition pre-factor */ - double timestep_mult; + float timestep_mult; + + /*! Constant pre-factor (2 pi G sigma) */ + float norm; + + /*! Constant pre-factor (2 pi sigma)*/ + float norm_over_G; }; /** * @brief Computes the time-step from the acceleration due to a hydrostatic * disc. * - * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948 + * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948, + * equations 17 and 20. + * We do not use the truncated potential here. * * @param time The current time. * @param potential The properties of the potential. @@ -80,39 +110,41 @@ __attribute__((always_inline)) INLINE static float external_gravity_timestep( /* initilize time step to disc dynamical time */ const float dt_dyn = potential->dynamical_time; - float dt = dt_dyn; + const float b = potential->scale_height; + const float b_inv = potential->scale_height_inv; + const float norm = potential->norm; /* absolute value of height above disc */ - const float dz = fabsf(g->x[2] - potential->z_disc); + const float dx = fabsf(g->x[0] - potential->x_disc); /* vertical acceleration */ - const float z_accel = 2.f * M_PI * phys_const->const_newton_G * - potential->surface_density * - tanhf(dz / potential->scale_height); + const float x_accel = norm * tanhf(dx * b_inv); + + float dt = dt_dyn; /* demand that dt * velocity < fraction of scale height of disc */ - float dt1 = FLT_MAX; - if (g->v_full[2] != 0.f) { - dt1 = potential->scale_height / fabsf(g->v_full[2]); - if (dt1 < dt) dt = dt1; + if (g->v_full[0] != 0.f) { + + const float dt1 = b / fabsf(g->v_full[0]); + dt = min(dt1, dt); } /* demand that dt^2 * acceleration < fraction of scale height of disc */ - float dt2 = FLT_MAX; - if (z_accel != 0.f) { - dt2 = potential->scale_height / fabsf(z_accel); + if (x_accel != 0.f) { + + const float dt2 = b / fabsf(x_accel); if (dt2 < dt * dt) dt = sqrtf(dt2); } /* demand that dt^3 * jerk < fraction of scale height of disc */ - float dt3 = FLT_MAX; - if (g->v_full[2] != 0.f) { - const float dz_accel_over_dt = - 2.f * M_PI * phys_const->const_newton_G * potential->surface_density / - potential->scale_height / coshf(dz / potential->scale_height) / - coshf(dz / potential->scale_height) * fabsf(g->v_full[2]); - - dt3 = potential->scale_height / fabsf(dz_accel_over_dt); + if (g->v_full[0] != 0.f) { + + const float cosh_dx_inv = 1.f / coshf(dx * b_inv); + const float cosh_dx_inv2 = cosh_dx_inv * cosh_dx_inv; + const float dx_accel_over_dt = + norm * cosh_dx_inv2 * b_inv * fabsf(g->v_full[0]); + + const float dt3 = b / fabsf(dx_accel_over_dt); if (dt3 < dt * dt * dt) dt = cbrtf(dt3); } @@ -120,11 +152,13 @@ __attribute__((always_inline)) INLINE static float external_gravity_timestep( } /** - * @brief Computes the gravitational acceleration along z due to a hydrostatic + * @brief Computes the gravitational acceleration along x due to a hydrostatic * disc * * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948, * equation 17. + * We truncate the accelerations beyond x_trunc using a 1-cos(x) function + * that smoothly brings the accelerations to 0 at x_max. * * @param time The current time in internal units. * @param potential The properties of the potential. @@ -135,20 +169,40 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration( double time, const struct external_potential* restrict potential, const struct phys_const* restrict phys_const, struct gpart* restrict g) { - const float dz = g->x[2] - potential->z_disc; - const float t_dyn = potential->dynamical_time; - - float reduction_factor = 1.f; - if (time < potential->growth_time * t_dyn) - reduction_factor = time / (potential->growth_time * t_dyn); - - /* Accelerations. Note that they are multiplied by G later on */ - const float z_accel = reduction_factor * 2.f * M_PI * - potential->surface_density * - tanhf(fabsf(dz) / potential->scale_height); + const float dx = g->x[0] - potential->x_disc; + const float abs_dx = fabsf(dx); + const float t_growth = potential->growth_time; + const float t_growth_inv = potential->growth_time_inv; + const float b_inv = potential->scale_height_inv; + const float x_trunc = potential->x_trunc; + const float x_max = potential->x_max; + const float x_trans_inv = potential->x_trans_inv; + const float norm_over_G = potential->norm_over_G; + + /* Are we still growing the disc ? */ + const float reduction_factor = time < t_growth ? time * t_growth_inv : 1.f; + + /* Truncated or not ? */ + float a_x; + if (abs_dx < x_trunc) { + + /* Acc. 2 pi sigma tanh(x/b) */ + a_x = reduction_factor * norm_over_G * tanhf(abs_dx * b_inv); + } else if (abs_dx < x_max) { + + /* Acc. 2 pi sigma tanh(x/b) [1/2 + 1/2cos((x-xmax)/(pi x_trans))] */ + a_x = + reduction_factor * norm_over_G * tanhf(abs_dx * b_inv) * + (0.5f + 0.5f * cosf((float)(M_PI) * (abs_dx - x_trunc) * x_trans_inv)); + } else { + + /* Acc. 0 */ + a_x = 0.f; + } - if (dz > 0) g->a_grav[2] -= z_accel; - if (dz < 0) g->a_grav[2] += z_accel; + /* Get the correct sign. Recall G is multipiled in later on */ + if (dx > 0) g->a_grav[0] -= a_x; + if (dx < 0) g->a_grav[0] += a_x; } /** @@ -156,7 +210,9 @@ __attribute__((always_inline)) INLINE static void external_gravity_acceleration( * disc patch potential. * * See Creasey, Theuns & Bower, 2013, MNRAS, Volume 429, Issue 3, p.1922-1948, - * equation 24. + * equation 22. + * We truncate the accelerations beyond x_trunc using a 1-cos(x) function + * that smoothly brings the accelerations to 0 at x_max. * * @param time The current time. * @param potential The #external_potential used in the run. @@ -168,17 +224,36 @@ external_gravity_get_potential_energy( double time, const struct external_potential* potential, const struct phys_const* const phys_const, const struct gpart* gp) { - const float dz = gp->x[2] - potential->z_disc; - const float t_dyn = potential->dynamical_time; + const float dx = gp->x[0] - potential->x_disc; + const float abs_dx = fabsf(dx); + const float t_growth = potential->growth_time; + const float t_growth_inv = potential->growth_time_inv; + const float b = potential->scale_height; + const float b_inv = potential->scale_height_inv; + const float norm = potential->norm; + const float x_trunc = potential->x_trunc; + const float x_max = potential->x_max; + + /* Are we still growing the disc ? */ + const float reduction_factor = time < t_growth ? time * t_growth_inv : 1.f; + + /* Truncated or not ? */ + float pot; + if (abs_dx < x_trunc) { - float reduction_factor = 1.f; - if (time < potential->growth_time * t_dyn) - reduction_factor = time / (potential->growth_time * t_dyn); + /* Potential (2 pi G sigma b ln(cosh(x/b)) */ + pot = b * logf(coshf(dx * b_inv)); + } else if (abs_dx < x_max) { - /* Accelerations. Note that they are multiplied by G later on */ - return reduction_factor * 2.f * M_PI * phys_const->const_newton_G * - potential->surface_density * potential->scale_height * - logf(coshf(dz / potential->scale_height)); + /* Potential. At x>>b, phi(x) = norm * x / b */ + pot = 0.f; + + } else { + + pot = 0.f; + } + + return pot * reduction_factor * norm; } /** @@ -202,15 +277,49 @@ static INLINE void potential_init_backend( parameter_file, "DiscPatchPotential:surface_density"); potential->scale_height = parser_get_param_double( parameter_file, "DiscPatchPotential:scale_height"); - potential->z_disc = - parser_get_param_double(parameter_file, "DiscPatchPotential:z_disc"); + potential->x_disc = + parser_get_param_double(parameter_file, "DiscPatchPotential:x_disc"); + potential->x_trunc = parser_get_opt_param_double( + parameter_file, "DiscPatchPotential:x_trunc", FLT_MAX); + potential->x_max = parser_get_opt_param_double( + parameter_file, "DiscPatchPotential:x_max", FLT_MAX); + potential->x_disc = + parser_get_param_double(parameter_file, "DiscPatchPotential:x_disc"); potential->timestep_mult = parser_get_param_double( parameter_file, "DiscPatchPotential:timestep_mult"); potential->growth_time = parser_get_opt_param_double( parameter_file, "DiscPatchPotential:growth_time", 0.); + + /* Compute the dynamical time */ potential->dynamical_time = sqrt(potential->scale_height / (phys_const->const_newton_G * potential->surface_density)); + + /* Convert the growth time multiplier to physical time */ + potential->growth_time *= potential->dynamical_time; + + /* Some cross-checks */ + if (potential->x_trunc > potential->x_max) + error("Potential truncation x larger than maximal z"); + if (potential->x_trunc < potential->scale_height) + error("Potential truncation x smaller than scale height"); + + /* Compute derived quantities */ + potential->scale_height_inv = 1. / potential->scale_height; + potential->norm = + 2. * M_PI * phys_const->const_newton_G * potential->surface_density; + potential->norm_over_G = 2 * M_PI * potential->surface_density; + potential->x_trans = potential->x_max - potential->x_trunc; + + if (potential->x_trans != 0.f) + potential->x_trans_inv = 1. / potential->x_trans; + else + potential->x_trans_inv = FLT_MAX; + + if (potential->growth_time != 0.) + potential->growth_time_inv = 1. / potential->growth_time; + else + potential->growth_time_inv = FLT_MAX; } /** @@ -222,13 +331,19 @@ static INLINE void potential_print_backend( const struct external_potential* potential) { message( - "External potential is 'Disk-patch' with properties surface_density = %e " - "disc height= %e scale height = %e timestep multiplier = %e.", - potential->surface_density, potential->z_disc, potential->scale_height, + "External potential is 'Disk-patch' with Sigma=%f, x_disc=%f, b=%f and " + "dt_mult=%f.", + potential->surface_density, potential->x_disc, potential->scale_height, potential->timestep_mult); + if (potential->x_max < FLT_MAX) + message("Potential will be truncated at x_trunc=%f and zeroed at x_max=%f", + potential->x_trunc, potential->x_max); + if (potential->growth_time > 0.) - message("Disc will grow for %f dynamical times.", potential->growth_time); + message("Disc will grow for %f [time_units]. (%f dynamical time)", + potential->growth_time, + potential->growth_time / potential->dynamical_time); } #endif /* SWIFT_DISC_PATCH_H */ diff --git a/src/queue.h b/src/queue.h index 951a3e5a056d7ad0c3935f98341a0d93c805e3ad..c85cf0cabe30a03d163e2564fdc216c19495761a 100644 --- a/src/queue.h +++ b/src/queue.h @@ -29,7 +29,7 @@ #define queue_sizeinit 100 #define queue_sizegrow 2 #define queue_search_window 8 -#define queue_incoming_size 1024 +#define queue_incoming_size 10240 #define queue_struct_align 64 /* Counters. */ diff --git a/src/runner.c b/src/runner.c index 54039609621945f7c529ef945c05e2ac2fe3f17c..ec08b743452508364a7f1900963aae73061a944d 100644 --- a/src/runner.c +++ b/src/runner.c @@ -316,23 +316,35 @@ void runner_check_sorts(struct cell *c, int flags) { * @param r The #runner. * @param c The #cell. * @param flags Cell flag. + * @param cleanup If true, re-build the sorts for the selected flags instead + * of just adding them. * @param clock Flag indicating whether to record the timing or not, needed * for recursive calls. */ -void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { +void runner_do_sort(struct runner *r, struct cell *c, int flags, int cleanup, + int clock) { struct entry *finger; struct entry *fingers[8]; struct part *parts = c->parts; struct xpart *xparts = c->xparts; - struct entry *sort; const int count = c->count; float buff[8]; TIMER_TIC; + /* We need to do the local sorts plus whatever was requested further up. */ + flags |= c->do_sort; + if (cleanup) { + c->sorted = 0; + } else { + flags &= ~c->sorted; + } + if (flags == 0 && !c->do_sub_sort) return; + /* Check that the particles have been moved to the current time */ - if (!cell_are_part_drifted(c, r->e)) error("Sorting un-drifted cell"); + if (flags && !cell_are_part_drifted(c, r->e)) + error("Sorting un-drifted cell"); #ifdef SWIFT_DEBUG_CHECKS /* Make sure the sort flags are consistent (downward). */ @@ -343,44 +355,40 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { finger = finger->parent) { if (finger->sorted & ~c->sorted) error("Inconsistent sort flags (upward)."); } + + /* Update the sort timer which represents the last time the sorts + were re-set. */ + if (c->sorted == 0) c->ti_sort = r->e->ti_current; #endif - /* Clean-up the flags, i.e. filter out what's already been sorted, but - only if the sorts are recent. */ - if (c->ti_sort == r->e->ti_current) { - /* Ignore dimensions that have been sorted in this timestep. */ - // flags &= ~c->sorted; - } else { - /* Clean old (stale) sorts. */ - flags |= c->sorted; - c->sorted = 0; - } - if (flags == 0) return; - - /* start by allocating the entry arrays. */ - if (c->sort == NULL || c->sortsize < count) { - if (c->sort != NULL) free(c->sort); - c->sortsize = count * 1.1; - if ((c->sort = (struct entry *)malloc(sizeof(struct entry) * - (c->sortsize + 1) * 13)) == NULL) - error("Failed to allocate sort memory."); + /* start by allocating the entry arrays in the requested dimensions. */ + for (int j = 0; j < 13; j++) { + if ((flags & (1 << j)) && c->sort[j] == NULL) { + if ((c->sort[j] = (struct entry *)malloc(sizeof(struct entry) * + (count + 1))) == NULL) + error("Failed to allocate sort memory."); + } } - sort = c->sort; /* Does this cell have any progeny? */ if (c->split) { /* Fill in the gaps within the progeny. */ float dx_max_sort = 0.0f; + float dx_max_sort_old = 0.0f; for (int k = 0; k < 8; k++) { if (c->progeny[k] != NULL) { - if (flags & ~c->progeny[k]->sorted || - c->progeny[k]->dx_max_sort > c->dmin * space_maxreldx) - runner_do_sort(r, c->progeny[k], flags, 0); + /* Only propagate cleanup if the progeny is stale. */ + runner_do_sort(r, c->progeny[k], flags, + cleanup && (c->progeny[k]->dx_max_sort > + space_maxreldx * c->progeny[k]->dmin), + 0); dx_max_sort = max(dx_max_sort, c->progeny[k]->dx_max_sort); + dx_max_sort_old = max(dx_max_sort_old, c->progeny[k]->dx_max_sort_old); } } c->dx_max_sort = dx_max_sort; + c->dx_max_sort_old = dx_max_sort_old; /* Loop over the 13 different sort arrays. */ for (int j = 0; j < 13; j++) { @@ -402,7 +410,7 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { for (int k = 0; k < 8; k++) { inds[k] = k; if (c->progeny[k] != NULL && c->progeny[k]->count > 0) { - fingers[k] = &c->progeny[k]->sort[j * (c->progeny[k]->count + 1)]; + fingers[k] = c->progeny[k]->sort[j]; buff[k] = fingers[k]->d; off[k] = off[k]; } else @@ -419,7 +427,7 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { } /* For each entry in the new sort list. */ - finger = &sort[j * (count + 1)]; + finger = c->sort[j]; for (int ind = 0; ind < count; ind++) { /* Copy the minimum into the new sort array. */ @@ -440,11 +448,11 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { } /* Merge. */ /* Add a sentinel. */ - sort[j * (count + 1) + count].d = FLT_MAX; - sort[j * (count + 1) + count].i = 0; + c->sort[j][count].d = FLT_MAX; + c->sort[j][count].i = 0; /* Mark as sorted. */ - c->sorted |= (1 << j); + atomic_or(&c->sorted, 1 << j); } /* loop over sort arrays. */ @@ -453,13 +461,23 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { /* Otherwise, just sort. */ else { - /* Reset the sort distance if we are in a local cell */ - if (xparts != NULL) { - for (int k = 0; k < count; k++) { - xparts[k].x_diff_sort[0] = 0.0f; - xparts[k].x_diff_sort[1] = 0.0f; - xparts[k].x_diff_sort[2] = 0.0f; + /* Reset the sort distance */ + if (c->sorted == 0) { +#ifdef SWIFT_DEBUG_CHECKS + if (xparts != NULL && c->nodeID != engine_rank) + error("Have non-NULL xparts in foreign cell"); +#endif + + /* And the individual sort distances if we are a local cell */ + if (xparts != NULL) { + for (int k = 0; k < count; k++) { + xparts[k].x_diff_sort[0] = 0.0f; + xparts[k].x_diff_sort[1] = 0.0f; + xparts[k].x_diff_sort[2] = 0.0f; + } } + c->dx_max_sort_old = 0.f; + c->dx_max_sort = 0.f; } /* Fill the sort array. */ @@ -467,40 +485,28 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { const double px[3] = {parts[k].x[0], parts[k].x[1], parts[k].x[2]}; for (int j = 0; j < 13; j++) if (flags & (1 << j)) { - sort[j * (count + 1) + k].i = k; - sort[j * (count + 1) + k].d = px[0] * runner_shift[j][0] + - px[1] * runner_shift[j][1] + - px[2] * runner_shift[j][2]; + c->sort[j][k].i = k; + c->sort[j][k].d = px[0] * runner_shift[j][0] + + px[1] * runner_shift[j][1] + + px[2] * runner_shift[j][2]; } } /* Add the sentinel and sort. */ for (int j = 0; j < 13; j++) if (flags & (1 << j)) { - sort[j * (count + 1) + count].d = FLT_MAX; - sort[j * (count + 1) + count].i = 0; - runner_do_sort_ascending(&sort[j * (count + 1)], count); - c->sorted |= (1 << j); + c->sort[j][count].d = FLT_MAX; + c->sort[j][count].i = 0; + runner_do_sort_ascending(c->sort[j], count); + atomic_or(&c->sorted, 1 << j); } - - /* Finally, clear the dx_max_sort field of this cell. */ - c->dx_max_sort = 0.f; - - /* If this was not just an update, invalidate the sorts above this one. */ - if (c->ti_sort < r->e->ti_current) - for (struct cell *finger = c->parent; finger != NULL; - finger = finger->parent) - finger->sorted = 0; } - /* Update the sort timer. */ - c->ti_sort = r->e->ti_current; - #ifdef SWIFT_DEBUG_CHECKS /* Verify the sorting. */ for (int j = 0; j < 13; j++) { if (!(flags & (1 << j))) continue; - finger = &sort[j * (count + 1)]; + finger = c->sort[j]; for (int k = 1; k < count; k++) { if (finger[k].d < finger[k - 1].d) error("Sorting failed, ascending array."); @@ -518,6 +524,11 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { } #endif + /* Clear the cell's sort flags. */ + c->do_sort = 0; + c->do_sub_sort = 0; + c->requires_sorts = 0; + if (clock) TIMER_TOC(timer_dosort); } @@ -621,11 +632,9 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) { const struct engine *e = r->e; const struct space *s = e->s; const float hydro_h_max = e->hydro_properties->h_max; - const float target_wcount = e->hydro_properties->target_neighbours; - const float max_wcount = - target_wcount + e->hydro_properties->delta_neighbours; - const float min_wcount = - target_wcount - e->hydro_properties->delta_neighbours; + const float eps = e->hydro_properties->h_tolerance; + const float hydro_eta_dim = + pow_dimension(e->hydro_properties->eta_neighbours); const int max_smoothing_iter = e->hydro_properties->max_smoothing_iterations; int redo = 0, count = 0; @@ -669,28 +678,47 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) { if (!part_is_active(p, e)) error("Ghost applied to inactive particle"); #endif - /* Finish the density calculation */ - hydro_end_density(p); + /* Get some useful values */ + const float h_old = p->h; + const float h_old_dim = pow_dimension(h_old); + const float h_old_dim_minus_one = pow_dimension_minus_one(h_old); + float h_new; - /* Did we get the right number of neighbours? */ - if (p->density.wcount > max_wcount || p->density.wcount < min_wcount) { + if (p->density.wcount == 0.f) { /* No neighbours case */ - float h_corr = 0.f; + /* Double h and try again */ + h_new = 2.f * h_old; + } else { - /* If no derivative, double the smoothing length. */ - if (p->density.wcount_dh == 0.0f) h_corr = p->h; + /* Finish the density calculation */ + hydro_end_density(p); - /* Otherwise, compute the smoothing length update (Newton step). */ - else { - h_corr = (target_wcount - p->density.wcount) / p->density.wcount_dh; + /* Compute one step of the Newton-Raphson scheme */ + const float n_sum = p->density.wcount * h_old_dim; + const float n_target = hydro_eta_dim; + const float f = n_sum - n_target; + const float f_prime = + p->density.wcount_dh * h_old_dim + + hydro_dimension * p->density.wcount * h_old_dim_minus_one; - /* Truncate to the range [ -p->h/2 , p->h ]. */ - h_corr = (h_corr < p->h) ? h_corr : p->h; - h_corr = (h_corr > -0.5f * p->h) ? h_corr : -0.5f * p->h; - } + h_new = h_old - f / f_prime; + +#ifdef SWIFT_DEBUG_CHECKS + if ((f > 0.f && h_new > h_old) || (f < 0.f && h_new < h_old)) + error( + "Smoothing length correction not going in the right direction"); +#endif + + /* Safety check: truncate to the range [ h_old/2 , 2h_old ]. */ + h_new = min(h_new, 2.f * h_old); + h_new = max(h_new, 0.5f * h_old); + } + + /* Check whether the particle has an inappropriate smoothing length */ + if (fabsf(h_new - h_old) > eps * h_old) { /* Ok, correct then */ - p->h += h_corr; + p->h = h_new; /* If below the absolute maximum, try again */ if (p->h < hydro_h_max) { @@ -708,6 +736,10 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) { /* Ok, this particle is a lost cause... */ p->h = hydro_h_max; + + /* Do some damage control if no neighbours at all were found */ + if (p->density.wcount == kernel_root * kernel_norm) + hydro_part_has_no_neighbours(p, xp); } } @@ -738,6 +770,11 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) { /* Run through this cell's density interactions. */ for (struct link *l = finger->density; l != NULL; l = l->next) { +#ifdef SWIFT_DEBUG_CHECKS + if (l->t->ti_run < r->e->ti_current) + error("Density task should have been run."); +#endif + /* Self-interaction? */ if (l->t->type == task_type_self) runner_doself_subset_density(r, finger, parts, pid, count); @@ -782,7 +819,7 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) { } #else if (count) - message("Smoothing length failed to converge on %i particles.", count); + error("Smoothing length failed to converge on %i particles.", count); #endif /* Be clean */ @@ -850,7 +887,7 @@ void runner_do_drift_part(struct runner *r, struct cell *c, int timer) { TIMER_TIC; - cell_drift_part(c, r->e); + cell_drift_part(c, r->e, 0); if (timer) TIMER_TOC(timer_drift_part); } @@ -1492,6 +1529,10 @@ void runner_do_recv_part(struct runner *r, struct cell *c, int clear_sorts, timebin_t time_bin_max = 0; float h_max = 0.f; +#ifdef SWIFT_DEBUG_CHECKS + if (c->nodeID == engine_rank) error("Updating a local cell!"); +#endif + /* Clear this cell's sorted mask. */ if (clear_sorts) c->sorted = 0; @@ -1504,11 +1545,6 @@ void runner_do_recv_part(struct runner *r, struct cell *c, int clear_sorts, time_bin_min = min(time_bin_min, parts[k].time_bin); time_bin_max = max(time_bin_max, parts[k].time_bin); h_max = max(h_max, parts[k].h); - -#ifdef SWIFT_DEBUG_CHECKS - if (parts[k].ti_drift != ti_current) - error("Received un-drifted particle !"); -#endif } /* Convert into a time */ @@ -1571,6 +1607,10 @@ void runner_do_recv_gpart(struct runner *r, struct cell *c, int timer) { timebin_t time_bin_min = num_time_bins; timebin_t time_bin_max = 0; +#ifdef SWIFT_DEBUG_CHECKS + if (c->nodeID == engine_rank) error("Updating a local cell!"); +#endif + /* If this cell is a leaf, collect the particle data. */ if (!c->split) { @@ -1644,6 +1684,10 @@ void runner_do_recv_spart(struct runner *r, struct cell *c, int timer) { timebin_t time_bin_min = num_time_bins; timebin_t time_bin_max = 0; +#ifdef SWIFT_DEBUG_CHECKS + if (c->nodeID == engine_rank) error("Updating a local cell!"); +#endif + /* If this cell is a leaf, collect the particle data. */ if (!c->split) { @@ -1710,7 +1754,7 @@ void *runner_main(void *data) { while (1) { /* Wait at the barrier. */ - engine_barrier(e, r->id); + engine_barrier(e); /* Re-set the pointer to the previous task, as there is none. */ struct task *t = NULL; @@ -1735,9 +1779,19 @@ void *runner_main(void *data) { struct cell *ci = t->ci; struct cell *cj = t->cj; -/* Mark the thread we run on */ #ifdef SWIFT_DEBUG_TASKS + /* Mark the thread we run on */ t->rid = r->cpuid; + + /* And recover the pair direction */ + if (t->type == task_type_pair || t->type == task_type_sub_pair) { + struct cell *ci_temp = ci; + struct cell *cj_temp = cj; + double shift[3]; + t->sid = space_getsid(e->s, &ci_temp, &cj_temp, shift); + } else { + t->sid = -1; + } #endif /* Check that we haven't scheduled an inactive task */ @@ -1764,7 +1818,7 @@ void *runner_main(void *data) { /* Special case for sorts */ if (!cell_is_active(ci, e) && t->type == task_type_sort && - t->flags == 0) + !(ci->do_sort || ci->do_sub_sort)) error( "Task (type='%s/%s') should have been skipped ti_current=%lld " "c->ti_end_min=%lld t->flags=%d", @@ -1822,16 +1876,11 @@ void *runner_main(void *data) { break; case task_type_pair: - if (t->subtype == task_subtype_density) { -#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) - runner_dopair1_density_vec(r, ci, cj); -#else - runner_dopair1_density(r, ci, cj); -#endif - } + if (t->subtype == task_subtype_density) + runner_dopair1_branch_density(r, ci, cj); #ifdef EXTRA_HYDRO_LOOP else if (t->subtype == task_subtype_gradient) - runner_dopair1_gradient(r, ci, cj); + runner_dopair1_branch_gradient(r, ci, cj); #endif else if (t->subtype == task_subtype_force) runner_dopair2_force(r, ci, cj); @@ -1874,7 +1923,11 @@ void *runner_main(void *data) { break; case task_type_sort: - runner_do_sort(r, ci, t->flags, 1); + /* Cleanup only if any of the indices went stale. */ + runner_do_sort(r, ci, t->flags, + ci->dx_max_sort_old > space_maxreldx * ci->dmin, 1); + /* Reset the sort flags as our work here is done. */ + t->flags = 0; break; case task_type_init_grav: runner_do_init_grav(r, ci, 1); @@ -1917,9 +1970,9 @@ void *runner_main(void *data) { } else if (t->subtype == task_subtype_xv) { runner_do_recv_part(r, ci, 1, 1); } else if (t->subtype == task_subtype_rho) { - runner_do_recv_part(r, ci, 1, 1); + runner_do_recv_part(r, ci, 0, 1); } else if (t->subtype == task_subtype_gradient) { - runner_do_recv_part(r, ci, 1, 1); + runner_do_recv_part(r, ci, 0, 1); } else if (t->subtype == task_subtype_gpart) { runner_do_recv_gpart(r, ci, 1); } else if (t->subtype == task_subtype_spart) { diff --git a/src/runner.h b/src/runner.h index 0c6edc3c0c1406855ac79c96617bbdaa310bb46d..e33a3e380e6097a67258d116d617483caca35086 100644 --- a/src/runner.h +++ b/src/runner.h @@ -28,6 +28,7 @@ /* Includes. */ #include "cache.h" +#include "gravity_cache.h" struct cell; struct engine; @@ -49,7 +50,14 @@ struct runner { /*! The engine owing this runner. */ struct engine *e; + /*! The particle gravity_cache of cell ci. */ + struct gravity_cache ci_gravity_cache; + + /*! The particle gravity_cache of cell cj. */ + struct gravity_cache cj_gravity_cache; + #ifdef WITH_VECTORIZATION + /*! The particle cache of cell ci. */ struct cache ci_cache; @@ -61,7 +69,8 @@ struct runner { /* Function prototypes. */ void runner_do_ghost(struct runner *r, struct cell *c, int timer); void runner_do_extra_ghost(struct runner *r, struct cell *c, int timer); -void runner_do_sort(struct runner *r, struct cell *c, int flag, int clock); +void runner_do_sort(struct runner *r, struct cell *c, int flag, int cleanup, + int clock); void runner_do_drift_part(struct runner *r, struct cell *c, int timer); void runner_do_drift_gpart(struct runner *r, struct cell *c, int timer); void runner_do_kick1(struct runner *r, struct cell *c, int timer); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 9e6adb9e267f0ee48d28cde937f280b51ca372dc..c07d70f3e48bb6f1c9e7e343a50cdbba71da0785 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -634,15 +634,13 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, const int flipped = runner_flip[sid]; sid = sortlistID[sid]; - /* Have the cells been sorted? */ + /* Has the cell cj been sorted? */ if (!(cj->sorted & (1 << sid)) || - cj->dx_max_sort > space_maxreldx * cj->dmin) { - DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj); - return; - } + cj->dx_max_sort_old > space_maxreldx * cj->dmin) + error("Interacting unsorted cells."); /* Pick-out the sorted lists. */ - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + const struct entry *restrict sort_j = cj->sort[sid]; const float dxj = cj->dx_max_sort; /* Parts are on the left? */ @@ -884,8 +882,11 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. + * @param sid The direction of the pair + * @param shift The shift vector to apply to the particles in ci. */ -void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { +void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, + const double *shift) { const struct engine *restrict e = r->e; @@ -900,29 +901,13 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { TIMER_TIC; - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - - if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) - error("Interacting undrifted cells."); - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin) - runner_do_sort(r, ci, (1 << sid), 1); - if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin) - runner_do_sort(r, cj, (1 << sid), 1); - /* Get the cutoff shift. */ double rshift = 0.0; for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + const struct entry *restrict sort_i = ci->sort[sid]; + const struct entry *restrict sort_j = cj->sort[sid]; #ifdef SWIFT_DEBUG_CHECKS /* Check that the dx_max_sort values in the cell are indeed an upper @@ -933,8 +918,13 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { p->x[1] * runner_shift[sid][1] + p->x[2] * runner_shift[sid][2]; if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-6 * max(fabsf(d), ci->dx_max_sort)) - error("particle shift diff exceeds dx_max_sort."); + 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " + "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " + "ci->dx_max_sort_old=%e", + ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, + ci->dx_max_sort_old); } for (int pjd = 0; pjd < cj->count; pjd++) { const struct part *p = &cj->parts[sort_j[pjd].i]; @@ -942,8 +932,13 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { p->x[1] * runner_shift[sid][1] + p->x[2] * runner_shift[sid][2]; if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-6 * max(fabsf(d), cj->dx_max_sort)) - error("particle shift diff exceeds dx_max_sort."); + 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " + "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " + "cj->dx_max_sort_old=%e", + cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, + cj->dx_max_sort_old); } #endif /* SWIFT_DEBUG_CHECKS */ @@ -1042,9 +1037,9 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (!part_is_active(pj, e)) continue; const float hj = pj->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; + if (dj - rshift > di_max) continue; + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; @@ -1116,6 +1111,49 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { TIMER_TOC(TIMER_DOPAIR); } +/** + * @brief Determine which version of DOPAIR1 needs to be called depending on the + * orientation of the cells or whether DOPAIR1 needs to be called at all. + * + * @param r #runner + * @param ci #cell ci + * @param cj #cell cj + * + */ +void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) { + + const struct engine *restrict e = r->e; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + + /* Check that cells are drifted. */ + if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) + error("Interacting undrifted cells."); + + /* Get the sort ID. */ + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || + ci->dx_max_sort_old > space_maxreldx * ci->dmin) + error("Interacting unsorted cells."); + if (!(cj->sorted & (1 << sid)) || + cj->dx_max_sort_old > space_maxreldx * cj->dmin) + error("Interacting unsorted cells."); + +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \ + (DOPAIR1_BRANCH == runner_dopair1_density_branch) + if (!sort_is_corner(sid)) + runner_dopair1_density_vec(r, ci, cj, sid, shift); + else + DOPAIR1(r, ci, cj, sid, shift); +#else + DOPAIR1(r, ci, cj, sid, shift); +#endif +} + /** * @brief Compute the interactions between a cell pair (symmetric) * @@ -1155,18 +1193,20 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { const int sid = space_getsid(e->s, &ci, &cj, shift); /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin) - runner_do_sort(r, ci, (1 << sid), 1); - if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin) - runner_do_sort(r, cj, (1 << sid), 1); + if (!(ci->sorted & (1 << sid)) || + ci->dx_max_sort_old > space_maxreldx * ci->dmin) + error("Interacting unsorted cells."); + if (!(cj->sorted & (1 << sid)) || + cj->dx_max_sort_old > space_maxreldx * cj->dmin) + error("Interacting unsorted cells."); /* Get the cutoff shift. */ double rshift = 0.0; for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + struct entry *restrict sort_i = ci->sort[sid]; + struct entry *restrict sort_j = cj->sort[sid]; #ifdef SWIFT_DEBUG_CHECKS /* Check that the dx_max_sort values in the cell are indeed an upper @@ -1177,8 +1217,13 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { p->x[1] * runner_shift[sid][1] + p->x[2] * runner_shift[sid][2]; if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-6 * max(fabsf(d), ci->dx_max_sort)) - error("particle shift diff exceeds dx_max_sort."); + 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " + "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " + "ci->dx_max_sort_old=%e", + ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, + ci->dx_max_sort_old); } for (int pjd = 0; pjd < cj->count; pjd++) { const struct part *p = &cj->parts[sort_j[pjd].i]; @@ -1186,8 +1231,13 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { p->x[1] * runner_shift[sid][1] + p->x[2] * runner_shift[sid][2]; if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-6 * max(fabsf(d), cj->dx_max_sort)) - error("particle shift diff exceeds dx_max_sort."); + 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " + "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " + "cj->dx_max_sort_old=%e", + cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, + cj->dx_max_sort_old); } #endif /* SWIFT_DEBUG_CHECKS */ @@ -1399,9 +1449,9 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; const float hj = pj->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; + if (dj - rshift > di_max) continue; + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; @@ -2063,19 +2113,12 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid, if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; if (ci->count == 0 || cj->count == 0) return; - /* Get the cell dimensions. */ - const float h = min(ci->width[0], min(ci->width[1], ci->width[2])); - /* Get the type of pair if not specified explicitly. */ - // if ( sid < 0 ) double shift[3]; sid = space_getsid(s, &ci, &cj, shift); /* Recurse? */ - if (ci->split && cj->split && - max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort + - cj->dx_max_sort < - h / 2) { + if (cell_can_recurse_in_pair_task(ci) && cell_can_recurse_in_pair_task(cj)) { /* Different types of flags. */ switch (sid) { @@ -2279,24 +2322,19 @@ void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid, else if (cell_is_active(ci, e) || cell_is_active(cj, e)) { /* Make sure both cells are drifted to the current timestep. */ - if (!cell_are_part_drifted(ci, e)) cell_drift_part(ci, e); - if (!cell_are_part_drifted(cj, e)) cell_drift_part(cj, e); + if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) + error("Interacting undrifted cells."); /* Do any of the cells need to be sorted first? */ if (!(ci->sorted & (1 << sid)) || - ci->dx_max_sort > ci->dmin * space_maxreldx) - runner_do_sort(r, ci, (1 << sid), 1); + ci->dx_max_sort_old > ci->dmin * space_maxreldx) + error("Interacting unsorted cell."); if (!(cj->sorted & (1 << sid)) || - cj->dx_max_sort > cj->dmin * space_maxreldx) - runner_do_sort(r, cj, (1 << sid), 1); + cj->dx_max_sort_old > cj->dmin * space_maxreldx) + error("Interacting unsorted cell."); -/* Compute the interactions. */ -#if (DOPAIR1 == runner_dopair1_density) && defined(WITH_VECTORIZATION) && \ - defined(GADGET2_SPH) - runner_dopair1_density_vec(r, ci, cj); -#else - DOPAIR1(r, ci, cj); -#endif + /* Compute the interactions. */ + DOPAIR1_BRANCH(r, ci, cj); } if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR); @@ -2317,7 +2355,7 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) { if (ci->count == 0 || !cell_is_active(ci, r->e)) return; /* Recurse? */ - if (ci->split) { + if (cell_can_recurse_in_self_task(ci)) { /* Loop over all progeny. */ for (int k = 0; k < 8; k++) @@ -2333,7 +2371,7 @@ void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) { else { /* Drift the cell to the current timestep if needed. */ - if (!cell_are_part_drifted(ci, r->e)) cell_drift_part(ci, r->e); + if (!cell_are_part_drifted(ci, r->e)) error("Interacting undrifted cell."); #if (DOSELF1 == runner_doself1_density) && defined(WITH_VECTORIZATION) && \ defined(GADGET2_SPH) @@ -2370,19 +2408,12 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid, if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; if (ci->count == 0 || cj->count == 0) return; - /* Get the cell dimensions. */ - const float h = min(ci->width[0], min(ci->width[1], ci->width[2])); - /* Get the type of pair if not specified explicitly. */ - // if ( sid < 0 ) double shift[3]; sid = space_getsid(s, &ci, &cj, shift); /* Recurse? */ - if (ci->split && cj->split && - max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort + - cj->dx_max_sort < - h / 2) { + if (cell_can_recurse_in_pair_task(ci) && cell_can_recurse_in_pair_task(cj)) { /* Different types of flags. */ switch (sid) { @@ -2586,16 +2617,16 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid, else if (cell_is_active(ci, e) || cell_is_active(cj, e)) { /* Make sure both cells are drifted to the current timestep. */ - if (!cell_are_part_drifted(ci, e)) cell_drift_part(ci, e); - if (!cell_are_part_drifted(cj, e)) cell_drift_part(cj, e); + if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) + error("Interacting undrifted cells."); /* Do any of the cells need to be sorted first? */ if (!(ci->sorted & (1 << sid)) || - ci->dx_max_sort > ci->dmin * space_maxreldx) - runner_do_sort(r, ci, (1 << sid), 1); + ci->dx_max_sort_old > ci->dmin * space_maxreldx) + error("Interacting unsorted cells."); if (!(cj->sorted & (1 << sid)) || - cj->dx_max_sort > cj->dmin * space_maxreldx) - runner_do_sort(r, cj, (1 << sid), 1); + cj->dx_max_sort_old > cj->dmin * space_maxreldx) + error("Interacting unsorted cells."); /* Compute the interactions. */ DOPAIR2(r, ci, cj); @@ -2619,7 +2650,7 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) { if (ci->count == 0 || !cell_is_active(ci, r->e)) return; /* Recurse? */ - if (ci->split) { + if (cell_can_recurse_in_self_task(ci)) { /* Loop over all progeny. */ for (int k = 0; k < 8; k++) @@ -2652,22 +2683,29 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, TIMER_TIC; + /* Should we even bother? */ + if (!cell_is_active(ci, e) && (cj == NULL || !cell_is_active(cj, e))) return; + if (ci->count == 0 || (cj != NULL && cj->count == 0)) return; + /* Find out in which sub-cell of ci the parts are. */ struct cell *sub = NULL; - for (int k = 0; k < 8; k++) - if (ci->progeny[k] != NULL) { - if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] && - &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) { - sub = ci->progeny[k]; - break; + if (ci->split) { + for (int k = 0; k < 8; k++) { + if (ci->progeny[k] != NULL) { + if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] && + &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) { + sub = ci->progeny[k]; + break; + } } } + } /* Is this a single cell? */ if (cj == NULL) { /* Recurse? */ - if (ci->split) { + if (cell_can_recurse_in_self_task(ci)) { /* Loop over all progeny. */ DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0); @@ -2686,14 +2724,9 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, /* Otherwise, it's a pair interaction. */ else { - /* Get the cell dimensions. */ - const float h = min(ci->width[0], min(ci->width[1], ci->width[2])); - /* Recurse? */ - if (ci->split && cj->split && - max(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max_sort + - cj->dx_max_sort < - h / 2) { + if (cell_can_recurse_in_pair_task(ci) && + cell_can_recurse_in_pair_task(cj)) { /* Get the type of pair if not specified explicitly. */ double shift[3] = {0.0, 0.0, 0.0}; @@ -3204,26 +3237,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, /* Otherwise, compute the pair directly. */ else if (cell_is_active(ci, e) || cell_is_active(cj, e)) { - /* Get the relative distance between the pairs, wrapping. */ - double shift[3] = {0.0, 0.0, 0.0}; - for (int k = 0; k < 3; k++) { - if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2) - shift[k] = s->dim[k]; - else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2) - shift[k] = -s->dim[k]; - } - - /* Get the sorting index. */ - int new_sid = 0; - for (int k = 0; k < 3; k++) - new_sid = 3 * new_sid + - ((cj->loc[k] - ci->loc[k] + shift[k] < 0) - ? 0 - : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); - new_sid = sortlistID[new_sid]; - /* Do any of the cells need to be drifted first? */ - if (!cell_are_part_drifted(cj, e)) cell_drift_part(cj, e); + if (!cell_are_part_drifted(cj, e)) error("Cell should be drifted!"); DOPAIR_SUBSET(r, ci, parts, ind, count, cj); } diff --git a/src/runner_doiact_fft.c b/src/runner_doiact_fft.c index a3e3f38fba920c0c58d600bb25feda88d4a3cf84..26b59f9f6b864445df9190c6041ee684c456ba22 100644 --- a/src/runner_doiact_fft.c +++ b/src/runner_doiact_fft.c @@ -20,9 +20,6 @@ /* Config parameters. */ #include "../config.h" -/* Some standard headers. */ -#include <pthread.h> - #ifdef HAVE_FFTW #include <fftw3.h> #endif @@ -33,6 +30,7 @@ /* Local includes. */ #include "engine.h" #include "error.h" +#include "kernel_long_gravity.h" #include "runner.h" #include "space.h" #include "timers.h" @@ -179,11 +177,12 @@ void runner_do_grav_fft(struct runner* r, int timer) { // error("Top-level multipole %d not drifted", i); /* Allocates some memory for the density mesh */ - double* restrict rho = fftw_alloc_real(N * N * N); + double* restrict rho = fftw_malloc(sizeof(double) * N * N * N); if (rho == NULL) error("Error allocating memory for density mesh"); /* Allocates some memory for the mesh in Fourier space */ - fftw_complex* restrict frho = fftw_alloc_complex(N * N * (N_half + 1)); + fftw_complex* restrict frho = + fftw_malloc(sizeof(fftw_complex) * N * N * (N_half + 1)); if (frho == NULL) error("Error allocating memory for transform of density mesh"); @@ -241,7 +240,9 @@ void runner_do_grav_fft(struct runner* r, int timer) { if (k2 == 0.) continue; /* Green function */ - const double green_cor = green_fac * exp(-k2 * a_smooth2) / k2; + double W; + fourier_kernel_long_grav_eval(k2 * a_smooth2, &W); + const double green_cor = green_fac * W / k2; /* Deconvolution of CIC */ const double CIC_cor = sinc_kx_inv * sinc_ky_inv * sinc_kz_inv; diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index a66cc5e0c9ed241aba3bb1b4329016b8e505e280..01ea6a073211a08430e77721f4c2e60ef7adfd04 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -36,8 +36,10 @@ */ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) { + /* Some constants */ const struct engine *e = r->e; - const int periodic = e->s->periodic; + + /* Cell properties */ struct gpart *gparts = c->gparts; const int gcount = c->gcount; @@ -52,7 +54,6 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) { /* Add the field-tensor to all the 8 progenitors */ for (int k = 0; k < 8; ++k) { struct cell *cp = c->progeny[k]; - struct grav_tensor temp; /* Do we have a progenitor with any active g-particles ? */ if (cp != NULL && cell_is_active(cp, e)) { @@ -61,13 +62,14 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) { if (cp->ti_old_multipole != e->ti_current) error("cp->multipole not drifted."); #endif + struct grav_tensor shifted_tensor; /* Shift the field tensor */ - gravity_L2L(&temp, &c->multipole->pot, cp->multipole->CoM, - c->multipole->CoM, 0 * periodic); + gravity_L2L(&shifted_tensor, &c->multipole->pot, cp->multipole->CoM, + c->multipole->CoM); /* Add it to this level's tensor */ - gravity_field_tensors_add(&cp->multipole->pot, &temp); + gravity_field_tensors_add(&cp->multipole->pot, &shifted_tensor); /* Recurse */ runner_do_grav_down(r, cp, 0); @@ -91,6 +93,7 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) { error("gpart not drifted to current time"); #endif + /* Apply the kernel */ gravity_L2P(&c->multipole->pot, c->multipole->CoM, gp); } } @@ -110,10 +113,12 @@ void runner_do_grav_down(struct runner *r, struct cell *c, int timer) { void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci, struct cell *restrict cj) { + /* Some constants */ const struct engine *e = r->e; + const struct space *s = e->s; + const int periodic = s->periodic; + const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; const struct gravity_props *props = e->gravity_properties; - const int periodic = e->s->periodic; - const struct multipole *multi_j = &cj->multipole->m_pole; // const float a_smooth = e->gravity_properties->a_smooth; // const float rlr_inv = 1. / (a_smooth * ci->super->width[0]); @@ -122,6 +127,9 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci, /* Anything to do here? */ if (!cell_is_active(ci, e)) return; + /* Short-cut to the multipole */ + const struct multipole *multi_j = &cj->multipole->m_pole; + #ifdef SWIFT_DEBUG_CHECKS if (ci == cj) error("Interacting a cell with itself using M2L"); @@ -136,202 +144,1133 @@ void runner_dopair_grav_mm(const struct runner *r, struct cell *restrict ci, /* Let's interact at this level */ gravity_M2L(&ci->multipole->pot, multi_j, ci->multipole->CoM, - cj->multipole->CoM, props, periodic * 0); + cj->multipole->CoM, props, periodic, dim); TIMER_TOC(timer_dopair_grav_mm); } /** - * @brief Computes the interaction of all the particles in a cell with the - * multipole of another cell. + * @brief Computes the interaction of all the particles in a cell with all the + * particles of another cell using the full Newtonian potential * * @param r The #runner. - * @param ci The #cell with particles to interct. - * @param cj The #cell with the multipole. + * @param ci The first #cell. + * @param cj The other #cell. + * @param shift The distance vector (periodically wrapped) between the cell + * centres. */ -void runner_dopair_grav_pm(const struct runner *r, - const struct cell *restrict ci, - const struct cell *restrict cj) { +void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, + struct cell *cj, double shift[3]) { + + /* Some constants */ + const struct engine *const e = r->e; + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + struct gravity_cache *const cj_cache = &r->cj_gravity_cache; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + const int ci_active = cell_is_active(ci, e); + const int cj_active = cell_is_active(cj, e); + const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; + const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]), + 0.5 * (loc_i[1] + loc_j[1]), + 0.5 * (loc_i[2] + loc_j[2])}; + + /* Anything to do here ?*/ + if (!ci_active && !cj_active) return; + + /* Check that we fit in cache */ + if (gcount_i > ci_cache->count || gcount_j > cj_cache->count) + error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i, + gcount_j); + + /* Computed the padded counts */ + const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE; + const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE; + + /* Fill the caches */ + gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, + loc_mean); + gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j, + loc_mean); + + /* Ok... Here we go ! */ + + if (ci_active) { + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount_i; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_i[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_j, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_padded_j; pjd++) { + + /* Get info about j */ + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + const float mass_j = cj_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ij, W_ij; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount_j) gparts_i[pid].num_interacted++; +#endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + } + + /* Now do the opposite loop */ + if (cj_active) { + + /* Loop over all particles in ci... */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_j[pjd], e)) continue; + + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + + /* Some powers of the softening length */ + const float h_j = cj_cache->epsilon[pjd]; + const float h2_j = h_j * h_j; + const float h_inv_j = 1.f / h_j; + const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_i, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_padded_i; pid++) { + + /* Get info about j */ + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + const float mass_i = ci_cache->m[pid]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_j - x_i; + const float dy = y_j - y_i; + const float dz = z_j - z_i; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); + if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + + float f_ji, W_ji; + + if (r2 >= h2_j) { + + /* Get Newtonian gravity */ + f_ji = mass_i * r_inv * r_inv * r_inv; + + } else { + + const float r = r2 * r_inv; + const float uj = r * h_inv_j; + + kernel_grav_eval(uj, &W_ji); + + /* Get softened gravity */ + f_ji = mass_i * h_inv3_j * W_ji; + } + + /* Store it back */ + a_x -= f_ji * dx; + a_y -= f_ji * dy; + a_z -= f_ji * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pid < gcount_i) gparts_j[pjd].num_interacted++; +#endif + } + + /* Store everything back in cache */ + cj_cache->a_x[pjd] = a_x; + cj_cache->a_y[pjd] = a_y; + cj_cache->a_z[pjd] = a_z; + } + } + + /* Write back to the particles */ + if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i); + if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j); + +#ifdef MATTHIEU_OLD_STUFF + + /* Some constants */ + const struct engine *const e = r->e; - error("Function should not be called"); + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + + /* MATTHIEU: Should we use local DP accumulators ? */ + + /* Loop over all particles in ci... */ + if (cell_is_active(ci, e)) { + for (int pid = 0; pid < gcount_i; pid++) { + + /* Get a hold of the ith part in ci. */ + struct gpart *restrict gpi = &gparts_i[pid]; + + if (!gpart_is_active(gpi, e)) continue; + + /* Apply boundary condition */ + const double pix[3] = {gpi->x[0] - shift[0], gpi->x[1] - shift[1], + gpi->x[2] - shift[2]}; + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Get a hold of the jth part in cj. */ + const struct gpart *restrict gpj = &gparts_j[pjd]; + + /* Compute the pairwise distance. */ + const float dx[3] = {pix[0] - gpj->x[0], // x + pix[1] - gpj->x[1], // y + pix[2] - gpj->x[2]}; // z + const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that particles have been drifted to the current time */ + if (gpi->ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (gpj->ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Interact ! */ + runner_iact_grav_pp_nonsym(r2, dx, gpi, gpj); + +#ifdef SWIFT_DEBUG_CHECKS + gpi->num_interacted++; +#endif + } + } + } + + /* Loop over all particles in cj... */ + if (cell_is_active(cj, e)) { + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Get a hold of the ith part in ci. */ + struct gpart *restrict gpj = &gparts_j[pjd]; + + if (!gpart_is_active(gpj, e)) continue; + + /* Apply boundary condition */ + const double pjx[3] = {gpj->x[0] + shift[0], gpj->x[1] + shift[1], + gpj->x[2] + shift[2]}; + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_i; pid++) { + + /* Get a hold of the ith part in ci. */ + const struct gpart *restrict gpi = &gparts_i[pid]; + + /* Compute the pairwise distance. */ + const float dx[3] = {pjx[0] - gpi->x[0], // x + pjx[1] - gpi->x[1], // y + pjx[2] - gpi->x[2]}; // z + const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that particles have been drifted to the current time */ + if (gpi->ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (gpj->ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Interact ! */ + runner_iact_grav_pp_nonsym(r2, dx, gpj, gpi); + +#ifdef SWIFT_DEBUG_CHECKS + gpj->num_interacted++; +#endif + } + } + } +#endif } /** * @brief Computes the interaction of all the particles in a cell with all the - * particles of another cell. + * particles of another cell using the truncated Newtonian potential + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The other #cell. + * @param shift The distance vector (periodically wrapped) between the cell + * centres. + */ +void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci, + struct cell *cj, double shift[3]) { + + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + + /* Caches to play with */ + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + struct gravity_cache *const cj_cache = &r->cj_gravity_cache; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + const int ci_active = cell_is_active(ci, e); + const int cj_active = cell_is_active(cj, e); + const double loc_i[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; + const double loc_j[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; + const double loc_mean[3] = {0.5 * (loc_i[0] + loc_j[0]), + 0.5 * (loc_i[1] + loc_j[1]), + 0.5 * (loc_i[2] + loc_j[2])}; + + /* Anything to do here ?*/ + if (!ci_active && !cj_active) return; + + /* Check that we fit in cache */ + if (gcount_i > ci_cache->count || gcount_j > cj_cache->count) + error("Not enough space in the caches! gcount_i=%d gcount_j=%d", gcount_i, + gcount_j); + + /* Computed the padded counts */ + const int gcount_padded_i = gcount_i - (gcount_i % VEC_SIZE) + VEC_SIZE; + const int gcount_padded_j = gcount_j - (gcount_j % VEC_SIZE) + VEC_SIZE; + + /* Fill the caches */ + gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, + loc_mean); + gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j, + loc_mean); + + /* Ok... Here we go ! */ + + if (ci_active) { + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount_i; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_i[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_j, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_padded_j; pjd++) { + + /* Get info about j */ + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + const float mass_j = cj_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount_j && gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ij, W_ij, corr_lr; + + if (r2 >= h2_i) { + + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; + + } else { + + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ij *= corr_lr; + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount_j) gparts_i[pid].num_interacted++; +#endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + } + + /* Now do the opposite loop */ + if (cj_active) { + + /* Loop over all particles in ci... */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts_j[pjd], e)) continue; + + const float x_j = cj_cache->x[pjd]; + const float y_j = cj_cache->y[pjd]; + const float z_j = cj_cache->z[pjd]; + + /* Some powers of the softening length */ + const float h_j = cj_cache->epsilon[pjd]; + const float h2_j = h_j * h_j; + const float h_inv_j = 1.f / h_j; + const float h_inv3_j = h_inv_j * h_inv_j * h_inv_j; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded_i, VEC_SIZE); + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_padded_i; pid++) { + + /* Get info about j */ + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + const float mass_i = ci_cache->m[pid]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_j - x_i; + const float dy = y_j - y_i; + const float dz = z_j - z_i; + const float r2 = dx * dx + dy * dy + dz * dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts_j[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); + if (pid < gcount_i && gparts_i[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); +#endif + + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; + + float f_ji, W_ji, corr_lr; + + if (r2 >= h2_j) { + + /* Get Newtonian gravity */ + f_ji = mass_i * r_inv * r_inv * r_inv; + + } else { + + const float uj = r * h_inv_j; + + kernel_grav_eval(uj, &W_ji); + + /* Get softened gravity */ + f_ji = mass_i * h_inv3_j * W_ji; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ji *= corr_lr; + + /* Store it back */ + a_x -= f_ji * dx; + a_y -= f_ji * dy; + a_z -= f_ji * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pid < gcount_i) gparts_j[pjd].num_interacted++; +#endif + } + + /* Store everything back in cache */ + cj_cache->a_x[pjd] = a_x; + cj_cache->a_y[pjd] = a_y; + cj_cache->a_z[pjd] = a_z; + } + } + + /* Write back to the particles */ + if (ci_active) gravity_cache_write_back(ci_cache, gparts_i, gcount_i); + if (cj_active) gravity_cache_write_back(cj_cache, gparts_j, gcount_j); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + + /* Cell properties */ + const int gcount_i = ci->gcount; + const int gcount_j = cj->gcount; + struct gpart *restrict gparts_i = ci->gparts; + struct gpart *restrict gparts_j = cj->gparts; + + /* MATTHIEU: Should we use local DP accumulators ? */ + + /* Loop over all particles in ci... */ + if (cell_is_active(ci, e)) { + for (int pid = 0; pid < gcount_i; pid++) { + + /* Get a hold of the ith part in ci. */ + struct gpart *restrict gpi = &gparts_i[pid]; + + if (!gpart_is_active(gpi, e)) continue; + + /* Apply boundary condition */ + const double pix[3] = {gpi->x[0] - shift[0], gpi->x[1] - shift[1], + gpi->x[2] - shift[2]}; + + /* Loop over every particle in the other cell. */ + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Get a hold of the jth part in cj. */ + const struct gpart *restrict gpj = &gparts_j[pjd]; + + /* Compute the pairwise distance. */ + const float dx[3] = {pix[0] - gpj->x[0], // x + pix[1] - gpj->x[1], // y + pix[2] - gpj->x[2]}; // z + const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that particles have been drifted to the current time */ + if (gpi->ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (gpj->ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Interact ! */ + runner_iact_grav_pp_truncated_nonsym(r2, dx, gpi, gpj, rlr_inv); + +#ifdef SWIFT_DEBUG_CHECKS + gpi->num_interacted++; +#endif + } + } + } + + /* Loop over all particles in cj... */ + if (cell_is_active(cj, e)) { + for (int pjd = 0; pjd < gcount_j; pjd++) { + + /* Get a hold of the ith part in ci. */ + struct gpart *restrict gpj = &gparts_j[pjd]; + + if (!gpart_is_active(gpj, e)) continue; + + /* Apply boundary condition */ + const double pjx[3] = {gpj->x[0] + shift[0], gpj->x[1] + shift[1], + gpj->x[2] + shift[2]}; + + /* Loop over every particle in the other cell. */ + for (int pid = 0; pid < gcount_i; pid++) { + + /* Get a hold of the ith part in ci. */ + const struct gpart *restrict gpi = &gparts_i[pid]; + + /* Compute the pairwise distance. */ + const float dx[3] = {pjx[0] - gpi->x[0], // x + pjx[1] - gpi->x[1], // y + pjx[2] - gpi->x[2]}; // z + const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that particles have been drifted to the current time */ + if (gpi->ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (gpj->ti_drift != e->ti_current) + error("gpj not drifted to current time"); +#endif + + /* Interact ! */ + runner_iact_grav_pp_truncated_nonsym(r2, dx, gpj, gpi, rlr_inv); + +#ifdef SWIFT_DEBUG_CHECKS + gpj->num_interacted++; +#endif + } + } + } + +#endif +} + +/** + * @brief Computes the interaction of all the particles in a cell with all the + * particles of another cell (switching function between full and truncated). * * @param r The #runner. * @param ci The first #cell. * @param cj The other #cell. + */ +void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) { + + /* Some properties of the space */ + const struct engine *e = r->e; + const struct space *s = e->s; + const int periodic = s->periodic; + const double cell_width = s->width[0]; + const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; + const double a_smooth = e->gravity_properties->a_smooth; + const double r_cut_min = e->gravity_properties->r_cut_min; + const double min_trunc = cell_width * r_cut_min * a_smooth; + double shift[3] = {0.0, 0.0, 0.0}; + + TIMER_TIC; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + + /* Let's start by drifting things */ + if (!cell_are_gpart_drifted(ci, e)) cell_drift_gpart(ci, e); + if (!cell_are_gpart_drifted(cj, e)) cell_drift_gpart(cj, e); + + /* Can we use the Newtonian version or do we need the truncated one ? */ + if (!periodic) { + runner_dopair_grav_pp_full(r, ci, cj, shift); + } else { + + /* Get the relative distance between the pairs, wrapping. */ + shift[0] = nearest(cj->loc[0] - ci->loc[0], dim[0]); + shift[1] = nearest(cj->loc[1] - ci->loc[1], dim[1]); + shift[2] = nearest(cj->loc[2] - ci->loc[2], dim[2]); + const double r2 = + shift[0] * shift[0] + shift[1] * shift[1] + shift[2] * shift[2]; + + /* Get the maximal distance between any two particles */ + const double max_r = sqrt(r2) + ci->multipole->r_max + cj->multipole->r_max; + + /* Do we need to use the truncated interactions ? */ + if (max_r > min_trunc) + runner_dopair_grav_pp_truncated(r, ci, cj, shift); + else + runner_dopair_grav_pp_full(r, ci, cj, shift); + } + + TIMER_TOC(timer_dopair_grav_pp); +} + +/** + * @brief Computes the interaction of all the particles in a cell using the + * full Newtonian potential. + * + * @param r The #runner. + * @param c The #cell. * * @todo Use a local cache for the particles. */ -void runner_dopair_grav_pp(struct runner *r, struct cell *ci, struct cell *cj) { +void runner_doself_grav_pp_full(struct runner *r, struct cell *c) { + + /* Some constants */ + const struct engine *const e = r->e; + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + + /* Cell properties */ + const int gcount = c->gcount; + struct gpart *restrict gparts = c->gparts; + const int c_active = cell_is_active(c, e); + const double loc[3] = {c->loc[0] + 0.5 * c->width[0], + c->loc[1] + 0.5 * c->width[1], + c->loc[2] + 0.5 * c->width[2]}; + + /* Anything to do here ?*/ + if (!c_active) return; + + /* Check that we fit in cache */ + if (gcount > ci_cache->count) + error("Not enough space in the cache! gcount=%d", gcount); + + /* Computed the padded counts */ + const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE; + + gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc); + + /* Ok... Here we go ! */ + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount; pid++) { - const struct engine *e = r->e; - const int gcount_i = ci->gcount; - const int gcount_j = cj->gcount; - struct gpart *restrict gparts_i = ci->gparts; - struct gpart *restrict gparts_j = cj->gparts; - const float a_smooth = e->gravity_properties->a_smooth; - const float rlr_inv = 1. / (a_smooth * ci->super->width[0]); + /* Skip inactive particles */ + if (!gpart_is_active(&gparts[pid], e)) continue; - TIMER_TIC; + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; - /* Let's start by drifting things */ - if (!cell_are_gpart_drifted(ci, e)) cell_drift_gpart(ci, e); - if (!cell_are_gpart_drifted(cj, e)) cell_drift_gpart(cj, e); + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; -#if ICHECK > 0 - for (int pid = 0; pid < gcount_i; pid++) { + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &gparts_i[pid]; + /* Loop over every other particle in the cell. */ + for (int pjd = 0; pjd < gcount_padded; pjd++) { - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count= %d", - gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2], - cj->width[0], cj->gcount); - } + /* No self interaction */ + if (pid == pjd) continue; - for (int pid = 0; pid < gcount_j; pid++) { + /* Get info about j */ + const float x_j = ci_cache->x[pjd]; + const float y_j = ci_cache->y[pjd]; + const float z_j = ci_cache->z[pjd]; + const float mass_j = ci_cache->m[pjd]; - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &gparts_j[pid]; + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count=%d", - gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2], - ci->width[0], ci->gcount); - } +#ifdef SWIFT_DEBUG_CHECKS + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); #endif - /* MATTHIEU: Should we use local DP accumulators ? */ + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); - /* Loop over all particles in ci... */ - if (cell_is_active(ci, e)) { - for (int pid = 0; pid < gcount_i; pid++) { + float f_ij, W_ij; - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gpi = &gparts_i[pid]; + if (r2 >= h2_i) { - if (!gpart_is_active(gpi, e)) continue; + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; - /* Loop over every particle in the other cell. */ - for (int pjd = 0; pjd < gcount_j; pjd++) { + } else { - /* Get a hold of the jth part in cj. */ - const struct gpart *restrict gpj = &gparts_j[pjd]; + const float r = r2 * r_inv; + const float ui = r * h_inv_i; - /* Compute the pairwise distance. */ - const float dx[3] = {gpi->x[0] - gpj->x[0], // x - gpi->x[1] - gpj->x[1], // y - gpi->x[2] - gpj->x[2]}; // z - const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + kernel_grav_eval(ui, &W_ij); -#ifdef SWIFT_DEBUG_CHECKS - /* Check that particles have been drifted to the current time */ - if (gpi->ti_drift != e->ti_current) - error("gpi not drifted to current time"); - if (gpj->ti_drift != e->ti_current) - error("gpj not drifted to current time"); -#endif + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } - /* Interact ! */ - runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj); + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; #ifdef SWIFT_DEBUG_CHECKS - gpi->num_interacted++; + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount) gparts[pid].num_interacted++; #endif - } } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; } - /* Loop over all particles in cj... */ - if (cell_is_active(cj, e)) { - for (int pjd = 0; pjd < gcount_j; pjd++) { + /* Write back to the particles */ + gravity_cache_write_back(ci_cache, gparts, gcount); - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gpj = &gparts_j[pjd]; +#ifdef MATTHIEU_OLD_STUFF - if (!gpart_is_active(gpj, e)) continue; + /* Some constants */ + const struct engine *const e = r->e; - /* Loop over every particle in the other cell. */ - for (int pid = 0; pid < gcount_i; pid++) { + /* Cell properties */ + const int gcount = c->gcount; + struct gpart *restrict gparts = c->gparts; - /* Get a hold of the ith part in ci. */ - const struct gpart *restrict gpi = &gparts_i[pid]; + /* MATTHIEU: Should we use local DP accumulators ? */ - /* Compute the pairwise distance. */ - const float dx[3] = {gpj->x[0] - gpi->x[0], // x - gpj->x[1] - gpi->x[1], // y - gpj->x[2] - gpi->x[2]}; // z - const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount; pid++) { + + /* Get a hold of the ith part in ci. */ + struct gpart *restrict gpi = &gparts[pid]; + + /* Loop over every particle in the other cell. */ + for (int pjd = pid + 1; pjd < gcount; pjd++) { + + /* Get a hold of the jth part in ci. */ + struct gpart *restrict gpj = &gparts[pjd]; + + /* Compute the pairwise distance. */ + float dx[3] = {gpi->x[0] - gpj->x[0], // x + gpi->x[1] - gpj->x[1], // y + gpi->x[2] - gpj->x[2]}; // z + const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; #ifdef SWIFT_DEBUG_CHECKS - /* Check that particles have been drifted to the current time */ - if (gpi->ti_drift != e->ti_current) - error("gpi not drifted to current time"); - if (gpj->ti_drift != e->ti_current) - error("gpj not drifted to current time"); + /* Check that particles have been drifted to the current time */ + if (gpi->ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (gpj->ti_drift != e->ti_current) + error("gpj not drifted to current time"); #endif - /* Interact ! */ - runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi); + /* Interact ! */ + if (gpart_is_active(gpi, e) && gpart_is_active(gpj, e)) { + + runner_iact_grav_pp(r2, dx, gpi, gpj); #ifdef SWIFT_DEBUG_CHECKS + gpi->num_interacted++; gpj->num_interacted++; #endif + + } else { + + if (gpart_is_active(gpi, e)) { + + runner_iact_grav_pp_nonsym(r2, dx, gpi, gpj); + +#ifdef SWIFT_DEBUG_CHECKS + gpi->num_interacted++; +#endif + + } else if (gpart_is_active(gpj, e)) { + + dx[0] = -dx[0]; + dx[1] = -dx[1]; + dx[2] = -dx[2]; + runner_iact_grav_pp_nonsym(r2, dx, gpj, gpi); + +#ifdef SWIFT_DEBUG_CHECKS + gpj->num_interacted++; +#endif + } } } } - TIMER_TOC(timer_dopair_grav_pp); +#endif } /** - * @brief Computes the interaction of all the particles in a cell directly + * @brief Computes the interaction of all the particles in a cell using the + * truncated Newtonian potential. * * @param r The #runner. * @param c The #cell. * * @todo Use a local cache for the particles. */ -void runner_doself_grav_pp(struct runner *r, struct cell *c) { +void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) { - const struct engine *e = r->e; + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + + /* Caches to play with */ + struct gravity_cache *const ci_cache = &r->ci_gravity_cache; + + /* Cell properties */ const int gcount = c->gcount; struct gpart *restrict gparts = c->gparts; - const float a_smooth = e->gravity_properties->a_smooth; - const float rlr_inv = 1. / (a_smooth * c->super->width[0]); + const int c_active = cell_is_active(c, e); + const double loc[3] = {c->loc[0] + 0.5 * c->width[0], + c->loc[1] + 0.5 * c->width[1], + c->loc[2] + 0.5 * c->width[2]}; - TIMER_TIC; + /* Anything to do here ?*/ + if (!c_active) return; + + /* Check that we fit in cache */ + if (gcount > ci_cache->count) + error("Not enough space in the caches! gcount=%d", gcount); + + /* Computed the padded counts */ + const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE; + + gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc); + + /* Ok... Here we go ! */ + + /* Loop over all particles in ci... */ + for (int pid = 0; pid < gcount; pid++) { + + /* Skip inactive particles */ + if (!gpart_is_active(&gparts[pid], e)) continue; + + const float x_i = ci_cache->x[pid]; + const float y_i = ci_cache->y[pid]; + const float z_i = ci_cache->z[pid]; + + /* Some powers of the softening length */ + const float h_i = ci_cache->epsilon[pid]; + const float h2_i = h_i * h_i; + const float h_inv_i = 1.f / h_i; + const float h_inv3_i = h_inv_i * h_inv_i * h_inv_i; + + /* Local accumulators for the acceleration */ + float a_x = 0.f, a_y = 0.f, a_z = 0.f; + + /* Make the compiler understand we are in happy vectorization land */ + swift_align_information(ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_align_information(ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_assume_size(gcount_padded, VEC_SIZE); + + /* Loop over every other particle in the cell. */ + for (int pjd = 0; pjd < gcount_padded; pjd++) { + + /* No self interaction */ + if (pid == pjd) continue; + + /* Get info about j */ + const float x_j = ci_cache->x[pjd]; + const float y_j = ci_cache->y[pjd]; + const float z_j = ci_cache->z[pjd]; + const float mass_j = ci_cache->m[pjd]; + + /* Compute the pairwise (square) distance. */ + const float dx = x_i - x_j; + const float dy = y_i - y_j; + const float dz = z_i - z_j; + const float r2 = dx * dx + dy * dy + dz * dz; #ifdef SWIFT_DEBUG_CHECKS - if (c->gcount == 0) error("Doing self gravity on an empty cell !"); + if (r2 == 0.f) error("Interacting particles with 0 distance"); + + /* Check that particles have been drifted to the current time */ + if (gparts[pid].ti_drift != e->ti_current) + error("gpi not drifted to current time"); + if (pjd < gcount && gparts[pjd].ti_drift != e->ti_current) + error("gpj not drifted to current time"); #endif - /* Anything to do here? */ - if (!cell_is_active(c, e)) return; + /* Get the inverse distance */ + const float r_inv = 1.f / sqrtf(r2); + const float r = r2 * r_inv; - /* Do we need to start by drifting things ? */ - if (!cell_are_gpart_drifted(c, e)) cell_drift_gpart(c, e); + float f_ij, W_ij, corr_lr; -#if ICHECK > 0 - for (int pid = 0; pid < gcount; pid++) { + if (r2 >= h2_i) { - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &gparts[pid]; + /* Get Newtonian gravity */ + f_ij = mass_j * r_inv * r_inv * r_inv; - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count= %d", - gp->id_or_neg_offset, c->loc[0], c->loc[1], c->loc[2], - c->width[0], c->gcount); - } + } else { + + const float ui = r * h_inv_i; + + kernel_grav_eval(ui, &W_ij); + + /* Get softened gravity */ + f_ij = mass_j * h_inv3_i * W_ij; + } + + /* Get long-range correction */ + const float u_lr = r * rlr_inv; + kernel_long_grav_eval(u_lr, &corr_lr); + f_ij *= corr_lr; + + /* Store it back */ + a_x -= f_ij * dx; + a_y -= f_ij * dy; + a_z -= f_ij * dz; + +#ifdef SWIFT_DEBUG_CHECKS + /* Update the interaction counter if it's not a padded gpart */ + if (pjd < gcount) gparts[pid].num_interacted++; #endif + } + + /* Store everything back in cache */ + ci_cache->a_x[pid] = a_x; + ci_cache->a_y[pid] = a_y; + ci_cache->a_z[pid] = a_z; + } + + /* Write back to the particles */ + gravity_cache_write_back(ci_cache, gparts, gcount); + +#ifdef MATTHIEU_OLD_STUFF + /* Some constants */ + const struct engine *const e = r->e; + const struct space *s = e->s; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double rlr = cell_width * a_smooth; + const float rlr_inv = 1. / rlr; + + /* Cell properties */ + const int gcount = c->gcount; + struct gpart *restrict gparts = c->gparts; /* MATTHIEU: Should we use local DP accumulators ? */ @@ -364,7 +1303,7 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) { /* Interact ! */ if (gpart_is_active(gpi, e) && gpart_is_active(gpj, e)) { - runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj); + runner_iact_grav_pp_truncated(r2, dx, gpi, gpj, rlr_inv); #ifdef SWIFT_DEBUG_CHECKS gpi->num_interacted++; @@ -375,7 +1314,7 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) { if (gpart_is_active(gpi, e)) { - runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpi, gpj); + runner_iact_grav_pp_truncated_nonsym(r2, dx, gpi, gpj, rlr_inv); #ifdef SWIFT_DEBUG_CHECKS gpi->num_interacted++; @@ -386,7 +1325,7 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) { dx[0] = -dx[0]; dx[1] = -dx[1]; dx[2] = -dx[2]; - runner_iact_grav_pp_nonsym(rlr_inv, r2, dx, gpj, gpi); + runner_iact_grav_pp_truncated_nonsym(r2, dx, gpj, gpi, rlr_inv); #ifdef SWIFT_DEBUG_CHECKS gpj->num_interacted++; @@ -395,6 +1334,53 @@ void runner_doself_grav_pp(struct runner *r, struct cell *c) { } } } +#endif +} + +/** + * @brief Computes the interaction of all the particles in a cell directly + * (Switching function between truncated and full) + * + * @param r The #runner. + * @param c The #cell. + */ +void runner_doself_grav_pp(struct runner *r, struct cell *c) { + + /* Some properties of the space */ + const struct engine *e = r->e; + const struct space *s = e->s; + const int periodic = s->periodic; + const double cell_width = s->width[0]; + const double a_smooth = e->gravity_properties->a_smooth; + const double r_cut_min = e->gravity_properties->r_cut_min; + const double min_trunc = cell_width * r_cut_min * a_smooth; + + TIMER_TIC; + +#ifdef SWIFT_DEBUG_CHECKS + if (c->gcount == 0) error("Doing self gravity on an empty cell !"); +#endif + + /* Anything to do here? */ + if (!cell_is_active(c, e)) return; + + /* Do we need to start by drifting things ? */ + if (!cell_are_gpart_drifted(c, e)) cell_drift_gpart(c, e); + + /* Can we use the Newtonian version or do we need the truncated one ? */ + if (!periodic) { + runner_doself_grav_pp_full(r, c); + } else { + + /* Get the maximal distance between any two particles */ + const double max_r = 2 * c->multipole->r_max; + + /* Do we need to use the truncated interactions ? */ + if (max_r > min_trunc) + runner_doself_grav_pp_truncated(r, c); + else + runner_doself_grav_pp_full(r, c); + } TIMER_TOC(timer_doself_grav_pp); } @@ -415,8 +1401,14 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, /* Some constants */ const struct engine *e = r->e; + const struct space *s = e->s; + const int periodic = s->periodic; + const double cell_width = s->width[0]; + const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; const struct gravity_props *props = e->gravity_properties; const double theta_crit_inv = props->theta_crit_inv; + const double max_distance = props->a_smooth * props->r_cut_max * cell_width; + const double max_distance2 = max_distance * max_distance; #ifdef SWIFT_DEBUG_CHECKS @@ -436,35 +1428,47 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, error("cj->multipole not drifted."); #endif -#if ICHECK > 0 - for (int pid = 0; pid < ci->gcount; pid++) { - - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &ci->gparts[pid]; + TIMER_TIC; - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count= %d", - gp->id_or_neg_offset, cj->loc[0], cj->loc[1], cj->loc[2], - cj->width[0], cj->gcount); - } + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - for (int pid = 0; pid < cj->gcount; pid++) { + /* Recover the multipole information */ + struct gravity_tensors *const multi_i = ci->multipole; + struct gravity_tensors *const multi_j = cj->multipole; - /* Get a hold of the ith part in ci. */ - struct gpart *restrict gp = &cj->gparts[pid]; + /* Get the distance between the CoMs */ + double dx = multi_i->CoM[0] - multi_j->CoM[0]; + double dy = multi_i->CoM[1] - multi_j->CoM[1]; + double dz = multi_i->CoM[2] - multi_j->CoM[2]; - if (gp->id_or_neg_offset == ICHECK) - message("id=%lld loc=[ %f %f %f ] size= %f count= %d", - gp->id_or_neg_offset, ci->loc[0], ci->loc[1], ci->loc[2], - ci->width[0], ci->gcount); + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); } + const double r2 = dx * dx + dy * dy + dz * dz; + + /* Are we beyond the distance where the truncated forces are 0? */ + if (periodic && r2 > max_distance2) { + +#ifdef SWIFT_DEBUG_CHECKS + /* Need to account for the interactions we missed */ + if (cell_is_active(ci, e)) + multi_i->pot.num_interacted += multi_j->m_pole.num_gpart; + if (cell_is_active(cj, e)) + multi_j->pot.num_interacted += multi_i->m_pole.num_gpart; #endif + return; + } - TIMER_TIC; + /* OK, we actually need to compute this pair. Let's find the cheapest + * option... */ /* Can we use M-M interactions ? */ - if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv, - 0)) { + if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) { + /* MATTHIEU: make a symmetric M-M interaction function ! */ runner_dopair_grav_mm(r, ci, cj); runner_dopair_grav_mm(r, cj, ci); @@ -476,8 +1480,8 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, /* Alright, we'll have to split and recurse. */ else { - const double ri_max = ci->multipole->r_max; - const double rj_max = cj->multipole->r_max; + const double ri_max = multi_i->r_max; + const double rj_max = multi_j->r_max; /* Split the larger of the two cells and start over again */ if (ri_max > rj_max) { @@ -543,6 +1547,9 @@ void runner_dopair_grav(struct runner *r, struct cell *ci, struct cell *cj, */ void runner_doself_grav(struct runner *r, struct cell *c, int gettimer) { + /* Some constants */ + const struct engine *e = r->e; + #ifdef SWIFT_DEBUG_CHECKS /* Early abort? */ if (c->gcount == 0) error("Doing self gravity on an empty cell !"); @@ -550,6 +1557,9 @@ void runner_doself_grav(struct runner *r, struct cell *c, int gettimer) { TIMER_TIC; + /* Anything to do here? */ + if (!cell_is_active(c, e)) return; + /* If the cell is split, interact each progeny with itself, and with each of its siblings. */ if (c->split) { @@ -617,8 +1627,14 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) { /* Some constants */ const struct engine *e = r->e; + const struct space *s = e->s; const struct gravity_props *props = e->gravity_properties; + const int periodic = s->periodic; + const double cell_width = s->width[0]; + const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; const double theta_crit_inv = props->theta_crit_inv; + const double max_distance = props->a_smooth * props->r_cut_max * cell_width; + const double max_distance2 = max_distance * max_distance; TIMER_TIC; @@ -627,38 +1643,86 @@ void runner_do_grav_long_range(struct runner *r, struct cell *ci, int timer) { const int nr_cells = e->s->nr_cells; /* Anything to do here? */ - if (!cell_is_active(ci, e)) return; // MATTHIEU (should never happen) + if (!cell_is_active(ci, e)) return; /* Check multipole has been drifted */ if (ci->ti_old_multipole != e->ti_current) error("Interacting un-drifted multipole"); + /* Recover the local multipole */ + struct gravity_tensors *const multi_i = ci->multipole; + const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]}; + const double CoM_rebuild_i[3] = {multi_i->CoM_rebuild[0], + multi_i->CoM_rebuild[1], + multi_i->CoM_rebuild[2]}; + /* Loop over all the top-level cells and go for a M-M interaction if * well-separated */ for (int i = 0; i < nr_cells; ++i) { - /* Handle on the top-level cell */ + /* Handle on the top-level cell and it's gravity business*/ struct cell *cj = &cells[i]; + const struct gravity_tensors *const multi_j = cj->multipole; /* Avoid stupid cases */ if (ci == cj || cj->gcount == 0) continue; + /* Get the distance between the CoMs */ + double dx = CoM_i[0] - multi_j->CoM[0]; + double dy = CoM_i[1] - multi_j->CoM[1]; + double dz = CoM_i[2] - multi_j->CoM[2]; + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + const double r2 = dx * dx + dy * dy + dz * dz; + + /* Are we beyond the distance where the truncated forces are 0 ?*/ + if (periodic && r2 > max_distance2) { + +#ifdef SWIFT_DEBUG_CHECKS + /* Need to account for the interactions we missed */ + multi_i->pot.num_interacted += multi_j->m_pole.num_gpart; +#endif + continue; + } + /* Check the multipole acceptance criterion */ - if (gravity_multipole_accept(ci->multipole, cj->multipole, theta_crit_inv, - 0)) { + if (gravity_multipole_accept(multi_i, multi_j, theta_crit_inv, r2)) { /* Go for a (non-symmetric) M-M calculation */ runner_dopair_grav_mm(r, ci, cj); - } - /* Is the criterion violated now but was OK at the last rebuild ? */ - else if (gravity_multipole_accept(ci->multipole, cj->multipole, - theta_crit_inv, 1)) { - /* Alright, we have to take charge of that pair in a different way. */ - // MATTHIEU: We should actually open the tree-node here and recurse. - runner_dopair_grav_mm(r, ci, cj); + } else { + + /* Let's check whether we need to still operate on this pair */ + + /* Get the distance between the CoMs at the last rebuild*/ + double dx = CoM_rebuild_i[0] - multi_j->CoM_rebuild[0]; + double dy = CoM_rebuild_i[1] - multi_j->CoM_rebuild[1]; + double dz = CoM_rebuild_i[2] - multi_j->CoM_rebuild[2]; + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + const double r2_rebuild = dx * dx + dy * dy + dz * dz; + + /* Is the criterion violated now but was OK at the last rebuild ? */ + if (gravity_multipole_accept_rebuild(multi_i, multi_j, theta_crit_inv, + r2_rebuild)) { + + /* Alright, we have to take charge of that pair in a different way. */ + // MATTHIEU: We should actually open the tree-node here and recurse. + runner_dopair_grav_mm(r, ci, cj); + } } - } + } /* Loop over top-level cells */ if (timer) TIMER_TOC(timer_dograv_long_range); } diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index acf83b95d564ba81db8586fc0fbd3e10c0bc6cd5..552fb91d099c4d10c847abcad1fc5b33e61b8799 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -20,13 +20,12 @@ /* Config parameters. */ #include "../config.h" -#include "swift.h" - -#include "active.h" - /* This object's header. */ #include "runner_doiact_vec.h" +/* Local headers. */ +#include "active.h" + #ifdef WITH_VECTORIZATION static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); @@ -76,8 +75,8 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( *icount_align += pad; /* Initialise masks to true. */ - vec_init_mask(int_mask); - vec_init_mask(int_mask2); + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Pad secondary cache so that there are no contributions in the interaction * function. */ @@ -124,10 +123,6 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param v_dx #vector of the x separation between two particles. * @param v_dy #vector of the y separation between two particles. * @param v_dz #vector of the z separation between two particles. - * @param v_mj #vector of the mass of particle pj. - * @param v_vjx #vector of x velocity of pj. - * @param v_vjy #vector of y velocity of pj. - * @param v_vjz #vector of z velocity of pj. * @param cell_cache #cache of all particles in the cell. * @param int_cache (return) secondary #cache of interactions between two * particles. @@ -212,8 +207,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions( v_hi_inv, v_vix, v_viy, v_viz, &icount_align); mask_t int_mask, int_mask2; - vec_init_mask(int_mask); - vec_init_mask(int_mask2); + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interactions. */ for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { @@ -310,7 +305,6 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( /* Perform remainder interaction and remove remainder from aligned * interaction count. */ *icount_align = icount - rem; - runner_iact_nonsym_2_vec_force( &int_cache->r2q[*icount_align], &int_cache->dxq[*icount_align], &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_vix, @@ -370,7 +364,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, vector v_rhoi, vector v_grad_hi, - vector v_pOrhoi2, vector v_balsara_i, vector v_ci) { + vector v_pOrhoi2, vector v_balsara_i, vector v_ci, int num_vec_proc) { /* Left-pack values needed into the secondary cache using the interaction mask. */ @@ -437,7 +431,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( #endif /* defined(HAVE_AVX2) || defined(HAVE_AVX512_F) */ /* Flush the c2 cache if it has reached capacity. */ - if (*icount >= (C2_CACHE_SIZE - (2 * VEC_SIZE))) { + if (*icount >= (C2_CACHE_SIZE - (num_vec_proc * VEC_SIZE))) { int icount_align = *icount; @@ -454,7 +448,7 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( vec_init_mask(int_mask2); /* Perform interactions. */ - for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { + for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { runner_iact_nonsym_2_vec_force( &int_cache->r2q[pjd], &int_cache->dxq[pjd], &int_cache->dyq[pjd], @@ -473,92 +467,138 @@ __attribute__((always_inline)) INLINE static void storeForceInteractions( } } -/* @brief Populates the arrays max_di and max_dj with the maximum distances of +/** + * @brief Populates the arrays max_index_i and max_index_j with the maximum + * indices of * particles into their neighbouring cells. Also finds the first pi that * interacts with any particle in cj and the last pj that interacts with any * particle in ci. + * * @param ci #cell pointer to ci * @param cj #cell pointer to cj * @param sort_i #entry array for particle distance in ci * @param sort_j #entry array for particle distance in cj - * @param ci_cache #cache for cell ci - * @param cj_cache #cache for cell cj * @param dx_max maximum particle movement allowed in cell * @param rshift cutoff shift - * @param max_di array to hold the maximum distances of pi particles into cell + * @param hi_max Maximal smoothing length in cell ci + * @param hj_max Maximal smoothing length in cell cj + * @param di_max Maximal position on the axis that can interact in cell ci + * @param dj_min Minimal position on the axis that can interact in cell ci + * @param max_index_i array to hold the maximum distances of pi particles into + * cell * cj - * @param max_dj array to hold the maximum distances of pj particles into cell + * @param max_index_j array to hold the maximum distances of pj particles into + * cell * cj * @param init_pi first pi to interact with a pj particle * @param init_pj last pj to interact with a pi particle + * @param e The #engine. */ -__attribute__((always_inline)) INLINE static void populate_max_d_no_cache( +__attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const float dx_max, const float rshift, float *max_di, float *max_dj, - int *init_pi, int *init_pj, const struct engine *e) { + const float dx_max, const float rshift, const double hi_max, + const double hj_max, const double di_max, const double dj_min, + int *max_index_i, int *max_index_j, int *init_pi, int *init_pj, + const struct engine *e) { - struct part *restrict parts_i = ci->parts; - struct part *restrict parts_j = cj->parts; - struct part *p = &parts_i[sort_i[0].i]; + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; - float h, d; + int first_pi = 0, last_pj = cj->count - 1; + int temp; + + /* Find the leftmost active particle in cell i that interacts with any + * particle in cell j. */ + first_pi = ci->count; + int active_id = first_pi - 1; + while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) { + first_pi--; + /* Store the index of the particle if it is active. */ + if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi; + } - /* Get the distance of the last pi and the first pj on the sorted axis.*/ - const float di_max = sort_i[ci->count - 1].d - rshift; - const float dj_min = sort_j[0].d; + /* Set the first active pi in range of any particle in cell j. */ + first_pi = active_id; - int first_pi = 0, last_pj = cj->count - 1; + /* Find the maximum index into cell j for each particle in range in cell i. */ + if (first_pi < ci->count) { - /* Find the first active particle in ci to interact with any particle in cj. - */ - /* Populate max_di with distances. */ - int active_id = ci->count - 1; - for (int k = ci->count - 1; k >= 0; k--) { - p = &parts_i[sort_i[k].i]; - h = p->h; - d = sort_i[k].d + h * kernel_gamma + dx_max - rshift; - - max_di[k] = d; - - /* If the particle is out of range set the index to - * the last active particle within range. */ - if (d < dj_min) { - first_pi = active_id; - break; - } else { - if (part_is_active(p, e)) active_id = k; + /* Start from the first particle in cell j. */ + temp = 0; + + const struct part *pi = &parts_i[sort_i[first_pi].i]; + + /* Loop through particles in cell j until they are not in range of pi. */ + while (temp <= cj->count && + (sort_i[first_pi].d + (pi->h * kernel_gamma + dx_max - rshift) > + sort_j[temp].d)) + temp++; + + max_index_i[first_pi] = temp; + + /* Populate max_index_i for remaining particles that are within range. */ + for (int i = first_pi + 1; i < ci->count; i++) { + temp = max_index_i[i - 1]; + pi = &parts_i[sort_i[i].i]; + + while (temp <= cj->count && + (sort_i[i].d + (pi->h * kernel_gamma + dx_max - rshift) > + sort_j[temp].d)) + temp++; + + max_index_i[i] = temp; } + } else { + /* Make sure that max index is set to first particle in cj.*/ + max_index_i[ci->count - 1] = 0; } - /* Find the maximum distance of pi particles into cj.*/ - for (int k = first_pi + 1; k < ci->count; k++) { - max_di[k] = fmaxf(max_di[k - 1], max_di[k]); + /* Find the rightmost active particle in cell j that interacts with any + * particle in cell i. */ + last_pj = -1; + active_id = last_pj; + while (last_pj < cj->count && + sort_j[last_pj + 1].d - hj_max - dx_max < di_max) { + last_pj++; + /* Store the index of the particle if it is active. */ + if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj; } - /* Find the last particle in cj to interact with any particle in ci. */ - /* Populate max_dj with distances. */ - active_id = 0; - for (int k = 0; k < cj->count; k++) { - p = &parts_j[sort_j[k].i]; - h = p->h; - d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; + /* Set the last active pj in range of any particle in cell i. */ + last_pj = active_id; - max_dj[k] = d; + /* Find the maximum index into cell i for each particle in range in cell j. */ + if (last_pj > 0) { - /* If the particle is out of range set the index to - * the last active particle within range. */ - if (d > di_max) { - last_pj = active_id; - break; - } else { - if (part_is_active(p, e)) active_id = k; - } - } + /* Start from the last particle in cell i. */ + temp = ci->count - 1; + + const struct part *pj = &parts_j[sort_j[last_pj].i]; - /* Find the maximum distance of pj particles into ci.*/ - for (int k = 1; k <= last_pj; k++) { - max_dj[k] = fmaxf(max_dj[k - 1], max_dj[k]); + /* Loop through particles in cell i until they are not in range of pj. */ + while (temp > 0 && + sort_j[last_pj].d - dx_max - (pj->h * kernel_gamma) < + sort_i[temp].d - rshift) + temp--; + + max_index_j[last_pj] = temp; + + /* Populate max_index_j for remaining particles that are within range. */ + for (int i = last_pj - 1; i >= 0; i--) { + temp = max_index_j[i + 1]; + pj = &parts_j[sort_j[i].i]; + + while (temp > 0 && + sort_j[i].d - dx_max - (pj->h * kernel_gamma) < + sort_i[temp].d - rshift) + temp--; + + max_index_j[i] = temp; + } + } else { + /* Make sure that max index is set to last particle in ci.*/ + max_index_j[0] = ci->count - 1; } *init_pi = first_pi; @@ -703,7 +743,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); /* Form a mask from r2 < hig2 and r2 > 0.*/ - mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, v_doi_mask2_self_check; + mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, + v_doi_mask2_self_check; int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check; /* Form r2 > 0 mask and r2 < hig2 mask. */ @@ -711,7 +752,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); /* Form r2 > 0 mask and r2 < hig2 mask. */ - vec_create_mask(v_doi_mask2_self_check, vec_cmp_gt(v_r2_2.v, vec_setzero())); + vec_create_mask(v_doi_mask2_self_check, + vec_cmp_gt(v_r2_2.v, vec_setzero())); vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v)); /* Form integer masks. */ @@ -720,7 +762,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check); doi_mask2 = vec_form_int_mask(v_doi_mask2); - + /* Combine the two masks. */ doi_mask = doi_mask & doi_mask_self_check; doi_mask2 = doi_mask2 & doi_mask2_self_check; @@ -752,8 +794,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Initialise masks to true in case remainder interactions have been * performed. */ mask_t int_mask, int_mask2; - vec_init_mask(int_mask); - vec_init_mask(int_mask2); + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interaction with 2 vectors. */ for (int pjd = 0; pjd < icount_align; pjd += (num_vec_proc * VEC_SIZE)) { @@ -954,7 +996,7 @@ for (int pid = 0; pid < count; pid++) { doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache, &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, - v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci); + v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); } } /* Loop over all other particles. */ @@ -968,8 +1010,8 @@ for (int pid = 0; pid < count; pid++) { /* Initialise masks to true in case remainder interactions have been * performed. */ mask_t int_mask, int_mask2; - vec_init_mask(int_mask); - vec_init_mask(int_mask2); + vec_init_mask_true(int_mask); + vec_init_mask_true(int_mask2); /* Perform interaction with 2 vectors. */ for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { @@ -1007,9 +1049,12 @@ TIMER_TOC(timer_doself_force); * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. + * @param sid The direction of the pair + * @param shift The shift vector to apply to the particles in ci. */ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, - struct cell *cj) { + struct cell *cj, const int sid, + const double *shift) { #ifdef WITH_VECTORIZATION const struct engine *restrict e = r->e; @@ -1018,29 +1063,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, TIMER_TIC; - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - - if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) - error("Interacting undrifted cells."); - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin) - runner_do_sort(r, ci, (1 << sid), 1); - if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin) - runner_do_sort(r, cj, (1 << sid), 1); - /* Get the cutoff shift. */ double rshift = 0.0; for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + const struct entry *restrict sort_i = ci->sort[sid]; + const struct entry *restrict sort_j = cj->sort[sid]; #ifdef SWIFT_DEBUG_CHECKS /* Check that the dx_max_sort values in the cell are indeed an upper @@ -1051,8 +1080,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, p->x[1] * runner_shift[sid][1] + p->x[2] * runner_shift[sid][2]; if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-6 * max(fabsf(d), ci->dx_max_sort)) - error("particle shift diff exceeds dx_max_sort."); + 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " + "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " + "ci->dx_max_sort_old=%e", + ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, + ci->dx_max_sort_old); } for (int pjd = 0; pjd < cj->count; pjd++) { const struct part *p = &cj->parts[sort_j[pjd].i]; @@ -1060,8 +1094,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, p->x[1] * runner_shift[sid][1] + p->x[2] * runner_shift[sid][2]; if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-6 * max(fabsf(d), cj->dx_max_sort)) - error("particle shift diff exceeds dx_max_sort."); + 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " + "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " + "cj->dx_max_sort_old=%e", + cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, + cj->dx_max_sort_old); } #endif /* SWIFT_DEBUG_CHECKS */ @@ -1113,37 +1152,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } int first_pi, last_pj; - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); + int *max_index_i __attribute__((aligned(sizeof(int) * VEC_SIZE))); + int *max_index_j __attribute__((aligned(sizeof(int) * VEC_SIZE))); - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; + max_index_i = r->ci_cache.max_index; + max_index_j = r->cj_cache.max_index; - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[]. + */ /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ - populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, - max_dj, &first_pi, &last_pj, e); - - /* Find the maximum index into cj that is required by a particle in ci. */ - /* Find the maximum index into ci that is required by a particle in cj. */ - float di, dj; - int max_ind_j = count_j - 1; - int max_ind_i = 0; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[count_i - 1] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[0] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } + populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, + hj_max, di_max, dj_min, max_index_i, max_index_j, + &first_pi, &last_pj, e); /* Limits of the outer loops. */ int first_pi_loop = first_pi; @@ -1151,8 +1172,8 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Take the max/min of both values calculated to work out how many particles * to read into the cache. */ - last_pj = max(last_pj, max_ind_j); - first_pi = min(first_pi, max_ind_i); + last_pj = max(last_pj, max_index_i[count_i - 1]); + first_pi = min(first_pi, max_index_j[0]); /* Read the needed particles into the two caches. */ int first_pi_align = first_pi; @@ -1166,26 +1187,25 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (cell_is_active(ci, e)) { - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) { + /* Loop over the parts in ci until nothing is within range in cj. */ + for (int pid = count_i - 1; pid >= first_pi_loop; pid--) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; if (!part_is_active(pi, e)) continue; - /* Determine the exit iteration of the interaction loop. */ - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j + 1; - /* Set the cache index. */ int ci_cache_idx = pid - first_pi_align; + /* Skip this particle if no particle in cj is within range of it. */ const float hi = ci_cache->h[ci_cache_idx]; + const double di_test = + sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + if (di_test < dj_min) continue; + + /* Determine the exit iteration of the interaction loop. */ + int exit_iteration = max_index_i[pid]; + const float hig2 = hi * hi * kernel_gamma2; vector pix, piy, piz; @@ -1294,26 +1314,27 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } if (cell_is_active(cj, e)) { - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) { + + /* Loop over the parts in cj until nothing is within range in ci. */ + for (int pjd = 0; pjd <= last_pj_loop; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (!part_is_active(pj, e)) continue; - /* Determine the exit iteration of the interaction loop. */ - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; - /* Set the cache index. */ int cj_cache_idx = pjd; + /*TODO: rshift term. */ + /* Skip this particle if no particle in ci is within range of it. */ const float hj = cj_cache->h[cj_cache_idx]; + const double dj_test = + sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + if (dj_test > di_max) continue; + + /* Determine the exit iteration of the interaction loop. */ + int exit_iteration = max_index_j[pjd]; + const float hjg2 = hj * hj * kernel_gamma2; vector pjx, pjy, pjz; diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 50d0722d577c38a4cb3cce35a339795b399161fe..09dc76ef04df5d29ea32f4af24efdc09e433aa73 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -37,6 +37,7 @@ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); void runner_doself2_force_vec(struct runner *r, struct cell *restrict c); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, - struct cell *restrict cj); + struct cell *restrict cj, const int sid, + const double *shift); #endif /* SWIFT_RUNNER_VEC_H */ diff --git a/src/scheduler.c b/src/scheduler.c index b07c403e4ecd960b22b51f24372ca0a3420a453f..4081cde0489b1b439ceb46fc9b4e191541f15bef 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -127,7 +127,8 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { redo = 0; /* Non-splittable task? */ - if ((t->ci == NULL) || (t->type == task_type_pair && t->cj == NULL)) { + if ((t->ci == NULL) || (t->type == task_type_pair && t->cj == NULL) || + t->ci->count == 0 || (t->cj != NULL && t->cj->count == 0)) { t->type = task_type_none; t->subtype = task_subtype_none; t->cj = NULL; @@ -140,7 +141,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { /* Get a handle on the cell involved. */ struct cell *ci = t->ci; - const double width = ci->dmin; /* Foreign task? */ if (ci->nodeID != s->nodeID) { @@ -149,18 +149,14 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { } /* Is this cell even split and the task does not violate h ? */ - if (ci->split && 2.f * kernel_gamma * ci->h_max * space_stretch < width) { + if (cell_can_split_self_task(ci)) { /* Make a sub? */ - if (scheduler_dosub && /* Note division here to avoid overflow */ - (ci->count > 0 && ci->count < space_subsize / ci->count)) { + if (scheduler_dosub && ci->count < space_subsize_self) { /* convert to a self-subtask. */ t->type = task_type_sub_self; - /* Depend on local sorts on this cell. */ - if (ci->sorts != NULL) scheduler_addunlock(s, ci->sorts, t); - /* Otherwise, make tasks explicitly. */ } else { @@ -172,7 +168,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { while (ci->progeny[first_child] == NULL) first_child++; t->ci = ci->progeny[first_child]; for (int k = first_child + 1; k < 8; k++) - if (ci->progeny[k] != NULL) + if (ci->progeny[k] != NULL && ci->progeny[k]->count) scheduler_splittask_hydro( scheduler_addtask(s, task_type_self, t->subtype, 0, 0, ci->progeny[k], NULL), @@ -180,9 +176,9 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { /* Make a task for each pair of progeny */ for (int j = 0; j < 8; j++) - if (ci->progeny[j] != NULL) + if (ci->progeny[j] != NULL && ci->progeny[j]->count) for (int k = j + 1; k < 8; k++) - if (ci->progeny[k] != NULL) + if (ci->progeny[k] != NULL && ci->progeny[k]->count) scheduler_splittask_hydro( scheduler_addtask(s, task_type_pair, t->subtype, sub_sid_flag[j][k], 0, ci->progeny[j], @@ -191,16 +187,6 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { } } /* Cell is split */ - /* Otherwise, make sure the self task has a drift task */ - else { - - lock_lock(&ci->lock); - - if (ci->drift_part == NULL) - ci->drift_part = scheduler_addtask(s, task_type_drift_part, - task_subtype_none, 0, 0, ci, NULL); - lock_unlock_blind(&ci->lock); - } } /* Self interaction */ /* Pair interaction? */ @@ -221,26 +207,17 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { double shift[3]; const int sid = space_getsid(s->space, &ci, &cj, shift); - const double width_i = ci->dmin; - const double width_j = cj->dmin; - /* Should this task be split-up? */ - if (ci->split && cj->split && - 2.f * kernel_gamma * space_stretch * ci->h_max < width_i && - 2.f * kernel_gamma * space_stretch * cj->h_max < width_j) { + if (cell_can_split_pair_task(ci) && cell_can_split_pair_task(cj)) { /* Replace by a single sub-task? */ - if (scheduler_dosub && - ci->count * sid_scale[sid] < space_subsize / cj->count && + if (scheduler_dosub && /* Use division to avoid integer overflow. */ + ci->count * sid_scale[sid] < space_subsize_pair / cj->count && !sort_is_corner(sid)) { /* Make this task a sub task. */ t->type = task_type_sub_pair; - /* Depend on the sort tasks of both cells. */ - if (ci->sorts != NULL) scheduler_addunlock(s, ci->sorts, t); - if (cj->sorts != NULL) scheduler_addunlock(s, cj->sorts, t); - /* Otherwise, split it. */ } else { @@ -593,44 +570,15 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { t->type = task_type_none; for (int j = 0; j < 8; j++) - if (ci->progeny[j] != NULL) + if (ci->progeny[j] != NULL && ci->progeny[j]->count) for (int k = 0; k < 8; k++) - if (cj->progeny[k] != NULL) { + if (cj->progeny[k] != NULL && cj->progeny[k]->count) { struct task *tl = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, ci->progeny[j], cj->progeny[k]); scheduler_splittask_hydro(tl, s); tl->flags = space_getsid(s->space, &t->ci, &t->cj, shift); } - - /* Otherwise, if not spilt, stitch-up the sorting. */ - } else { - - /* Create the drift and sort for ci. */ - lock_lock(&ci->lock); - if (ci->drift_part == NULL && ci->nodeID == engine_rank) - ci->drift_part = scheduler_addtask(s, task_type_drift_part, - task_subtype_none, 0, 0, ci, NULL); - if (ci->sorts == NULL) - ci->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, - 1 << sid, 0, ci, NULL); - else - ci->sorts->flags |= (1 << sid); - lock_unlock_blind(&ci->lock); - scheduler_addunlock(s, ci->sorts, t); - - /* Create the drift and sort for cj. */ - lock_lock(&cj->lock); - if (cj->drift_part == NULL && cj->nodeID == engine_rank) - cj->drift_part = scheduler_addtask(s, task_type_drift_part, - task_subtype_none, 0, 0, cj, NULL); - if (cj->sorts == NULL) - cj->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, - 1 << sid, 0, cj, NULL); - else - cj->sorts->flags |= (1 << sid); - lock_unlock_blind(&cj->lock); - scheduler_addunlock(s, cj->sorts, t); } } /* pair interaction? */ } /* iterate over the current task. */ @@ -672,54 +620,36 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) { break; } - /* Is this cell even split? */ - if (ci->split) { - - /* Make a sub? */ - if (scheduler_dosub && /* Note division here to avoid overflow */ - (ci->gcount > 0 && ci->gcount < space_subsize / ci->gcount)) { - - /* convert to a self-subtask. */ - t->type = task_type_sub_self; - - /* Make sure we have a drift task (MATTHIEU temp. fix) */ - lock_lock(&ci->lock); - if (ci->drift_gpart == NULL) - ci->drift_gpart = scheduler_addtask( - s, task_type_drift_gpart, task_subtype_none, 0, 0, ci, NULL); - lock_unlock_blind(&ci->lock); - - /* Otherwise, make tasks explicitly. */ - } else { - - /* Take a step back (we're going to recycle the current task)... */ - redo = 1; - - /* Add the self tasks. */ - int first_child = 0; - while (ci->progeny[first_child] == NULL) first_child++; - t->ci = ci->progeny[first_child]; - for (int k = first_child + 1; k < 8; k++) - if (ci->progeny[k] != NULL) - scheduler_splittask_gravity( - scheduler_addtask(s, task_type_self, t->subtype, 0, 0, - ci->progeny[k], NULL), - s); - - /* Make a task for each pair of progeny */ - if (t->subtype != task_subtype_external_grav) { - for (int j = 0; j < 8; j++) - if (ci->progeny[j] != NULL) - for (int k = j + 1; k < 8; k++) - if (ci->progeny[k] != NULL) - scheduler_splittask_gravity( - scheduler_addtask(s, task_type_pair, t->subtype, - sub_sid_flag[j][k], 0, ci->progeny[j], - ci->progeny[k]), - s); - } + /* Should we split this task? */ + if (ci->split && ci->gcount > space_subsize_self_grav) { + + /* Take a step back (we're going to recycle the current task)... */ + redo = 1; + + /* Add the self tasks. */ + int first_child = 0; + while (ci->progeny[first_child] == NULL) first_child++; + t->ci = ci->progeny[first_child]; + for (int k = first_child + 1; k < 8; k++) + if (ci->progeny[k] != NULL) + scheduler_splittask_gravity( + scheduler_addtask(s, task_type_self, t->subtype, 0, 0, + ci->progeny[k], NULL), + s); + + /* Make a task for each pair of progeny */ + if (t->subtype != task_subtype_external_grav) { + for (int j = 0; j < 8; j++) + if (ci->progeny[j] != NULL) + for (int k = j + 1; k < 8; k++) + if (ci->progeny[k] != NULL) + scheduler_splittask_gravity( + scheduler_addtask(s, task_type_pair, t->subtype, + sub_sid_flag[j][k], 0, ci->progeny[j], + ci->progeny[k]), + s); } - } /* Cell is split */ + } /* Otherwise, make sure the self task has a drift task */ else { @@ -747,7 +677,7 @@ static void scheduler_splittask_gravity(struct task *t, struct scheduler *s) { } /* Should this task be split-up? */ - if (ci->split && cj->split) { + if (0 && ci->split && cj->split) { // MATTHIEU: nothing here for now @@ -813,7 +743,7 @@ void scheduler_splittasks(struct scheduler *s) { /* Call the mapper on each current task. */ threadpool_map(s->threadpool, scheduler_splittasks_mapper, s->tasks, - s->nr_tasks, sizeof(struct task), 1000, s); + s->nr_tasks, sizeof(struct task), 0, s); } /** @@ -823,13 +753,14 @@ void scheduler_splittasks(struct scheduler *s) { * @param type The type of the task. * @param subtype The sub-type of the task. * @param flags The flags of the task. - * @param wait The number of unsatisfied dependencies of this task. + * @param implicit If true, only use this task to unlock dependencies, i.e. + * this task is never enqueued. * @param ci The first cell to interact. * @param cj The second cell to interact. */ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, - enum task_subtypes subtype, int flags, int wait, - struct cell *ci, struct cell *cj) { + enum task_subtypes subtype, int flags, + int implicit, struct cell *ci, struct cell *cj) { #ifdef SWIFT_DEBUG_CHECKS if (ci == NULL && cj != NULL) @@ -850,11 +781,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, t->type = type; t->subtype = subtype; t->flags = flags; - t->wait = wait; + t->wait = 0; t->ci = ci; t->cj = cj; t->skip = 1; /* Mark tasks as skip by default. */ - t->implicit = 0; + t->implicit = implicit; t->weight = 0; t->rank = 0; t->nr_unlock_tasks = 0; @@ -1035,9 +966,7 @@ void scheduler_reset(struct scheduler *s, int size) { if (size > s->size) { /* Free existing task lists if necessary. */ - if (s->tasks != NULL) free(s->tasks); - if (s->tasks_ind != NULL) free(s->tasks_ind); - if (s->tid_active != NULL) free(s->tid_active); + scheduler_free_tasks(s); /* Allocate the new lists. */ if (posix_memalign((void *)&s->tasks, task_align, @@ -1184,11 +1113,6 @@ void scheduler_rewait_mapper(void *map_data, int num_elements, if (t->wait < 0) error("Task unlocked by more than %d tasks!", (1 << (8 * sizeof(t->wait) - 1)) - 1); - - /* Skip sort tasks that have already been performed */ - if (t->type == task_type_sort && t->flags == 0) { - error("Empty sort task encountered."); - } #endif /* Sets the waits of the dependances */ @@ -1232,7 +1156,7 @@ void scheduler_start(struct scheduler *s) { /* Re-wait the tasks. */ if (s->active_count > 1000) { threadpool_map(s->threadpool, scheduler_rewait_mapper, s->tid_active, - s->active_count, sizeof(int), 1000, s); + s->active_count, sizeof(int), 0, s); } else { scheduler_rewait_mapper(s->tid_active, s->active_count, s); } @@ -1277,14 +1201,14 @@ void scheduler_start(struct scheduler *s) { ci->ti_end_min); /* Special treatment for sort tasks */ - if (ci->ti_end_min == ti_current && t->skip && + /* if (ci->ti_end_min == ti_current && t->skip && t->type == task_type_sort && t->flags == 0) error( "Task (type='%s/%s') should not have been skipped " "ti_current=%lld " "c->ti_end_min=%lld t->flags=%d", taskID_names[t->type], subtaskID_names[t->subtype], ti_current, - ci->ti_end_min, t->flags); + ci->ti_end_min, t->flags); */ } else { /* pair */ @@ -1308,7 +1232,7 @@ void scheduler_start(struct scheduler *s) { /* Loop over the tasks and enqueue whoever is ready. */ if (s->active_count > 1000) { threadpool_map(s->threadpool, scheduler_enqueue_mapper, s->tid_active, - s->active_count, sizeof(int), 1000, s); + s->active_count, sizeof(int), 0, s); } else { scheduler_enqueue_mapper(s->tid_active, s->active_count, s); } @@ -1338,6 +1262,10 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { /* If this is an implicit task, just pretend it's done. */ if (t->implicit) { +#ifdef SWIFT_DEBUG_CHECKS + t->ti_run = s->space->e->ti_current; +#endif + t->skip = 1; for (int j = 0; j < t->nr_unlock_tasks; j++) { struct task *t2 = t->unlock_tasks[j]; if (atomic_dec(&t2->wait) == 1) scheduler_enqueue(s, t2); @@ -1417,11 +1345,6 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { } else if (t->subtype == task_subtype_xv || t->subtype == task_subtype_rho || t->subtype == task_subtype_gradient) { -#ifdef SWIFT_DEBUG_CHECKS - for (int k = 0; k < t->ci->count; k++) - if (t->ci->parts[k].ti_drift != s->space->e->ti_current) - error("Sending un-drifted particle !"); -#endif err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); // message( "sending %i parts with tag=%i from %i to %i." , @@ -1719,11 +1642,29 @@ void scheduler_print_tasks(const struct scheduler *s, const char *fileName) { */ void scheduler_clean(struct scheduler *s) { - free(s->tasks); - free(s->tasks_ind); + scheduler_free_tasks(s); free(s->unlocks); free(s->unlock_ind); - free(s->tid_active); for (int i = 0; i < s->nr_queues; ++i) queue_clean(&s->queues[i]); free(s->queues); } + +/** + * @brief Free the task arrays allocated by this #scheduler. + */ +void scheduler_free_tasks(struct scheduler *s) { + + if (s->tasks != NULL) { + free(s->tasks); + s->tasks = NULL; + } + if (s->tasks_ind != NULL) { + free(s->tasks_ind); + s->tasks_ind = NULL; + } + if (s->tid_active != NULL) { + free(s->tid_active); + s->tid_active = NULL; + } + s->size = 0; +} diff --git a/src/scheduler.h b/src/scheduler.h index 7bf9a40e7cec89eb25dfa6ce7a56912bf3a9e639..ac654580b2af2ffb506dc3fd9f0b988b89effbd0 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -52,7 +52,7 @@ /* Flags . */ #define scheduler_flag_none 0 -#define scheduler_flag_steal 1 +#define scheduler_flag_steal (1 << 1) /* Data of a scheduler. */ struct scheduler { @@ -133,8 +133,8 @@ void scheduler_reset(struct scheduler *s, int nr_tasks); void scheduler_ranktasks(struct scheduler *s); void scheduler_reweight(struct scheduler *s, int verbose); struct task *scheduler_addtask(struct scheduler *s, enum task_types type, - enum task_subtypes subtype, int flags, int wait, - struct cell *ci, struct cell *cj); + enum task_subtypes subtype, int flags, + int implicit, struct cell *ci, struct cell *cj); void scheduler_splittasks(struct scheduler *s); struct task *scheduler_done(struct scheduler *s, struct task *t); struct task *scheduler_unlock(struct scheduler *s, struct task *t); @@ -143,5 +143,6 @@ void scheduler_set_unlocks(struct scheduler *s); void scheduler_dump_queue(struct scheduler *s); void scheduler_print_tasks(const struct scheduler *s, const char *fileName); void scheduler_clean(struct scheduler *s); +void scheduler_free_tasks(struct scheduler *s); #endif /* SWIFT_SCHEDULER_H */ diff --git a/src/serial_io.c b/src/serial_io.c index a7e342f0a90fcf4c57f334526ff91b1923de4585..eb1e0e23fb34fd8d6a21230d9e38cfe82c47df1d 100644 --- a/src/serial_io.c +++ b/src/serial_io.c @@ -59,19 +59,15 @@ * @brief Reads a data array from a given HDF5 group. * * @param grp The group from which to read. - * @param name The name of the array to read. - * @param type The #DATA_TYPE of the attribute. - * @param N The number of particles. - * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurrence of the field of - *interest in the parts array - * @param partSize The size in bytes of the particle structure. - * @param importance If COMPULSORY, the data must be present in the IC file. If - *OPTIONAL, the array will be zeroed when the data is not present. + * @param props The #io_props of the field to read + * @param N The number of particles to read on this rank. + * @param N_total The total number of particles on all ranks. + * @param offset The offset position where this rank starts reading. + * @param internal_units The #unit_system used internally + * @param ic_units The #unit_system used in the ICs * * @todo A better version using HDF5 hyper-slabs to read the file directly into - *the part array - * will be written once the structures have been stabilized. + * the part array will be written once the structures have been stabilized. */ void readArray(hid_t grp, const struct io_props props, size_t N, long long N_total, long long offset, @@ -274,16 +270,17 @@ void prepareArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, * @param fileName The name of the file in which the data is written * @param xmfFile The FILE used to write the XMF description * @param partTypeGroupName The name of the group containing the particles in - *the HDF5 file. - * @param name The name of the array to write. - * @param type The #DATA_TYPE of the array. + * the HDF5 file. + * @param props The #io_props of the field to read * @param N The number of particles to write. - * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurrence of the field of - *interest in the parts array - * @param partSize The size in bytes of the particle structure. - * @param us The unit_system currently in use - * @param convFactor The UnitConversionFactor for this arrayo + * @param N_total The total number of particles on all ranks. + * @param offset The offset position where this rank starts writing. + * @param mpi_rank The MPI rank of this node + * @param internal_units The #unit_system used internally + * @param snapshot_units The #unit_system used in the snapshots + * + * @todo A better version using HDF5 hyper-slabs to write the file directly from + * the part array will be written once the structures have been stabilized. */ void writeArray(struct engine* e, hid_t grp, char* fileName, FILE* xmfFile, char* partTypeGroupName, const struct io_props props, size_t N, @@ -741,7 +738,7 @@ void write_output_serial(struct engine* e, const char* baseName, /* File name */ char fileName[FILENAME_BUFFER_SIZE]; - snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName, + snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%04i.hdf5", baseName, outputCount); /* Compute offset in the file and total number of particles */ diff --git a/src/single_io.c b/src/single_io.c index 0b091a5997504e5f5a4cc3b8af7ca06c994e993c..194563352dff5570b8703f828fac95bccbf7409f 100644 --- a/src/single_io.c +++ b/src/single_io.c @@ -64,8 +64,7 @@ * @param ic_units The #unit_system used in the ICs * * @todo A better version using HDF5 hyper-slabs to read the file directly into - *the part array - * will be written once the structures have been stabilized. + * the part array will be written once the structures have been stabilized. */ void readArray(hid_t h_grp, const struct io_props prop, size_t N, const struct unit_system* internal_units, @@ -607,7 +606,7 @@ void write_output_single(struct engine* e, const char* baseName, /* File name */ char fileName[FILENAME_BUFFER_SIZE]; - snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%03i.hdf5", baseName, + snprintf(fileName, FILENAME_BUFFER_SIZE, "%s_%04i.hdf5", baseName, outputCount); /* First time, we need to create the XMF file */ diff --git a/src/space.c b/src/space.c index b1612876b6339fb29648a87e9aec93a1d8f64664..52a34248cd9bf38e03e476e4937fa601b0ee9222 100644 --- a/src/space.c +++ b/src/space.c @@ -60,9 +60,10 @@ /* Split size. */ int space_splitsize = space_splitsize_default; -int space_subsize = space_subsize_default; +int space_subsize_pair = space_subsize_pair_default; +int space_subsize_self = space_subsize_self_default; +int space_subsize_self_grav = space_subsize_self_grav_default; int space_maxsize = space_maxsize_default; -int space_maxcount = space_maxcount_default; /** * @brief Interval stack necessary for parallel particle sorting. @@ -214,6 +215,8 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements, c->scount = 0; c->init_grav = NULL; c->extra_ghost = NULL; + c->ghost_in = NULL; + c->ghost_out = NULL; c->ghost = NULL; c->kick1 = NULL; c->kick2 = NULL; @@ -227,10 +230,15 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements, c->grav_long_range = NULL; c->grav_down = NULL; c->super = c; - if (c->sort != NULL) { - free(c->sort); - c->sort = NULL; - } + c->parts = NULL; + c->xparts = NULL; + c->gparts = NULL; + c->sparts = NULL; + for (int i = 0; i < 13; i++) + if (c->sort[i] != NULL) { + free(c->sort[i]); + c->sort[i] = NULL; + } #if WITH_MPI c->recv_xv = NULL; c->recv_rho = NULL; @@ -245,6 +253,15 @@ void space_rebuild_recycle_mapper(void *map_data, int num_elements, } } +/** + * @brief Free up any allocated cells. + */ +void space_free_cells(struct space *s) { + threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, s->cells_top, + s->nr_cells, sizeof(struct cell), 0, s); + s->maxdepth = 0; +} + /** * @brief Re-build the top-level cell grid. * @@ -308,14 +325,22 @@ void space_regrid(struct space *s, int verbose) { "small,\n" " - the (minimal) time-step is too large leading to particles with " "predicted smoothing lengths too large for the box size,\n" - " - particle with velocities so large that they move by more than two " + " - particles with velocities so large that they move by more than two " "box sizes per time-step.\n"); - /* Check if we have enough cells for gravity. */ - if (s->gravity && (cdim[0] < 8 || cdim[1] < 8 || cdim[2] < 8)) + /* Check if we have enough cells for periodic gravity. */ + if (s->gravity && s->periodic && (cdim[0] < 8 || cdim[1] < 8 || cdim[2] < 8)) error( - "Must have at least 8 cells in each spatial dimension when gravity " - "is switched on."); + "Must have at least 8 cells in each spatial dimension when periodic " + "gravity is switched on.\nThis error is often caused by any of the " + "followings:\n" + " - too few particles to generate a sensible grid,\n" + " - the initial value of 'Scheduler:max_top_level_cells' is too " + "small,\n" + " - the (minimal) time-step is too large leading to particles with " + "predicted smoothing lengths too large for the box size,\n" + " - particles with velocities so large that they move by more than two " + "box sizes per time-step.\n"); /* In MPI-Land, changing the top-level cell size requires that the * global partition is recomputed and the particles redistributed. @@ -357,19 +382,21 @@ void space_regrid(struct space *s, int verbose) { /* Be verbose about this. */ #ifdef SWIFT_DEBUG_CHECKS - message("re)griding space cdim=(%d %d %d)", cdim[0], cdim[1], cdim[2]); + message("(re)griding space cdim=(%d %d %d)", cdim[0], cdim[1], cdim[2]); fflush(stdout); #endif /* Free the old cells, if they were allocated. */ if (s->cells_top != NULL) { - threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, - s->cells_top, s->nr_cells, sizeof(struct cell), 100, s); + space_free_cells(s); free(s->cells_top); free(s->multipoles_top); - s->maxdepth = 0; } + /* Also free the task arrays, these will be regenerated and we can use the + * memory while copying the particle arrays. */ + if (s->e != NULL) scheduler_free_tasks(&s->e->sched); + /* Set the new cell dimensions only if smaller. */ for (int k = 0; k < 3; k++) { s->cdim[k] = cdim[k]; @@ -476,9 +503,7 @@ void space_regrid(struct space *s, int verbose) { else { /* Otherwise, just clean up the cells. */ /* Free the old cells, if they were allocated. */ - threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, - s->cells_top, s->nr_cells, sizeof(struct cell), 100, s); - s->maxdepth = 0; + space_free_cells(s); } if (verbose) @@ -499,7 +524,7 @@ void space_rebuild(struct space *s, int verbose) { /* Be verbose about this. */ #ifdef SWIFT_DEBUG_CHECKS - if (s->e->nodeID == 0 || verbose) message("re)building space"); + if (s->e->nodeID == 0 || verbose) message("(re)building space"); fflush(stdout); #endif @@ -910,14 +935,16 @@ void space_rebuild(struct space *s, int verbose) { c->ti_old_part = ti_old; c->ti_old_gpart = ti_old; c->ti_old_multipole = ti_old; - c->parts = finger; - c->xparts = xfinger; - c->gparts = gfinger; - c->sparts = sfinger; - finger = &finger[c->count]; - xfinger = &xfinger[c->count]; - gfinger = &gfinger[c->gcount]; - sfinger = &sfinger[c->scount]; + if (c->nodeID == engine_rank) { + c->parts = finger; + c->xparts = xfinger; + c->gparts = gfinger; + c->sparts = sfinger; + finger = &finger[c->count]; + xfinger = &xfinger[c->count]; + gfinger = &gfinger[c->gcount]; + sfinger = &sfinger[c->scount]; + } } // message( "hooking up cells took %.3f %s." , // clocks_from_ticks(getticks() - tic), clocks_getunit()); @@ -954,7 +981,7 @@ void space_split(struct space *s, struct cell *cells, int nr_cells, const ticks tic = getticks(); threadpool_map(&s->e->threadpool, space_split_mapper, cells, nr_cells, - sizeof(struct cell), 1, s); + sizeof(struct cell), 0, s); if (verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -962,28 +989,33 @@ void space_split(struct space *s, struct cell *cells, int nr_cells, } /** - * @brief Runs through the top-level cells and checks whether tasks associated - * with them can be split. If not, try to sanitize the cells. + * @brief #threadpool mapper function to sanitize the cells * - * @param s The #space to act upon. + * @param map_data Pointers towards the top-level cells. + * @param num_cells The number of top-level cells. + * @param extra_data Unused parameters. */ -void space_sanitize(struct space *s) { - - s->sanitized = 1; +void space_sanitize_mapper(void *map_data, int num_cells, void *extra_data) { + /* Unpack the inputs. */ + struct cell *cells_top = (struct cell *)map_data; - for (int k = 0; k < s->nr_cells; k++) { + for (int ind = 0; ind < num_cells; ind++) { + struct cell *c = &cells_top[ind]; + cell_sanitize(c, 0); + } +} - struct cell *c = &s->cells_top[k]; - const double min_width = c->dmin; +/** + * @brief Runs through the top-level cells and sanitize their h values + * + * @param s The #space to act upon. + */ +void space_sanitize(struct space *s) { - /* Do we have a problem ? */ - if (c->h_max * kernel_gamma * space_stretch > min_width * 0.5 && - c->count > space_maxcount) { + if (s->e->nodeID == 0) message("Cleaning up unreasonable values of h"); - /* Ok, clean-up the mess */ - cell_sanitize(c); - } - } + threadpool_map(&s->e->threadpool, space_sanitize_mapper, s->cells_top, + s->nr_cells, sizeof(struct cell), 0, NULL); } /** @@ -1166,7 +1198,7 @@ void space_parts_get_cell_index(struct space *s, int *ind, struct cell *cells, data.ind = ind; threadpool_map(&s->e->threadpool, space_parts_get_cell_index_mapper, s->parts, - s->nr_parts, sizeof(struct part), 1000, &data); + s->nr_parts, sizeof(struct part), 0, &data); if (verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -1193,7 +1225,7 @@ void space_gparts_get_cell_index(struct space *s, int *gind, struct cell *cells, data.ind = gind; threadpool_map(&s->e->threadpool, space_gparts_get_cell_index_mapper, - s->gparts, s->nr_gparts, sizeof(struct gpart), 1000, &data); + s->gparts, s->nr_gparts, sizeof(struct gpart), 0, &data); if (verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -1220,7 +1252,7 @@ void space_sparts_get_cell_index(struct space *s, int *sind, struct cell *cells, data.ind = sind; threadpool_map(&s->e->threadpool, space_sparts_get_cell_index_mapper, - s->sparts, s->nr_sparts, sizeof(struct spart), 1000, &data); + s->sparts, s->nr_sparts, sizeof(struct spart), 0, &data); if (verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), @@ -1783,10 +1815,11 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, */ void space_map_clearsort(struct cell *c, void *data) { - if (c->sort != NULL) { - free(c->sort); - c->sort = NULL; - } + for (int i = 0; i < 13; i++) + if (c->sort[i] != NULL) { + free(c->sort[i]); + c->sort[i] = NULL; + } } /** @@ -2284,7 +2317,8 @@ void space_recycle(struct space *s, struct cell *c) { error("Failed to destroy spinlocks."); /* Clear this cell's sort arrays. */ - if (c->sort != NULL) free(c->sort); + for (int i = 0; i < 13; i++) + if (c->sort[i] != NULL) free(c->sort[i]); /* Lock the space. */ lock_lock(&s->lock); @@ -2336,7 +2370,8 @@ void space_recycle_list(struct space *s, struct cell *cell_list_begin, error("Failed to destroy spinlocks."); /* Clear this cell's sort arrays. */ - if (c->sort != NULL) free(c->sort); + for (int i = 0; i < 13; i++) + if (c->sort[i] != NULL) free(c->sort[i]); /* Count this cell. */ count += 1; @@ -2480,7 +2515,7 @@ void space_synchronize_particle_positions(struct space *s) { (s->nr_gparts > 0 && s->nr_sparts > 0)) threadpool_map(&s->e->threadpool, space_synchronize_particle_positions_mapper, s->gparts, - s->nr_gparts, sizeof(struct gpart), 1000, (void *)s); + s->nr_gparts, sizeof(struct gpart), 0, (void *)s); } /** @@ -2630,7 +2665,6 @@ void space_init(struct space *s, const struct swift_params *params, s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2]; - s->sanitized = 0; s->periodic = periodic; s->gravity = gravity; s->nr_parts = Npart; @@ -2677,15 +2711,21 @@ void space_init(struct space *s, const struct swift_params *params, /* Get the constants for the scheduler */ space_maxsize = parser_get_opt_param_int(params, "Scheduler:cell_max_size", space_maxsize_default); - space_subsize = parser_get_opt_param_int(params, "Scheduler:cell_sub_size", - space_subsize_default); + space_subsize_pair = parser_get_opt_param_int( + params, "Scheduler:cell_sub_size_pair", space_subsize_pair_default); + space_subsize_self = parser_get_opt_param_int( + params, "Scheduler:cell_sub_size_self", space_subsize_self_default); + space_subsize_self_grav = + parser_get_opt_param_int(params, "Scheduler:cell_sub_size_self_grav", + space_subsize_self_grav_default); space_splitsize = parser_get_opt_param_int( params, "Scheduler:cell_split_size", space_splitsize_default); - space_maxcount = parser_get_opt_param_int(params, "Scheduler:cell_max_count", - space_maxcount_default); + if (verbose) - message("max_size set to %d, sub_size set to %d, split_size set to %d", - space_maxsize, space_subsize, space_splitsize); + message( + "max_size set to %d, sub_size_pair set to %d, sub_size_self set to %d, " + "split_size set to %d", + space_maxsize, space_subsize_pair, space_subsize_self, space_splitsize); /* Apply h scaling */ const double scaling = diff --git a/src/space.h b/src/space.h index e8e8600349c97ff8a60f0fcf2964d6ec514a7589..dbbba714c2b3c9841905b2ba54e4f2d854b820a6 100644 --- a/src/space.h +++ b/src/space.h @@ -30,19 +30,22 @@ #include <stddef.h> /* Includes. */ -#include "cell.h" #include "hydro_space.h" #include "lock.h" #include "parser.h" #include "part.h" #include "space.h" +/* Avoid cyclic inclusions */ +struct cell; + /* Some constants. */ #define space_cellallocchunk 1000 #define space_splitsize_default 400 #define space_maxsize_default 8000000 -#define space_subsize_default 64000000 -#define space_maxcount_default 10000 +#define space_subsize_pair_default 256000000 +#define space_subsize_self_default 32000 +#define space_subsize_self_grav_default 32000 #define space_max_top_level_cells_default 12 #define space_stretch 1.10f #define space_maxreldx 0.1f @@ -53,8 +56,9 @@ /* Split size. */ extern int space_splitsize; extern int space_maxsize; -extern int space_subsize; -extern int space_maxcount; +extern int space_subsize_pair; +extern int space_subsize_self; +extern int space_subsize_self_grav; /** * @brief The space in which the cells and particles reside. @@ -139,9 +143,6 @@ struct space { /*! Number of queues in the system. */ int nr_queues; - /*! Has this space already been sanitized ? */ - int sanitized; - /*! The associated engine. */ struct engine *e; @@ -225,5 +226,6 @@ void space_check_timesteps(struct space *s); void space_replicate(struct space *s, int replicate, int verbose); void space_reset_task_counters(struct space *s); void space_clean(struct space *s); +void space_free_cells(struct space *s); #endif /* SWIFT_SPACE_H */ diff --git a/src/statistics.c b/src/statistics.c index 57d60bcb1b247c9616c859b7ac8a475acdcd878f..5a3f1ff4f9a2232a14817e7e55fd2cff5bdcd80e 100644 --- a/src/statistics.c +++ b/src/statistics.c @@ -271,12 +271,12 @@ void stats_collect(const struct space *s, struct statistics *stats) { /* Run parallel collection of statistics for parts */ if (s->nr_parts > 0) threadpool_map(&s->e->threadpool, stats_collect_part_mapper, s->parts, - s->nr_parts, sizeof(struct part), 10000, &extra_data); + s->nr_parts, sizeof(struct part), 0, &extra_data); /* Run parallel collection of statistics for gparts */ if (s->nr_gparts > 0) threadpool_map(&s->e->threadpool, stats_collect_gpart_mapper, s->gparts, - s->nr_gparts, sizeof(struct gpart), 10000, &extra_data); + s->nr_gparts, sizeof(struct gpart), 0, &extra_data); } /** diff --git a/src/swift.h b/src/swift.h index 20397eb24df478cba65a0e35d686b402f1d8ee70..1d1a7c7d04b3662c524504c292aa7d9eee2c3d09 100644 --- a/src/swift.h +++ b/src/swift.h @@ -57,6 +57,7 @@ #include "sourceterms.h" #include "space.h" #include "task.h" +#include "threadpool.h" #include "timeline.h" #include "timers.h" #include "tools.h" diff --git a/src/task.h b/src/task.h index 052f3e8036381441e283d3f7847d09e98ec1dac2..dee888c9f16dd69785a31371da15078af4e0af0c 100644 --- a/src/task.h +++ b/src/task.h @@ -36,6 +36,7 @@ * @brief The different task types. * * Be sure to update the taskID_names array in tasks.c if you modify this list! + * Also update the python task plotting scripts! */ enum task_types { task_type_none = 0, @@ -162,6 +163,9 @@ struct task { /*! ID of the queue or runner owning this task */ short int rid; + /*! Information about the direction of the pair task */ + short int sid; + /*! Start and end time of this task */ ticks tic, toc; #endif diff --git a/src/threadpool.c b/src/threadpool.c index c11fd8121bb02f36fce1796d79a7eb55a38102c4..465756f71d88df81921a880edf8cdb1ee17f6026 100644 --- a/src/threadpool.c +++ b/src/threadpool.c @@ -26,13 +26,139 @@ #include <math.h> #include <stdlib.h> #include <string.h> +#ifdef SWIFT_DEBUG_THREADPOOL +#include <dlfcn.h> +#endif /* This object's header. */ #include "threadpool.h" /* Local headers. */ #include "atomic.h" +#include "clocks.h" #include "error.h" +#include "minmax.h" + +#ifdef SWIFT_DEBUG_THREADPOOL +/** + * @brief Store a log entry of the given chunk. + */ +void threadpool_log(struct threadpool *tp, int tid, size_t chunk_size, + ticks tic, ticks toc) { + struct mapper_log *log = &tp->logs[tid > 0 ? tid : 0]; + + /* Check if we need to re-allocate the log buffer. */ + if (log->count == log->size) { + log->size *= 2; + struct mapper_log_entry *new_log; + if ((new_log = (struct mapper_log_entry *)malloc( + sizeof(struct mapper_log_entry) * log->size)) == NULL) + error("Failed to re-allocate mapper log."); + memcpy(new_log, log->log, sizeof(struct mapper_log_entry) * log->count); + free(log->log); + log->log = new_log; + } + + /* Store the new entry. */ + struct mapper_log_entry *entry = &log->log[log->count]; + entry->tid = tid; + entry->chunk_size = chunk_size; + entry->tic = tic; + entry->toc = toc; + entry->map_function = tp->map_function; + log->count++; +} + +void threadpool_dump_log(struct threadpool *tp, const char *filename, + int reset) { + + /* Open the output file. */ + FILE *fd; + if ((fd = fopen(filename, "w")) == NULL) + error("Failed to create log file '%s'.", filename); + + /* Create a buffer of function names. */ + const int max_names = 100; + struct name_entry { + threadpool_map_function map_function; + const char *name; + }; + struct name_entry names[max_names]; + bzero(names, sizeof(struct name_entry) * max_names); + + /* Write a header. */ + fprintf(fd, "# map_function thread_id chunk_size tic toc\n"); + fprintf(fd, "# {'num_threads': %i, 'cpufreq': %lli}\n", tp->num_threads, + clocks_get_cpufreq()); + + /* Loop over the per-tid logs and dump them. */ + for (int k = 0; k < tp->num_threads; k++) { + struct mapper_log *log = &tp->logs[k]; + + /* Loop over the log entries and dump them. */ + for (int i = 0; i < log->count; i++) { + + struct mapper_log_entry *entry = &log->log[i]; + + /* Look for the function pointer in the buffer. */ + int nid = 0; + while (nid < max_names && names[nid].map_function != entry->map_function) + nid++; + + /* If the name was not found, make a new entry. */ + if (nid == max_names) { + for (int j = 1; j < max_names; j++) names[j - 1] = names[j]; + names[0].map_function = entry->map_function; + Dl_info dl_info; + dladdr(entry->map_function, &dl_info); + names[0].name = dl_info.dli_sname; + nid = 0; + } + + /* Log a line to the file. */ + fprintf(fd, "%s %i %i %lli %lli\n", names[nid].name, entry->tid, + entry->chunk_size, entry->tic, entry->toc); + } + + /* Clear the log if requested. */ + if (reset) log->count = 0; + } + + /* Close the file. */ + fclose(fd); +} +#endif // SWIFT_DEBUG_THREADPOOL + +/** + * @brief Runner main loop, get a chunk and call the mapper function. + */ +void threadpool_chomp(struct threadpool *tp, int tid) { + + /* Loop until we can't get a chunk. */ + while (1) { + /* Desired chunk size. */ + size_t chunk_size = + (tp->map_data_size - tp->map_data_count) / (2 * tp->num_threads); + if (chunk_size > tp->map_data_chunk) chunk_size = tp->map_data_chunk; + if (chunk_size < 1) chunk_size = 1; + + /* Get a chunk and check its size. */ + size_t task_ind = atomic_add(&tp->map_data_count, chunk_size); + if (task_ind >= tp->map_data_size) break; + if (task_ind + chunk_size > tp->map_data_size) + chunk_size = tp->map_data_size - task_ind; + +/* Call the mapper function. */ +#ifdef SWIFT_DEBUG_THREADPOOL + ticks tic = getticks(); +#endif + tp->map_function((char *)tp->map_data + (tp->map_data_stride * task_ind), + chunk_size, tp->map_extra_data); +#ifdef SWIFT_DEBUG_THREADPOOL + threadpool_log(tp, tid, chunk_size, tic, getticks()); +#endif + } +} void *threadpool_runner(void *data) { @@ -43,39 +169,17 @@ void *threadpool_runner(void *data) { while (1) { /* Let the controller know that this thread is waiting. */ - pthread_mutex_lock(&tp->thread_mutex); - tp->num_threads_waiting += 1; - if (tp->num_threads_waiting == tp->num_threads) { - pthread_cond_signal(&tp->control_cond); - } + pthread_barrier_wait(&tp->wait_barrier); /* Wait for the controller. */ - pthread_cond_wait(&tp->thread_cond, &tp->thread_mutex); - tp->num_threads_waiting -= 1; - tp->num_threads_running += 1; - if (tp->num_threads_running == tp->num_threads) { - pthread_cond_signal(&tp->control_cond); - } - pthread_mutex_unlock(&tp->thread_mutex); - - /* The index of the mapping task we will work on next. */ - while (1) { - /* Desired chunk size. */ - size_t chunk_size = - (tp->map_data_size - tp->map_data_count) / (2 * tp->num_threads); - if (chunk_size > tp->map_data_chunk) chunk_size = tp->map_data_chunk; - if (chunk_size < 1) chunk_size = 1; - - /* Get a chunk and check its size. */ - size_t task_ind = atomic_add(&tp->map_data_count, chunk_size); - if (task_ind >= tp->map_data_size) break; - if (task_ind + chunk_size > tp->map_data_size) - chunk_size = tp->map_data_size - task_ind; - - /* Call the mapper function. */ - tp->map_function((char *)tp->map_data + (tp->map_data_stride * task_ind), - chunk_size, tp->map_extra_data); - } + pthread_barrier_wait(&tp->run_barrier); + + /* If no map function is specified, just die. We use this as a mechanism + to shut down threads without leaving the barriers in an invalid state. */ + if (tp->map_function == NULL) pthread_exit(NULL); + + /* Do actual work. */ + threadpool_chomp(tp, atomic_inc(&tp->num_threads_running)); } } @@ -89,18 +193,28 @@ void threadpool_init(struct threadpool *tp, int num_threads) { /* Initialize the thread counters. */ tp->num_threads = num_threads; - tp->num_threads_waiting = 0; + +#ifdef SWIFT_DEBUG_THREADPOOL + if ((tp->logs = (struct mapper_log *)malloc(sizeof(struct mapper_log) * + num_threads)) == NULL) + error("Failed to allocate mapper logs."); + for (int k = 0; k < num_threads; k++) { + tp->logs[k].size = threadpool_log_initial_size; + tp->logs[k].count = 0; + if ((tp->logs[k].log = (struct mapper_log_entry *)malloc( + sizeof(struct mapper_log_entry) * tp->logs[k].size)) == NULL) + error("Failed to allocate mapper log."); + } +#endif /* If there is only a single thread, do nothing more as of here as we will just do work in the (blocked) calling thread. */ if (num_threads == 1) return; - /* Init the threadpool mutexes. */ - if (pthread_mutex_init(&tp->thread_mutex, NULL) != 0) - error("Failed to initialize mutexex."); - if (pthread_cond_init(&tp->control_cond, NULL) != 0 || - pthread_cond_init(&tp->thread_cond, NULL) != 0) - error("Failed to initialize condition variables."); + /* Init the barriers. */ + if (pthread_barrier_init(&tp->wait_barrier, NULL, num_threads) != 0 || + pthread_barrier_init(&tp->run_barrier, NULL, num_threads) != 0) + error("Failed to initialize barriers."); /* Set the task counter to zero. */ tp->map_data_size = 0; @@ -109,24 +223,21 @@ void threadpool_init(struct threadpool *tp, int num_threads) { tp->map_data_chunk = 0; tp->map_function = NULL; - /* Allocate the threads. */ - if ((tp->threads = (pthread_t *)malloc(sizeof(pthread_t) * num_threads)) == - NULL) { + /* Allocate the threads, one less than requested since the calling thread + works as well. */ + if ((tp->threads = (pthread_t *)malloc(sizeof(pthread_t) * + (num_threads - 1))) == NULL) { error("Failed to allocate thread array."); } /* Create and start the threads. */ - pthread_mutex_lock(&tp->thread_mutex); - for (int k = 0; k < num_threads; k++) { + for (int k = 0; k < num_threads - 1; k++) { if (pthread_create(&tp->threads[k], NULL, &threadpool_runner, tp) != 0) error("Failed to create threadpool runner thread."); } /* Wait for all the threads to be up and running. */ - while (tp->num_threads_waiting < tp->num_threads) { - pthread_cond_wait(&tp->control_cond, &tp->thread_mutex); - } - pthread_mutex_unlock(&tp->thread_mutex); + pthread_barrier_wait(&tp->wait_barrier); } /** @@ -140,7 +251,8 @@ void threadpool_init(struct threadpool *tp, int num_threads) { * @param map_data The data on which the mapping function will be called. * @param N Number of elements in @c map_data. * @param stride Size, in bytes, of each element of @c map_data. - * @param chunk Number of map data elements to pass to the function at a time. + * @param chunk Number of map data elements to pass to the function at a time, + * or zero to choose the number automatically. * @param extra_data Addtitional pointer that will be passed to the mapping * function, may contain additional data. */ @@ -148,37 +260,86 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function, void *map_data, size_t N, int stride, int chunk, void *extra_data) { +#ifdef SWIFT_DEBUG_THREADPOOL + ticks tic = getticks(); +#endif + /* If we just have a single thread, call the map function directly. */ if (tp->num_threads == 1) { map_function(map_data, N, extra_data); +#ifdef SWIFT_DEBUG_THREADPOOL + tp->map_function = map_function; + threadpool_log(tp, 0, N, tic, getticks()); +#endif return; } /* Set the map data and signal the threads. */ - pthread_mutex_lock(&tp->thread_mutex); tp->map_data_stride = stride; tp->map_data_size = N; tp->map_data_count = 0; - tp->map_data_chunk = chunk; + tp->map_data_chunk = + chunk ? chunk + : max((int)(N / (tp->num_threads * threadpool_default_chunk_ratio)), + 1); tp->map_function = map_function; tp->map_data = map_data; tp->map_extra_data = extra_data; tp->num_threads_running = 0; - pthread_cond_broadcast(&tp->thread_cond); /* Wait for all the threads to be up and running. */ - while (tp->num_threads_running < tp->num_threads) { - pthread_cond_wait(&tp->control_cond, &tp->thread_mutex); - } + pthread_barrier_wait(&tp->run_barrier); + + /* Do some work while I'm at it. */ + threadpool_chomp(tp, tp->num_threads - 1); /* Wait for all threads to be done. */ - while (tp->num_threads_waiting < tp->num_threads) { - pthread_cond_wait(&tp->control_cond, &tp->thread_mutex); - } - pthread_mutex_unlock(&tp->thread_mutex); + pthread_barrier_wait(&tp->wait_barrier); + +#ifdef SWIFT_DEBUG_THREADPOOL + /* Log the total call time to thread id -1. */ + threadpool_log(tp, -1, N, tic, getticks()); +#endif } +/** + * @brief Re-sets the log for this #threadpool. + */ +#ifdef SWIFT_DEBUG_THREADPOOL +void threadpool_reset_log(struct threadpool *tp) { + for (int k = 0; k < tp->num_threads; k++) tp->logs[k].count = 0; +} +#endif + /** * @brief Frees up the memory allocated for this #threadpool. */ -void threadpool_clean(struct threadpool *tp) { free(tp->threads); } +void threadpool_clean(struct threadpool *tp) { + + if (tp->num_threads > 1) { + /* Destroy the runner threads by calling them with a NULL mapper function + * and waiting for all the threads to terminate. This ensures that no + * thread is still waiting at a barrier. */ + tp->map_function = NULL; + pthread_barrier_wait(&tp->run_barrier); + for (int k = 0; k < tp->num_threads - 1; k++) { + void *retval; + pthread_join(tp->threads[k], &retval); + } + + /* Release the barriers. */ + if (pthread_barrier_destroy(&tp->wait_barrier) != 0 || + pthread_barrier_destroy(&tp->run_barrier) != 0) + error("Failed to destroy threadpool barriers."); + + /* Clean up memory. */ + free(tp->threads); + } + +#ifdef SWIFT_DEBUG_THREADPOOL + for (int k = 0; k < tp->num_threads; k++) { + free(tp->logs[k].log); + } + free(tp->logs); +#endif +} diff --git a/src/threadpool.h b/src/threadpool.h index f9c7eeffb700adc579ec05902193b888cdd6363d..019403f658a22d36c4a6e1ec1ae1fdc47c62658d 100644 --- a/src/threadpool.h +++ b/src/threadpool.h @@ -25,10 +25,44 @@ /* Some standard headers. */ #include <pthread.h> +/* Local includes. */ +#include "cycle.h" + +/* Local defines. */ +#define threadpool_log_initial_size 1000 +#define threadpool_default_chunk_ratio 7 + /* Function type for mappings. */ typedef void (*threadpool_map_function)(void *map_data, int num_elements, void *extra_data); +/* Data for threadpool logging. */ +struct mapper_log_entry { + + /* ID of the thread executing the chunk. */ + int tid; + + /* Size of the chunk processed. */ + int chunk_size; + + /* Pointer to the mapper function. */ + threadpool_map_function map_function; + + /*! Start and end time of this task */ + ticks tic, toc; +}; + +struct mapper_log { + /* Log of threadpool mapper calls. */ + struct mapper_log_entry *log; + + /* Size of the allocated log. */ + int size; + + /* Number of entries in the log. */ + int count; +}; + /* Data of a threadpool. */ struct threadpool { @@ -36,8 +70,8 @@ struct threadpool { pthread_t *threads; /* This is where threads go to rest. */ - pthread_mutex_t thread_mutex; - pthread_cond_t control_cond, thread_cond; + pthread_barrier_t wait_barrier; + pthread_barrier_t run_barrier; /* Current map data and count. */ void *map_data, *map_extra_data; @@ -49,7 +83,11 @@ struct threadpool { int num_threads; /* Counter for the number of threads that are done. */ - volatile int num_threads_waiting, num_threads_running; + volatile int num_threads_running; + +#ifdef SWIFT_DEBUG_THREADPOOL + struct mapper_log *logs; +#endif }; /* Function prototypes. */ @@ -58,5 +96,10 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function, void *map_data, size_t N, int stride, int chunk, void *extra_data); void threadpool_clean(struct threadpool *tp); +#ifdef SWIFT_DEBUG_THREADPOOL +void threadpool_reset_log(struct threadpool *tp); +void threadpool_dump_log(struct threadpool *tp, const char *filename, + int reset); +#endif #endif /* SWIFT_THREADPOOL_H */ diff --git a/src/tools.c b/src/tools.c index 73684c82662870d368f7dd360c84635654f06434..7d69ebc6c476312081d8a8c34c76c6592da5cab0 100644 --- a/src/tools.c +++ b/src/tools.c @@ -32,11 +32,13 @@ #include "tools.h" /* Local includes. */ +#include "active.h" #include "cell.h" #include "error.h" #include "gravity.h" #include "hydro.h" #include "part.h" +#include "periodic.h" #include "runner.h" /** @@ -181,6 +183,8 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { float r2, hi, hj, hig2, hjg2, dx[3]; struct part *pi, *pj; + const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]}; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -189,6 +193,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { hi = pi->h; hig2 = hi * hi * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pi, e)) continue; + for (int j = 0; j < cj->count; ++j) { pj = &cj->parts[j]; @@ -197,6 +204,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { r2 = 0.0f; for (int k = 0; k < 3; k++) { dx[k] = ci->parts[i].x[k] - cj->parts[j].x[k]; + dx[k] = nearest(dx[k], dim[k]); r2 += dx[k] * dx[k]; } @@ -216,6 +224,9 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { hj = pj->h; hjg2 = hj * hj * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pj, e)) continue; + for (int i = 0; i < ci->count; ++i) { pi = &ci->parts[i]; @@ -224,6 +235,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { r2 = 0.0f; for (int k = 0; k < 3; k++) { dx[k] = cj->parts[j].x[k] - ci->parts[i].x[k]; + dx[k] = nearest(dx[k], dim[k]); r2 += dx[k] * dx[k]; } @@ -241,6 +253,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { float r2, hi, hj, hig2, hjg2, dx[3]; struct part *pi, *pj; + const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]}; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -259,6 +272,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { r2 = 0.0f; for (int k = 0; k < 3; k++) { dx[k] = ci->parts[i].x[k] - cj->parts[j].x[k]; + dx[k] = nearest(dx[k], dim[k]); r2 += dx[k] * dx[k]; } @@ -288,6 +302,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { r2 = 0.0f; for (int k = 0; k < 3; k++) { dx[k] = cj->parts[j].x[k] - ci->parts[i].x[k]; + dx[k] = nearest(dx[k], dim[k]); r2 += dx[k] * dx[k]; } @@ -304,6 +319,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { void self_all_density(struct runner *r, struct cell *ci) { float r2, hi, hj, hig2, hjg2, dxi[3]; //, dxj[3]; struct part *pi, *pj; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -328,14 +344,14 @@ void self_all_density(struct runner *r, struct cell *ci) { } /* Hit or miss? */ - if (r2 < hig2) { + if (r2 < hig2 && part_is_active(pi, e)) { /* Interact */ runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj); } /* Hit or miss? */ - if (r2 < hjg2) { + if (r2 < hjg2 && part_is_active(pj, e)) { dxi[0] = -dxi[0]; dxi[1] = -dxi[1]; @@ -423,7 +439,7 @@ void pairs_single_grav(double *dim, long long int pid, fdx[i] = dx[i]; } r2 = fdx[0] * fdx[0] + fdx[1] * fdx[1] + fdx[2] * fdx[2]; - runner_iact_grav_pp(0.f, r2, fdx, &pi, &pj); + runner_iact_grav_pp(r2, fdx, &pi, &pj); a[0] += pi.a_grav[0]; a[1] += pi.a_grav[1]; a[2] += pi.a_grav[2]; @@ -748,7 +764,7 @@ void gravity_n2(struct gpart *gparts, const int gcount, const struct gravity_props *gravity_properties, float rlr) { const float rlr_inv = 1. / rlr; - const float r_cut = gravity_properties->r_cut; + const float r_cut = gravity_properties->r_cut_max; const float max_d = r_cut * rlr; const float max_d2 = max_d * max_d; @@ -783,7 +799,7 @@ void gravity_n2(struct gpart *gparts, const int gcount, if (r2 < max_d2 || 1) { /* Apply the gravitational acceleration. */ - runner_iact_grav_pp(rlr_inv, r2, dx, gpi, gpj); + runner_iact_grav_pp(r2, dx, gpi, gpj); } } } diff --git a/src/vector.h b/src/vector.h index 48b9af924b64219f6e7d85292b23a87c348f9ea4..6a7c6837989025785c1f9134004f2ebcc226a205 100644 --- a/src/vector.h +++ b/src/vector.h @@ -23,8 +23,12 @@ /* Have I already read this file? */ #ifndef VEC_MACRO +/* Config parameters. */ #include "../config.h" +/* Local headers */ +#include "inline.h" + #ifdef WITH_VECTORIZATION /* Need to check whether compiler supports this (IBM does not) @@ -64,7 +68,9 @@ #define vec_sub(a, b) _mm512_sub_ps(a, b) #define vec_mask_sub(a, b, mask) _mm512_mask_sub_ps(a, mask, a, b) #define vec_mul(a, b) _mm512_mul_ps(a, b) +#define vec_div(a, b) _mm512_div_ps(a, b) #define vec_fma(a, b, c) _mm512_fmadd_ps(a, b, c) +#define vec_fnma(a, b, c) _mm512_fnmadd_ps(a, b, c) #define vec_sqrt(a) _mm512_sqrt_ps(a) #define vec_rcp(a) _mm512_rcp14_ps(a) #define vec_rsqrt(a) _mm512_rsqrt14_ps(a) @@ -77,15 +83,16 @@ #define vec_cmp_lt(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ) #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) -#define vec_cmp_result(a) a -#define vec_form_int_mask(a) a +#define vec_cmp_result(a) ({ a; }) +#define vec_form_int_mask(a) ({ a; }) #define vec_and(a, b) _mm512_and_ps(a, b) -#define vec_mask_and(a, b) a &b -#define vec_and_mask(a, mask) _mm512_maskz_expand_ps(mask, a) -#define vec_init_mask(mask) mask = 0xFFFF -#define vec_zero_mask(mask) mask = 0 -#define vec_create_mask(mask, cond) mask = cond -#define vec_pad_mask(mask, pad) mask = mask >> (pad) +#define vec_mask_and(a, b) _mm512_kand(a, b) +#define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a) +#define vec_init_mask_true(mask) ({ mask = 0xFFFF; }) +#define vec_zero_mask(mask) ({ mask = 0; }) +#define vec_create_mask(mask, cond) ({ mask = cond; }) +#define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); }) +#define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) @@ -159,6 +166,7 @@ #define vec_sub(a, b) _mm256_sub_ps(a, b) #define vec_mask_sub(a, b, mask) vec_sub(a, vec_and(b, mask.v)) #define vec_mul(a, b) _mm256_mul_ps(a, b) +#define vec_div(a, b) _mm256_div_ps(a, b) #define vec_sqrt(a) _mm256_sqrt_ps(a) #define vec_rcp(a) _mm256_rcp_ps(a) #define vec_rsqrt(a) _mm256_rsqrt_ps(a) @@ -176,11 +184,12 @@ #define vec_and(a, b) _mm256_and_ps(a, b) #define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) #define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v) -#define vec_init_mask(mask) mask.m = vec_setint1(0xFFFFFFFF) +#define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF) #define vec_create_mask(mask, cond) mask.v = cond #define vec_zero_mask(mask) mask.v = vec_setzero() #define vec_pad_mask(mask, pad) \ for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 +#define vec_blend(mask, a, b) _mm256_blendv_ps(a, b, mask.v) #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) #define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) @@ -222,6 +231,7 @@ /* Check if we have AVX2 intrinsics alongside AVX */ #ifdef HAVE_AVX2 #define vec_fma(a, b, c) _mm256_fmadd_ps(a, b, c) +#define vec_fnma(a, b, c) _mm256_fnmadd_ps(a, b, c) /* Used in VEC_FORM_PACKED_MASK */ #define identity_indices 0x0706050403020100 @@ -250,6 +260,11 @@ #define vec_fma(a, b, c) vec_add(vec_mul(a, b), c) #endif +/* Create a negated FMA using vec_sub and vec_mul if AVX2 is not present. */ +#ifndef vec_fnma +#define vec_fnma(a, b, c) vec_sub(c, vec_mul(a, b)) +#endif + /* Form a packed mask without intrinsics if AVX2 is not present. */ #ifndef VEC_FORM_PACKED_MASK @@ -313,6 +328,7 @@ #define vec_add(a, b) _mm_add_ps(a, b) #define vec_sub(a, b) _mm_sub_ps(a, b) #define vec_mul(a, b) _mm_mul_ps(a, b) +#define vec_div(a, b) _mm_div_ps(a, b) #define vec_sqrt(a) _mm_sqrt_ps(a) #define vec_rcp(a) _mm_rcp_ps(a) #define vec_rsqrt(a) _mm_rsqrt_ps(a) diff --git a/src/version.c b/src/version.c index 54a416f6b0745a523382f338fa838018e5254b1e..46c31103c953ce2ff70b9e346f88470008dd8266 100644 --- a/src/version.c +++ b/src/version.c @@ -142,10 +142,7 @@ const char *configuration_options(void) { static int initialised = 0; static const char *config = SWIFT_CONFIG_FLAGS; if (!initialised) { - if (strlen(config) < 1024 - 2) - sprintf(buf, "'%s'", config); - else - error("SWIFT_CONFIG_FLAGS string longer than buffer"); + snprintf(buf, 1024, "'%s'", config); initialised = 1; } return buf; @@ -161,10 +158,7 @@ const char *compilation_cflags(void) { static int initialised = 0; static const char *cflags = SWIFT_CFLAGS; if (!initialised) { - if (strlen(cflags) < 1024 - 2) - sprintf(buf, "'%s'", cflags); - else - error("SWIFT_CFLAGS string longer than buffer"); + snprintf(buf, 1024, "'%s'", cflags); initialised = 1; } return buf; diff --git a/src/xmf.c b/src/xmf.c index ca4ffe5157599dd5a45295dcfa59f9420753f5cf..67682b4794ade773c39a748eddf765e392c74865 100644 --- a/src/xmf.c +++ b/src/xmf.c @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2017 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -21,7 +22,9 @@ #include "../config.h" /* Some standard headers. */ +#include <libgen.h> #include <stdio.h> +#include <string.h> /* This object's header. */ #include "xmf.h" @@ -30,6 +33,21 @@ #include "common_io.h" #include "error.h" +/** + * @brief Return the basename of an HDF5 path. + * + * Need basename as XML paths are relative to the container, and XMF file is + * written with the same baseName as the HDF5 snapshots. + * + * @param hdfFileName + * @return the basename part of hdfFileName. + */ +static const char* xmf_basename(const char* hdfFileName) { + static char buffer[FILENAME_BUFFER_SIZE]; + strcpy(buffer, hdfFileName); + return basename(buffer); +} + /** * @brief Prepare the XMF file corresponding to a snapshot. * @@ -135,7 +153,7 @@ void xmf_write_outputfooter(FILE* xmfFile, int output, float time) { /* Write end of the section of this time step */ fprintf(xmfFile, - "\n</Grid> <!-- End of meta-data for output=%03i, time=%f -->\n", + "\n</Grid> <!-- End of meta-data for output=%04i, time=%f -->\n", output, time); fprintf(xmfFile, "\n</Grid> <!-- timeSeries -->\n"); fprintf(xmfFile, "</Domain>\n"); @@ -154,6 +172,7 @@ void xmf_write_outputfooter(FILE* xmfFile, int output, float time) { */ void xmf_write_groupheader(FILE* xmfFile, char* hdfFileName, size_t N, enum part_type ptype) { + fprintf(xmfFile, "\n<Grid Name=\"%s\" GridType=\"Uniform\">\n", part_type_names[ptype]); fprintf(xmfFile, @@ -163,7 +182,7 @@ void xmf_write_groupheader(FILE* xmfFile, char* hdfFileName, size_t N, "<DataItem Dimensions=\"%zu 3\" NumberType=\"Double\" " "Precision=\"8\" " "Format=\"HDF\">%s:/PartType%d/Coordinates</DataItem>\n", - N, hdfFileName, (int)ptype); + N, xmf_basename(hdfFileName), (int)ptype); fprintf(xmfFile, "</Geometry>\n <!-- Done geometry for %s, start of particle fields " "list -->\n", @@ -251,13 +270,13 @@ void xmf_write_line(FILE* xmfFile, const char* fileName, fprintf(xmfFile, "<DataItem Dimensions=\"%zu\" NumberType=\"%s\" " "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n", - N, xmf_type(type), xmf_precision(type), fileName, partTypeGroupName, - name); + N, xmf_type(type), xmf_precision(type), xmf_basename(fileName), + partTypeGroupName, name); else fprintf(xmfFile, "<DataItem Dimensions=\"%zu %d\" NumberType=\"%s\" " "Precision=\"%d\" Format=\"HDF\">%s:%s/%s</DataItem>\n", - N, dim, xmf_type(type), xmf_precision(type), fileName, + N, dim, xmf_type(type), xmf_precision(type), xmf_basename(fileName), partTypeGroupName, name); fprintf(xmfFile, "</Attribute>\n"); } diff --git a/tests/Makefile.am b/tests/Makefile.am index 7c45ead22f77da7e0aa53e03051c7351cc97f550..9cd6e9ab9e09935d39bf416dfbb65b83a874b382 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -21,20 +21,24 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(FFTW_LIBS # List of programs and scripts to run in the test suite TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetry \ - testPair.sh testPairPerturbed.sh test27cells.sh test27cellsPerturbed.sh \ + testActivePair.sh test27cells.sh test27cellsPerturbed.sh \ testParser.sh testSPHStep test125cells.sh test125cellsPerturbed.sh testFFT \ testAdiabaticIndex testRiemannExact testRiemannTRRS testRiemannHLLC \ - testMatrixInversion testThreadpool testDump testLogger \ - testVoronoi1D testVoronoi2D testVoronoi3D + testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \ + testVoronoi1D testVoronoi2D testVoronoi3D \ + testPeriodicBC.sh testPeriodicBCPerturbed.sh # List of test programs to compile check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \ - testSPHStep testPair test27cells test125cells testParser \ + testSPHStep testActivePair test27cells test125cells testParser \ testKernel testFFT testInteractions testMaths \ testSymmetry testThreadpool benchmarkInteractions \ testAdiabaticIndex testRiemannExact testRiemannTRRS \ testRiemannHLLC testMatrixInversion testDump testLogger \ - testVoronoi1D testVoronoi2D testVoronoi3D + testVoronoi1D testVoronoi2D testVoronoi3D testPeriodicBC + +# Rebuild tests when SWIFT is updated. +$(check_PROGRAMS): ../src/.libs/libswiftsim.a # Sources for the individual programs testGreetings_SOURCES = testGreetings.c @@ -51,10 +55,12 @@ testSPHStep_SOURCES = testSPHStep.c testSingle_SOURCES = testSingle.c -testPair_SOURCES = testPair.c +testActivePair_SOURCES = testActivePair.c test27cells_SOURCES = test27cells.c +testPeriodicBC_SOURCES = testPeriodicBC.c + test125cells_SOURCES = test125cells.c testParser_SOURCES = testParser.c @@ -90,10 +96,10 @@ testDump_SOURCES = testDump.c testLogger_SOURCES = testLogger.c # Files necessary for distribution -EXTRA_DIST = testReading.sh makeInput.py testPair.sh testPairPerturbed.sh \ - test27cells.sh test27cellsPerturbed.sh testParser.sh \ - test125cells.sh test125cellsPerturbed.sh testParserInput.yaml difffloat.py \ - tolerance_125_normal.dat tolerance_125_perturbed.dat \ - tolerance_27_normal.dat tolerance_27_perturbed.dat \ - tolerance_pair_normal.dat tolerance_pair_perturbed.dat \ - fft_params.yml +EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \ + test27cells.sh test27cellsPerturbed.sh testParser.sh testPeriodicBC.sh \ + testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \ + difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \ + tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \ + tolerance_testInteractions.dat tolerance_pair_active.dat \ + fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c index ec3710e05e0151cdff13f2205bcd06bda45a34be..2cc1f830f9827a4805d8f201294e20e8334f4b09 100644 --- a/tests/benchmarkInteractions.c +++ b/tests/benchmarkInteractions.c @@ -31,6 +31,7 @@ #define IACT runner_iact_nonsym_density #define IACT_VEC runner_iact_nonsym_2_vec_density #define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 2 #endif #ifdef SYM_DENSITY @@ -53,8 +54,9 @@ #ifndef IACT #define IACT runner_iact_nonsym_density -#define IACT_VEC runner_iact_nonsym_2_vec_density +#define IACT_VEC runner_iact_nonsym_1_vec_density #define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 1 #endif /** @@ -125,7 +127,7 @@ struct part *make_particles(size_t count, double *offset, double spacing, */ void prepare_force(struct part *parts, size_t count) { -#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH) struct part *p; for (size_t i = 0; i < count; ++i) { p = &parts[i]; @@ -389,19 +391,35 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, hi_inv_vec = vec_reciprocal(hi_vec); - mask_t mask, mask2; - vec_init_mask(mask); - vec_init_mask(mask2); - + mask_t mask; + vec_init_mask_true(mask); +#if (NUM_VEC_PROC_INT == 2) + mask_t mask2; + vec_init_mask_true(mask2); +#endif const ticks vec_tic = getticks(); - for (size_t i = 0; i < count; i += 2 * VEC_SIZE) { + for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) { +/* Interleave two vectors for interaction. */ +#if (NUM_VEC_PROC_INT == 2) IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0); +#else /* Only use one vector for interaction. */ + vector r2, dx, dy, dz; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + + IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), + (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, mask); +#endif } VEC_HADD(rhoSum, piq[0]->rho); diff --git a/tests/difffloat.py b/tests/difffloat.py index 0bdc706a1c44ee6c42c54ad37e93f634742e06bc..ddcf7bcb29758afa3429dea8bcf50e1c5c0477dc 100644 --- a/tests/difffloat.py +++ b/tests/difffloat.py @@ -42,11 +42,12 @@ if len(sys.argv) >= 4: if len(sys.argv) >= 5: number_to_check = int(sys.argv[4]) -if len(sys.argv) == 6: - ignoreSmallRhoDh = int(sys.argv[5]) -else: - ignoreSmallRhoDh = 0 - +# Get the particle properties being compared from the header. +with open(file1, 'r') as f: + line = f.readline() + if 'ID' in line: + part_props = line.split()[1:] + data1 = loadtxt(file1) data2 = loadtxt(file2) if fileTol != "": @@ -63,7 +64,7 @@ n_lines = shape(data1)[0] n_columns = shape(data1)[1] if fileTol != "": - if n_linesTol != 2: + if n_linesTol != 3: print "Incorrect number of lines in tolerance file '%s'."%fileTol if n_columnsTol != n_columns: print "Incorrect number of columns in tolerance file '%s'."%fileTol @@ -73,10 +74,12 @@ if fileTol == "": print "Relative difference tolerance:", rel_tol absTol = ones(n_columns) * abs_tol relTol = ones(n_columns) * rel_tol + limTol = zeros(n_columns) else: print "Tolerances read from file" absTol = dataTol[0,:] relTol = dataTol[1,:] + limTol = dataTol[2,:] n_lines_to_check = 0 if number_to_check > 0: @@ -100,20 +103,17 @@ for i in range(n_lines_to_check): rel_diff = 0. if( abs_diff > 1.1*absTol[j]): - print "Absolute difference larger than tolerance (%e) for particle %d, column %d:"%(absTol[j], i,j) + print "Absolute difference larger than tolerance (%e) for particle %d, column %s:"%(absTol[j], data1[i,0], part_props[j]) print "%10s: a = %e"%("File 1", data1[i,j]) print "%10s: b = %e"%("File 2", data2[i,j]) print "%10s: |a-b| = %e"%("Difference", abs_diff) print "" error = True - if abs(data1[i,j]) < 4e-6 and abs(data2[i,j]) < 4e-6 : continue + if abs(data1[i,j]) + abs(data2[i,j]) < limTol[j] : continue - # Ignore pathological cases with rho_dh - if ignoreSmallRhoDh and j == 8 and abs(data1[i,j]) < 2e-4: continue - if( rel_diff > 1.1*relTol[j]): - print "Relative difference larger than tolerance (%e) for particle %d, column %d:"%(relTol[j], i,j) + print "Relative difference larger than tolerance (%e) for particle %d, column %s:"%(relTol[j], data1[i,0], part_props[j]) print "%10s: a = %e"%("File 1", data1[i,j]) print "%10s: b = %e"%("File 2", data2[i,j]) print "%10s: |a-b|/|a+b| = %e"%("Difference", rel_diff) diff --git a/tests/test125cells.c b/tests/test125cells.c index 5cd8c82a3fb1850b34d157befa70ae75240c7012..023ce145846a30baf79a42877199e6a3028cd75c 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -349,13 +349,11 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h, cell->ti_old_part = 8; cell->ti_end_min = 8; cell->ti_end_max = 8; - cell->ti_sort = 0; // shuffle_particles(cell->parts, cell->count); cell->sorted = 0; - cell->sort = NULL; - cell->sortsize = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; return cell; } @@ -363,7 +361,8 @@ struct cell *make_cell(size_t n, const double offset[3], double size, double h, void clean_up(struct cell *ci) { free(ci->parts); free(ci->xparts); - free(ci->sort); + for (int k = 0; k < 13; k++) + if (ci->sort[k] != NULL) free(ci->sort[k]); free(ci); } @@ -445,6 +444,8 @@ void dump_particle_fields(char *fileName, struct cell *main_cell, /* Just a forward declaration... */ void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair1_branch_density(struct runner *r, struct cell *ci, + struct cell *cj); void runner_doself1_density(struct runner *r, struct cell *ci); void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj); void runner_doself2_force(struct runner *r, struct cell *ci); @@ -565,8 +566,8 @@ int main(int argc, char *argv[]) { prog_const.const_newton_G = 1.f; struct hydro_props hp; - hp.target_neighbours = pow_dimension(h) * kernel_norm; - hp.delta_neighbours = 4.; + hp.eta_neighbours = h; + hp.h_tolerance = 1e0; hp.h_max = FLT_MAX; hp.max_smoothing_iterations = 1; hp.CFL_condition = 0.1; @@ -637,11 +638,20 @@ int main(int argc, char *argv[]) { } /* First, sort stuff */ - for (int j = 0; j < 125; ++j) runner_do_sort(&runner, cells[j], 0x1FFF, 0); + for (int j = 0; j < 125; ++j) + runner_do_sort(&runner, cells[j], 0x1FFF, 0, 0); /* Do the density calculation */ #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) +/* Initialise the particle cache. */ +#ifdef WITH_VECTORIZATION + runner.ci_cache.count = 0; + cache_init(&runner.ci_cache, 512); + runner.cj_cache.count = 0; + cache_init(&runner.cj_cache, 512); +#endif + /* Run all the pairs (only once !)*/ for (int i = 0; i < 5; i++) { for (int j = 0; j < 5; j++) { @@ -664,7 +674,7 @@ int main(int argc, char *argv[]) { struct cell *cj = cells[iii * 25 + jjj * 5 + kkk]; - if (cj > ci) runner_dopair1_density(&runner, ci, cj); + if (cj > ci) runner_dopair1_branch_density(&runner, ci, cj); } } } diff --git a/tests/test27cells.c b/tests/test27cells.c index a0f541d17100a13079580aabbef065fa5adbd5e1..7ba1eec9ad279f09f63021e332dac1cfd5cc1505 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -30,11 +30,9 @@ /* Local headers. */ #include "swift.h" -#define ACC_THRESHOLD 1e-5 - #if defined(WITH_VECTORIZATION) #define DOSELF1 runner_doself1_density_vec -#define DOPAIR1 runner_dopair1_density_vec +#define DOPAIR1 runner_dopair1_branch_density #define DOSELF1_NAME "runner_doself1_density_vec" #define DOPAIR1_NAME "runner_dopair1_density_vec" #endif @@ -45,7 +43,7 @@ #endif #ifndef DOPAIR1 -#define DOPAIR1 runner_dopair1_density +#define DOPAIR1 runner_dopair1_branch_density #define DOPAIR1_NAME "runner_dopair1_density" #endif @@ -64,18 +62,20 @@ enum velocity_types { * @param offset The position of the cell offset from (0,0,0). * @param size The cell size. * @param h The smoothing length of the particles in units of the inter-particle - *separation. + * separation. * @param density The density of the fluid. * @param partId The running counter of IDs. * @param pert The perturbation to apply to the particles in the cell in units - *of the inter-particle separation. + * of the inter-particle separation. * @param vel The type of velocity field (0, random, divergent, rotating) + * @param h_pert The perturbation to apply to the smoothing length. */ struct cell *make_cell(size_t n, double *offset, double size, double h, double density, long long *partId, double pert, - enum velocity_types vel) { + enum velocity_types vel, double h_pert) { const size_t count = n * n * n; const double volume = size * size * size; + float h_max = 0.f; struct cell *cell = malloc(sizeof(struct cell)); bzero(cell, sizeof(struct cell)); @@ -121,7 +121,11 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, part->v[2] = 0.f; break; } - part->h = size * h / (float)n; + if (h_pert) + part->h = size * h * random_uniform(1.f, h_pert) / (float)n; + else + part->h = size * h / (float)n; + h_max = fmaxf(h_max, part->h); part->id = ++(*partId); #if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) @@ -156,7 +160,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, /* Cell properties */ cell->split = 0; - cell->h_max = h; + cell->h_max = h_max; cell->count = count; cell->dx_max_part = 0.; cell->dx_max_sort = 0.; @@ -170,20 +174,19 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, cell->ti_old_part = 8; cell->ti_end_min = 8; cell->ti_end_max = 8; - cell->ti_sort = 8; shuffle_particles(cell->parts, cell->count); cell->sorted = 0; - cell->sort = NULL; - cell->sortsize = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; return cell; } void clean_up(struct cell *ci) { free(ci->parts); - free(ci->sort); + for (int k = 0; k < 13; k++) + if (ci->sort[k] != NULL) free(ci->sort[k]); free(ci); } @@ -202,6 +205,10 @@ void zero_particle_fields(struct cell *c) { void end_calculation(struct cell *c) { for (int pid = 0; pid < c->count; pid++) { hydro_end_density(&c->parts[pid]); + + /* Recover the common "Neighbour number" definition */ + c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h); + c->parts[pid].density.wcount *= kernel_norm; } } @@ -288,33 +295,11 @@ void dump_particle_fields(char *fileName, struct cell *main_cell, fclose(file); } -/** - * @brief Compares the vectorised result against - * the serial result of the interaction. - * - * @param serial_parts Particle array that has been interacted serially - * @param vec_parts Particle array to be interacted using vectors - * @param count No. of particles that have been interacted - * @param threshold Level of accuracy needed - * - * @return Non-zero value if difference found, 0 otherwise - */ -int check_results(struct part *serial_parts, struct part *vec_parts, int count, - double threshold) { - int result = 0; - - for (int i = 0; i < count; i++) - result += compare_particles(serial_parts[i], vec_parts[i], threshold); - - return result; -} - /* Just a forward declaration... */ void runner_doself1_density(struct runner *r, struct cell *ci); void runner_doself1_density_vec(struct runner *r, struct cell *ci); -void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); -void runner_dopair1_density_vec(struct runner *r, struct cell *ci, - struct cell *cj); +void runner_dopair1_branch_density(struct runner *r, struct cell *ci, + struct cell *cj); /* And go... */ int main(int argc, char *argv[]) { @@ -322,8 +307,7 @@ int main(int argc, char *argv[]) { engine_pin(); size_t runs = 0, particles = 0; double h = 1.23485, size = 1., rho = 1.; - double perturbation = 0.; - double threshold = ACC_THRESHOLD; + double perturbation = 0., h_pert = 0.; char outputFileNameExtension[200] = ""; char outputFileName[200] = ""; enum velocity_types vel = velocity_zero; @@ -339,11 +323,14 @@ int main(int argc, char *argv[]) { srand(0); char c; - while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:a:")) != -1) { + while ((c = getopt(argc, argv, "m:s:h:p:n:r:t:d:f:v:")) != -1) { switch (c) { case 'h': sscanf(optarg, "%lf", &h); break; + case 'p': + sscanf(optarg, "%lf", &h_pert); + break; case 's': sscanf(optarg, "%lf", &size); break; @@ -365,9 +352,6 @@ int main(int argc, char *argv[]) { case 'v': sscanf(optarg, "%d", (int *)&vel); break; - case 'a': - sscanf(optarg, "%lf", &threshold); - break; case '?': error("Unknown option."); break; @@ -382,6 +366,7 @@ int main(int argc, char *argv[]) { "runner_doself1_density()." "\n\nOptions:" "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>" + "\n-p - Random fractional change in h, h=h*random(1,p)" "\n-m rho - Physical density in the cell" "\n-s size - Physical size of the cell" "\n-d pert - Perturbation to apply to the particles [0,1[" @@ -415,7 +400,11 @@ int main(int argc, char *argv[]) { space.dim[2] = 3.; struct hydro_props hp; + hp.eta_neighbours = h; + hp.h_tolerance = 1e0; hp.h_max = FLT_MAX; + hp.max_smoothing_iterations = 1; + hp.CFL_condition = 0.1; struct engine engine; engine.s = &space; @@ -435,12 +424,13 @@ int main(int argc, char *argv[]) { for (int j = 0; j < 3; ++j) { for (int k = 0; k < 3; ++k) { double offset[3] = {i * size, j * size, k * size}; - cells[i * 9 + j * 3 + k] = make_cell(particles, offset, size, h, rho, - &partId, perturbation, vel); + cells[i * 9 + j * 3 + k] = + make_cell(particles, offset, size, h, rho, &partId, perturbation, + vel, h_pert); runner_do_drift_part(&runner, cells[i * 9 + j * 3 + k], 0); - runner_do_sort(&runner, cells[i * 9 + j * 3 + k], 0x1FFF, 0); + runner_do_sort(&runner, cells[i * 9 + j * 3 + k], 0x1FFF, 0, 0); } } } @@ -504,10 +494,6 @@ int main(int argc, char *argv[]) { } } - /* Store the vectorised particle results. */ - struct part vec_parts[main_cell->count]; - for (int i = 0; i < main_cell->count; i++) vec_parts[i] = main_cell->parts[i]; - /* Output timing */ ticks corner_time = timings[0] + timings[2] + timings[6] + timings[8] + timings[18] + timings[20] + timings[24] + timings[26]; @@ -552,10 +538,6 @@ int main(int argc, char *argv[]) { sprintf(outputFileName, "brute_force_27_%s.dat", outputFileNameExtension); dump_particle_fields(outputFileName, main_cell, cells); - /* Check serial results against the vectorised results. */ - if (check_results(main_cell->parts, vec_parts, main_cell->count, threshold)) - message("Differences found..."); - /* Output timing */ message("Brute force calculation took : %15lli ticks.", toc - tic); diff --git a/tests/test27cells.sh.in b/tests/test27cells.sh.in index 4312ce55e13097d4ae40c289b9c5caa885ff37cc..059a7a208aa8e570ad5035fac16ffd201bf3dddd 100755 --- a/tests/test27cells.sh.in +++ b/tests/test27cells.sh.in @@ -1,13 +1,14 @@ #!/bin/bash +# Test for particles with the same smoothing length for v in {0..3} do echo "" rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat - echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -a 1e-4" - ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -a 1e-4 + echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v" + ./test27cells -n 6 -r 1 -d 0 -f standard -v $v if [ -e brute_force_27_standard.dat ] then @@ -27,4 +28,60 @@ do done +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.1" + ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.1 + + if [ -e brute_force_27_standard.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_standard.dat swift_dopair_27_standard.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3" + ./test27cells -n 6 -r 1 -d 0 -f standard -v $v -p 1.3 + + if [ -e brute_force_27_standard.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_standard.dat swift_dopair_27_standard.dat @srcdir@/tolerance_27_perturbed_h2.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + exit $? diff --git a/tests/test27cellsPerturbed.sh.in b/tests/test27cellsPerturbed.sh.in index 2f2e1db76346ca8f0ea4c2365ee349e232a1ce53..f875504e541588377ca6e40fe55681ebec3466f6 100755 --- a/tests/test27cellsPerturbed.sh.in +++ b/tests/test27cellsPerturbed.sh.in @@ -1,17 +1,18 @@ #!/bin/bash +# Test for particles with the same smoothing length for v in {0..3} do echo "" rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat - echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -a 5e-4" - ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -a 5e-4 + echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v" + ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v if [ -e brute_force_27_perturbed.dat ] then - if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed.dat 6 1 + if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed.dat 6 then echo "Accuracy test passed" else @@ -27,4 +28,59 @@ do done +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.1" + ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.1 + + if [ -e brute_force_27_perturbed.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + +# Test for particles with random smoothing lengths +for v in {0..3} +do + echo "" + + rm -f brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat + + echo "Running ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3" + ./test27cells -n 6 -r 1 -d 0.1 -f perturbed -v $v -p 1.3 + + if [ -e brute_force_27_perturbed.dat ] + then + if python @srcdir@/difffloat.py brute_force_27_perturbed.dat swift_dopair_27_perturbed.dat @srcdir@/tolerance_27_perturbed_h2.dat 6 + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done exit $? diff --git a/tests/testActivePair.c b/tests/testActivePair.c new file mode 100644 index 0000000000000000000000000000000000000000..1e0111b4f0e480d0f66463b4c2264cdd89bd28c8 --- /dev/null +++ b/tests/testActivePair.c @@ -0,0 +1,510 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#include "../config.h" + +/* Some standard headers. */ +#include <fenv.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> + +/* Local headers. */ +#include "swift.h" + +/** + * @brief Constructs a cell and all of its particle in a valid state prior to + * a DOPAIR or DOSELF calcuation. + * + * @param n The cube root of the number of particles. + * @param offset The position of the cell offset from (0,0,0). + * @param size The cell size. + * @param h The smoothing length of the particles in units of the inter-particle + * separation. + * @param density The density of the fluid. + * @param partId The running counter of IDs. + * @param pert The perturbation to apply to the particles in the cell in units + * of the inter-particle separation. + * @param h_pert The perturbation to apply to the smoothing length. + * @param fraction_active The fraction of particles that should be active in the + * cell. + */ +struct cell *make_cell(size_t n, double *offset, double size, double h, + double density, long long *partId, double pert, + double h_pert, double fraction_active) { + const size_t count = n * n * n; + const double volume = size * size * size; + float h_max = 0.f; + struct cell *cell = malloc(sizeof(struct cell)); + bzero(cell, sizeof(struct cell)); + + if (posix_memalign((void **)&cell->parts, part_align, + count * sizeof(struct part)) != 0) { + error("couldn't allocate particles, no. of particles: %d", (int)count); + } + bzero(cell->parts, count * sizeof(struct part)); + + /* Construct the parts */ + struct part *part = cell->parts; + for (size_t x = 0; x < n; ++x) { + for (size_t y = 0; y < n; ++y) { + for (size_t z = 0; z < n; ++z) { + part->x[0] = + offset[0] + + size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[1] = + offset[1] + + size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[2] = + offset[2] + + size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->v[0] = random_uniform(-0.05, 0.05); + part->v[1] = random_uniform(-0.05, 0.05); + part->v[2] = random_uniform(-0.05, 0.05); + + if (h_pert) + part->h = size * h * random_uniform(1.f, h_pert) / (float)n; + else + part->h = size * h / (float)n; + h_max = fmaxf(h_max, part->h); + part->id = ++(*partId); + +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + part->conserved.mass = density * volume / count; + +#ifdef SHADOWFAX_SPH + double anchor[3] = {0., 0., 0.}; + double side[3] = {1., 1., 1.}; + voronoi_cell_init(&part->cell, part->x, anchor, side); +#endif + +#else + part->mass = density * volume / count; +#endif + +#if defined(HOPKINS_PE_SPH) + part->entropy = 1.f; + part->entropy_one_over_gamma = 1.f; +#endif + if (random_uniform(0, 1.f) < fraction_active) + part->time_bin = 1; + else + part->time_bin = num_time_bins + 1; + +#ifdef SWIFT_DEBUG_CHECKS + part->ti_drift = 8; + part->ti_kick = 8; +#endif + + ++part; + } + } + } + + /* Cell properties */ + cell->split = 0; + cell->h_max = h_max; + cell->count = count; + cell->dx_max_part = 0.; + cell->dx_max_sort = 0.; + cell->width[0] = size; + cell->width[1] = size; + cell->width[2] = size; + cell->loc[0] = offset[0]; + cell->loc[1] = offset[1]; + cell->loc[2] = offset[2]; + + cell->ti_old_part = 8; + cell->ti_end_min = 8; + cell->ti_end_max = 8; + + shuffle_particles(cell->parts, cell->count); + + cell->sorted = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; + + return cell; +} + +void clean_up(struct cell *ci) { + free(ci->parts); + for (int k = 0; k < 13; k++) + if (ci->sort[k] != NULL) free(ci->sort[k]); + free(ci); +} + +/** + * @brief Initializes all particles field to be ready for a density calculation + */ +void zero_particle_fields(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_init_part(&c->parts[pid], NULL); + } +} + +/** + * @brief Ends the loop by adding the appropriate coefficients + */ +void end_calculation(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_end_density(&c->parts[pid]); + + /* Recover the common "Neighbour number" definition */ + c->parts[pid].density.wcount *= pow_dimension(c->parts[pid].h); + c->parts[pid].density.wcount *= kernel_norm; + } +} + +/** + * @brief Dump all the particles to a file + */ +void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { + FILE *file = fopen(fileName, "a"); + + /* Write header */ + fprintf(file, "# %4s %13s\n", "ID", "wcount"); + + fprintf(file, "# ci --------------------------------------------\n"); + + for (int pid = 0; pid < ci->count; pid++) { + fprintf(file, "%6llu %13e\n", ci->parts[pid].id, + ci->parts[pid].density.wcount); + } + + fprintf(file, "# cj --------------------------------------------\n"); + + for (int pjd = 0; pjd < cj->count; pjd++) { + fprintf(file, "%6llu %13e\n", cj->parts[pjd].id, + cj->parts[pjd].density.wcount); + } + + fclose(file); +} + +/* Just a forward declaration... */ +void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_doself1_density_vec(struct runner *r, struct cell *ci); +void runner_dopair1_branch_density(struct runner *r, struct cell *ci, + struct cell *cj); + +/** + * @brief Computes the pair interactions of two cells using SWIFT and a brute + * force implementation. + */ +void test_pair_interactions(struct runner *runner, struct cell **ci, + struct cell **cj, char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + runner_do_sort(runner, *ci, 0x1FFF, 0, 0); + runner_do_sort(runner, *cj, 0x1FFF, 0, 0); + + /* Zero the fields */ + zero_particle_fields(*ci); + zero_particle_fields(*cj); + + /* Run the test */ + runner_dopair1_branch_density(runner, *ci, *cj); + + /* Let's get physical ! */ + end_calculation(*ci); + end_calculation(*cj); + + /* Dump if necessary */ + dump_particle_fields(swiftOutputFileName, *ci, *cj); + + /* Now perform a brute-force version for accuracy tests */ + + /* Zero the fields */ + zero_particle_fields(*ci); + zero_particle_fields(*cj); + + /* Run the brute-force test */ + pairs_all_density(runner, *ci, *cj); + + /* Let's get physical ! */ + end_calculation(*ci); + end_calculation(*cj); + + dump_particle_fields(bruteForceOutputFileName, *ci, *cj); +} + +/** + * @brief Computes the pair interactions of two cells in various configurations. + */ +void test_all_pair_interactions(struct runner *runner, double *offset2, + size_t particles, double size, double h, + double rho, long long *partId, + double perturbation, double h_pert, + char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + double offset1[3] = {0, 0, 0}; + struct cell *ci, *cj; + + /* All active particles. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 1.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 1.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Half particles are active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.5); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.5); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* All particles inactive. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* 10% of particles active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.1); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.1); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One active cell one inactive cell. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 1.0); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One active cell one inactive cell. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 1.0); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Smaller cells, all active. */ + ci = make_cell(2, offset1, size, h, rho, partId, perturbation, h_pert, 1.0); + cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* Different numbers of particles in each cell. */ + ci = make_cell(10, offset1, size, h, rho, partId, perturbation, h_pert, 0.5); + cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One cell inactive and the other only half active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.5); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + clean_up(ci); + clean_up(cj); + + /* One cell inactive and the other only half active. */ + ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, + 0.); + cj = make_cell(particles, offset2, size, h, rho, partId, perturbation, h_pert, + 0.5); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName); + + /* Clean things to make the sanitizer happy ... */ + clean_up(ci); + clean_up(cj); +} + +int main(int argc, char *argv[]) { + size_t particles = 0, runs = 0, type = 0; + double h = 1.23485, size = 1., rho = 1.; + double perturbation = 0.1, h_pert = 1.1; + struct space space; + struct engine engine; + struct runner *runner; + char c; + static long long partId = 0; + char outputFileNameExtension[200] = ""; + char swiftOutputFileName[200] = ""; + char bruteForceOutputFileName[200] = ""; + + /* Initialize CPU frequency, this also starts time. */ + unsigned long long cpufreq = 0; + clocks_set_cpufreq(cpufreq); + + /* Choke on FP-exceptions */ + feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + + /* Generate a RNG seed from time. */ + unsigned int seed = time(NULL); + + while ((c = getopt(argc, argv, "h:p:n:r:t:d:s:f:")) != -1) { + switch (c) { + case 'h': + sscanf(optarg, "%lf", &h); + break; + case 'p': + sscanf(optarg, "%lf", &h_pert); + break; + case 'n': + sscanf(optarg, "%zu", &particles); + break; + case 'r': + sscanf(optarg, "%zu", &runs); + break; + case 't': + sscanf(optarg, "%zu", &type); + break; + case 'd': + sscanf(optarg, "%lf", &perturbation); + break; + case 's': + sscanf(optarg, "%u", &seed); + break; + case 'f': + strcpy(outputFileNameExtension, optarg); + break; + case '?': + error("Unknown option."); + break; + } + } + + if (h < 0 || particles == 0 || runs == 0 || type > 2) { + printf( + "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" + "\nGenerates a cell pair, filled with particles on a Cartesian grid." + "\nThese are then interacted using runner_dopair1_density." + "\n\nOptions:" + "\n-t TYPE=0 - cells share face (0), edge (1) or corner (2)" + "\n-h DISTANCE=1.2348 - smoothing length" + "\n-p - Random fractional change in h, h=h*random(1,p)" + "\n-d pert - perturbation to apply to the particles [0,1[" + "\n-s seed - seed for RNG" + "\n-f fileName - part of the file name used to save the dumps\n", + argv[0]); + exit(1); + } + + /* Seed RNG. */ + message("Seed used for RNG: %d", seed); + srand(seed); + + space.periodic = 0; + space.dim[0] = 3.; + space.dim[1] = 3.; + space.dim[2] = 3.; + + engine.s = &space; + engine.time = 0.1f; + engine.ti_current = 8; + engine.max_active_bin = num_time_bins; + + if (posix_memalign((void **)&runner, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct runner)) != 0) { + error("couldn't allocate runner"); + } + + runner->e = &engine; + + /* Create output file names. */ + sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension); + sprintf(bruteForceOutputFileName, "brute_force_%s.dat", + outputFileNameExtension); + + /* Delete files if they already exist. */ + remove(swiftOutputFileName); + remove(bruteForceOutputFileName); + +#ifdef WITH_VECTORIZATION + runner->ci_cache.count = 0; + cache_init(&runner->ci_cache, 512); + runner->cj_cache.count = 0; + cache_init(&runner->cj_cache, 512); +#endif + + double offset[3] = {1., 0., 0.}; + + /* Test a pair of cells face-on. */ + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + + /* Test a pair of cells edge-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 0.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + + /* Test a pair of cells corner-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 1.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName); + return 0; +} diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in new file mode 100755 index 0000000000000000000000000000000000000000..ff8d027a469bd9bc78286b843cf2dffd3ef27ad3 --- /dev/null +++ b/tests/testActivePair.sh.in @@ -0,0 +1,11 @@ +#!/bin/bash + +echo "" + +rm -f brute_force_pair_active.dat swift_dopair_active.dat + +./testActivePair -n 6 -r 1 -d 0 -f active + +python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat + +exit $? diff --git a/tests/testInteractions.c b/tests/testInteractions.c index 4ce7fe40554d24551750629fa47c0bee7acdb6da..54d1f38733a1f1647331166f1a37b40ed3511419 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -17,12 +17,6 @@ * ******************************************************************************/ -#include "../config.h" - -#ifndef WITH_VECTORIZATION -int main() { return 0; } -#else - #include <fenv.h> #include <stdio.h> #include <stdlib.h> @@ -30,15 +24,17 @@ int main() { return 0; } #include <unistd.h> #include "swift.h" +#ifdef WITH_VECTORIZATION + #define array_align sizeof(float) * VEC_SIZE #define ACC_THRESHOLD 1e-5 -/* Typdef function pointers for serial and vectorised versions of the - * interaction functions. */ -typedef void (*serial_interaction)(float, float *, float, float, struct part *, - struct part *); -typedef void (*vec_interaction)(float *, float *, float *, float *, - struct part **, struct part **); +#ifndef IACT +#define IACT runner_iact_nonsym_density +#define IACT_VEC runner_iact_nonsym_1_vec_density +#define IACT_NAME "test_nonsym_density" +#define NUM_VEC_PROC_INT 1 +#endif /** * @brief Constructs an array of particles in a valid state prior to @@ -74,7 +70,10 @@ struct part *make_particles(size_t count, double *offset, double spacing, p->h = h; p->id = ++(*partId); + +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) p->mass = 1.0f; +#endif /* Place rest of particles around the test particle * with random position within a unit sphere. */ @@ -93,7 +92,9 @@ struct part *make_particles(size_t count, double *offset, double spacing, p->h = h; p->id = ++(*partId); +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) p->mass = 1.0f; +#endif } return particles; } @@ -103,6 +104,7 @@ struct part *make_particles(size_t count, double *offset, double spacing, */ void prepare_force(struct part *parts, size_t count) { +#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH) struct part *p; for (size_t i = 0; i < count; ++i) { p = &parts[i]; @@ -113,6 +115,7 @@ void prepare_force(struct part *parts, size_t count) { p->force.v_sig = 0.0f; p->force.h_dt = 0.0f; } +#endif } /** @@ -122,25 +125,26 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { FILE *file = fopen(fileName, "a"); + /* Write header */ fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e " - "%13e %13e %13e %13e " - "%13e %13e %13e %10f\n", + "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " + "%13e %13e %13e\n", p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], - p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], p->rho, - p->density.rho_dh, p->density.wcount, p->density.wcount_dh, - p->force.h_dt, p->force.v_sig, -#if defined(GADGET2_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2], p->entropy_dt -#elif defined(DEFAULT_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2], 0. + hydro_get_density(p), +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + 0.f, #else + p->density.rho_dh, +#endif + p->density.wcount, p->density.wcount_dh, +#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], p->density.rot_v[2] +#else + 0., 0., 0., 0. #endif ); + fclose(file); } @@ -152,13 +156,10 @@ void write_header(char *fileName) { FILE *file = fopen(fileName, "w"); /* Write header */ fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s " - "%13s %13s %13s %13s" - "%13s %13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y", - "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig", - "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt"); - fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n"); + "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " + "%13s %13s %13s\n", + "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", + "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); fclose(file); } @@ -187,8 +188,8 @@ int check_results(struct part serial_test_part, struct part *serial_parts, } /* - * @brief Calls the serial and vectorised version of an interaction - * function given by the function pointers. + * @brief Calls the serial and vectorised version of the non-symmetrical density + * interaction. * * @param test_part Particle that will be updated * @param parts Particle array to be interacted @@ -196,16 +197,15 @@ int check_results(struct part serial_test_part, struct part *serial_parts, * @param serial_inter_func Serial interaction function to be called * @param vec_inter_func Vectorised interaction function to be called * @param runs No. of times to call interactions + * @param num_vec_proc No. of vectors to use to process interaction * */ void test_interactions(struct part test_part, struct part *parts, size_t count, - serial_interaction serial_inter_func, - vec_interaction vec_inter_func, char *filePrefix, - size_t runs) { + char *filePrefix, int runs, int num_vec_proc) { - ticks serial_time = 0, vec_time = 0; + ticks serial_time = 0; + ticks vec_time = 0; - FILE *file; char serial_filename[200] = ""; char vec_filename[200] = ""; @@ -217,64 +217,68 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, write_header(serial_filename); write_header(vec_filename); - /* Test particle at the center of a unit sphere. */ struct part pi_serial, pi_vec; - - /* Remaining particles in the sphere that will interact with test particle. */ struct part pj_serial[count], pj_vec[count]; - /* Stores the separation, smoothing length and pointers to particles - * needed for the vectorised interaction. */ + float r2[count] __attribute__((aligned(array_align))); + float dx[3 * count] __attribute__((aligned(array_align))); + + struct part *piq[count], *pjq[count]; + for (size_t k = 0; k < count; k++) { + piq[k] = NULL; + pjq[k] = NULL; + } + float r2q[count] __attribute__((aligned(array_align))); float hiq[count] __attribute__((aligned(array_align))); - float hjq[count] __attribute__((aligned(array_align))); - float dxq[3 * count] __attribute__((aligned(array_align))); - struct part *piq[count], *pjq[count]; + float dxq[count] __attribute__((aligned(array_align))); + + float dyq[count] __attribute__((aligned(array_align))); + float dzq[count] __attribute__((aligned(array_align))); + float mjq[count] __attribute__((aligned(array_align))); + float vixq[count] __attribute__((aligned(array_align))); + float viyq[count] __attribute__((aligned(array_align))); + float vizq[count] __attribute__((aligned(array_align))); + float vjxq[count] __attribute__((aligned(array_align))); + float vjyq[count] __attribute__((aligned(array_align))); + float vjzq[count] __attribute__((aligned(array_align))); /* Call serial interaction a set number of times. */ - for (size_t k = 0; k < runs; k++) { + for (int k = 0; k < runs; k++) { /* Reset particle to initial setup */ pi_serial = test_part; for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i]; - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi_serial); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(serial_filename, &pj_serial[i]); - } - /* Perform serial interaction */ for (size_t i = 0; i < count; i++) { /* Compute the pairwise distance. */ - float r2 = 0.0f; - float dx[3]; - for (size_t k = 0; k < 3; k++) { - dx[k] = pi_serial.x[k] - pj_serial[i].x[k]; - r2 += dx[k] * dx[k]; + r2[i] = 0.0f; + for (int k = 0; k < 3; k++) { + int ind = (3 * i) + k; + dx[ind] = pi_serial.x[k] - pj_serial[i].x[k]; + r2[i] += dx[ind] * dx[ind]; } + } - const ticks tic = getticks(); - - serial_inter_func(r2, dx, pi_serial.h, pj_serial[i].h, &pi_serial, - &pj_serial[i]); - - serial_time += getticks() - tic; + const ticks tic = getticks(); +/* Perform serial interaction */ +#ifdef __ICC +#pragma novector +#endif + for (size_t i = 0; i < count; i++) { + IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial, + &pj_serial[i]); } + serial_time += getticks() - tic; } - file = fopen(serial_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - /* Dump result of serial interaction. */ dump_indv_particle_fields(serial_filename, &pi_serial); for (size_t i = 0; i < count; i++) dump_indv_particle_fields(serial_filename, &pj_serial[i]); /* Call vector interaction a set number of times. */ - for (size_t k = 0; k < runs; k++) { + for (int k = 0; k < runs; k++) { /* Reset particle to initial setup */ pi_vec = test_part; for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i]; @@ -284,45 +288,92 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, /* Compute the pairwise distance. */ float r2 = 0.0f; float dx[3]; - for (size_t k = 0; k < 3; k++) { + for (int k = 0; k < 3; k++) { dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; r2 += dx[k] * dx[k]; } r2q[i] = r2; - dxq[3 * i + 0] = dx[0]; - dxq[3 * i + 1] = dx[1]; - dxq[3 * i + 2] = dx[2]; + dxq[i] = dx[0]; hiq[i] = pi_vec.h; - hjq[i] = pj_vec[i].h; piq[i] = &pi_vec; pjq[i] = &pj_vec[i]; - } - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before vector interaction. */ - dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(vec_filename, pjq[i]); + dyq[i] = dx[1]; + dzq[i] = dx[2]; + mjq[i] = pj_vec[i].mass; + vixq[i] = pi_vec.v[0]; + viyq[i] = pi_vec.v[1]; + vizq[i] = pi_vec.v[2]; + vjxq[i] = pj_vec[i].v[0]; + vjyq[i] = pj_vec[i].v[1]; + vjzq[i] = pj_vec[i].v[2]; } + /* Perform vector interaction. */ + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; + vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, + curlvySum, curlvzSum; + mask_t mask, mask2; + + rhoSum.v = vec_set1(0.f); + rho_dhSum.v = vec_set1(0.f); + wcountSum.v = vec_set1(0.f); + wcount_dhSum.v = vec_set1(0.f); + div_vSum.v = vec_set1(0.f); + curlvxSum.v = vec_set1(0.f); + curlvySum.v = vec_set1(0.f); + curlvzSum.v = vec_set1(0.f); + + hi_vec.v = vec_load(&hiq[0]); + vix_vec.v = vec_load(&vixq[0]); + viy_vec.v = vec_load(&viyq[0]); + viz_vec.v = vec_load(&vizq[0]); + + hi_inv_vec = vec_reciprocal(hi_vec); + vec_init_mask_true(mask); + vec_init_mask_true(mask2); + const ticks vec_tic = getticks(); - /* Perform vector interaction. */ - for (size_t i = 0; i < count; i += VEC_SIZE) { - vec_inter_func(&(r2q[i]), &(dxq[3 * i]), &(hiq[i]), &(hjq[i]), &(piq[i]), - &(pjq[i])); + for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) { + + /* Interleave two vectors for interaction. */ + if (num_vec_proc == 2) { + runner_iact_nonsym_2_vec_density( + &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), (vix_vec), + (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), + &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, mask, mask2, 0); + } else { /* Only use one vector for interaction. */ + + vector r2, dx, dy, dz; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + + runner_iact_nonsym_1_vec_density( + &r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), (viz_vec), + &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, mask); + } } + VEC_HADD(rhoSum, piq[0]->rho); + VEC_HADD(rho_dhSum, piq[0]->density.rho_dh); + VEC_HADD(wcountSum, piq[0]->density.wcount); + VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh); + VEC_HADD(div_vSum, piq[0]->density.div_v); + VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]); + VEC_HADD(curlvySum, piq[0]->density.rot_v[1]); + VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]); + vec_time += getticks() - vec_tic; } - file = fopen(vec_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - - /* Dump result of vector interaction. */ + /* Dump result of serial interaction. */ dump_indv_particle_fields(vec_filename, piq[0]); for (size_t i = 0; i < count; i++) dump_indv_particle_fields(vec_filename, pjq[i]); @@ -334,6 +385,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, message("The serial interactions took : %15lli ticks.", serial_time / runs); message("The vectorised interactions took : %15lli ticks.", vec_time / runs); + message("Speed up: %15fx.", (double)(serial_time) / vec_time); } /* And go... */ @@ -386,62 +438,22 @@ int main(int argc, char *argv[]) { /* Build the infrastructure */ static long long partId = 0; - struct part density_test_particle, force_test_particle; - struct part *density_particles = - make_particles(count, offset, spacing, h, &partId); - struct part *force_particles = - make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles, count); - - /* Define which interactions to call */ - serial_interaction serial_inter_func = &runner_iact_nonsym_density; - vec_interaction vec_inter_func = &runner_iact_nonsym_vec_density; - - density_test_particle = density_particles[0]; + struct part test_particle; + struct part *particles = make_particles(count, offset, spacing, h, &partId); + + test_particle = particles[0]; /* Call the non-sym density test. */ - message("Testing non-symmetrical density interaction..."); - test_interactions(density_test_particle, &density_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_nonsym_density", - runs); - - density_particles = make_particles(count, offset, spacing, h, &partId); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_density; - vec_inter_func = &runner_iact_vec_density; - - density_test_particle = density_particles[0]; - /* Call the symmetrical density test. */ - message("Testing symmetrical density interaction..."); - test_interactions(density_test_particle, &density_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_sym_density", - runs); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_nonsym_force; - vec_inter_func = &runner_iact_nonsym_vec_force; - - force_test_particle = force_particles[0]; - /* Call the test non-sym force test. */ - message("Testing non-symmetrical force interaction..."); - test_interactions(force_test_particle, &force_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_nonsym_force", - runs); - - force_particles = make_particles(count, offset, spacing, h, &partId); - prepare_force(force_particles, count); - - /* Re-assign function pointers. */ - serial_inter_func = &runner_iact_force; - vec_inter_func = &runner_iact_vec_force; - - force_test_particle = force_particles[0]; - /* Call the test symmetrical force test. */ - message("Testing symmetrical force interaction..."); - test_interactions(force_test_particle, &force_particles[1], count - 1, - serial_inter_func, vec_inter_func, "test_sym_force", runs); + message("Testing %s interaction...", IACT_NAME); + test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, + 1); + test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, + 2); return 0; } -#endif /* WITH_VECTORIZATION */ +#else + +int main() { return 1; } + +#endif diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..4b002c56e37eff417c673ddac2e44b3edf17683a --- /dev/null +++ b/tests/testInteractions.sh.in @@ -0,0 +1,29 @@ +#!/bin/bash + +echo "" + +rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat + +echo "Running ./testInteractions" + +./testInteractions + +if [ $? != 0 ]; then + echo "testInteractions is redundant when vectorisation is disabled" +else + if [ -e test_nonsym_density_serial.dat ] + then + if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi +fi + +echo "------------" diff --git a/tests/testKernel.c b/tests/testKernel.c index a3731188e51b1235fe84f36eab7c270c788f7dea..0658639070526f28ce1bceefc54d3f2d7a3ae765 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -68,7 +68,7 @@ int main() { vx.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vec_mul(vx.v, vec_set1(1.f/h)); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); kernel_deval_1_vec(&vx_h, &W_vec, &dW_vec); @@ -106,8 +106,8 @@ int main() { vx_2.f[j] = (i + j) * 2.25f / numPoints; } - vx_h.v = vec_mul(vx.v, vec_set1(1.f/h)); - vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f/h)); + vx_h.v = vec_mul(vx.v, vec_set1(1.f / h)); + vx_h_2.v = vec_mul(vx_2.v, vec_set1(1.f / h)); kernel_deval_2_vec(&vx_h, &W_vec, &dW_vec, &vx_h_2, &W_vec_2, &dW_vec_2); diff --git a/tests/testPair.c b/tests/testPair.c deleted file mode 100644 index 92987d2fdb625fec6e186a280837f145787f599b..0000000000000000000000000000000000000000 --- a/tests/testPair.c +++ /dev/null @@ -1,322 +0,0 @@ -/******************************************************************************* - * This file is part of SWIFT. - * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ - -#include <fenv.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include "swift.h" - -/* n is both particles per axis and box size: - * particles are generated on a mesh with unit spacing - */ -struct cell *make_cell(size_t n, double *offset, double size, double h, - double density, unsigned long long *partId, - double pert) { - const size_t count = n * n * n; - const double volume = size * size * size; - struct cell *cell = malloc(sizeof(struct cell)); - bzero(cell, sizeof(struct cell)); - - if (posix_memalign((void **)&cell->parts, part_align, - count * sizeof(struct part)) != 0) { - error("couldn't allocate particles, no. of particles: %d", (int)count); - } - bzero(cell->parts, count * sizeof(struct part)); - - /* Construct the parts */ - struct part *part = cell->parts; - for (size_t x = 0; x < n; ++x) { - for (size_t y = 0; y < n; ++y) { - for (size_t z = 0; z < n; ++z) { - part->x[0] = - offset[0] + - size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - part->x[1] = - offset[1] + - size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - part->x[2] = - offset[2] + - size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; - // part->v[0] = part->x[0] - 1.5; - // part->v[1] = part->x[1] - 1.5; - // part->v[2] = part->x[2] - 1.5; - part->v[0] = random_uniform(-0.05, 0.05); - part->v[1] = random_uniform(-0.05, 0.05); - part->v[2] = random_uniform(-0.05, 0.05); - part->h = size * h / (float)n; - part->id = ++(*partId); -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - part->conserved.mass = density * volume / count; -#else - part->mass = density * volume / count; -#endif - part->time_bin = 1; - -#ifdef SWIFT_DEBUG_CHECKS - part->ti_drift = 8; - part->ti_kick = 8; -#endif - - ++part; - } - } - } - - /* Cell properties */ - cell->split = 0; - cell->h_max = h; - cell->count = count; - cell->dx_max_part = 0.; - cell->dx_max_sort = 0.; - cell->width[0] = n; - cell->width[1] = n; - cell->width[2] = n; - cell->loc[0] = offset[0]; - cell->loc[1] = offset[1]; - cell->loc[2] = offset[2]; - - cell->ti_old_part = 8; - cell->ti_end_min = 8; - cell->ti_end_max = 8; - - shuffle_particles(cell->parts, cell->count); - - cell->sorted = 0; - cell->sort = NULL; - cell->sortsize = 0; - - return cell; -} - -void clean_up(struct cell *ci) { - free(ci->parts); - free(ci->sort); - free(ci); -} - -/** - * @brief Initializes all particles field to be ready for a density calculation - */ -void zero_particle_fields(struct cell *c) { - for (int pid = 0; pid < c->count; pid++) { - hydro_init_part(&c->parts[pid], NULL); - } -} - -/** - * @brief Dump all the particles to a file - */ -void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { - FILE *file = fopen(fileName, "w"); - - /* Write header */ - fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " - "%13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", - "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); - - fprintf(file, "# ci --------------------------------------------\n"); - - for (int pid = 0; pid < ci->count; pid++) { - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - ci->parts[pid].id, ci->parts[pid].x[0], ci->parts[pid].x[1], - ci->parts[pid].x[2], ci->parts[pid].v[0], ci->parts[pid].v[1], - ci->parts[pid].v[2], hydro_get_density(&ci->parts[pid]), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0.f, -#else - ci->parts[pid].density.rho_dh, -#endif - ci->parts[pid].density.wcount, ci->parts[pid].density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - ci->parts[pid].density.div_v, ci->parts[pid].density.rot_v[0], - ci->parts[pid].density.rot_v[1], ci->parts[pid].density.rot_v[2] -#else - 0., 0., 0., 0. -#endif - ); - } - - fprintf(file, "# cj --------------------------------------------\n"); - - for (int pjd = 0; pjd < cj->count; pjd++) { - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - cj->parts[pjd].id, cj->parts[pjd].x[0], cj->parts[pjd].x[1], - cj->parts[pjd].x[2], cj->parts[pjd].v[0], cj->parts[pjd].v[1], - cj->parts[pjd].v[2], hydro_get_density(&cj->parts[pjd]), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0.f, -#else - cj->parts[pjd].density.rho_dh, -#endif - cj->parts[pjd].density.wcount, cj->parts[pjd].density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - cj->parts[pjd].density.div_v, cj->parts[pjd].density.rot_v[0], - cj->parts[pjd].density.rot_v[1], cj->parts[pjd].density.rot_v[2] -#else - 0., 0., 0., 0. -#endif - ); - } - - fclose(file); -} - -/* Just a forward declaration... */ -void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); - -int main(int argc, char *argv[]) { - size_t particles = 0, runs = 0, volume, type = 0; - double offset[3] = {0, 0, 0}, h = 1.1255, size = 1., rho = 1.; - double perturbation = 0.; - struct cell *ci, *cj; - struct space space; - struct engine engine; - struct runner runner; - char c; - static unsigned long long partId = 0; - char outputFileNameExtension[200] = ""; - char outputFileName[200] = ""; - ticks tic, toc, time; - - /* Initialize CPU frequency, this also starts time. */ - unsigned long long cpufreq = 0; - clocks_set_cpufreq(cpufreq); - - srand(0); - - while ((c = getopt(argc, argv, "h:p:r:t:d:f:")) != -1) { - switch (c) { - case 'h': - sscanf(optarg, "%lf", &h); - break; - case 'p': - sscanf(optarg, "%zu", &particles); - break; - case 'r': - sscanf(optarg, "%zu", &runs); - break; - case 't': - sscanf(optarg, "%zu", &type); - break; - case 'd': - sscanf(optarg, "%lf", &perturbation); - break; - case 'f': - strcpy(outputFileNameExtension, optarg); - break; - case '?': - error("Unknown option."); - break; - } - } - - if (h < 0 || particles == 0 || runs == 0 || type > 2) { - printf( - "\nUsage: %s -p PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" - "\nGenerates a cell pair, filled with particles on a Cartesian grid." - "\nThese are then interacted using runner_dopair1_density." - "\n\nOptions:" - "\n-t TYPE=0 - cells share face (0), edge (1) or corner (2)" - "\n-h DISTANCE=1.1255 - smoothing length" - "\n-d pert - perturbation to apply to the particles [0,1[" - "\n-f fileName - part of the file name used to save the dumps\n", - argv[0]); - exit(1); - } - - space.periodic = 0; - - engine.s = &space; - engine.time = 0.1f; - engine.ti_current = 8; - engine.max_active_bin = num_time_bins; - runner.e = &engine; - - volume = particles * particles * particles; - message("particles: %zu B\npositions: 0 B", 2 * volume * sizeof(struct part)); - - ci = make_cell(particles, offset, size, h, rho, &partId, perturbation); - for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.; - cj = make_cell(particles, offset, size, h, rho, &partId, perturbation); - - runner_do_sort(&runner, ci, 0x1FFF, 0); - runner_do_sort(&runner, cj, 0x1FFF, 0); - - time = 0; - for (size_t i = 0; i < runs; ++i) { - /* Zero the fields */ - zero_particle_fields(ci); - zero_particle_fields(cj); - - tic = getticks(); - -#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) - /* Run the test */ - runner_dopair1_density(&runner, ci, cj); -#endif - - toc = getticks(); - time += toc - tic; - - /* Dump if necessary */ - if (i % 50 == 0) { - sprintf(outputFileName, "swift_dopair_%s.dat", outputFileNameExtension); - dump_particle_fields(outputFileName, ci, cj); - } - } - - /* Output timing */ - message("SWIFT calculation took %lli ticks.", time / runs); - - /* Now perform a brute-force version for accuracy tests */ - - /* Zero the fields */ - zero_particle_fields(ci); - zero_particle_fields(cj); - - tic = getticks(); - -#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) - /* Run the brute-force test */ - pairs_all_density(&runner, ci, cj); -#endif - - toc = getticks(); - - /* Dump */ - sprintf(outputFileName, "brute_force_%s.dat", outputFileNameExtension); - dump_particle_fields(outputFileName, ci, cj); - - /* Output timing */ - message("Brute force calculation took %lli ticks.", toc - tic); - - /* Clean things to make the sanitizer happy ... */ - clean_up(ci); - clean_up(cj); - - return 0; -} diff --git a/tests/testPair.sh.in b/tests/testPair.sh.in deleted file mode 100755 index bd7051b060c4acab6cf5a164af1914715856849b..0000000000000000000000000000000000000000 --- a/tests/testPair.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "" - -rm -f brute_force_standard.dat swift_dopair_standard.dat - -./testPair -p 6 -r 1 -d 0 -f standard - -python @srcdir@/difffloat.py brute_force_standard.dat swift_dopair_standard.dat @srcdir@/tolerance_pair_normal.dat - -exit $? diff --git a/tests/testPairPerturbed.sh.in b/tests/testPairPerturbed.sh.in deleted file mode 100755 index 9f214e25a098448a906f9da307ea569e327cfdea..0000000000000000000000000000000000000000 --- a/tests/testPairPerturbed.sh.in +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -echo "" - -rm -f brute_force_perturbed.dat swift_dopair_perturbed.dat - -./testPair -p 6 -r 1 -d 0.1 -f perturbed - -python @srcdir@/difffloat.py brute_force_perturbed.dat swift_dopair_perturbed.dat @srcdir@/tolerance_pair_perturbed.dat - -exit $? diff --git a/tests/testPeriodicBC.c b/tests/testPeriodicBC.c new file mode 100644 index 0000000000000000000000000000000000000000..6fa2dc607b996b9e8508338a9806633c5a4d1a89 --- /dev/null +++ b/tests/testPeriodicBC.c @@ -0,0 +1,587 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/* Config parameters. */ +#include "../config.h" + +/* Some standard headers. */ +#include <fenv.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +/* Local headers. */ +#include "swift.h" + +#define ACC_THRESHOLD 1e-5 + +#if defined(WITH_VECTORIZATION) +#define DOSELF1 runner_doself1_density_vec +#define DOPAIR1 runner_dopair1_branch_density +#define DOSELF1_NAME "runner_doself1_density_vec" +#define DOPAIR1_NAME "runner_dopair1_density_vec" +#endif + +#ifndef DOSELF1 +#define DOSELF1 runner_doself1_density +#define DOSELF1_NAME "runner_doself1_density" +#endif + +#ifndef DOPAIR1 +#define DOPAIR1 runner_dopair1_branch_density +#define DOPAIR1_NAME "runner_dopair1_density" +#endif + +enum velocity_types { + velocity_zero, + velocity_random, + velocity_divergent, + velocity_rotating +}; + +/** + * @brief Constructs a cell and all of its particle in a valid state prior to + * a DOPAIR or DOSELF calcuation. + * + * @param n The cube root of the number of particles. + * @param offset The position of the cell offset from (0,0,0). + * @param size The cell size. + * @param h The smoothing length of the particles in units of the inter-particle + *separation. + * @param density The density of the fluid. + * @param partId The running counter of IDs. + * @param pert The perturbation to apply to the particles in the cell in units + *of the inter-particle separation. + * @param vel The type of velocity field (0, random, divergent, rotating) + */ +struct cell *make_cell(size_t n, double *offset, double size, double h, + double density, long long *partId, double pert, + enum velocity_types vel) { + const size_t count = n * n * n; + const double volume = size * size * size; + struct cell *cell = malloc(sizeof(struct cell)); + bzero(cell, sizeof(struct cell)); + + if (posix_memalign((void **)&cell->parts, part_align, + count * sizeof(struct part)) != 0) { + error("couldn't allocate particles, no. of particles: %d", (int)count); + } + bzero(cell->parts, count * sizeof(struct part)); + + float h_max = 0.f; + + /* Construct the parts */ + struct part *part = cell->parts; + for (size_t x = 0; x < n; ++x) { + for (size_t y = 0; y < n; ++y) { + for (size_t z = 0; z < n; ++z) { + part->x[0] = + offset[0] + + size * (x + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[1] = + offset[1] + + size * (y + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + part->x[2] = + offset[2] + + size * (z + 0.5 + random_uniform(-0.5, 0.5) * pert) / (float)n; + switch (vel) { + case velocity_zero: + part->v[0] = 0.f; + part->v[1] = 0.f; + part->v[2] = 0.f; + break; + case velocity_random: + part->v[0] = random_uniform(-0.05, 0.05); + part->v[1] = random_uniform(-0.05, 0.05); + part->v[2] = random_uniform(-0.05, 0.05); + break; + case velocity_divergent: + part->v[0] = part->x[0] - 1.5 * size; + part->v[1] = part->x[1] - 1.5 * size; + part->v[2] = part->x[2] - 1.5 * size; + break; + case velocity_rotating: + part->v[0] = part->x[1]; + part->v[1] = -part->x[0]; + part->v[2] = 0.f; + break; + } + part->h = size * h / (float)n; + h_max = fmax(h_max, part->h); + part->id = ++(*partId); + +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + part->conserved.mass = density * volume / count; + +#ifdef SHADOWFAX_SPH + double anchor[3] = {0., 0., 0.}; + double side[3] = {1., 1., 1.}; + voronoi_cell_init(&part->cell, part->x, anchor, side); +#endif + +#else + part->mass = density * volume / count; +#endif + +#if defined(HOPKINS_PE_SPH) + part->entropy = 1.f; + part->entropy_one_over_gamma = 1.f; +#endif + + part->time_bin = 1; + +#ifdef SWIFT_DEBUG_CHECKS + part->ti_drift = 8; + part->ti_kick = 8; +#endif + + ++part; + } + } + } + + /* Cell properties */ + cell->split = 0; + cell->h_max = h_max; + cell->count = count; + cell->dx_max_part = 0.; + cell->dx_max_sort = 0.; + cell->width[0] = size; + cell->width[1] = size; + cell->width[2] = size; + cell->loc[0] = offset[0]; + cell->loc[1] = offset[1]; + cell->loc[2] = offset[2]; + + cell->ti_old_part = 8; + cell->ti_end_min = 8; + cell->ti_end_max = 8; + + shuffle_particles(cell->parts, cell->count); + + cell->sorted = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; + + return cell; +} + +void clean_up(struct cell *ci) { + free(ci->parts); + for (int k = 0; k < 13; k++) + if (ci->sort[k] != NULL) free(ci->sort[k]); + free(ci); +} + +/** + * @brief Initializes all particles field to be ready for a density calculation + */ +void zero_particle_fields(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_init_part(&c->parts[pid], NULL); + } +} + +/** + * @brief Ends the loop by adding the appropriate coefficients + */ +void end_calculation(struct cell *c) { + for (int pid = 0; pid < c->count; pid++) { + hydro_end_density(&c->parts[pid]); + } +} + +/** + * @brief Dump all the particles to a file + */ +void dump_particle_fields(char *fileName, struct cell *main_cell, int i, int j, + int k) { + FILE *file = fopen(fileName, "a"); + + /* Write header */ + fprintf(file, + "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " + "%13s %13s %13s\n", + "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", + "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); + + fprintf(file, "# Centre cell at (i,j,k)=(%d, %d, %d) ---------------------\n", + i, j, k); + + /* Write main cell */ + for (int pid = 0; pid < main_cell->count; pid++) { + fprintf(file, + "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " + "%13e %13e %13e\n", + main_cell->parts[pid].id, main_cell->parts[pid].x[0], + main_cell->parts[pid].x[1], main_cell->parts[pid].x[2], + main_cell->parts[pid].v[0], main_cell->parts[pid].v[1], + main_cell->parts[pid].v[2], + hydro_get_density(&main_cell->parts[pid]), +#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) + 0.f, +#else + main_cell->parts[pid].density.rho_dh, +#endif + main_cell->parts[pid].density.wcount, + main_cell->parts[pid].density.wcount_dh, +#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) + main_cell->parts[pid].density.div_v, + main_cell->parts[pid].density.rot_v[0], + main_cell->parts[pid].density.rot_v[1], + main_cell->parts[pid].density.rot_v[2] +#else + 0., 0., 0., 0. +#endif + ); + } + fclose(file); +} + +/** + * @brief Compares the vectorised result against + * the serial result of the interaction. + * + * @param serial_parts Particle array that has been interacted serially + * @param vec_parts Particle array to be interacted using vectors + * @param count No. of particles that have been interacted + * @param threshold Level of accuracy needed + * + * @return Non-zero value if difference found, 0 otherwise + */ +int check_results(struct part *serial_parts, struct part *vec_parts, int count, + double threshold) { + int result = 0; + + for (int i = 0; i < count; i++) + result += compare_particles(serial_parts[i], vec_parts[i], threshold); + + return result; +} + +/* Just a forward declaration... */ +void runner_doself1_density(struct runner *r, struct cell *ci); +void runner_doself1_density_vec(struct runner *r, struct cell *ci); +void runner_dopair1_branch_density(struct runner *r, struct cell *ci, + struct cell *cj); + +void test_boundary_conditions(struct cell **cells, struct runner runner, + const int loc_i, const int loc_j, const int loc_k, + const int dim, char *swiftOutputFileName, + char *bruteForceOutputFileName) { + + /* Store the main cell for future use */ + struct cell *main_cell = cells[loc_i * (dim * dim) + loc_j * dim + loc_k]; + + /* Zero the fields */ + for (int j = 0; j < 512; ++j) zero_particle_fields(cells[j]); + +/* Run all the pairs */ +#if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) + +#ifdef WITH_VECTORIZATION + runner.ci_cache.count = 0; + cache_init(&runner.ci_cache, 512); + runner.cj_cache.count = 0; + cache_init(&runner.cj_cache, 512); +#endif + + /* Now loop over all the neighbours of this cell + * and perform the pair interactions. */ + for (int ii = -1; ii < 2; ii++) { + int iii = loc_i + ii; + iii = (iii + dim) % dim; + for (int jj = -1; jj < 2; jj++) { + int jjj = loc_j + jj; + jjj = (jjj + dim) % dim; + for (int kk = -1; kk < 2; kk++) { + int kkk = loc_k + kk; + kkk = (kkk + dim) % dim; + + /* Get the neighbouring cell */ + struct cell *cj = cells[iii * (dim * dim) + jjj * dim + kkk]; + + if (cj != main_cell) DOPAIR1(&runner, main_cell, cj); + } + } + } + + /* And now the self-interaction */ + + DOSELF1(&runner, main_cell); + +#endif + + /* Let's get physical ! */ + end_calculation(main_cell); + + /* Dump particles from the main cell. */ + dump_particle_fields(swiftOutputFileName, main_cell, loc_i, loc_j, loc_k); + + /* Now perform a brute-force version for accuracy tests */ + + /* Zero the fields */ + for (int i = 0; i < 512; ++i) zero_particle_fields(cells[i]); + +#if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) + + /* Now loop over all the neighbours of this cell + * and perform the pair interactions. */ + for (int ii = -1; ii < 2; ii++) { + int iii = loc_i + ii; + iii = (iii + dim) % dim; + for (int jj = -1; jj < 2; jj++) { + int jjj = loc_j + jj; + jjj = (jjj + dim) % dim; + for (int kk = -1; kk < 2; kk++) { + int kkk = loc_k + kk; + kkk = (kkk + dim) % dim; + + /* Get the neighbouring cell */ + struct cell *cj = cells[iii * (dim * dim) + jjj * dim + kkk]; + + if (cj != main_cell) pairs_all_density(&runner, main_cell, cj); + } + } + } + + /* And now the self-interaction */ + self_all_density(&runner, main_cell); + +#endif + + /* Let's get physical ! */ + end_calculation(main_cell); + + /* Dump */ + dump_particle_fields(bruteForceOutputFileName, main_cell, loc_i, loc_j, + loc_k); +} + +/* And go... */ +int main(int argc, char *argv[]) { + + engine_pin(); + size_t runs = 0, particles = 0; + double h = 1.23485, size = 1., rho = 1.; + double perturbation = 0.; + double threshold = ACC_THRESHOLD; + char outputFileNameExtension[200] = ""; + char swiftOutputFileName[200] = ""; + char bruteForceOutputFileName[200] = ""; + enum velocity_types vel = velocity_zero; + + /* Initialize CPU frequency, this also starts time. */ + unsigned long long cpufreq = 0; + clocks_set_cpufreq(cpufreq); + + /* Choke on FP-exceptions */ + feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + + /* Get some randomness going */ + srand(0); + + char c; + while ((c = getopt(argc, argv, "m:s:h:n:r:t:d:f:v:a:")) != -1) { + switch (c) { + case 'h': + sscanf(optarg, "%lf", &h); + break; + case 's': + sscanf(optarg, "%lf", &size); + break; + case 'n': + sscanf(optarg, "%zu", &particles); + break; + case 'r': + sscanf(optarg, "%zu", &runs); + break; + case 'd': + sscanf(optarg, "%lf", &perturbation); + break; + case 'm': + sscanf(optarg, "%lf", &rho); + break; + case 'f': + strcpy(outputFileNameExtension, optarg); + break; + case 'v': + sscanf(optarg, "%d", (int *)&vel); + break; + case 'a': + sscanf(optarg, "%lf", &threshold); + break; + case '?': + error("Unknown option."); + break; + } + } + + if (h < 0 || particles == 0 || runs == 0) { + printf( + "\nUsage: %s -n PARTICLES_PER_AXIS -r NUMBER_OF_RUNS [OPTIONS...]\n" + "\nGenerates 27 cells, filled with particles on a Cartesian grid." + "\nThese are then interacted using runner_dopair1_density() and " + "runner_doself1_density()." + "\n\nOptions:" + "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>" + "\n-m rho - Physical density in the cell" + "\n-s size - Physical size of the cell" + "\n-d pert - Perturbation to apply to the particles [0,1[" + "\n-v type (0,1,2,3) - Velocity field: (zero, random, divergent, " + "rotating)" + "\n-f fileName - Part of the file name used to save the dumps\n", + argv[0]); + exit(1); + } + + /* Help users... */ + message("DOSELF1 function called: %s", DOSELF1_NAME); + message("DOPAIR1 function called: %s", DOPAIR1_NAME); + message("Vector size: %d", VEC_SIZE); + message("Adiabatic index: ga = %f", hydro_gamma); + message("Hydro implementation: %s", SPH_IMPLEMENTATION); + message("Smoothing length: h = %f", h * size); + message("Kernel: %s", kernel_name); + message("Neighbour target: N = %f", pow_dimension(h) * kernel_norm); + message("Density target: rho = %f", rho); + message("div_v target: div = %f", vel == 2 ? 3.f : 0.f); + message("curl_v target: curl = [0., 0., %f]", vel == 3 ? -2.f : 0.f); + + printf("\n"); + + /* Build the infrastructure */ + struct space space; + space.periodic = 1; + space.dim[0] = 8.; + space.dim[1] = 8.; + space.dim[2] = 8.; + + struct hydro_props hp; + hp.h_max = FLT_MAX; + + struct engine engine; + engine.s = &space; + engine.time = 0.1f; + engine.ti_current = 8; + engine.max_active_bin = num_time_bins; + engine.hydro_properties = &hp; + + struct runner runner; + runner.e = &engine; + + /* Construct some cells */ + struct cell *cells[512]; + const int dim = 8; + static long long partId = 0; + for (int i = 0; i < dim; ++i) { + for (int j = 0; j < dim; ++j) { + for (int k = 0; k < dim; ++k) { + double offset[3] = {i * size, j * size, k * size}; + cells[i * (dim * dim) + j * dim + k] = make_cell( + particles, offset, size, h, rho, &partId, perturbation, vel); + + runner_do_drift_part(&runner, cells[i * (dim * dim) + j * dim + k], 0); + + runner_do_sort(&runner, cells[i * (dim * dim) + j * dim + k], 0x1FFF, 0, + 0); + } + } + } + + /* Create output file names. */ + sprintf(swiftOutputFileName, "swift_periodic_BC_%s.dat", + outputFileNameExtension); + sprintf(bruteForceOutputFileName, "brute_force_periodic_BC_%s.dat", + outputFileNameExtension); + + /* Delete files if they already exist. */ + remove(swiftOutputFileName); + remove(bruteForceOutputFileName); + + const int half_dim = (dim - 1) / 2; + + /* Test the periodic boundary conditions for each of the 8 corners. Interact + * each corner with all of its 26 neighbours.*/ + test_boundary_conditions(cells, runner, 0, 0, 0, dim, swiftOutputFileName, + bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, 0, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, 0, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, 0, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, dim - 1, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, dim - 1, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, dim - 1, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, dim - 1, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + + /* Test the boundary conditions for cells at the centre of each face of the + * box. */ + test_boundary_conditions(cells, runner, half_dim, half_dim, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, half_dim, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, half_dim, half_dim, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, half_dim, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, half_dim, 0, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, half_dim, dim - 1, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + + /* Test the boundary conditions for cells at the centre of each edge of the + * box. */ + test_boundary_conditions(cells, runner, half_dim, dim - 1, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, dim - 1, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, half_dim, dim - 1, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, dim - 1, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + + test_boundary_conditions(cells, runner, 0, half_dim, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, half_dim, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, half_dim, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, half_dim, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + + test_boundary_conditions(cells, runner, half_dim, 0, 0, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, dim - 1, 0, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, half_dim, 0, dim - 1, dim, + swiftOutputFileName, bruteForceOutputFileName); + test_boundary_conditions(cells, runner, 0, 0, half_dim, dim, + swiftOutputFileName, bruteForceOutputFileName); + + /* Clean things to make the sanitizer happy ... */ + for (int i = 0; i < 512; ++i) clean_up(cells[i]); + + return 0; +} diff --git a/tests/testPeriodicBC.sh.in b/tests/testPeriodicBC.sh.in new file mode 100755 index 0000000000000000000000000000000000000000..075acc0b68686bd2f418cf457140b3d6b93093d5 --- /dev/null +++ b/tests/testPeriodicBC.sh.in @@ -0,0 +1,30 @@ +#!/bin/bash + +for v in {0..3} +do + echo "" + + rm -f brute_force_periodic_BC_standard.dat swift_periodic_BC_standard.dat + + echo "Running ./testPeriodicBC -n 6 -r 1 -d 0 -f standard -v $v" + ./testPeriodicBC -n 6 -r 1 -d 0 -f standard -v $v + + if [ -e brute_force_periodic_BC_standard.dat ] + then + if python @srcdir@/difffloat.py brute_force_periodic_BC_standard.dat swift_periodic_BC_standard.dat @srcdir@/tolerance_periodic_BC_normal.dat + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + +exit $? diff --git a/tests/testPeriodicBCPerturbed.sh.in b/tests/testPeriodicBCPerturbed.sh.in new file mode 100755 index 0000000000000000000000000000000000000000..ac190d5a80654154dcd329e69c1c9cc9fe45833a --- /dev/null +++ b/tests/testPeriodicBCPerturbed.sh.in @@ -0,0 +1,30 @@ +#!/bin/bash + +for v in {0..3} +do + echo "" + + rm -f brute_force_periodic_BC_perturbed.dat swift_periodic_BC_perturbed.dat + + echo "Running ./testPeriodicBC -n 6 -r 1 -d 0.1 -f perturbed -v $v" + ./testPeriodicBC -n 6 -r 1 -d 0.1 -f perturbed -v $v + + if [ -e brute_force_periodic_BC_perturbed.dat ] + then + if python @srcdir@/difffloat.py brute_force_periodic_BC_perturbed.dat swift_periodic_BC_perturbed.dat @srcdir@/tolerance_periodic_BC_perturbed.dat + then + echo "Accuracy test passed" + else + echo "Accuracy test failed" + exit 1 + fi + else + echo "Error Missing test output file" + exit 1 + fi + + echo "------------" + +done + +exit $? diff --git a/tests/testSPHStep.c b/tests/testSPHStep.c index 014dacd1eb62040b03e6038b2c23183a24ec4850..e890c7c1a834ec7ca13ed2e8a509b7ea42db28fd 100644 --- a/tests/testSPHStep.c +++ b/tests/testSPHStep.c @@ -81,8 +81,7 @@ struct cell *make_cell(size_t N, float cellSize, int offset[3], int id_offset) { cell->ti_end_max = 1; cell->sorted = 0; - cell->sort = NULL; - cell->sortsize = 0; + for (int k = 0; k < 13; k++) cell->sort[k] = NULL; return cell; } @@ -212,7 +211,8 @@ int main() { for (int j = 0; j < 27; ++j) { free(cells[j]->parts); free(cells[j]->xparts); - free(cells[j]->sort); + for (int k = 0; k < 13; k++) + if (cells[j]->sort[k] != NULL) free(cells[j]->sort[k]); free(cells[j]); } diff --git a/tests/testThreadpool.c b/tests/testThreadpool.c index aa65d533a29afbe4e7e8384fb887281822a31e58..6b39991e1620fa90cfea0b7103d6e3e2ce4ed286 100644 --- a/tests/testThreadpool.c +++ b/tests/testThreadpool.c @@ -17,6 +17,8 @@ * ******************************************************************************/ +#include "../config.h" + // Standard includes. #include <stdio.h> #include <stdlib.h> @@ -31,7 +33,7 @@ void map_function_first(void *map_data, int num_elements, void *extra_data) { for (int ind = 0; ind < num_elements; ind++) { int input = inputs[ind]; usleep(rand() % 1000000); - printf("map_function_first: got input %i.\n", input); + printf(" map_function_first: got input %i.\n", input); fflush(stdout); } } @@ -41,7 +43,7 @@ void map_function_second(void *map_data, int num_elements, void *extra_data) { for (int ind = 0; ind < num_elements; ind++) { int input = inputs[ind]; usleep(rand() % 1000000); - printf("map_function_second: got input %i.\n", input); + printf(" map_function_second: got input %i.\n", input); fflush(stdout); } } @@ -49,37 +51,49 @@ void map_function_second(void *map_data, int num_elements, void *extra_data) { int main(int argc, char *argv[]) { // Some constants for this test. - const int num_threads = 16; const int N = 20; const int num_runs = 2; - // Create a threadpool with 8 threads. - struct threadpool tp; - threadpool_init(&tp, num_threads); + // Create threadpools with different numbers of threads. + for (int num_thread = 1; num_thread <= 16; num_thread *= 4) { + printf("# Creating threadpool with %d threads\n", num_thread); + struct threadpool tp; + threadpool_init(&tp, num_thread); - // Main loop. - for (int run = 0; run < num_runs; run++) { + // Main loop. + for (int run = 0; run < num_runs; run++) { - // Run over a set of integers and print them. - int data[N]; - for (int k = 0; k < N; k++) data[k] = k; - printf("processing integers from 0..%i.\n", N); - fflush(stdout); - threadpool_map(&tp, map_function_first, data, N, sizeof(int), 1, NULL); + // Run over a set of integers and print them. + int data[N]; + for (int k = 0; k < N; k++) data[k] = k; + printf("1..processing integers from 0..%i.\n", N); + fflush(stdout); + threadpool_map(&tp, map_function_first, data, N, sizeof(int), 1, NULL); - // Do the same thing again, with less jobs than threads. - printf("processing integers from 0..%i.\n", N / 2); - fflush(stdout); - threadpool_map(&tp, map_function_second, data, N / 2, sizeof(int), 1, NULL); + // Do the same thing again, with less jobs than threads. + printf("2..processing integers from 0..%i.\n", N / 2); + fflush(stdout); + threadpool_map(&tp, map_function_second, data, N / 2, sizeof(int), 1, + NULL); - // Do the same thing again, with a chunk size of two. - printf("processing integers from 0..%i.\n", N); - fflush(stdout); - threadpool_map(&tp, map_function_first, data, N, sizeof(int), 2, NULL); - } + // Do the same thing again, with a chunk size of two. + printf("3..processing integers from 0..%i.\n", N); + fflush(stdout); + threadpool_map(&tp, map_function_first, data, N, sizeof(int), 2, NULL); + } + +/* If logging was enabled, dump the log. */ +#ifdef SWIFT_DEBUG_THREADPOOL + char filename[80]; + sprintf(filename, "threadpool_log-%d.txt", num_thread); + printf("# Dumping log\n"); + threadpool_dump_log(&tp, filename, 1); +#endif - /* Be clean */ - threadpool_clean(&tp); + /* Be clean */ + threadpool_clean(&tp); + printf("\n"); + } return 0; } diff --git a/tests/tolerance_125_normal.dat b/tests/tolerance_125_normal.dat index c9ad23d4472c46e64e8418e46c5fe71f813b23b5..0f11d03507b23c76b5703e118eede1359fe2afba 100644 --- a/tests/tolerance_125_normal.dat +++ b/tests/tolerance_125_normal.dat @@ -1,3 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z h rho div_v S u P c a_x a_y a_z h_dt v_sig dS/dt du/dt 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 1e-5 1e-5 1e-5 1e-5 1e-5 diff --git a/tests/tolerance_125_perturbed.dat b/tests/tolerance_125_perturbed.dat index 04e642b28cb3729cb81f8183c3e69595ac651876..349f68c1ad6393ba2ffba675126edc3de11a487e 100644 --- a/tests/tolerance_125_perturbed.dat +++ b/tests/tolerance_125_perturbed.dat @@ -1,3 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z h rho div_v S u P c a_x a_y a_z h_dt v_sig dS/dt du/dt 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 - 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 5e-3 5e-3 5e-3 1e-4 1e-4 1e-4 1e-4 + 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 2e-3 2e-3 2e-3 1e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-4 2e-4 2e-4 1e-6 1e-6 1e-6 1e-6 diff --git a/tests/tolerance_27_normal.dat b/tests/tolerance_27_normal.dat index 31ee002bb9c73ff8d74cce545aff715476b33507..0fe55e84a42e7541068744e1e554afff1731ed3f 100644 --- a/tests/tolerance_27_normal.dat +++ b/tests/tolerance_27_normal.dat @@ -1,3 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 4e-5 2e-4 2e-3 1e-5 6e-6 6e-6 6e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-4 1e-4 1e-4 2e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 4e-5 4e-4 1e-2 1e-5 6e-6 6e-6 6e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-4 1e-4 2e-4 2e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 diff --git a/tests/tolerance_27_perturbed.dat b/tests/tolerance_27_perturbed.dat index 9c6ee8c77cc6d53e67db9dbb86be197d49149b10..aa86962b733e2da73211bceeb30b2345af808bb5 100644 --- a/tests/tolerance_27_perturbed.dat +++ b/tests/tolerance_27_perturbed.dat @@ -1,3 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1e-4 5e-5 2e-3 4e-6 3e-6 3e-6 3e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-3 1e-5 1e-4 4e-5 2e-3 2e-3 2e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 1e-4 2e-4 1e-2 1e-5 3e-6 3e-6 7e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-3 1e-5 2e-3 6e-5 2e-3 2e-3 2e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 4e-4 1e-6 1e0 1e-6 2e-6 2e-6 2e-6 diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat new file mode 100644 index 0000000000000000000000000000000000000000..5142c2a2090e15381a19b2bc71e253a482973b11 --- /dev/null +++ b/tests/tolerance_27_perturbed_h.dat @@ -0,0 +1,4 @@ +# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.4e-6 1e-4 5e-4 1.2e-2 1e-5 3e-6 3e-6 8e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat new file mode 100644 index 0000000000000000000000000000000000000000..23f6a5006124f6233aebd111005760a5dcc5b6a3 --- /dev/null +++ b/tests/tolerance_27_perturbed_h2.dat @@ -0,0 +1,4 @@ +# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.5e-2 1.4e-5 3e-6 3e-6 9e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.57e-2 1e-5 4.74e-3 3.89e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat new file mode 100644 index 0000000000000000000000000000000000000000..b07697a686eb7801326ceaf77cf93fb3a1491c2e --- /dev/null +++ b/tests/tolerance_pair_active.dat @@ -0,0 +1,4 @@ +# ID wcount + 0 1e-2 + 0 1e-2 + 0 1e-2 diff --git a/tests/tolerance_periodic_BC_normal.dat b/tests/tolerance_periodic_BC_normal.dat new file mode 100644 index 0000000000000000000000000000000000000000..823e4af488b343f57e3c90e89ee2d4f13d3ca94b --- /dev/null +++ b/tests/tolerance_periodic_BC_normal.dat @@ -0,0 +1,4 @@ +# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 4e-6 4e-5 1e-3 1e-2 2e-4 2e-4 2e-4 2e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 2e-4 1e-4 2e-4 6e-4 2e-3 2e-3 2e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-4 1e-6 1e-4 5e-4 2e-4 2e-4 2e-4 diff --git a/tests/tolerance_pair_normal.dat b/tests/tolerance_periodic_BC_perturbed.dat similarity index 53% rename from tests/tolerance_pair_normal.dat rename to tests/tolerance_periodic_BC_perturbed.dat index f5031c5f47dfa203300ebcc9a47fbac42f854d26..df5ee6458ba05eed08006586514467fcdb715990 100644 --- a/tests/tolerance_pair_normal.dat +++ b/tests/tolerance_periodic_BC_perturbed.dat @@ -1,3 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 2e-5 3e-2 1e-5 1e-5 1e-5 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1.2e-5 1e-5 1e-2 1e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 4e-5 1e-3 1e-2 2e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-6 6e-3 1e-4 3e-3 1e-2 6e-3 6e-3 6e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2e-3 1e-6 1e0 5e-4 3e-3 3e-3 3e-3 diff --git a/tests/tolerance_pair_perturbed.dat b/tests/tolerance_testInteractions.dat similarity index 52% rename from tests/tolerance_pair_perturbed.dat rename to tests/tolerance_testInteractions.dat index ca58ff45995158e031eca6b60eec498aa6c627ef..ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce 100644 --- a/tests/tolerance_pair_perturbed.dat +++ b/tests/tolerance_testInteractions.dat @@ -1,3 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 2e-5 3e-2 1e-5 1e-5 1e-5 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-3 4e-4 8e-3 2e-2 1e-4 1.6e-4 1.6e-4 1.6e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 4e-5 4e-4 1e-2 1e-5 6e-6 6e-6 6e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-4 1e-4 2e-4 2e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 diff --git a/theory/Multipoles/bibliography.bib b/theory/Multipoles/bibliography.bib index 12e274dd63093ba1e14750249f2538c092e5268a..193db42ea4947e49930b79cbd663562d971ec2d4 100644 --- a/theory/Multipoles/bibliography.bib +++ b/theory/Multipoles/bibliography.bib @@ -96,3 +96,69 @@ doi="10.1007/BF02123482", url="http://dx.doi.org/10.1007/BF02123482" } + + +@article{Greengard1987, +title = "A fast algorithm for particle simulations", +journal = "Journal of Computational Physics", +volume = "73", +number = "2", +pages = "325 - 348", +year = "1987", +note = "", +issn = "0021-9991", +doi = "http://dx.doi.org/10.1016/0021-9991(87)90140-9", +url = "http://www.sciencedirect.com/science/article/pii/0021999187901409", +author = "L Greengard and V Rokhlin", +} + +@article{Cheng1999, +title = "A Fast Adaptive Multipole Algorithm in Three Dimensions", +journal = "Journal of Computational Physics", +volume = "155", +number = "2", +pages = "468 - 498", +year = "1999", +note = "", +issn = "0021-9991", +doi = "http://dx.doi.org/10.1006/jcph.1999.6355", +url = "http://www.sciencedirect.com/science/article/pii/S0021999199963556", +author = "H. Cheng and L. Greengard and V. Rokhlin", +keywords = "Laplace equation", +keywords = "translation operators", +keywords = "fast multipole method", +keywords = "adaptive algorithms" +} + + +@ARTICLE{Dehnen2000, + author = {{Dehnen}, W.}, + title = "{A Very Fast and Momentum-conserving Tree Code}", + journal = {\apjl}, + eprint = {astro-ph/0003209}, + keywords = {Celestial Mechanics, Stellar Dynamics, Methods: n-Body Simulations, Methods: Numerical}, + year = 2000, + month = jun, + volume = 536, + pages = {L39-L42}, + doi = {10.1086/312724}, + adsurl = {http://adsabs.harvard.edu/abs/2000ApJ...536L..39D}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + +@ARTICLE{Dehnen2002, + author = {{Dehnen}, W.}, + title = "{A Hierarchical $\lt$E10$\gt$O$\lt$/E10$\gt$(N) Force Calculation Algorithm}", + journal = {Journal of Computational Physics}, + eprint = {astro-ph/0202512}, + year = 2002, + month = jun, + volume = 179, + pages = {27-42}, + doi = {10.1006/jcph.2002.7026}, + adsurl = {http://adsabs.harvard.edu/abs/2002JCoPh.179...27D}, + adsnote = {Provided by the SAO/NASA Astrophysics Data System} +} + + + diff --git a/theory/Multipoles/cells.odg b/theory/Multipoles/cells.odg new file mode 100644 index 0000000000000000000000000000000000000000..ada8fd7a1a6e746fca93f2b1ed04b78a6b7f9097 Binary files /dev/null and b/theory/Multipoles/cells.odg differ diff --git a/theory/Multipoles/cells.pdf b/theory/Multipoles/cells.pdf new file mode 100644 index 0000000000000000000000000000000000000000..d621f6f1023d71503f698b69694d980ef27814e6 Binary files /dev/null and b/theory/Multipoles/cells.pdf differ diff --git a/theory/Multipoles/fmm_standalone.tex b/theory/Multipoles/fmm_standalone.tex index fcd727a89abe95bba69b23c58ce5067c8cc53211..dc4266a23110873ff38ccbec4d71345e2780d6b2 100644 --- a/theory/Multipoles/fmm_standalone.tex +++ b/theory/Multipoles/fmm_standalone.tex @@ -4,27 +4,45 @@ \usepackage{times} \newcommand{\swift}{{\sc Swift}\xspace} +\newcommand{\nbody}{$N$-body\xspace} %opening \title{FMM in SWIFT} \author{Matthieu Schaller} - \begin{document} +\date{\today} + +\pagerange{\pageref{firstpage}--\pageref{lastpage}} \pubyear{2014} + \maketitle -We use the multi-index notation of \cite{Dehnen2014} to simplify expressions. +\label{firstpage} + +\begin{abstract} +Making gravity great again. +\end{abstract} + +\begin{keywords} +\end{keywords} + +\section{Gravity in \swift} +\label{sec:gravity} \input{potential_softening} +\input{fmm_summary} \input{gravity_derivatives} +\input{mesh_summary} \bibliographystyle{mnras} \bibliography{./bibliography.bib} \appendix +\input{vector_notation} \onecolumn \input{potential_derivatives} +\label{lastpage} \end{document} diff --git a/theory/Multipoles/fmm_summary.tex b/theory/Multipoles/fmm_summary.tex new file mode 100644 index 0000000000000000000000000000000000000000..1ff9ab88ada6836d6118c7cfd74e39f4d1c504b3 --- /dev/null +++ b/theory/Multipoles/fmm_summary.tex @@ -0,0 +1,182 @@ +\subsection{Evaluating the forces using the Fast Multipole Method} +\label{ssec:fmm_summary} + +The algorithmically challenging aspect of the \nbody problem is to +evaluate for each particle in a system the potential and associated +forces generated by all the other particles. Mathematically, this means +evaluate +\begin{equation} + \phi(\mathbf{x}_a) = \sum_{b \neq a} G m_b\varphi(\mathbf{x}_a - + \mathbf{x}_b)\qquad \forall~a\in N + \label{eq:fmm:n_body} +\end{equation} +efficiently for large numbers of particles $N$. In the case of collisionless +dynamics, the particles are a mere Monte-Carlo sampling of the +underlying coarse-grained phase-space distribution which justifies the +use of approximate method to evaluate Eq.~\ref{eq:fmm:n_body}. The +\emph{Fast Multipole Method} (FMM) \citep{Greengard1987, Cheng1999}, +popularized in the field and adapted specifically for gravity solvers +by \cite{Dehnen2000, Dehnen2002}, is an $\mathcal{O}(N)$ method +designed to solve Eq.~\ref{eq:fmm:n_body} by expanding the potential both +around $\mathbf{x}_i$ and $\mathbf{x}_j$ and grouping similar terms +arising from nearby particles. \\ + +In what follows, we use the compact multi-index notation of +\cite{Dehnen2014} (repeated in appendix \ref{sec:multi_index_notation} +for completeness) to simplify expressions and ease +comparisons. $\mathbf{k}$, $\mathbf{m}$ and $\mathbf{n}$ are +multi-indices and $\mathbf{r}$, $\mathbf{R}$, $\mathbf{x}$, +$\mathbf{y}$ and $\mathbf{z}$ are vectors, whilst $a$ and $b$ are +particle indices.\\ + +\begin{figure} +\includegraphics[width=\columnwidth]{cells.pdf} +\caption{The basics of the FMM: The potential generated by a particle + at position $\mathbf{x}_b$ on a particle at position at location + $\mathbf{x}_a$ is replaced by a Taylor expansion of the potential + around the distance vector $\mathbf{R}$ linking the two centres of mass + ($\mathbf{z}_A$ and $\mathbf{z}_B$) of cell $A$ and $B$. The + expansion converges towards the exact expression provided + $|\mathbf{R}|<|\mathbf{r}_a + \mathbf{r}_b|$.} +\label{fig:fmm:cells} +\end{figure} + + +For a single pair of particles $a$ and $b$ located in cell $A$ and $B$ +with centres of mass $\mathbf{z}_A$ and $\mathbf{z}_B$ +respectively, as shown on Fig.~\ref{fig:fmm:cells}, the potential +generated by $b$ at the location of $a$ can be rewritten as +\begin{align} + \varphi(\mathbf{x}_a - \mathbf{x}_b) + &= \varphi\left(\mathbf{x}_a - \mathbf{z}_A - \mathbf{x}_b + + \mathbf{z}_B + \mathbf{z}_A - \mathbf{z}_B\right) \nonumber \\ + &= \varphi\left(\mathbf{r}_a - \mathbf{r}_b + \mathbf{R}\right) + \nonumber \\ + &= \sum_\mathbf{k} \frac{1}{\mathbf{k}!} \left(\mathbf{r}_a - + \mathbf{r}_b\right)^{\mathbf{k}} \nabla^{\mathbf{k}}\varphi(\mathbf{R}) + \nonumber \\ + &= \sum_\mathbf{k} \frac{1}{\mathbf{k}!} \sum_{\mathbf{n} < + \mathbf{k}} \binom{\mathbf{k}}{\mathbf{n}} \mathbf{r}_a^{\mathbf{n}} + \left(-\mathbf{r}_b\right)^{\mathbf{k} - \mathbf{n}} + \nabla^{\mathbf{k}}\varphi(\mathbf{R})\nonumber \\ + &= \sum_\mathbf{n} \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} + \sum_\mathbf{m} \frac{1}{\mathbf{m}!} + \left(-\mathbf{r}_b\right)^\mathbf{m} \nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R}), +\end{align} +where we used the Taylor expansion of $\varphi$ around $\mathbf{R} \equiv +\mathbf{z}_A - \mathbf{z}_B$ on the third line, used $\mathbf{r}_a +\equiv \mathbf{x}_a - \mathbf{z}_A$, $\mathbf{r}_b \equiv \mathbf{x}_b +- \mathbf{z}_B$ throughout and defined $\mathbf{m} \equiv +\mathbf{k}-\mathbf{n}$ on the last line. Expanding the series only up +to order $p$, we get +\begin{equation} + \varphi(\mathbf{x}_a - \mathbf{x}_b) \approx \sum_{\mathbf{n}}^{p} + \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} \sum_{\mathbf{m}}^{p + -|\mathbf{n}|} + \frac{1}{\mathbf{m}!} \left(-\mathbf{r}_b\right)^\mathbf{m} + \nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R}), + \label{eq:fmm:fmm_one_part} +\end{equation} +with the approximation converging as $p\rightarrow\infty$ towards the +correct value provided $|\mathbf{R}|<|\mathbf{r}_a + +\mathbf{r}_b|$. If we now consider all the particles within $B$ and +combine their contributions to the potential at location +$\mathbf{x}_a$ in cell $A$, we get +\begin{align} + \phi_{BA}(\mathbf{x}_a) &= \sum_{b\in B}G m_b\varphi(\mathbf{x}_a - + \mathbf{x}_b) \label{eq:fmm:fmm_one_cell} \\ + &\approx G\sum_{\mathbf{n}}^{p} + \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} \sum_{\mathbf{m}} + ^{p -|\mathbf{n}|} + \frac{1}{\mathbf{m}!} \sum_{b\in B} m_b\left(-\mathbf{r}_b\right)^\mathbf{m} + \nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R}) \nonumber. +\end{align} +This last equation forms the basis of the FMM. The algorithm +decomposes the equation into three separated sums evaluated at +different stages.\\ + +In a first step, multipoles are constructed from the +innermost sum. For each cell, we compute all the terms +\begin{equation} + \mathsf{M}_{\mathbf{m}}(\mathbf{z}_B) = \frac{1}{\mathbf{m}!} + \sum_{b\in B} m_b\left(-\mathbf{r}_b\right)^\mathbf{m} \label{eq:fmm:P2M} +\end{equation} +up to order $p$. This is the first kernel of the method, commonly +labelled as \textsc{P2M} (particle to multipole). In a second step, we +compute the second kernel, \textsc{M2L} (multipole to local +expansion), which corresponds to the interaction of a cell with +another one: +\begin{equation} + \mathsf{F}_{\mathbf{n}}(\mathbf{z}_A) = G\sum_{\mathbf{m}}^{p -|\mathbf{n}|} + \mathsf{M}_{\mathbf{m}}(\mathbf{z}_B) + \mathsf{D}_{\mathbf{n}+\mathbf{m}}(\mathbf{R}), \label{eq:fmm:M2L} +\end{equation} +where $\mathsf{D}_{\mathbf{n}+\mathbf{m}}(\mathbf{R}) \equiv +\nabla^{\mathbf{n}+\mathbf{m}} \varphi(\mathbf{R})$ is an order $n+m$ +derivative of the potential. This is the computationally expensive +step of the FMM algorithm as the number of operations in a naive +implementation using cartesian coordinates scales as +$\mathcal{O}(p^6)$. More advanced techniques +\citep[e.g.][]{Dehnen2014} can bring the cost down to +$\mathcal{O}(p^3)$, albeit at a considerable algebraic cost. For +collisionless dynamics, high accuracy is not required and low values +of $p$ are sufficient, which maintains the computational cost of the +M2L kernel at a reasonable level. +Finally, in the last step, the potential is propagated from the local +expansion centre to the particles (L2P kernel) using +\begin{equation} + \phi_{BA}(\mathbf{x}_a) = \sum_{\mathbf{n}}^{p} + \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} + \mathsf{F}_{\mathbf{n}}(\mathbf{z}_A). \label{eq:fmm:L2P} +\end{equation} +In summary, the potential generated by a cell $B$ on the particles in +cell $A$ is obtained by the successive application of the P2M, M2L and +L2P kernels. The P2M and L2P kernels are applied only once per +particle, whilst one M2L calculation has to be performed for each pair +of cells. The forces applied to the particles are obtained by the same +procedure using an extra order in the Taylor expansion. For instance, +for the acceleration along $x$, we have: +\begin{equation} + a_x(\mathbf{x}_a) = \sum_{\mathbf{n}}^{p-1} + \frac{1}{\mathbf{n}!} \mathbf{r}_a^{\mathbf{n}} + \mathsf{F}_{\mathbf{n}+\left(1,0,0\right)}(\mathbf{z}_A). \label{eq:fmm:L2P_force} +\end{equation} + +In practice, the multipoles can be constructed recursively from the +leaves of the tree to the root and the local expansions from the root +to the leaves by shifting the $\mathsf{M}$ and $\mathsf{F}$ tensors +and adding their contributions to their parent or daughter cell's +tensors respecitvely. The shifting formulas (M2M and L2L kernels) +read: + +\begin{align} + \mathsf{M}_{\mathbf{m}}(\mathbf{x} + \mathbf{y}) &= + \sum_{\mathbf{n}}^{\mathbf{m}} + \frac{\mathbf{y}^\mathbf{n}}{\mathbf{n}!}\mathsf{M}_{\mathbf{m} - + \mathbf{n}}(\mathbf{x}), \label{eq:fmm:M2M} \\ + \mathsf{F}_{\mathbf{n}}(\mathbf{x} + \mathbf{y}) &= + \sum_{\mathbf{m}}^{p-|\mathbf{n}|} + \frac{\mathbf{y}^\mathbf{m}}{\mathbf{m}!}\mathsf{F}_{\mathbf{m} + + \mathbf{n}}(\mathbf{x}). \label{eq:fmm:L2L} +\end{align} + +All the kernels (Eqs.~\ref{eq:fmm:P2M}-\ref{eq:fmm:L2L}) are rather +straightforward to evaluate as they are only made of additions and +multiplications (provided $\mathsf{D}$ can be evaluated quickly, see +Sec.~\ref{ssec:grav_derivatives}), which are extremely efficient +instructions on modern architectures. However, the fully expanded sums +can lead to rather large and prone to typo expressions. To avoid any +mishaps, we use a \texttt{python} script to generate C code in which +all the sums are unrolled and correct by construction. In \swift, we +implemented the kernels up to order $p=5$, as it proved to be accurate +enough for our purpose, but this could be extended to higher order +easily. This implies storing $56$ numbers per cell for each +$\textsf{M}$ and $\textsf{F}$ plus three numbers for the location of +the centre of mass. For leaf-cells with large numbers of particles, as +in \swift, this is a small memory overhead. One further small +improvement consists in choosing $\mathbf{z}_A$ to be the centre of +mass of cell $A$ rather than its geometrical centre. The first order +multipoles ($\mathsf{M}_{100},\mathsf{M}_{010},\mathsf{M}_{001}$) then +vanish by construction. This allows us to simplify some of the +expressions and helps reduce, albeit by a small fraction, the memory +footprint of the tree structure. diff --git a/theory/Multipoles/gravity_derivatives.py b/theory/Multipoles/generate_multipoles/gravity_derivatives.py similarity index 100% rename from theory/Multipoles/gravity_derivatives.py rename to theory/Multipoles/generate_multipoles/gravity_derivatives.py diff --git a/theory/Multipoles/multipoles.py b/theory/Multipoles/generate_multipoles/multipoles.py similarity index 100% rename from theory/Multipoles/multipoles.py rename to theory/Multipoles/generate_multipoles/multipoles.py diff --git a/theory/Multipoles/vector_powers.py b/theory/Multipoles/generate_multipoles/vector_powers.py similarity index 100% rename from theory/Multipoles/vector_powers.py rename to theory/Multipoles/generate_multipoles/vector_powers.py diff --git a/theory/Multipoles/gravity_derivatives.tex b/theory/Multipoles/gravity_derivatives.tex index e4c7b1565ab6c82de5623d5a643c3a8bd1fa513f..e4569ef960fae5e92343f1d99902a5c14fd6ee5c 100644 --- a/theory/Multipoles/gravity_derivatives.tex +++ b/theory/Multipoles/gravity_derivatives.tex @@ -1,52 +1,55 @@ -\subsection{Derivatives of the gravitational potential} +\subsection{Notes on the derivatives of the gravitational potential} +\label{ssec:grav_derivatives} The calculation of all the -$D_\mathbf{n}(x,y,z) \equiv \nabla^{\mathbf{n}}\phi(x,y,z)$ terms up +$\mathsf{D}_\mathbf{n}(x,y,z) \equiv \nabla^{\mathbf{n}}\varphi(x,y,z)$ terms up to the relevent order can be quite tedious and it is beneficial to automatize the whole setup. Ideally, one would like to have an -expression for each of this term that is only made of multiplications +expression for each of these terms that is only made of multiplications and additions of each of the coordinates and the inverse distance. We -achieve this by writing $\phi$ as a composition of functions -$\phi(u(x,y,z))$ and apply the \textit{Fa\`a di Bruno} +achieve this by writing $\varphi$ as a composition of functions +$\varphi(u(x,y,z))$ and apply the \textit{Fa\`a di Bruno} formula \citep[i.e. the ``chain rule'' for higher order derivatives, -e.g.][]{Hardy2006} to construct our terms: - + see e.g.][]{Hardy2006} to construct our terms: \begin{equation} \label{eq:faa_di_bruno} -\frac{\partial^n}{\partial x_1 \cdots \partial x_n} \phi(u) -= \sum_{A} \phi^{(|A|)}(u) \prod_{B \in +\frac{\partial^n}{\partial x_1 \cdots \partial x_n} \varphi(u) += \sum_{A} \varphi^{(|A|)}(u) \prod_{B \in A} \frac{\partial^{|B|}}{\prod_{c\in B}\partial x_c} u(x,y,z), \end{equation} where $A$ is the set of all partitions of $\lbrace1,\cdots, n\rbrace$, -$B$ is a block of a partition $A$ and $|\cdot|$ denotes the -cardinality of a set. For generic functions $\phi$ and $u$ this +$B$ is a block of a partition in the set $A$ and $|\cdot|$ denotes the +cardinality of a set. For generic functions $\varphi$ and $u$ this formula yields an untracktable number of terms; an 8th-order derivative will have $4140$ (!) terms in the sum\footnote{The number -of terms in the sum is given by the Bell number of the same order}. \\ -We choose to write + of terms in the sum is given by the Bell number of the same + order.}. \\ For the un-softened gravitational potential, we choose to write \begin{align} - \phi(x,y,z) &= 1 / \sqrt{u(x,y,z)}, \\ + \varphi(x,y,z) &= 1 / \sqrt{u(x,y,z)}, \\ u(x,y,z) &= x^2 + y^2 + z^2. \end{align} -This choice allows to have derivatives of any order of $\phi(u)$ that -only depend on powers of $u$: - +This choice allows to have derivatives of any order of $\varphi(u)$ that +can be easily expressed and only depend on powers of $u$: \begin{equation} -f^{(n)}(u) = \frac{\Gamma(\frac{1}{2})}{\Gamma(\frac{1}{2} - -n)}\frac{1}{u^{n+\frac{1}{2}}}. +\varphi^{(n)}(u) = (-1)^n\cdot\frac{(2n-1)!!}{2^n}\cdot\frac{1}{u^{n+\frac{1}{2}}}, \end{equation} -More importantly, this choice of decomposition allows us to have -derivatives of $u$ only up to second order in $x$, $y$ or $z$. The -number of non-zero terms in eq. \ref{eq:faa_di_bruno} is hence -drastically reduced. For instance, when computing -$D_{(4,1,3)} \equiv \frac{\partial^8}{\partial x^4 \partial y \partial -z^3} \phi$, $4100$ of the $4140$ terms will involve at least one +where $!!$ denotes the semi-factorial. More importantly, this +choice of decomposition allows us to have non-zero derivatives of $u$ +only up to second order in $x$, $y$ or $z$. The number of non-zero +terms in eq. \ref{eq:faa_di_bruno} is hence drastically reduced. For +instance, when computing $\mathsf{D}_{(4,1,3)}(\mathbf{r}) \equiv +\frac{\partial^8}{\partial x^4 \partial y \partial z^3} +\varphi(u(x,y,z))$, $4100$ of the $4140$ terms will involve at least one zero-valued derivative (e.g. $\partial^3/\partial x^3$ or $\partial^2/\partial x\partial y$) of $u$. Furthermore, among the 40 -remaining terms, many will involve the same derivatives and can be -grouped together, leaving us with a sum of six products of $x$,$y$ and -$z$. This is generally the case for most of the $D_\mathbf{n}$'s and -figuring out which terms are identical in a given set of partitions of -$\lbrace1,\cdots, n\rbrace$ is an interesting exercise in -combinatorics left for the reader \citep[see also][]{Hardy2006}. +remaining terms, many will involve the same combination of derivatives +of $u$ and can be grouped together, leaving us with a sum of six +products of $x$,$y$ and $z$. This is generally the case for most of +the $\mathsf{D}_\mathbf{n}$'s and figuring out which terms are identical in a +given set of partitions of $\lbrace1,\cdots, n\rbrace$ is an +interesting exercise in combinatorics left for the reader \citep[see + also][]{Hardy2006}. We use a \texttt{python} script based on this +technique to generate the actual C routines used within \swift. Some +examples of these terms are given in Appendix +\ref{sec:pot_derivatives}. diff --git a/theory/Multipoles/mesh_summary.tex b/theory/Multipoles/mesh_summary.tex new file mode 100644 index 0000000000000000000000000000000000000000..3069257c8845804d9a307cc54fffec5e36e4ae8c --- /dev/null +++ b/theory/Multipoles/mesh_summary.tex @@ -0,0 +1,39 @@ +\subsection{Coupling the FMM to a mesh for periodic long-range forces} +\label{ssec:mesh_summary} + +\begin{equation} + S(x) = \frac{e^x}{1 + e^x} +\end{equation} + +\begin{align} + \varphi_s(r) &= \frac{1}{r}\left[2 - 2S\left(\frac{2r}{r_s}\right)\right] \nonumber\\ + &= \frac{1}{r}\left[2 - \frac{2e^{\frac{2r}{r_s}}}{1+e^{\frac{2r}{r_s}}}\right] +\end{align} +\begin{align} + |\mathbf{f}_s(r)| &= \frac{1}{r^2}\left[\frac{4r}{r_s}S'\left(\frac{2r}{r_s}\right) - 2S\left(\frac{2r}{r_s}\right) + 2\right] \nonumber \\ + &= \frac{1}{r^2}\left[\frac{4r}{r_s}\frac{e^{\frac{2r}{r_s}}}{(1+e^{\frac{2r}{r_s}})^2} - \frac{2e^{\frac{2r}{r_s}}}{1+e^{\frac{2r}{r_s}}} + 2\right] +\end{align} + +\begin{equation} + \tilde\varphi_l(k) = \frac{1}{k^2}\left[\frac{\upi}{2}kr_s\textrm{csch}\left(\frac{\upi}{2}kr_s\right) \right] +\end{equation} + +\begin{figure} +\includegraphics[width=\columnwidth]{potential_short.pdf} +\caption{aa} +\label{fig:fmm:potential_short} +\end{figure} + + +\begin{figure} +\includegraphics[width=\columnwidth]{force_short.pdf} +\caption{bb} +\label{fig:fmm:force_short} +\end{figure} + + +\begin{figure} +\includegraphics[width=\columnwidth]{potential_long.pdf} +\caption{cc} +\label{fig:fmm:potential_long} +\end{figure} diff --git a/theory/Multipoles/plot_mesh.py b/theory/Multipoles/plot_mesh.py new file mode 100644 index 0000000000000000000000000000000000000000..6706016f73b4b6251c6d517ec89eacbb7a469417 --- /dev/null +++ b/theory/Multipoles/plot_mesh.py @@ -0,0 +1,267 @@ +############################################################################### + # This file is part of SWIFT. + # Copyright (c) 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk) + # + # This program is free software: you can redistribute it and/or modify + # it under the terms of the GNU Lesser General Public License as published + # by the Free Software Foundation, either version 3 of the License, or + # (at your option) any later version. + # + # This program is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + # GNU General Public License for more details. + # + # You should have received a copy of the GNU Lesser General Public License + # along with this program. If not, see <http://www.gnu.org/licenses/>. + # + ############################################################################## +import matplotlib +matplotlib.use("Agg") +from pylab import * +from scipy import integrate +from scipy import special +from scipy.optimize import curve_fit +from scipy.optimize import fsolve +from matplotlib.font_manager import FontProperties +import numpy +import math + +params = {'axes.labelsize': 9, +'axes.titlesize': 10, +'font.size': 10, +'legend.fontsize': 10, +'xtick.labelsize': 8, +'ytick.labelsize': 8, +'text.usetex': True, +'figure.figsize' : (3.15,3.15), +'figure.subplot.left' : 0.12, +'figure.subplot.right' : 0.99 , +'figure.subplot.bottom' : 0.09 , +'figure.subplot.top' : 0.99 , +'figure.subplot.wspace' : 0. , +'figure.subplot.hspace' : 0. , +'lines.markersize' : 6, +'lines.linewidth' : 3., +'text.latex.unicode': True +} +rcParams.update(params) +rc('font',**{'family':'sans-serif','sans-serif':['Times']}) +colors=['#4477AA', '#CC6677', '#DDCC77', '#117733'] + + +# Parameters +r_s = 2. +r_min = 1e-2 +r_max = 1.5e2 + +# Radius +r = logspace(log10(r_min), log10(r_max), 401) +r_rs = r / r_s + +k = logspace(log10(r_min/r_s**2), log10(r_max/r_s**2), 401) +k_rs = k * r_s + +# Newtonian solution +phi_newton = 1. / r +phit_newton = 1. / k**2 +force_newton = 1. / r**2 + +def my_exp(x): + return 1. + x + (x**2 / 2.) + (x**3 / 6.) + (x**4 / 24.) + (x**5 / 120.)# + (x**6 / 720.) + #return exp(x) + +def csch(x): # hyperbolic cosecant + return 1. / sinh(x) + +def sigmoid(x): + return my_exp(x) / (my_exp(x) + 1.) + +def d_sigmoid(x): + return my_exp(x) / ((my_exp(x) + 1)**2) + +def swift_corr(x): + return 2 * sigmoid( 4 * x ) - 1 + +#figure() +#x = linspace(-4, 4, 100) +#plot(x, special.erf(x), '-', color=colors[0]) +#plot(x, swift_corr(x), '-', color=colors[1]) +#plot(x, x, '-', color=colors[2]) +#ylim(-1.1, 1.1) +#xlim(-4.1, 4.1) +#savefig("temp.pdf") + +# Correction in real space +corr_short_gadget2 = special.erf(r / (2.*r_s)) +corr_short_swift = swift_corr(r / (2.*r_s)) +eta_short_gadget2 = special.erfc(r / 2.*r_s) + (r / (r_s * math.sqrt(math.pi))) * exp(-r**2 / (4.*r_s**2)) +eta_short_swift = 4. * (r / r_s) * d_sigmoid(2. * r / r_s) - 2. * sigmoid(2 * r / r_s) + 2. + +# Corection in Fourier space +corr_long_gadget2 = exp(-k**2*r_s**2) +corr_long_swift = math.pi * k * r_s * csch(0.5 * math.pi * r_s * k) / 2. + +# Shortrange term +phi_short_gadget2 = (1. / r ) * (1. - corr_short_gadget2) +phi_short_swift = (1. / r ) * (1. - corr_short_swift) +force_short_gadget2 = (1. / r**2) * eta_short_gadget2 +force_short_swift = (1. / r**2) * eta_short_swift + +# Long-range term +phi_long_gadget2 = (1. / r ) * corr_short_gadget2 +phi_long_swift = (1. / r ) * corr_short_swift +phit_long_gadget2 = corr_long_gadget2 / k**2 +phit_long_swift = corr_long_swift / k**2 + + + + +figure() + +# Potential +subplot(311, xscale="log", yscale="log") + +plot(r_rs, phi_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0]) +plot(r_rs, phi_short_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2]) +plot(r_rs, phi_short_swift, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3]) +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) + +xlim(1.1*r_min/ r_s, 0.9*r_max / r_s) +ylim(1.1/r_max, 0.9/r_min) +ylabel("$\\varphi_s(r)$", labelpad=-3) + +legend(loc="upper right", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8) + +# Correction +subplot(312, xscale="log", yscale="log") +plot(r_rs, np.ones(np.size(r)), '--', lw=1.4, color=colors[0]) +plot(r_rs, 1. - corr_short_gadget2, '-', lw=1.4, color=colors[2]) +plot(r_rs, 1. - corr_short_swift, '-', lw=1.4, color=colors[3]) +plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5) +plot([1., 1.], [-1e5, 1e5], 'k-', alpha=0.5, lw=0.5) + +yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"]) +xlim(1.1*r_min/r_s, 0.9*r_max/r_s) +ylim(3e-3, 1.5) +#ylabel("$\\chi_s(r)$", labelpad=-3) +ylabel("$\\varphi_s(r) \\times r$", labelpad=-2) + +# 1 - Correction +subplot(313, xscale="log", yscale="log") +plot(r_rs, corr_short_gadget2, '-', lw=1.4, color=colors[2]) +plot(r_rs, corr_short_swift, '-', lw=1.4, color=colors[3]) + +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r)), 'k:', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5) + +xlim(1.1*r_min/r_s, 0.9*r_max/r_s) +ylim(3e-3, 1.5) +#ylabel("$1 - \\chi_s(r)$", labelpad=-2) +ylabel("$1 - \\varphi_s(r) \\times r$", labelpad=-2) +yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"]) +xlabel("$r / r_s$", labelpad=-3) + +savefig("potential_short.pdf") + +################################################################################################## + + +# Force +figure() +subplot(311, xscale="log", yscale="log") + +plot(r_rs, force_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0]) +plot(r_rs, force_short_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2]) +plot(r_rs, force_short_swift, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3]) +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) + +xlim(1.1*r_min/ r_s, 0.9*r_max / r_s) +ylim(1.1/r_max**2, 0.9/r_min**2) +ylabel("$|\\mathbf{f}_s(r)|$", labelpad=-3) +yticks([1e-4, 1e-2, 1e0, 1e2], ["$10^{-4}$", "$10^{-2}$", "$10^{0}$", "$10^{2}$"]) + +legend(loc="upper right", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8) + +# Correction +subplot(312, xscale="log", yscale="log") +plot(r_rs, np.ones(np.size(r)), '--', lw=1.4, color=colors[0]) +plot(r_rs, eta_short_gadget2, '-', lw=1.4, color=colors[2]) +plot(r_rs, eta_short_swift, '-', lw=1.4, color=colors[3]) +plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5) +plot([1., 1.], [-1e5, 1e5], 'k-', alpha=0.5, lw=0.5) + +yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"]) +xlim(1.1*r_min/r_s, 0.9*r_max/r_s) +ylim(3e-3, 1.5) +#ylabel("$\\eta_s(r)$", labelpad=-3) +ylabel("$|\\mathbf{f}_s(r)|\\times r^2$", labelpad=-2) + +# 1 - Correction +subplot(313, xscale="log", yscale="log") +plot(r_rs, 1. - eta_short_gadget2, '-', lw=1.4, color=colors[2]) +plot(r_rs, 1. - eta_short_swift, '-', lw=1.4, color=colors[3]) + +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r)), 'k:', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5) + +xlim(1.1*r_min/r_s, 0.9*r_max/r_s) +ylim(3e-3, 1.5) +#ylabel("$1 - \\eta_s(r)$", labelpad=-2) +ylabel("$1 - |\\mathbf{f}_s(r)|\\times r^2$", labelpad=-3) +yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"]) +xlabel("$r / r_s$", labelpad=-3) + +savefig("force_short.pdf") + +################################################################################################## + +figure() +subplot(311, xscale="log", yscale="log") + +# Potential +plot(k_rs, phit_newton, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0]) +plot(k_rs, phit_long_gadget2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2]) +plot(k_rs, phit_long_swift, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3]) +plot(k_rs, -phit_long_swift, ':', lw=1.4, color=colors[3]) +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) + +legend(loc="lower left", frameon=True, handletextpad=0.1, handlelength=3.2, fontsize=8) + +xlim(1.1*r_min/ r_s, 0.9*r_max / r_s) +ylim(1.1/r_max**2, 0.9/r_min**2) +ylabel("$\\tilde{\\varphi_l}(k)$", labelpad=-3) +yticks([1e-4, 1e-2, 1e0, 1e2], ["$10^{-4}$", "$10^{-2}$", "$10^{0}$", "$10^{2}$"]) + +subplot(312, xscale="log", yscale="log") + +# Potential normalized +plot(k_rs, phit_newton * k**2, '--', lw=1.4, label="${\\rm Newtonian}$", color=colors[0]) +plot(k_rs, phit_long_gadget2 * k**2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2]) +plot(k_rs, phit_long_swift * k**2, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3]) +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5) + +xlim(1.1*r_min/ r_s, 0.9*r_max / r_s) +ylim(3e-3, 1.5) +ylabel("$k^2 \\times \\tilde{\\varphi_l}(k)$", labelpad=-3) +yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"]) + +subplot(313, xscale="log", yscale="log") + +plot(k_rs, 1. - phit_long_gadget2 * k**2, '-', lw=1.4, label="${\\rm Gadget}$", color=colors[2]) +plot(k_rs, 1. - phit_long_swift * k**2, '-', lw=1.4, label="${\\rm SWIFT}$", color=colors[3]) +plot([1., 1.], [1e-5, 1e5], 'k-', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r)), 'k:', alpha=0.5, lw=0.5) +plot(r_rs, np.ones(np.size(r))*0.01, 'k:', alpha=0.5, lw=0.5) + +xlim(1.1*r_min/ r_s, 0.9*r_max / r_s) +ylim(3e-3, 1.5) +ylabel("$1 - k^2 \\times \\tilde{\\varphi_l}(k)$", labelpad=-3) +yticks([1e-2, 1e-1, 1], ["$0.01$", "$0.1$", "$1$"]) + +xlabel("$k \\times r_s$", labelpad=0) + +savefig("potential_long.pdf") diff --git a/theory/Multipoles/potential.py b/theory/Multipoles/plot_potential.py similarity index 92% rename from theory/Multipoles/potential.py rename to theory/Multipoles/plot_potential.py index 559f590762a3cbef171c5dd584cbc517879a2cec..8761314572cdbda1304cdf882f920651b58be08e 100644 --- a/theory/Multipoles/potential.py +++ b/theory/Multipoles/plot_potential.py @@ -141,7 +141,7 @@ plot([epsilon, epsilon], [-10, 10], 'k-', alpha=0.5, lw=0.5) plot([epsilon/plummer_equivalent_factor, epsilon/plummer_equivalent_factor], [0, 10], 'k-', alpha=0.5, lw=0.5) ylim(0, 2.3) -ylabel("$|\\phi(r)|$", labelpad=1) +ylabel("$\\varphi(r)$", labelpad=1) #yticks([0., 0.5, 1., 1.5, 2., 2.5], ["$%.1f$"%(0.*epsilon), "$%.1f$"%(0.5*epsilon), "$%.1f$"%(1.*epsilon), "$%.1f$"%(1.5*epsilon), "$%.1f$"%(2.*epsilon)]) xlim(0,r_max_plot) @@ -163,19 +163,6 @@ xticks([0., 0.5, 1., 1.5, 2., 2.5], ["$%.1f$"%(0./epsilon), "", "$%.1f$"%(1./eps xlabel("$r/H$", labelpad=-7) ylim(0, 0.95) -ylabel("$|\\overrightarrow{\\nabla}\\phi(r)|$", labelpad=0) +ylabel("$|\\overrightarrow{\\nabla}\\varphi(r)|$", labelpad=0) savefig("potential.pdf") - - - - -#Construct potential -# phi = np.zeros(np.size(r)) -# for i in range(np.size(r)): -# if r[i] > 2*epsilon: -# phi[i] = 1./ r[i] -# elif r[i] > epsilon: -# phi[i] = -(1./epsilon) * ((32./3.)*u[i]**2 - (48./3.)*u[i]**3 + (38.4/4.)*u[i]**4 - (32./15.)*u[i]**5 + (2./30.)*u[i]**(-1) - (9/5.)) -# else: -# phi[i] = -(1./epsilon) * ((32./6.)*u[i]**2 - (38.4/4.)*u[i]**4 + (32./5.)*u[i]**4 - (7./5.)) diff --git a/theory/Multipoles/potential_derivatives.tex b/theory/Multipoles/potential_derivatives.tex index 56184ce98902d76ad53ce1d49e3d6d67dfc33ac4..5c7b1e6566d7d51b5d27ea3c24d785571e1ad692 100644 --- a/theory/Multipoles/potential_derivatives.tex +++ b/theory/Multipoles/potential_derivatives.tex @@ -1,4 +1,5 @@ -\subsection{Derivatives of the potential} +\section{Derivatives of the potential} +\label{sec:pot_derivatives} For completeness, we give here the full expression for the first few derivatives of the potential that are used in our FMM scheme. We use @@ -6,7 +7,7 @@ the notation $\mathbf{r}=(r_x, r_y, r_z)$, $r = |\mathbf{r}|$ and $u=r/H$. Starting from the potential (Eq. \ref{eq:fmm:potential}, reproduced here for clarity), \begin{align} -D_{000}(\mathbf{r}) = \phi (\mathbf{r},H) = +\mathsf{D}_{000}(\mathbf{r}) = \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} \frac{1}{H} \left(-3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3\right) & \mbox{if} & u < 1,\\ \frac{1}{r} & \mbox{if} & u \geq 1, @@ -14,10 +15,11 @@ D_{000}(\mathbf{r}) = \phi (\mathbf{r},H) = \right.\nonumber \end{align} we can construct the higher order terms by successively applying the -"chain rule". We show examples of the first few relevant ones here. +"chain rule". We show representative examples of the first few +relevant ones here split by order. \begin{align} -D_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \phi (\mathbf{r},H) = +\mathsf{D}_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} -\frac{r_x}{H^3} \left(21u^5 - 90u^4 + 140u^3 - 84u^2 + 14\right) & \mbox{if} & u < 1,\\ -\frac{r_x}{r^3} & \mbox{if} & u \geq 1, @@ -25,8 +27,10 @@ D_{100}(\mathbf{r}) = \frac{\partial}{\partial r_x} \phi (\mathbf{r},H) = \right.\nonumber \end{align} +\noindent\rule{6cm}{0.4pt} + \begin{align} -D_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \phi (\mathbf{r},H) = +\mathsf{D}_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} \frac{r_x^2}{H^5}\left(-105u^3+360u^2-420u+168\right) - \frac{1}{H^3} \left(21u^5 - 90u^4 + 140u^3 - 84u^2 + 14\right) & \mbox{if} & u < 1,\\ @@ -36,7 +40,7 @@ D_{200}(\mathbf{r}) = \frac{\partial^2}{\partial r_x^2} \phi (\mathbf{r},H) = \end{align} \begin{align} -D_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \phi (\mathbf{r},H) = +\mathsf{D}_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} \frac{r_xr_y}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\ 3\frac{r_xr_y}{r^5} & \mbox{if} & u \geq 1, @@ -44,8 +48,10 @@ D_{110}(\mathbf{r}) = \frac{\partial^2}{\partial r_x\partial r_y} \phi (\mathbf{ \right.\nonumber \end{align} +\noindent\rule{6cm}{0.4pt} + \begin{align} -D_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = +\mathsf{D}_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} -\frac{r_x^3}{H^7} \left(315u - 720 + 420u^{-1}\right) + \frac{3r_x}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\ @@ -55,7 +61,7 @@ D_{300}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = \end{align} \begin{align} -D_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = +\mathsf{D}_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} -\frac{r_x^2r_y}{H^7} \left(315u - 720 + 420u^{-1}\right) + \frac{r_y}{H^5}\left(-105u^3+360u^2-420u+168\right) & \mbox{if} & u < 1,\\ @@ -66,10 +72,32 @@ D_{210}(\mathbf{r}) = \frac{\partial^3}{\partial r_x^3} \phi (\mathbf{r},H) = \begin{align} -D_{111}(\mathbf{r}) = \frac{\partial^3}{\partial r_x\partial r_y\partial r_z} \phi (\mathbf{r},H) = +\mathsf{D}_{111}(\mathbf{r}) = \frac{\partial^3}{\partial r_x\partial r_y\partial r_z} \varphi (\mathbf{r},H) = \left\lbrace\begin{array}{rcl} -\frac{r_xr_yr_z}{H^7} \left(315u - 720 + 420u^{-1}\right) & \mbox{if} & u < 1,\\ -15\frac{r_xr_yr_z}{r^7} & \mbox{if} & u \geq 1, \end{array} \right.\nonumber \end{align} + +\noindent\rule{6cm}{0.4pt} + +\begin{align} + \mathsf{D}_{400}(\mathbf{r}) &= + \nonumber +\end{align} + +\begin{align} + \mathsf{D}_{310}(\mathbf{r}) &= + \nonumber +\end{align} + +\begin{align} + \mathsf{D}_{220}(\mathbf{r}) &= + \nonumber +\end{align} + +\begin{align} + \mathsf{D}_{211}(\mathbf{r}) &= + \nonumber +\end{align} diff --git a/theory/Multipoles/potential_softening.tex b/theory/Multipoles/potential_softening.tex index 1186a9cec377fd8daa94e14d024115f95ecfdc99..aa9ee12340a3492a19dcf9048548952ef7e141e1 100644 --- a/theory/Multipoles/potential_softening.tex +++ b/theory/Multipoles/potential_softening.tex @@ -1,4 +1,5 @@ \subsection{Gravitational softening} +\label{ssec:potential_softening} To avoid artificial two-body relaxation, the Dirac $\delta$-distribution of particles is convolved with a softening @@ -6,9 +7,13 @@ kernel of a given fixed, but time-variable, scale-length $\epsilon$. Instead of the commonly used spline kernel of \cite{Monaghan1985} (e.g. in \textsc{Gadget}), we use a C2 kernel \citep{Wendland1995} which leads to an expression for the force that -is cheaper to compute and has a very similar overall shape. We set -$\tilde\delta(\mathbf{x}) = \rho(|\mathbf{x}|) = W(|\mathbf{x}|, -3\epsilon_{\rm Plummer})$, with $W(r, H)$ given by +is cheaper to compute and has a very similar overall shape. The C2 +kernel has the advantage of being branch-free leading to an expression +which is faster to evaluate using vector units available on modern +architectures; it also does not require any divisions to evaluate the +softened forces. We set $\tilde\delta(\mathbf{x}) = +\rho(|\mathbf{x}|) = W(|\mathbf{x}|, 3\epsilon_{\rm Plummer})$, with +$W(r, H)$ given by \begin{align} W(r,H) &= \frac{21}{2\pi H^3} \times \nonumber \\ @@ -18,9 +23,9 @@ W(r,H) &= \frac{21}{2\pi H^3} \times \nonumber \\ \end{array} \right. \end{align} -and $u = r/H$. The potential $\phi(r,H)$ corresponding to this density distribution reads +and $u = r/H$. The potential $\varphi(r,H)$ corresponding to this density distribution reads \begin{align} -\phi = +\varphi = \left\lbrace\begin{array}{rcl} \frac{1}{H} (-3u^7 + 15u^6 - 28u^5 + 21u^4 - 7u^2 + 3) & \mbox{if} & u < 1,\\ \frac{1}{r} & \mbox{if} & u \geq 1. @@ -41,12 +46,13 @@ details see Sec. 2 of~\cite{Price2007}). \begin{figure} \includegraphics[width=\columnwidth]{potential.pdf} -\caption{The density (top), potential (middle) and forces (bottom) of -generated py a point mass in our softened gravitational scheme (for -completeness, we chose $\epsilon=2$). A -Plummer-equivalent sphere is shown for comparison. The spline kernel -of \citet{Monaghan1985}, used in \textsc{Gadget}, is shown for -comparison but note that it has not been re-scaled to match the -Plummer-sphere potential at $r=0$.} +\caption{The density (top), potential (middle) and forces (bottom) + generated py a point mass in our softened gravitational scheme. + A Plummer-equivalent sphere is shown for comparison. The spline + kernel of \citet{Monaghan1985}, used in \textsc{Gadget}, is shown + for comparison but note that it has not been re-scaled to match the + Plummer-sphere potential at $r=0$. %(for completeness, we chose + %$\epsilon=2$). + } \label{fig:fmm:softening} \end{figure} diff --git a/theory/Multipoles/run.sh b/theory/Multipoles/run.sh index f25d407cd4ffe679a272f352798817f7c0c4e55a..fc376188ad2e69d2879ce963ddc7069c736fc8b7 100755 --- a/theory/Multipoles/run.sh +++ b/theory/Multipoles/run.sh @@ -1,5 +1,15 @@ #!/bin/bash -python potential.py +if [ ! -e potential.pdf ] +then + echo "Generating 1st figure..." + python plot_potential.py +fi +if [ ! -e potential_short.pdf ] +then + echo "Generating 2nd figures..." + python plot_mesh.py +fi +echo "Generating PDF..." pdflatex -jobname=fmm fmm_standalone.tex bibtex fmm.aux pdflatex -jobname=fmm fmm_standalone.tex diff --git a/theory/Multipoles/vector_notation.tex b/theory/Multipoles/vector_notation.tex new file mode 100644 index 0000000000000000000000000000000000000000..4c17a1b92ad7576ac3aaa02b8d02993acfcd795a --- /dev/null +++ b/theory/Multipoles/vector_notation.tex @@ -0,0 +1,34 @@ +\section{Multi-index notation} +\label{sec:multi_index_notation} + +We define a multi-index $\mathbf{n}$ as a triplet of integers +non-negative integers: +\begin{equation} + \mathbf{n} \equiv \left(n_x, n_y, n_z\right), \qquad n_i \in \mathbb{N}, +\end{equation} +with a norm $n$ given by +\begin{equation} + n = |\mathbf{n}| \equiv n_x + n_y + n_z. +\end{equation} +We also define the exponentiation of a vector +$\mathbf{r}=(r_x,r_y,r_z)$ by a multi-index $\mathbf{n}$ as +\begin{equation} + \mathbf{r}^\mathbf{n} \equiv r_x^{n_x} \cdot r_y^{n_y} \cdot r_z^{n_z}, +\end{equation} +which for a scalar $\alpha$ reduces to +\begin{equation} + \alpha^\mathbf{n} = \alpha^{n}. +\end{equation} +Finally, the factiorial of a multi-index is defined to be +\begin{equation} + \mathbf{n}! \equiv n_x! \cdot n_y! \cdot n_z!, +\end{equation} +which leads to a simple expression for the binomial coefficients of +two multi-indices entering Taylor expansions: +\begin{equation} + \binom{\mathbf{n}}{\mathbf{k}} = \binom{n_x}{k_x}\binom{n_y}{k_y}\binom{n_z}{k_z}. +\end{equation} +When appearing as the index in a sum, a multi-index represents all +values that the triplet can take up to a given norm. For instance, +$\sum_{\mathbf{n}}^{p}$ indicates that the sum runs over all possible +multi-indices whose norm is $\leq p$.