diff --git a/README b/README index b0a21c610f7ab4c380e523b51c6f74b81f98a2f2..cd2a397a18e872e7914b24fd58cc588ec1d6c8c0 100644 --- a/README +++ b/README @@ -26,6 +26,8 @@ Valid options are: -f {int} Overwrite the CPU frequency (Hz) to be used for time measurements -g Run with an external gravitational potential -G Run with self-gravity + -n {int} Execute a fixed number of time steps. When unset use the time_end + parameter to stop. -s Run with SPH -t {int} The number of threads to use on each MPI rank. Defaults to 1 if not specified. -v [12] Increase the level of verbosity 1: MPI-rank 0 writes diff --git a/configure.ac b/configure.ac index 497107121753f212dd8b07f5a8e8eed7acdf82b5..23fbf5c6e4c4211379986a4a9c951892d044713c 100644 --- a/configure.ac +++ b/configure.ac @@ -72,9 +72,6 @@ if test "$enable_ipo" = "yes"; then fi fi -# Add libtool support. -LT_INIT - # Check for MPI. Need to do this before characterising the compiler (C99 mode), # as this changes the compiler. # We should consider using AX_PROG_CC_MPI to replace AC_PROG_CC when compiling @@ -151,6 +148,9 @@ AM_CONDITIONAL([HAVEMPI],[test $enable_mpi = "yes"]) # Indicate that MPIRUN can be modified by an environement variable AC_ARG_VAR(MPIRUN, Path to the mpirun command if non-standard) +# Add libtool support (now that CC is defined). +LT_INIT + # Need C99 and inline support. AC_PROG_CC_C99 AC_C_INLINE @@ -179,7 +179,7 @@ if test "$enable_opt" = "yes" ; then ac_test_CFLAGS="yes" CFLAGS="$old_CFLAGS $CFLAGS" - # Check SSE & AVX support (some overlap with AX_CC_MAXOPT). + # Check SSE & AVX support (some overlap with AX_CC_MAXOPT). # Don't use the SIMD_FLAGS result with Intel compilers. The -x<code> # value from AX_CC_MAXOPT should be sufficient. AX_EXT @@ -287,12 +287,75 @@ AC_SUBST([METIS_LIBS]) AC_SUBST([METIS_INCS]) AM_CONDITIONAL([HAVEMETIS],[test -n "$METIS_LIBS"]) -# # Check for zlib. -# AC_CHECK_LIB([z],[gzopen],[ -# AC_DEFINE([HAVE_LIBZ],[1],[Set to 1 if zlib is installed.]) -# LDFLAGS="$LDFLAGS -lz" -# ],[]) +# Check for tcmalloc a fast malloc that is part of the gperftools. +have_tcmalloc="no" +AC_ARG_WITH([tcmalloc], + [AS_HELP_STRING([--with-tcmalloc], + [use tcmalloc library or specify the directory with lib @<:@yes/no@:>@] + )], + [with_tcmalloc="$withval"], + [with_tcmalloc="no"] +) +if test "x$with_tcmalloc" != "xno"; then + if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then + tclibs="-L$with_tcmalloc -ltcmalloc" + else + tclibs="-ltcmalloc" + fi + AC_CHECK_LIB([tcmalloc],[tc_cfree],[have_tcmalloc="yes"],[have_tcmalloc="no"], + $tclibs) + + # Could just have the minimal version. + if test "$have_tcmalloc" = "no"; then + if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then + tclibs="-L$with_tcmalloc -ltcmalloc_minimal" + else + tclibs="-ltcmalloc_minimal" + fi + AC_CHECK_LIB([tcmalloc],[tc_cfree],[have_tcmalloc="yes"],[have_tcmalloc="no"], + $tclibs) + fi + if test "$have_tcmalloc" = "yes"; then + TCMALLOC_LIBS="$tclibs" + + # These are recommended for GCC. + if test "$ax_cv_c_compiler_vendor" = "gnu"; then + CFLAGS="$CFLAGS -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free" + fi + else + TCMALLOC_LIBS="" + fi +fi +AC_SUBST([TCMALLOC_LIBS]) +AM_CONDITIONAL([HAVETCMALLOC],[test -n "$TCMALLOC_LIBS"]) + +# Check for -lprofiler usually part of the gpreftools along with tcmalloc. +have_profiler="no" +AC_ARG_WITH([profiler], + [AS_HELP_STRING([--with-profiler], + [use cpu profiler library or specify the directory with lib @<:@yes/no@:>@] + )], + [with_profiler="$withval"], + [with_profiler="yes"] +) +if test "x$with_profiler" != "xno"; then + if test "x$with_profiler" != "xyes" && test "x$with_profiler" != "x"; then + proflibs="-L$with_profiler -lprofiler" + else + proflibs="-lprofiler" + fi + AC_CHECK_LIB([profiler],[ProfilerFlush],[have_profiler="yes"],[have_profiler="no"], + $proflibs) + + if test "$have_profiler" = "yes"; then + PROFILER_LIBS="$proflibs" + else + PROFILER_LIBS="" + fi +fi +AC_SUBST([PROFILER_LIBS]) +AM_CONDITIONAL([HAVEPROFILER],[test -n "$PROFILER_LIBS"]) # Check for HDF5. This is required. AX_LIB_HDF5 @@ -410,6 +473,8 @@ AC_MSG_RESULT([ - parallel : $have_parallel_hdf5 Metis enabled : $have_metis libNUMA enabled : $have_numa + Using tcmalloc : $have_tcmalloc + CPU profiler : $have_profiler ]) # Generate output. diff --git a/doc/Makefile.am b/doc/Makefile.am index 1736a4ad81a674e68932a45e2f8bea8ef82305f3..dc88489ca4ac4ef2fe856d910420fee6a7e87c8a 100644 --- a/doc/Makefile.am +++ b/doc/Makefile.am @@ -1,3 +1,19 @@ +# This file is part of SWIFT. +# Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), +# Matthieu Schaller (matthieu.schaller@durham.ac.uk). +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. doxyfile.stamp: if HAVE_DOXYGEN diff --git a/examples/CosmoVolume/cosmoVolume.yml b/examples/CosmoVolume/cosmoVolume.yml index d33de8170284b3251a00a43c1a8acaaf10e35cc8..acfd449cbf57408718a7c2a50d8a16bf5ccd5b4e 100644 --- a/examples/CosmoVolume/cosmoVolume.yml +++ b/examples/CosmoVolume/cosmoVolume.yml @@ -37,10 +37,8 @@ Statistics: SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. - max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. max_smoothing_length: 0.705 # Maximal smoothing length allowed (in internal units). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_volume_change: 2. # Maximal allowed change of volume over one time-step # Parameters related to the initial conditions InitialConditions: diff --git a/examples/ExternalPointMass/externalPointMass.yml b/examples/ExternalPointMass/externalPointMass.yml index e4c3abc28362801ee53320f9e547bf6d01670ec7..52330163caa13609fd7674a5cdf2921743fbe227 100644 --- a/examples/ExternalPointMass/externalPointMass.yml +++ b/examples/ExternalPointMass/externalPointMass.yml @@ -32,10 +32,8 @@ Statistics: SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. - max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. max_smoothing_length: 10. # Maximal smoothing length allowed (in internal units). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_volume_change: 2. # Maximal allowed change of volume over one time-step # Parameters related to the initial conditions InitialConditions: diff --git a/examples/Makefile.am b/examples/Makefile.am index fcb6d3ac4158e140ed5bf227f9215d488f608dd7..9ad4d06f8764f8f57f68d0e501f4a726941c40e6 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,4 +1,3 @@ - # This file is part of SWIFT. # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), # Matthieu Schaller (matthieu.schaller@durham.ac.uk). @@ -22,13 +21,17 @@ MYFLAGS = -DTIMER # Add the source directory and debug to CFLAGS AM_CFLAGS = -I../src $(HDF5_CPPFLAGS) -AM_LDFLAGS = +AM_LDFLAGS = $(HDF5_LDFLAGS) + +# Extra libraries. +EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) -MPI_THREAD_LIBS = @MPI_THREAD_LIBS@ +# MPI libraries. MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS) MPI_FLAGS = -DWITH_MPI $(METIS_INCS) -# Set-up the library + +# Programs. bin_PROGRAMS = swift swift_fixdt # Build MPI versions as well? @@ -46,20 +49,20 @@ ENGINE_POLICY_SETAFFINITY= # Sources for swift swift_SOURCES = main.c swift_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -swift_LDADD = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) +swift_LDADD = ../src/.libs/libswiftsim.a $(EXTRA_LIBS) swift_fixdt_SOURCES = main.c swift_fixdt_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) -DENGINE_POLICY="engine_policy_fixdt | engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -swift_fixdt_LDADD = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) +swift_fixdt_LDADD = ../src/.libs/libswiftsim.a $(EXTRA_LIBS) # Sources for swift_mpi, do we need an affinity policy for MPI? swift_mpi_SOURCES = main.c swift_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -swift_mpi_LDADD = ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS) +swift_mpi_LDADD = ../src/.libs/libswiftsim_mpi.a $(MPI_LIBS) $(EXTRA_LIBS) swift_fixdt_mpi_SOURCES = main.c swift_fixdt_mpi_CFLAGS = $(MYFLAGS) $(AM_CFLAGS) $(MPI_FLAGS) -DENGINE_POLICY="engine_policy_fixdt | engine_policy_keep $(ENGINE_POLICY_SETAFFINITY)" -swift_fixdt_mpi_LDADD = ../src/.libs/libswiftsim_mpi.a $(HDF5_LDFLAGS) $(HDF5_LIBS) $(MPI_LIBS) +swift_fixdt_mpi_LDADD = ../src/.libs/libswiftsim_mpi.a $(MPI_LIBS) $(EXTRA_LIBS) # Scripts to generate ICs EXTRA_DIST = UniformBox/makeIC.py UniformBox/run.sh UniformBox/uniformBox.yml \ diff --git a/examples/SedovBlast/sedov.yml b/examples/SedovBlast/sedov.yml index 96b16c5b85b2e534e5ac8b7680f61ed9912da3d6..9fbabb4969b6accdb7323d8270b735951ac0693a 100644 --- a/examples/SedovBlast/sedov.yml +++ b/examples/SedovBlast/sedov.yml @@ -32,10 +32,8 @@ Statistics: SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. - max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. max_smoothing_length: 1. # Maximal smoothing length allowed (in internal units). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_volume_change: 2. # Maximal allowed change of volume over one time-step # Parameters related to the initial conditions InitialConditions: diff --git a/examples/SodShock/sodShock.yml b/examples/SodShock/sodShock.yml index 62ed0c68cad8e063f49ab0655aee0e10f57e6314..003b7286777230ceb3b84ee01c6a1f335aeb9476 100644 --- a/examples/SodShock/sodShock.yml +++ b/examples/SodShock/sodShock.yml @@ -32,10 +32,8 @@ Statistics: SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. - max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. max_smoothing_length: 0.01 # Maximal smoothing length allowed (in internal units). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_volume_change: 2. # Maximal allowed change of volume over one time-step # Parameters related to the initial conditions InitialConditions: diff --git a/examples/UniformBox/uniformBox.yml b/examples/UniformBox/uniformBox.yml index 64f9a135fff6ecb2e4500ade95d1500a0712f2c8..50afbee02ddc8a801ca77ed4900b7d2e0e3b50b5 100644 --- a/examples/UniformBox/uniformBox.yml +++ b/examples/UniformBox/uniformBox.yml @@ -32,10 +32,8 @@ Statistics: SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. - max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. max_smoothing_length: 0.1 # Maximal smoothing length allowed (in internal units). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_volume_change: 2. # Maximal allowed change of volume over one time-step # Parameters related to the initial conditions InitialConditions: diff --git a/examples/main.c b/examples/main.c index 6422ccceef0a5f5c5f3b5acce9ba60b957986aac..016f948bb1b031ba756a344d020d4b26c8a42bed 100644 --- a/examples/main.c +++ b/examples/main.c @@ -74,6 +74,8 @@ void print_help_message() { printf(" %2s %8s %s\n", "-g", "", "Run with an external gravitational potential"); printf(" %2s %8s %s\n", "-G", "", "Run with self-gravity"); + printf(" %2s %8s %s\n", "-n", "{int}", + "Execute a fixed number of time steps"); printf(" %2s %8s %s\n", "-s", "", "Run with SPH"); printf(" %2s %8s %s\n", "-t", "{int}", "The number of threads to use on each MPI rank. Defaults to 1 if not " @@ -93,7 +95,6 @@ void print_help_message() { * @brief Main routine that loads a few particles and generates some output. * */ - int main(int argc, char *argv[]) { struct clocks_time tic, toc; @@ -139,6 +140,7 @@ int main(int argc, char *argv[]) { int with_aff = 0; int dry_run = 0; int dump_tasks = 0; + int nsteps = -2; int with_cosmology = 0; int with_external_gravity = 0; int with_self_gravity = 0; @@ -151,7 +153,7 @@ int main(int argc, char *argv[]) { /* Parse the parameters */ int c; - while ((c = getopt(argc, argv, "acdef:gGhst:v:y:")) != -1) switch (c) { + while ((c = getopt(argc, argv, "acdef:gGhn:st:v:y:")) != -1) switch (c) { case 'a': with_aff = 1; break; @@ -180,6 +182,13 @@ int main(int argc, char *argv[]) { case 'h': if (myrank == 0) print_help_message(); return 0; + case 'n': + if (sscanf(optarg, "%d", &nsteps) != 1) { + if (myrank == 0) printf("Error parsing fixed number of steps.\n"); + if (myrank == 0) print_help_message(); + return 1; + } + break; case 's': with_hydro = 1; break; @@ -323,18 +332,21 @@ int main(int argc, char *argv[]) { size_t Ngas = 0, Ngpart = 0; double dim[3] = {0., 0., 0.}; int periodic = 0; + int flag_entropy_ICs = 0; if (myrank == 0) clocks_gettime(&tic); #if defined(WITH_MPI) #if defined(HAVE_PARALLEL_HDF5) read_ic_parallel(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic, - myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL, dry_run); + &flag_entropy_ICs, myrank, nr_nodes, MPI_COMM_WORLD, + MPI_INFO_NULL, dry_run); #else read_ic_serial(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic, - myrank, nr_nodes, MPI_COMM_WORLD, MPI_INFO_NULL, dry_run); + &flag_entropy_ICs, myrank, nr_nodes, MPI_COMM_WORLD, + MPI_INFO_NULL, dry_run); #endif #else read_ic_single(ICfileName, dim, &parts, &gparts, &Ngas, &Ngpart, &periodic, - dry_run); + &flag_entropy_ICs, dry_run); #endif if (myrank == 0) { clocks_gettime(&toc); @@ -355,7 +367,7 @@ int main(int argc, char *argv[]) { free(parts); parts = NULL; for (size_t k = 0; k < Ngpart; ++k) - if (gparts[k].id > 0) error("Linking problem"); + if (gparts[k].id_or_neg_offset < 0) error("Linking problem"); Ngas = 0; } @@ -467,7 +479,7 @@ int main(int argc, char *argv[]) { #endif /* Initialise the particles */ - engine_init_particles(&e); + engine_init_particles(&e, flag_entropy_ICs); /* Legend */ if (myrank == 0) @@ -475,17 +487,15 @@ int main(int argc, char *argv[]) { "Updates", "g-Updates", "Wall-clock time", clocks_getunit()); /* Main simulation loop */ - for (int j = 0; !engine_is_done(&e); j++) { + for (int j = 0; !engine_is_done(&e) && e.step != nsteps; j++) { /* Repartition the space amongst the nodes? */ #ifdef WITH_MPI if (j % 100 == 2) e.forcerepart = reparttype; #endif + /* Reset timers */ timers_reset(timers_mask_all); -#ifdef COUNTER - for (k = 0; k < runner_counter_count; k++) runner_counter[k] = 0; -#endif /* Take a step. */ engine_step(&e); diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 9b2ebd028a3506e231aa36ed2078bb44faf4eb49..fb1e6ff6931b2fdee00792eed7f178872ee6d950 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -39,10 +39,10 @@ Statistics: SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. - max_ghost_iterations: 30 # Maximal number of iterations allowed to converge towards the smoothing length. + max_ghost_iterations: 30 # (Optional) Maximal number of iterations allowed to converge towards the smoothing length. max_smoothing_length: 0.1 # Maximal smoothing length allowed (in internal units). CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - max_volume_change: 2. # Maximal allowed change of kernel volume over one time-step + max_volume_change: 2. # (Optional) Maximal allowed change of kernel volume over one time-step # Parameters related to the initial conditions InitialConditions: diff --git a/examples/plot_tasks.py b/examples/plot_tasks.py index e070bdb565d6d8305b7974137292910dd720afe2..f2d0aa95d1f35f30476e1989349a07be8d9e5b0a 100755 --- a/examples/plot_tasks.py +++ b/examples/plot_tasks.py @@ -55,8 +55,8 @@ PLOT_PARAMS = {"axes.labelsize": 10, pl.rcParams.update(PLOT_PARAMS) # Tasks and subtypes. Indexed as in tasks.h. -TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift", - "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm", +TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost", + "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm", "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort", "split_cell", "rewait", "count"] @@ -64,7 +64,8 @@ TASKCOLOURS = {"none": "black", "sort": "lightblue", "self": "greenyellow", "pair": "navy", - "sub": "hotpink", + "sub_self": "greenyellow", + "sub_pair": "navy", "init": "indigo", "ghost": "cyan", "drift": "maroon", diff --git a/examples/plot_tasks_MPI.py b/examples/plot_tasks_MPI.py index 44705a0164034f8a22f1f3aa860a64d3e7656d2e..9a92faf9417c9a302831eb8cb2f4471eb672d59c 100755 --- a/examples/plot_tasks_MPI.py +++ b/examples/plot_tasks_MPI.py @@ -61,8 +61,8 @@ PLOT_PARAMS = {"axes.labelsize": 10, pl.rcParams.update(PLOT_PARAMS) # Tasks and subtypes. Indexed as in tasks.h. -TASKTYPES = ["none", "sort", "self", "pair", "sub", "init", "ghost", "drift", - "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm", +TASKTYPES = ["none", "sort", "self", "pair", "sub_self", "sub_pair", "init", "ghost", + "drift", "kick", "kick_fixdt", "send", "recv", "grav_pp", "grav_mm", "grav_up", "grav_down", "grav_external", "part_sort", "gpart_sort", "split_cell", "rewait", "count"] @@ -70,7 +70,8 @@ TASKCOLOURS = {"none": "black", "sort": "lightblue", "self": "greenyellow", "pair": "navy", - "sub": "hotpink", + "sub_self": "greenyellow", + "sub_pair": "navy", "init": "indigo", "ghost": "cyan", "drift": "maroon", diff --git a/src/Makefile.am b/src/Makefile.am index 9ed6f9f19d4f659119271338e6673d7a14b8f9ee..ba9e2373a72ddd21a747337edca9a5c22a2cd26d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,4 +1,3 @@ - # This file is part of SWIFT. # Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), # Matthieu Schaller (matthieu.schaller@durham.ac.uk). @@ -17,14 +16,21 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. # Add the debug flag to the whole thing -AM_CFLAGS = -DTIMER -DCOUNTER $(HDF5_CPPFLAGS) +AM_CFLAGS = -DTIMER $(HDF5_CPPFLAGS) # Assign a "safe" version number -AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 # -fsanitize=address +AM_LDFLAGS = $(HDF5_LDFLAGS) -version-info 0:0:0 # The git command, if available. GIT_CMD = @GIT_CMD@ +# Additional dependencies for shared libraries. +EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) + +# MPI libraries. +MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS) +MPI_FLAGS = -DWITH_MPI $(METIS_INCS) + # Build the libswiftsim library lib_LTLIBRARIES = libswiftsim.la # Build a MPI-enabled version too? @@ -67,10 +73,13 @@ nobase_noinst_HEADERS = approx_math.h atomic.h cycle.h error.h inline.h kernel_h # Sources and flags for regular library libswiftsim_la_SOURCES = $(AM_SOURCES) +libswiftsim_la_CFLAGS = $(AM_CFLAGS) +libswiftsim_la_LDFLAGS = $(AM_LDFLAGS) $(EXTRA_LIBS) # Sources and flags for MPI library libswiftsim_mpi_la_SOURCES = $(AM_SOURCES) -libswiftsim_mpi_la_CFLAGS = $(AM_CFLAGS) -DWITH_MPI $(METIS_INCS) +libswiftsim_mpi_la_CFLAGS = $(AM_CFLAGS) $(MPI_FLAGS) +libswiftsim_mpi_la_LDFLAGS = $(AM_LDFLAGS) $(MPI_LIBS) $(EXTRA_LIBS) libswiftsim_mpi_la_SHORTNAME = mpi diff --git a/src/approx_math.h b/src/approx_math.h index ef93ea63c383c74caa3eaff65446962872389a35..cbca602b3fafcc5044b0939b2207b8f9d50a7446 100644 --- a/src/approx_math.h +++ b/src/approx_math.h @@ -19,6 +19,8 @@ #ifndef SWIFT_APPROX_MATH_H #define SWIFT_APPROX_MATH_H +#include "inline.h" + /** * @brief Approximate version of expf(x) using a 4th order Taylor expansion * diff --git a/src/cell.c b/src/cell.c index 97a67583a7872999d7b7e7c5a0eb9787c83b4bd3..cd83ddab570a3c848e0502629534743e8f383a4f 100644 --- a/src/cell.c +++ b/src/cell.c @@ -128,6 +128,7 @@ int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) { } /* Return the total number of unpacked cells. */ + c->pcell_size = count; return count; } @@ -213,6 +214,39 @@ int cell_pack(struct cell *c, struct pcell *pc) { pc->progeny[k] = -1; /* Return the number of packed cells used. */ + c->pcell_size = count; + return count; +} + +int cell_pack_ti_ends(struct cell *c, int *ti_ends) { + + /* Pack this cell's data. */ + ti_ends[0] = c->ti_end_min; + + /* Fill in the progeny, depth-first recursion. */ + int count = 1; + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + count += cell_pack_ti_ends(c->progeny[k], &ti_ends[count]); + } + + /* Return the number of packed values. */ + return count; +} + +int cell_unpack_ti_ends(struct cell *c, int *ti_ends) { + + /* Unpack this cell's data. */ + c->ti_end_min = ti_ends[0]; + + /* Fill in the progeny, depth-first recursion. */ + int count = 1; + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + count += cell_unpack_ti_ends(c->progeny[k], &ti_ends[count]); + } + + /* Return the number of packed values. */ return count; } @@ -269,7 +303,7 @@ int cell_locktree(struct cell *c) { /* Undo the holds up to finger. */ for (struct cell *finger2 = c->parent; finger2 != finger; finger2 = finger2->parent) - __sync_fetch_and_sub(&finger2->hold, 1); + atomic_dec(&finger2->hold); /* Unlock this cell. */ if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell."); @@ -309,7 +343,7 @@ int cell_glocktree(struct cell *c) { if (lock_trylock(&finger->glock) != 0) break; /* Increment the hold. */ - __sync_fetch_and_add(&finger->ghold, 1); + atomic_inc(&finger->ghold); /* Unlock the cell. */ if (lock_unlock(&finger->glock) != 0) error("Failed to unlock cell."); @@ -327,7 +361,7 @@ int cell_glocktree(struct cell *c) { /* Undo the holds up to finger. */ for (struct cell *finger2 = c->parent; finger2 != finger; finger2 = finger2->parent) - __sync_fetch_and_sub(&finger2->ghold, 1); + atomic_dec(&finger2->ghold); /* Unlock this cell. */ if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell."); @@ -353,7 +387,7 @@ void cell_unlocktree(struct cell *c) { /* Climb up the tree and unhold the parents. */ for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent) - __sync_fetch_and_sub(&finger->hold, 1); + atomic_dec(&finger->hold); TIMER_TOC(timer_locktree); } @@ -367,7 +401,7 @@ void cell_gunlocktree(struct cell *c) { /* Climb up the tree and unhold the parents. */ for (struct cell *finger = c->parent; finger != NULL; finger = finger->parent) - __sync_fetch_and_sub(&finger->ghold, 1); + atomic_dec(&finger->ghold); TIMER_TOC(timer_locktree); } @@ -376,9 +410,11 @@ void cell_gunlocktree(struct cell *c) { * @brief Sort the parts into eight bins along the given pivots. * * @param c The #cell array to be sorted. + * @param parts_offset Offset of the cell parts array relative to the + * space's parts array, i.e. c->parts - s->parts. */ -void cell_split(struct cell *c) { +void cell_split(struct cell *c, ptrdiff_t parts_offset) { int i, j; const int count = c->count, gcount = c->gcount; @@ -406,12 +442,14 @@ void cell_split(struct cell *c) { xparts[j] = xtemp; } } - /* for ( k = 0 ; k <= j ; k++ ) - if ( parts[k].x[0] > pivot[0] ) - error( "cell_split: sorting failed." ); - for ( k = i ; k < count ; k++ ) - if ( parts[k].x[0] < pivot[0] ) - error( "cell_split: sorting failed." ); */ + +#ifdef SWIFT_DEBUG_CHECKS + for (int k = 0; k <= j; k++) + if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed."); + for (int k = i; k < count; k++) + if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed."); +#endif + left[1] = i; right[1] = count - 1; left[0] = 0; @@ -433,14 +471,17 @@ void cell_split(struct cell *c) { xparts[j] = xtemp; } } - /* for ( int kk = left[k] ; kk <= j ; kk++ ) - if ( parts[kk].x[1] > pivot[1] ) { - message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); - error( "sorting failed (left)." ); - } - for ( int kk = i ; kk <= right[k] ; kk++ ) - if ( parts[kk].x[1] < pivot[1] ) - error( "sorting failed (right)." ); */ + +#ifdef SWIFT_DEBUG_CHECKS + for (int kk = left[k]; kk <= j; kk++) + if (parts[kk].x[1] > pivot[1]) { + message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); + error("sorting failed (left)."); + } + for (int kk = i; kk <= right[k]; kk++) + if (parts[kk].x[1] < pivot[1]) error("sorting failed (right)."); +#endif + left[2 * k + 1] = i; right[2 * k + 1] = right[k]; left[2 * k] = left[k]; @@ -463,16 +504,20 @@ void cell_split(struct cell *c) { xparts[j] = xtemp; } } - /* for ( int kk = left[k] ; kk <= j ; kk++ ) - if ( parts[kk].x[2] > pivot[2] ) { - message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); - error( "sorting failed (left)." ); - } - for ( int kk = i ; kk <= right[k] ; kk++ ) - if ( parts[kk].x[2] < pivot[2] ) { - message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); - error( "sorting failed (right)." ); - } */ + +#ifdef SWIFT_DEBUG_CHECKS + for (int kk = left[k]; kk <= j; kk++) + if (parts[kk].x[2] > pivot[2]) { + message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); + error("sorting failed (left)."); + } + for (int kk = i; kk <= right[k]; kk++) + if (parts[kk].x[2] < pivot[2]) { + message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); + error("sorting failed (right)."); + } +#endif + left[2 * k + 1] = i; right[2 * k + 1] = right[k]; left[2 * k] = left[k]; @@ -487,35 +532,36 @@ void cell_split(struct cell *c) { } /* Re-link the gparts. */ - for (int k = 0; k < count; k++) - if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k]; + part_relink_gparts(parts, count, parts_offset); +#ifdef SWIFT_DEBUG_CHECKS /* Verify that _all_ the parts have been assigned to a cell. */ - /* for ( k = 1 ; k < 8 ; k++ ) - if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] != - c->progeny[k]->parts ) - error( "Particle sorting failed (internal consistency)." ); - if ( c->progeny[0]->parts != c->parts ) - error( "Particle sorting failed (left edge)." ); - if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] ) - error( "Particle sorting failed (right edge)." ); */ + for (int k = 1; k < 8; k++) + if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] != + c->progeny[k]->parts) + error("Particle sorting failed (internal consistency)."); + if (c->progeny[0]->parts != c->parts) + error("Particle sorting failed (left edge)."); + if (&c->progeny[7]->parts[c->progeny[7]->count] != &c->parts[count]) + error("Particle sorting failed (right edge)."); /* Verify a few sub-cells. */ - /* for (int k = 0 ; k < c->progeny[0]->count ; k++ ) - if ( c->progeny[0]->parts[k].x[0] > pivot[0] || - c->progeny[0]->parts[k].x[1] > pivot[1] || - c->progeny[0]->parts[k].x[2] > pivot[2] ) - error( "Sorting failed (progeny=0)." ); - for (int k = 0 ; k < c->progeny[1]->count ; k++ ) - if ( c->progeny[1]->parts[k].x[0] > pivot[0] || - c->progeny[1]->parts[k].x[1] > pivot[1] || - c->progeny[1]->parts[k].x[2] <= pivot[2] ) - error( "Sorting failed (progeny=1)." ); - for (int k = 0 ; k < c->progeny[2]->count ; k++ ) - if ( c->progeny[2]->parts[k].x[0] > pivot[0] || - c->progeny[2]->parts[k].x[1] <= pivot[1] || - c->progeny[2]->parts[k].x[2] > pivot[2] ) - error( "Sorting failed (progeny=2)." ); */ + for (int k = 0; k < c->progeny[0]->count; k++) + if (c->progeny[0]->parts[k].x[0] > pivot[0] || + c->progeny[0]->parts[k].x[1] > pivot[1] || + c->progeny[0]->parts[k].x[2] > pivot[2]) + error("Sorting failed (progeny=0)."); + for (int k = 0; k < c->progeny[1]->count; k++) + if (c->progeny[1]->parts[k].x[0] > pivot[0] || + c->progeny[1]->parts[k].x[1] > pivot[1] || + c->progeny[1]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=1)."); + for (int k = 0; k < c->progeny[2]->count; k++) + if (c->progeny[2]->parts[k].x[0] > pivot[0] || + c->progeny[2]->parts[k].x[1] <= pivot[1] || + c->progeny[2]->parts[k].x[2] > pivot[2]) + error("Sorting failed (progeny=2)."); +#endif /* Now do the same song and dance for the gparts. */ @@ -581,8 +627,7 @@ void cell_split(struct cell *c) { } /* Re-link the parts. */ - for (int k = 0; k < gcount; k++) - if (gparts[k].id > 0) gparts[k].part->gpart = &gparts[k]; + part_relink_parts(gparts, gcount, parts - parts_offset); } /** diff --git a/src/cell.h b/src/cell.h index 65665c9dcfcd652cc688f652008044d618b10790..b5cc49d05c1cee6e0a884cdb708abbaff4281829 100644 --- a/src/cell.h +++ b/src/cell.h @@ -24,11 +24,15 @@ #define SWIFT_CELL_H /* Includes. */ +#include <stddef.h> + +/* Local includes. */ #include "lock.h" #include "multipole.h" #include "part.h" +#include "task.h" -/* Forward declaration of space, needed for cell_unpack. */ +/* Avoid cyclic inclusions */ struct space; /* Max tag size set to 2^29 to take into account some MPI implementations @@ -121,7 +125,10 @@ struct cell { struct task *ghost, *init, *drift, *kick; /* Task receiving data. */ - struct task *recv_xv, *recv_rho; + struct task *recv_xv, *recv_rho, *recv_ti; + + /* Task send data. */ + struct link *send_xv, *send_rho, *send_ti; /* Tasks for gravity tree. */ struct task *grav_up, *grav_down; @@ -174,13 +181,15 @@ struct cell { ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* Function prototypes. */ -void cell_split(struct cell *c); +void cell_split(struct cell *c, ptrdiff_t parts_offset); int cell_locktree(struct cell *c); void cell_unlocktree(struct cell *c); int cell_glocktree(struct cell *c); void cell_gunlocktree(struct cell *c); int cell_pack(struct cell *c, struct pcell *pc); int cell_unpack(struct pcell *pc, struct cell *c, struct space *s); +int cell_pack_ti_ends(struct cell *c, int *ti_ends); +int cell_unpack_ti_ends(struct cell *c, int *ti_ends); int cell_getsize(struct cell *c); int cell_link_parts(struct cell *c, struct part *parts); int cell_link_gparts(struct cell *c, struct gpart *gparts); diff --git a/src/common_io.c b/src/common_io.c index 4399c83f7ebd2bb5355b7df9cc1fdecd301e36aa..971fe6b01c682c489d3444ec1d47d6a902250bb8 100644 --- a/src/common_io.c +++ b/src/common_io.c @@ -43,6 +43,8 @@ #include "const.h" #include "error.h" #include "kernel_hydro.h" +#include "part.h" +#include "units.h" #include "version.h" const char* particle_type_names[NUM_PARTICLE_TYPES] = { @@ -514,12 +516,10 @@ void prepare_dm_gparts(struct gpart* const gparts, size_t Ndm) { /* Let's give all these gparts a negative id */ for (size_t i = 0; i < Ndm; ++i) { - /* 0 or negative ids are not allowed */ - if (gparts[i].id <= 0) - error("0 or negative ID for DM particle %zd: ID=%lld", i, gparts[i].id); - - gparts[i].id = -gparts[i].id; + if (gparts[i].id_or_neg_offset <= 0) + error("0 or negative ID for DM particle %zd: ID=%lld", i, + gparts[i].id_or_neg_offset); } } @@ -553,7 +553,7 @@ void duplicate_hydro_gparts(struct part* const parts, gparts[i + Ndm].mass = parts[i].mass; /* Link the particles */ - gparts[i + Ndm].part = &parts[i]; + gparts[i + Ndm].id_or_neg_offset = -i; parts[i].gpart = &gparts[i + Ndm]; } } @@ -578,9 +578,8 @@ void collect_dm_gparts(const struct gpart* const gparts, size_t Ntot, * gparts[i].part); */ /* And collect the DM ones */ - if (gparts[i].id < 0LL) { - memcpy(&dmparts[count], &gparts[i], sizeof(struct gpart)); - dmparts[count].id = -dmparts[count].id; + if (gparts[i].id_or_neg_offset > 0) { + dmparts[count] = gparts[i]; count++; } } diff --git a/src/common_io.h b/src/common_io.h index ed1c96801c904d9952c7b3ca995b847afdc3fb43..fa6811a26671a2ed85c12ada7bb380094f55795d 100644 --- a/src/common_io.h +++ b/src/common_io.h @@ -23,13 +23,11 @@ /* Config parameters. */ #include "../config.h" -/* Includes. */ -#include "kernel_hydro.h" +#if defined(HAVE_HDF5) + #include "part.h" #include "units.h" -#if defined(HAVE_HDF5) - /** * @brief The different types of data used in the GADGET IC files. * @@ -107,6 +105,6 @@ void writeXMFline(FILE* xmfFile, char* fileName, char* partTypeGroupName, void writeCodeDescription(hid_t h_file); void writeUnitSystem(hid_t h_file, struct UnitSystem* us); -#endif +#endif /* defined HDF5 */ #endif /* SWIFT_COMMON_IO_H */ diff --git a/src/const.h b/src/const.h index 6432ef6a9e8107d94c93a32967e291d7e1e4d24d..673aa3bf0093c1c426938a384fd017c1c1d18310 100644 --- a/src/const.h +++ b/src/const.h @@ -60,4 +60,7 @@ /* Gravity properties */ #define EXTERNAL_POTENTIAL_POINTMASS +/* Are we debugging ? */ +//#define SWIFT_DEBUG_CHECKS + #endif /* SWIFT_CONST_H */ diff --git a/src/debug.c b/src/debug.c index a3647915d96a9456cb6d8177d144dab56fd0b97e..e7b60c52f92fdb64fdefc8ac5c5e353f1ab57e6e 100644 --- a/src/debug.c +++ b/src/debug.c @@ -20,11 +20,19 @@ * ******************************************************************************/ +/* Config parameters. */ +#include "../config.h" + +/* Some standard headers. */ #include <stdio.h> +/* This object's header. */ +#include "debug.h" + +/* Local includes. */ #include "config.h" #include "const.h" -#include "debug.h" +#include "inline.h" #include "part.h" /* Import the right hydro definition */ @@ -51,9 +59,8 @@ * * (Should be used for debugging only as it runs in O(N).) */ - -void printParticle(struct part *parts, struct xpart *xparts, long long int id, - size_t N) { +void printParticle(const struct part *parts, struct xpart *xparts, + long long int id, size_t N) { int found = 0; @@ -69,19 +76,33 @@ void printParticle(struct part *parts, struct xpart *xparts, long long int id, if (!found) printf("## Particles[???] id=%lld not found\n", id); } -void printgParticle(struct gpart *gparts, long long int id, size_t N) { +/** + * @brief Looks for the g-particle with the given id and prints its information + * to + * the standard output. + * + * @param gparts The array of g-particles. + * @param parts The array of particles. + * @param id The id too look for. + * @param N The size of the array of g-particles. + * + * (Should be used for debugging only as it runs in O(N).) + */ +void printgParticle(const struct gpart *gparts, const struct part *parts, + long long int id, size_t N) { int found = 0; /* Look for the particle. */ for (size_t i = 0; i < N; i++) - if (gparts[i].id == -id) { - printf("## gParticle[%zd] (DM) :\n id=%lld ", i, -gparts[i].id); + if (gparts[i].id_or_neg_offset == id) { + printf("## gParticle[%zd] (DM) :\n id=%lld", i, id); gravity_debug_particle(&gparts[i]); found = 1; break; - } else if (gparts[i].id > 0 && gparts[i].part->id == id) { - printf("## gParticle[%zd] (hydro) :\n id=%lld ", i, gparts[i].id); + } else if (gparts[i].id_or_neg_offset < 0 && + parts[-gparts[i].id_or_neg_offset].id == id) { + printf("## gParticle[%zd] (hydro) :\n id=%lld", i, id); gravity_debug_particle(&gparts[i]); found = 1; break; @@ -95,16 +116,26 @@ void printgParticle(struct gpart *gparts, long long int id, size_t N) { * * @param p The particle to print * @param xp The extended data ot the particle to print - * */ +void printParticle_single(const struct part *p, const struct xpart *xp) { -void printParticle_single(struct part *p, struct xpart *xp) { - - printf("## Particle: id=%lld ", p->id); + printf("## Particle: id=%lld", p->id); hydro_debug_particle(p, xp); printf("\n"); } +/** + * @brief Prints the details of a given particle to stdout + * + * @param gp The g-particle to print + */ +void printgParticle_single(struct gpart *gp) { + + printf("## g-Particle: id=%lld ", gp->id_or_neg_offset); + gravity_debug_particle(gp); + printf("\n"); +} + #ifdef HAVE_METIS /** diff --git a/src/debug.h b/src/debug.h index fba5cb68680472715025f9fee39a5bf06abacf81..367241201977d9b79a8c2913dbae5d08f1148529 100644 --- a/src/debug.h +++ b/src/debug.h @@ -23,10 +23,12 @@ #include "cell.h" #include "part.h" -void printParticle(struct part *parts, struct xpart *xparts, long long int id, - size_t N); -void printgParticle(struct gpart *parts, long long int id, size_t N); -void printParticle_single(struct part *p, struct xpart *xp); +void printParticle(const struct part *parts, struct xpart *xparts, + long long int id, size_t N); +void printgParticle(const struct gpart *gparts, const struct part *parts, + long long int id, size_t N); +void printParticle_single(const struct part *p, const struct xpart *xp); +void printgParticle_single(struct gpart *gp); #ifdef HAVE_METIS #include "metis.h" diff --git a/src/engine.c b/src/engine.c index 56d5e1cdd97285e35bb76d7c96c873885d3febc2..8267b52adb2013263c54103892956e5eb8aee01a 100644 --- a/src/engine.c +++ b/src/engine.c @@ -59,15 +59,26 @@ #include "parallel_io.h" #include "part.h" #include "partition.h" +#include "proxy.h" +#include "runner.h" #include "serial_io.h" #include "single_io.h" #include "timers.h" - -const char *engine_policy_names[13] = { - "none", "rand", "steal", "keep", - "block", "fix_dt", "cpu_tight", "mpi", - "numa_affinity", "hydro", "self_gravity", "external_gravity", - "cosmology_integration"}; +#include "units.h" + +const char *engine_policy_names[13] = {"none", + "rand", + "steal", + "keep", + "block", + "fix_dt", + "cpu_tight", + "mpi", + "numa_affinity", + "hydro", + "self_gravity", + "external_gravity", + "cosmology_integration"}; /** The rank of the engine as a global variable (for messages). */ int engine_rank; @@ -87,7 +98,7 @@ static cpu_set_t entry_affinity; * @return The new #link pointer. */ -void engine_addlink(struct engine *e, struct link **l, struct task *t) { +struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) { /* Get the next free link. */ const int ind = atomic_inc(&e->nr_links); @@ -102,8 +113,8 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) { } /** - * @brief Generate the hierarchical tasks for a hierarchy of cells - i.e. all - * the O(Npart) tasks. + * @brief Generate the gravity hierarchical tasks for a hierarchy of cells - + * i.e. all the O(Npart) tasks. * * Tasks are only created here. The dependencies will be added later on. * @@ -111,9 +122,8 @@ void engine_addlink(struct engine *e, struct link **l, struct task *t) { * @param c The #cell. * @param super The super #cell. */ - -void engine_make_hierarchical_tasks(struct engine *e, struct cell *c, - struct cell *super) { +void engine_make_gravity_hierarchical_tasks(struct engine *e, struct cell *c, + struct cell *super) { struct scheduler *s = &e->sched; const int is_with_external_gravity = @@ -122,45 +132,102 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c, const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt; /* Am I the super-cell? */ - if (super == NULL && (c->count > 0 || c->gcount > 0)) { + /* TODO(pedro): Add a condition for gravity tasks as well. */ + if (super == NULL && + (c->density != NULL || (!c->split && (c->count > 0 || c->gcount > 0)))) { - /* Remember me. */ + /* This is the super cell, i.e. the first with density tasks attached. */ super = c; /* Local tasks only... */ if (c->nodeID == e->nodeID) { /* Add the init task. */ - c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, c, - NULL, 0); + if (c->init == NULL) + c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, + c, NULL, 0); /* Add the drift task. */ - c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, 0, - c, NULL, 0); + if (c->drift == NULL) + c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, + 0, c, NULL, 0); /* Add the kick task that matches the policy. */ if (is_fixdt) { - c->kick = scheduler_addtask(s, task_type_kick_fixdt, task_subtype_none, - 0, 0, c, NULL, 0); + if (c->kick == NULL) + c->kick = scheduler_addtask(s, task_type_kick_fixdt, + task_subtype_none, 0, 0, c, NULL, 0); } else { - c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, 0, - c, NULL, 0); + if (c->kick == NULL) + c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, + 0, c, NULL, 0); } - if (c->count > 0) { + if (is_with_external_gravity) + c->grav_external = scheduler_addtask( + s, task_type_grav_external, task_subtype_none, 0, 0, c, NULL, 0); + } + } + + /* Set the super-cell. */ + c->super = super; - /* Generate the ghost task. */ - c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, - 0, c, NULL, 0); - } + /* Recurse. */ + if (c->split) + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) + engine_make_gravity_hierarchical_tasks(e, c->progeny[k], super); +} + +/** + * @brief Generate the hydro hierarchical tasks for a hierarchy of cells - + * i.e. all the O(Npart) tasks. + * + * Tasks are only created here. The dependencies will be added later on. + * + * @param e The #engine. + * @param c The #cell. + * @param super The super #cell. + */ +void engine_make_hydro_hierarchical_tasks(struct engine *e, struct cell *c, + struct cell *super) { + + struct scheduler *s = &e->sched; + const int is_fixdt = (e->policy & engine_policy_fixdt) == engine_policy_fixdt; - if (c->gcount > 0) { + /* Is this the super-cell? */ + if (super == NULL && (c->density != NULL || (c->count > 0 && !c->split))) { + + /* This is the super cell, i.e. the first with density tasks attached. */ + super = c; - /* Add the external gravity tasks */ - if (is_with_external_gravity) - c->grav_external = scheduler_addtask( - s, task_type_grav_external, task_subtype_none, 0, 0, c, NULL, 0); + /* Local tasks only... */ + if (c->nodeID == e->nodeID) { + + /* Add the init task. */ + if (c->init == NULL) + c->init = scheduler_addtask(s, task_type_init, task_subtype_none, 0, 0, + c, NULL, 0); + + /* Add the drift task. */ + if (c->drift == NULL) + c->drift = scheduler_addtask(s, task_type_drift, task_subtype_none, 0, + 0, c, NULL, 0); + + /* Add the kick task that matches the policy. */ + if (is_fixdt) { + if (c->kick == NULL) + c->kick = scheduler_addtask(s, task_type_kick_fixdt, + task_subtype_none, 0, 0, c, NULL, 0); + } else { + if (c->kick == NULL) + c->kick = scheduler_addtask(s, task_type_kick, task_subtype_none, 0, + 0, c, NULL, 0); } + + /* Generate the ghost task. */ + c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0, + c, NULL, 0); } } @@ -171,7 +238,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c, if (c->split) for (int k = 0; k < 8; k++) if (c->progeny[k] != NULL) - engine_make_hierarchical_tasks(e, c->progeny[k], super); + engine_make_hydro_hierarchical_tasks(e, c->progeny[k], super); } /** @@ -236,9 +303,12 @@ void engine_redistribute(struct engine *e) { } const int cid = cell_getid(cdim, parts[k].x[0] * ih[0], parts[k].x[1] * ih[1], parts[k].x[2] * ih[2]); - /* if (cid < 0 || cid >= s->nr_cells) - error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].", - cid, k, parts[k].x[0], parts[k].x[1], parts[k].x[2]); */ +#ifdef SWIFT_DEBUG_CHECKS + if (cid < 0 || cid >= s->nr_cells) + error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k, + parts[k].x[0], parts[k].x[1], parts[k].x[2]); +#endif + dest[k] = cells[cid].nodeID; /* The counts array is indexed as count[from * nr_nodes + to]. */ @@ -265,11 +335,12 @@ void engine_redistribute(struct engine *e) { count_this_dest = 0; } - /* Debug */ - /* if(s->parts[k].gpart->id < 0) */ - /* error("Trying to link a partnerless gpart !"); */ +#ifdef SWIFT_DEBUG_CHECKS + if (s->parts[k].gpart->id_or_neg_offset >= 0) + error("Trying to link a partnerless gpart !"); +#endif - s->parts[k].gpart->id = count_this_dest; + s->parts[k].gpart->id_or_neg_offset = -count_this_dest; count_this_dest++; } } @@ -287,9 +358,12 @@ void engine_redistribute(struct engine *e) { } const int cid = cell_getid(cdim, gparts[k].x[0] * ih[0], gparts[k].x[1] * ih[1], gparts[k].x[2] * ih[2]); - /* if (cid < 0 || cid >= s->nr_cells) - error("Bad cell id %i for part %i at [%.3e,%.3e,%.3e].", - cid, k, g_parts[k].x[0], g_parts[k].x[1], g_parts[k].x[2]); */ +#ifdef SWIFT_DEBUG_CHECKS + if (cid < 0 || cid >= s->nr_cells) + error("Bad cell id %i for part %zi at [%.3e,%.3e,%.3e].", cid, k, + gparts[k].x[0], gparts[k].x[1], gparts[k].x[2]); +#endif + g_dest[k] = cells[cid].nodeID; /* The counts array is indexed as count[from * nr_nodes + to]. */ @@ -450,13 +524,14 @@ void engine_redistribute(struct engine *e) { for (size_t k = offset_gparts; k < offset_gparts + count_gparts; ++k) { /* Does this gpart have a partner ? */ - if (gparts_new[k].id >= 0) { + if (gparts_new[k].id_or_neg_offset <= 0) { - const size_t partner_index = offset_parts + gparts_new[k].id; + const ptrdiff_t partner_index = + offset_parts - gparts_new[k].id_or_neg_offset; /* Re-link */ - gparts_new[k].part = &parts_new[partner_index]; - gparts_new[k].part->gpart = &gparts_new[k]; + gparts_new[k].id_or_neg_offset = -partner_index; + parts_new[partner_index].gpart = &gparts_new[k]; } } @@ -464,37 +539,39 @@ void engine_redistribute(struct engine *e) { offset_gparts += count_gparts; } +#ifdef SWIFT_DEBUG_CHECKS /* Verify that all parts are in the right place. */ - /* for ( int k = 0 ; k < nr_parts ; k++ ) { - int cid = cell_getid( cdim , parts_new[k].x[0]*ih[0], - parts_new[k].x[1]*ih[1], parts_new[k].x[2]*ih[2] ); - if ( cells[ cid ].nodeID != nodeID ) - error( "Received particle (%i) that does not belong here - (nodeID=%i).", k , cells[ cid ].nodeID ); - } */ + for (int k = 0; k < nr_parts; k++) { + int cid = cell_getid(cdim, parts_new[k].x[0] * ih[0], + parts_new[k].x[1] * ih[1], parts_new[k].x[2] * ih[2]); + if (cells[cid].nodeID != nodeID) + error("Received particle (%i) that does not belong here (nodeID=%i).", k, + cells[cid].nodeID); + } /* Verify that the links are correct */ /* MATTHIEU: To be commented out once we are happy */ for (size_t k = 0; k < nr_gparts; ++k) { - if (gparts_new[k].id > 0) { + if (gparts_new[k].id_or_neg_offset <= 0) { - if (gparts_new[k].part->gpart != &gparts_new[k]) - error("Linking problem !"); + struct part *part = &parts_new[-gparts_new[k].id_or_neg_offset]; - if (gparts_new[k].x[0] != gparts_new[k].part->x[0] || - gparts_new[k].x[1] != gparts_new[k].part->x[1] || - gparts_new[k].x[2] != gparts_new[k].part->x[2]) + if (part->gpart != &gparts_new[k]) error("Linking problem !"); + + if (gparts_new[k].x[0] != part->x[0] || + gparts_new[k].x[1] != part->x[1] || gparts_new[k].x[2] != part->x[2]) error("Linked particles are not at the same position !"); } } for (size_t k = 0; k < nr_parts; ++k) { - if (parts_new[k].gpart != NULL) { - - if (parts_new[k].gpart->part != &parts_new[k]) error("Linking problem !"); + if (parts_new[k].gpart != NULL && + parts_new[k].gpart->id_or_neg_offset != -k) { + error("Linking problem !"); } } +#endif /* Set the new part data, free the old. */ free(parts); @@ -605,48 +682,63 @@ void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up, * * @param e The #engine. * @param ci The sending #cell. - * @param cj The receiving #cell + * @param cj Dummy cell containing the nodeID of the receiving node. + * @param t_xv The send_xv #task, if it has already been created. + * @param t_rho The send_rho #task, if it has already been created. + * @param t_ti The send_ti #task, if required and has already been created. */ - -void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) { +void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj, + struct task *t_xv, struct task *t_rho, + struct task *t_ti) { #ifdef WITH_MPI struct link *l = NULL; struct scheduler *s = &e->sched; + const int nodeID = cj->nodeID; /* Check if any of the density tasks are for the target node. */ for (l = ci->density; l != NULL; l = l->next) - if (l->t->ci->nodeID == cj->nodeID || - (l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID)) + if (l->t->ci->nodeID == nodeID || + (l->t->cj != NULL && l->t->cj->nodeID == nodeID)) break; /* If so, attach send tasks. */ if (l != NULL) { - /* Create the tasks. */ - struct task *t_xv = scheduler_addtask(s, task_type_send, task_subtype_none, - 2 * ci->tag, 0, ci, cj, 0); - struct task *t_rho = scheduler_addtask(s, task_type_send, task_subtype_none, - 2 * ci->tag + 1, 0, ci, cj, 0); + /* Create the tasks and their dependencies? */ + if (t_xv == NULL) { + t_xv = scheduler_addtask(s, task_type_send, task_subtype_none, + 3 * ci->tag, 0, ci, cj, 0); + t_rho = scheduler_addtask(s, task_type_send, task_subtype_none, + 3 * ci->tag + 1, 0, ci, cj, 0); + if (!(e->policy & engine_policy_fixdt)) + t_ti = scheduler_addtask(s, task_type_send, task_subtype_tend, + 3 * ci->tag + 2, 0, ci, cj, 0); - /* The first send should depend on the engine's send_root. */ - scheduler_addunlock(s, e->send_root, t_xv); + /* The send_rho task depends on the cell's ghost task. */ + scheduler_addunlock(s, ci->super->ghost, t_rho); - /* The send_rho task depends on the cell's ghost task. */ - scheduler_addunlock(s, ci->super->ghost, t_rho); + /* The send_rho task should unlock the super-cell's kick task. */ + scheduler_addunlock(s, t_rho, ci->super->kick); - /* The send_rho task should unlock the super-cell's kick task. */ - scheduler_addunlock(s, t_rho, ci->super->kick); + /* The send_xv task should unlock the super-cell's ghost task. */ + scheduler_addunlock(s, t_xv, ci->super->ghost); - /* The send_xv task should unlock the super-cell's ghost task. */ - scheduler_addunlock(s, t_xv, ci->super->ghost); + /* The super-cell's kick task should unlock the send_ti task. */ + if (t_ti != NULL) scheduler_addunlock(s, ci->super->kick, t_ti); + } + /* Add them to the local cell. */ + ci->send_xv = engine_addlink(e, ci->send_xv, t_xv); + ci->send_rho = engine_addlink(e, ci->send_rho, t_rho); + if (t_ti != NULL) ci->send_ti = engine_addlink(e, ci->send_ti, t_ti); } /* Recurse? */ - else if (ci->split) + if (ci->split) for (int k = 0; k < 8; k++) - if (ci->progeny[k] != NULL) engine_addtasks_send(e, ci->progeny[k], cj); + if (ci->progeny[k] != NULL) + engine_addtasks_send(e, ci->progeny[k], cj, t_xv, t_rho, t_ti); #else error("SWIFT was not compiled with MPI support."); @@ -657,44 +749,52 @@ void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) { * @brief Add recv tasks to a hierarchy of cells. * * @param e The #engine. - * @param c The #cell. + * @param c The foreign #cell. * @param t_xv The recv_xv #task, if it has already been created. * @param t_rho The recv_rho #task, if it has already been created. + * @param t_ti The recv_ti #task, if required and has already been created. */ void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv, - struct task *t_rho) { + struct task *t_rho, struct task *t_ti) { #ifdef WITH_MPI struct scheduler *s = &e->sched; - /* Do we need to construct a recv task? */ - if (t_xv == NULL && c->nr_density > 0) { + /* Do we need to construct a recv task? + Note that since c is a foreign cell, all its density tasks will involve + only the current rank, and thus we don't have to check them.*/ + if (t_xv == NULL && c->density != NULL) { /* Create the tasks. */ - t_xv = c->recv_xv = scheduler_addtask(s, task_type_recv, task_subtype_none, - 2 * c->tag, 0, c, NULL, 0); - t_rho = c->recv_rho = scheduler_addtask( - s, task_type_recv, task_subtype_none, 2 * c->tag + 1, 0, c, NULL, 0); - - /* The first recv should depend on the engine's recv_root. */ - scheduler_addunlock(s, e->recv_root, t_xv); - } + t_xv = scheduler_addtask(s, task_type_recv, task_subtype_none, 3 * c->tag, + 0, c, NULL, 0); + t_rho = scheduler_addtask(s, task_type_recv, task_subtype_none, + 3 * c->tag + 1, 0, c, NULL, 0); + if (!(e->policy & engine_policy_fixdt)) + t_ti = scheduler_addtask(s, task_type_recv, task_subtype_tend, + 3 * c->tag + 2, 0, c, NULL, 0); + } + c->recv_xv = t_xv; + c->recv_rho = t_rho; + c->recv_ti = t_ti; /* Add dependencies. */ for (struct link *l = c->density; l != NULL; l = l->next) { scheduler_addunlock(s, t_xv, l->t); scheduler_addunlock(s, l->t, t_rho); } - for (struct link *l = c->force; l != NULL; l = l->next) + for (struct link *l = c->force; l != NULL; l = l->next) { scheduler_addunlock(s, t_rho, l->t); + if (t_ti != NULL) scheduler_addunlock(s, l->t, t_ti); + } if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts); /* Recurse? */ if (c->split) for (int k = 0; k < 8; k++) if (c->progeny[k] != NULL) - engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho); + engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho, t_ti); #else error("SWIFT was not compiled with MPI support."); @@ -888,7 +988,8 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, /* Re-link the associated gpart with the buffer offset of the part. */ if (s->parts[offset_parts + k].gpart != NULL) { - s->parts[offset_parts + k].gpart->id = e->proxies[pid].nr_parts_out; + s->parts[offset_parts + k].gpart->id_or_neg_offset = + -e->proxies[pid].nr_parts_out; } /* Load the part and xpart into the proxy. */ @@ -904,7 +1005,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, error( "Do not have a proxy for the requested nodeID %i for part with " "id=%lli, x=[%e,%e,%e].", - node_id, s->gparts[offset_parts + k].id, + node_id, s->gparts[offset_parts + k].id_or_neg_offset, s->gparts[offset_gparts + k].x[0], s->gparts[offset_parts + k].x[1], s->gparts[offset_gparts + k].x[2]); proxy_gparts_load(&e->proxies[pid], &s->gparts[offset_gparts + k], 1); @@ -964,7 +1065,7 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, s->xparts = xparts_new; for (size_t k = 0; k < offset_parts; k++) { if (s->parts[k].gpart != NULL) { - s->parts[k].gpart->part = &s->parts[k]; + s->parts[k].gpart->id_or_neg_offset = -k; } } } @@ -979,8 +1080,8 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, free(s->gparts); s->gparts = gparts_new; for (size_t k = 0; k < offset_gparts; k++) { - if (s->gparts[k].id > 0) { - s->gparts[k].part->gpart = &s->gparts[k]; + if (s->gparts[k].id_or_neg_offset < 0) { + s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k]; } } } @@ -1053,9 +1154,10 @@ void engine_exchange_strays(struct engine *e, size_t offset_parts, /* Re-link the gparts. */ for (int k = 0; k < p->nr_gparts_in; k++) { struct gpart *gp = &s->gparts[offset_gparts + count_gparts + k]; - if (gp->id >= 0) { - struct part *p = &s->parts[offset_parts + count_parts + gp->id]; - gp->part = p; + if (gp->id_or_neg_offset <= 0) { + struct part *p = + &s->parts[offset_gparts + count_parts - gp->id_or_neg_offset]; + gp->id_or_neg_offset = s->parts - p; p->gpart = gp; } } @@ -1172,6 +1274,7 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, struct engine *e = (struct engine *)extra_data; struct task *tasks = (struct task *)map_data; struct scheduler *sched = &e->sched; + const int nr_tasks = sched->nr_tasks; for (int ind = 0; ind < num_elements; ind++) { @@ -1203,20 +1306,23 @@ void engine_count_and_link_tasks_mapper(void *map_data, int num_elements, engine_addlink(e, &t->cj->density, t); atomic_inc(&t->cj->nr_density); } - } else if (t->type == task_type_sub) { + } else if (t->type == task_type_sub_self) { atomic_inc(&t->ci->nr_tasks); if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks); if (t->subtype == task_subtype_density) { engine_addlink(e, &t->ci->density, t); atomic_inc(&t->ci->nr_density); - if (t->cj != NULL) { - engine_addlink(e, &t->cj->density, t); - atomic_inc(&t->cj->nr_density); - } + } + } else if (t->type == task_type_sub_pair) { + atomic_inc(&t->ci->nr_tasks); + atomic_inc(&t->cj->nr_tasks); + if (t->subtype == task_subtype_density) { + t->ci->density = engine_addlink(e, t->ci->density, t); + atomic_inc(&t->ci->nr_density); + t->cj->density = engine_addlink(e, t->cj->density, t); + atomic_inc(&t->cj->nr_density); } } - } -} void engine_count_and_link_tasks_serial(struct engine *e) { @@ -1252,16 +1358,21 @@ void engine_count_and_link_tasks_serial(struct engine *e) { engine_addlink(e, &t->cj->density, t); atomic_inc(&t->cj->nr_density); } - } else if (t->type == task_type_sub) { + } else if (t->type == task_type_sub_self) { atomic_inc(&t->ci->nr_tasks); if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks); if (t->subtype == task_subtype_density) { engine_addlink(e, &t->ci->density, t); atomic_inc(&t->ci->nr_density); - if (t->cj != NULL) { - engine_addlink(e, &t->cj->density, t); - atomic_inc(&t->cj->nr_density); - } + } + } else if (t->type == task_type_sub_pair) { + atomic_inc(&t->ci->nr_tasks); + atomic_inc(&t->cj->nr_tasks); + if (t->subtype == task_subtype_density) { + t->ci->density = engine_addlink(e, t->ci->density, t); + atomic_inc(&t->ci->nr_density); + t->cj->density = engine_addlink(e, t->cj->density, t); + atomic_inc(&t->cj->nr_density); } } } @@ -1443,29 +1554,47 @@ void engine_make_extra_hydroloop_tasks_serial(struct engine *e) { } } - /* Otherwise, sub interaction? */ - else if (t->type == task_type_sub && t->subtype == task_subtype_density) { + /* Otherwise, sub-self interaction? */ + else if (t->type == task_type_sub_self && + t->subtype == task_subtype_density) { /* Start by constructing the task for the second hydro loop */ struct task *t2 = - scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags, - 0, t->ci, t->cj, 0); + scheduler_addtask(sched, task_type_sub_self, task_subtype_force, + t->flags, 0, t->ci, t->cj, 0); - /* Add the link between the new loop and both cells */ - engine_addlink(e, &t->ci->force, t2); + /* Add the link between the new loop and the cell */ + t->ci->force = engine_addlink(e, t->ci->force, t2); atomic_inc(&t->ci->nr_force); - if (t->cj != NULL) { - engine_addlink(e, &t->cj->force, t2); - atomic_inc(&t->cj->nr_force); + + /* Now, build all the dependencies for the hydro for the cells */ + /* that are local and are not descendant of the same super-cells */ + if (t->ci->nodeID == nodeID) { + engine_make_hydro_loops_dependencies(sched, t, t2, t->ci); } + } + + /* Otherwise, sub-pair interaction? */ + else if (t->type == task_type_sub_pair && + t->subtype == task_subtype_density) { + + /* Start by constructing the task for the second hydro loop */ + struct task *t2 = + scheduler_addtask(sched, task_type_sub_pair, task_subtype_force, + t->flags, 0, t->ci, t->cj, 0); + + /* Add the link between the new loop and both cells */ + t->ci->force = engine_addlink(e, t->ci->force, t2); + atomic_inc(&t->ci->nr_force); + t->cj->force = engine_addlink(e, t->cj->force, t2); + atomic_inc(&t->cj->nr_force); /* Now, build all the dependencies for the hydro for the cells */ /* that are local and are not descendant of the same super-cells */ if (t->ci->nodeID == nodeID) { engine_make_hydro_loops_dependencies(sched, t, t2, t->ci); } - if (t->cj != NULL && t->cj->nodeID == nodeID && - t->ci->super != t->cj->super) { + if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) { engine_make_hydro_loops_dependencies(sched, t, t2, t->cj); } } @@ -1599,8 +1728,14 @@ void engine_maketasks(struct engine *e) { engine_count_and_link_tasks_serial(e); /* Append hierarchical tasks to each cells */ - for (int k = 0; k < nr_cells; k++) - engine_make_hierarchical_tasks(e, &cells[k], NULL); + if (e->policy & engine_policy_hydro) + for (int k = 0; k < nr_cells; k++) + engine_make_hydro_hierarchical_tasks(e, &cells[k], NULL); + + if ((e->policy & engine_policy_self_gravity) || + (e->policy & engine_policy_external_gravity)) + for (int k = 0; k < nr_cells; k++) + engine_make_gravity_hierarchical_tasks(e, &cells[k], NULL); /* Run through the tasks and make force tasks for each density task. Each force task depends on the cell ghosts and unlocks the kick task @@ -1627,12 +1762,13 @@ void engine_maketasks(struct engine *e) { /* Loop through the proxy's incoming cells and add the recv tasks. */ for (int k = 0; k < p->nr_cells_in; k++) - engine_addtasks_recv(e, p->cells_in[k], NULL, NULL); + engine_addtasks_recv(e, p->cells_in[k], NULL, NULL, NULL); /* Loop through the proxy's outgoing cells and add the send tasks. */ for (int k = 0; k < p->nr_cells_out; k++) - engine_addtasks_send(e, p->cells_out[k], p->cells_in[0]); + engine_addtasks_send(e, p->cells_out[k], p->cells_in[0], NULL, NULL, + NULL); } } @@ -1670,7 +1806,7 @@ void engine_marktasks_fixdt_mapper(void *map_data, int num_elements, /* Pair? */ if (t->type == task_type_pair || - (t->type == task_type_sub && t->cj != NULL)) { + (t->type == task_type_sub_pair)) { /* Local pointers. */ const struct cell *ci = t->ci; @@ -1727,7 +1863,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements, /* Pair? */ else if (t->type == task_type_pair || - (t->type == task_type_sub && t->cj != NULL)) { + (t->type == task_type_sub_pair)) { /* Local pointers. */ const struct cell *ci = t->ci; @@ -2114,7 +2250,11 @@ void engine_collect_drift(struct cell *c) { c->ang_mom[1] = ang_mom[1]; c->ang_mom[2] = ang_mom[2]; } - +/** + * @brief Print the conserved quantities statistics to a log file + * + * @param e The #engine. + */ void engine_print_stats(struct engine *e) { const struct space *s = e->s; @@ -2228,8 +2368,10 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask, *forward in time. * * @param e The #engine + * @param flag_entropy_ICs Did the 'Internal Energy' of the particles actually + *contain entropy ? */ -void engine_init_particles(struct engine *e) { +void engine_init_particles(struct engine *e, int flag_entropy_ICs) { struct space *s = e->s; @@ -2288,6 +2430,7 @@ void engine_init_particles(struct engine *e) { mask |= 1 << task_type_send; mask |= 1 << task_type_recv; + submask |= 1 << task_subtype_tend; } /* Now, launch the calculation */ @@ -2296,7 +2439,7 @@ void engine_init_particles(struct engine *e) { TIMER_TOC(timer_runners); /* Apply some conversions (e.g. internal energy -> entropy) */ - space_map_cells_pre(s, 0, cell_convert_hydro, NULL); + if (!flag_entropy_ICs) space_map_cells_pre(s, 0, cell_convert_hydro, NULL); clocks_gettime(&time2); @@ -2394,7 +2537,8 @@ void engine_step(struct engine *e) { mask |= 1 << task_type_self; mask |= 1 << task_type_pair; - mask |= 1 << task_type_sub; + mask |= 1 << task_type_sub_self; + mask |= 1 << task_type_sub_pair; mask |= 1 << task_type_ghost; submask |= 1 << task_subtype_density; @@ -2417,6 +2561,7 @@ void engine_step(struct engine *e) { mask |= 1 << task_type_send; mask |= 1 << task_type_recv; + submask |= 1 << task_subtype_tend; } /* Send off the runners. */ @@ -2582,8 +2727,7 @@ void engine_split(struct engine *e, struct partition *initial_partition) { s->xparts = xparts_new; /* Re-link the gparts. */ - for (size_t k = 0; k < s->nr_parts; k++) - if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k]; + part_relink_gparts(s->parts, s->nr_parts, 0); /* Re-allocate the local gparts. */ if (e->verbose) @@ -2599,30 +2743,29 @@ void engine_split(struct engine *e, struct partition *initial_partition) { s->gparts = gparts_new; /* Re-link the parts. */ - for (size_t k = 0; k < s->nr_gparts; k++) - if (s->gparts[k].id > 0) s->gparts[k].part->gpart = &s->gparts[k]; + part_relink_parts(s->gparts, s->nr_gparts, s->parts); +#ifdef SWIFT_DEBUG_CHECKS /* Verify that the links are correct */ - /* MATTHIEU: To be commented out once we are happy */ for (size_t k = 0; k < s->nr_gparts; ++k) { - if (s->gparts[k].id > 0) { + if (s->gparts[k].id_or_neg_offset <= 0) { + + struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset]; - if (s->gparts[k].part->gpart != &s->gparts[k]) error("Linking problem !"); + if (part->gpart != &s->gparts[k]) error("Linking problem !"); - if (s->gparts[k].x[0] != s->gparts[k].part->x[0] || - s->gparts[k].x[1] != s->gparts[k].part->x[1] || - s->gparts[k].x[2] != s->gparts[k].part->x[2]) + if (s->gparts[k].x[0] != part->x[0] || s->gparts[k].x[1] != part->x[1] || + s->gparts[k].x[2] != part->x[2]) error("Linked particles are not at the same position !"); } } for (size_t k = 0; k < s->nr_parts; ++k) { - if (s->parts[k].gpart != NULL) { - - if (s->parts[k].gpart->part != &s->parts[k]) error("Linking problem !"); - } + if (s->parts[k].gpart != NULL && s->parts[k].gpart->id_or_neg_offset != -k) + error("Linking problem !"); } +#endif #else error("SWIFT was not compiled with MPI support."); @@ -3015,9 +3158,6 @@ void engine_init(struct engine *e, struct space *s, part_create_mpi_types(); #endif - /* Initialize the threadpool. */ - threadpool_init(&e->threadpool, e->nr_threads); - /* First of all, init the barrier and lock it. */ if (pthread_mutex_init(&e->barrier_mutex, NULL) != 0) error("Failed to initialize barrier mutex."); @@ -3032,7 +3172,16 @@ void engine_init(struct engine *e, struct space *s, /* Init the scheduler with enough tasks for the initial sorting tasks. */ const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads; scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, scheduler_flag_steal, - e->nodeID, &e->threadpool); + e->nodeID); + + /* Create the sorting tasks. */ + for (int i = 0; i < e->nr_threads; i++) { + scheduler_addtask(&e->sched, task_type_part_sort, task_subtype_none, i, 0, + NULL, NULL, 0); + + scheduler_addtask(&e->sched, task_type_gpart_sort, task_subtype_none, i, 0, + NULL, NULL, 0); + } scheduler_ranktasks(&e->sched); diff --git a/src/engine.h b/src/engine.h index f1fea143e6252aa218ad69210a7e105aefdcc13c..6b00d7a4b8be8305bd8d0377db37a6dba2c7c7a7 100644 --- a/src/engine.h +++ b/src/engine.h @@ -37,13 +37,10 @@ #include <stdio.h> /* Includes. */ -#include "hydro_properties.h" -#include "lock.h" +#include "clocks.h" #include "parser.h" #include "partition.h" -#include "physical_constants.h" #include "potentials.h" -#include "proxy.h" #include "runner.h" #include "scheduler.h" #include "space.h" @@ -223,7 +220,7 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask, unsigned int submask); void engine_prepare(struct engine *e); void engine_print(struct engine *e); -void engine_init_particles(struct engine *e); +void engine_init_particles(struct engine *e, int flag_entropy_ICs); void engine_step(struct engine *e); void engine_maketasks(struct engine *e); void engine_split(struct engine *e, struct partition *initial_partition); diff --git a/src/gravity/Default/gravity_debug.h b/src/gravity/Default/gravity_debug.h index 21d775d703c8bd7000ad03386fe818124642a8f1..7cf375a1fdf7bccc4131dc415ab2d4acbbf2d3bc 100644 --- a/src/gravity/Default/gravity_debug.h +++ b/src/gravity/Default/gravity_debug.h @@ -18,7 +18,7 @@ ******************************************************************************/ __attribute__((always_inline)) INLINE static void gravity_debug_particle( - struct gpart* p) { + const struct gpart* p) { printf( "x=[%.3e,%.3e,%.3e], " "v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n " diff --git a/src/gravity/Default/gravity_io.h b/src/gravity/Default/gravity_io.h index 129c4b39828ca73d2d80d79edbdaa8ec4d5a9e01..74f364dd97361f0513755bedec83fe7cb277c36b 100644 --- a/src/gravity/Default/gravity_io.h +++ b/src/gravity/Default/gravity_io.h @@ -39,8 +39,8 @@ __attribute__((always_inline)) INLINE static void darkmatter_read_particles( COMPULSORY); readArray(h_grp, "Velocities", FLOAT, N, 3, gparts, N_total, offset, v_full, COMPULSORY); - readArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, gparts, N_total, offset, id, - COMPULSORY); + readArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, gparts, N_total, offset, + id_or_neg_offset, COMPULSORY); } /** @@ -75,6 +75,6 @@ __attribute__((always_inline)) INLINE static void darkmatter_write_particles( Ndm, 3, gparts, Ndm_total, mpi_rank, offset, v_full, us, UNIT_CONV_SPEED); writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs", - ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset, id, us, - UNIT_CONV_NO_UNITS); + ULONGLONG, Ndm, 1, gparts, Ndm_total, mpi_rank, offset, + id_or_neg_offset, us, UNIT_CONV_NO_UNITS); } diff --git a/src/gravity/Default/gravity_part.h b/src/gravity/Default/gravity_part.h index a54fc1d18e7e84a1ca7a7e7913117ed207549271..b6c9e62559207cb25323c8a108e4bffc87ea0fcf 100644 --- a/src/gravity/Default/gravity_part.h +++ b/src/gravity/Default/gravity_part.h @@ -47,14 +47,8 @@ struct gpart { /* float tx; */ /* float tv; */ - /* Anonymous union for id/part. */ - union { - - /* Particle ID. */ - long long id; - - /* Pointer to corresponding SPH part. */ - struct part* part; - }; + /* Particle ID. If negative, it is the negative offset of the #part with + which this gpart is linked. */ + long long id_or_neg_offset; } __attribute__((aligned(gpart_align))); diff --git a/src/hydro/Default/hydro_debug.h b/src/hydro/Default/hydro_debug.h index 79ee392d46ca75a2c097bf045b2d82c9f3dc96c0..9b8136e8349398e4e9e804459a5de23d21043564 100644 --- a/src/hydro/Default/hydro_debug.h +++ b/src/hydro/Default/hydro_debug.h @@ -18,7 +18,7 @@ ******************************************************************************/ __attribute__((always_inline)) INLINE static void hydro_debug_particle( - struct part* p, struct xpart* xp) { + const struct part* p, const struct xpart* xp) { printf( "x=[%.3e,%.3e,%.3e], " "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n " diff --git a/src/hydro/Default/hydro_iact.h b/src/hydro/Default/hydro_iact.h index 4d651b61bd934388414d54fa813820111d26e682..0a577931b5e0ca67ab07dfe414a548da66e82cdd 100644 --- a/src/hydro/Default/hydro_iact.h +++ b/src/hydro/Default/hydro_iact.h @@ -20,12 +20,6 @@ #ifndef SWIFT_RUNNER_IACT_H #define SWIFT_RUNNER_IACT_H -/* Includes. */ -#include "const.h" -#include "kernel_hydro.h" -#include "part.h" -#include "vector.h" - /** * @brief SPH interaction functions following the Gadget-2 version of SPH. * @@ -44,7 +38,6 @@ /** * @brief Density loop */ - __attribute__((always_inline)) INLINE static void runner_iact_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -218,7 +211,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( /** * @brief Density loop (non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -267,7 +259,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /** * @brief Density loop (non-symmetric vectorized version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { @@ -360,7 +351,6 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, /** * @brief Force loop */ - __attribute__((always_inline)) INLINE static void runner_iact_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -456,7 +446,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force( /** * @brief Force loop (Vectorized version) */ - __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { @@ -675,7 +664,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( /** * @brief Force loop (non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -766,7 +754,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( /** * @brief Force loop (Vectorized non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { diff --git a/src/hydro/Default/hydro_io.h b/src/hydro/Default/hydro_io.h index de71963b05149fa0b2df6222424478a6ad9b1f44..fb1f7af922b2272779c7251b15e304880a2af520 100644 --- a/src/hydro/Default/hydro_io.h +++ b/src/hydro/Default/hydro_io.h @@ -122,3 +122,10 @@ void writeSPHflavour(hid_t h_grpsph) { writeAttribute_f(h_grpsph, "Maximal Delta u change over dt", const_max_u_change); } + +/** + * @brief Are we writing entropy in the internal energy field ? + * + * @return 1 if entropy is in 'internal energy', 0 otherwise. + */ +int writeEntropyFlag() { return 0; } diff --git a/src/hydro/Gadget2/hydro_debug.h b/src/hydro/Gadget2/hydro_debug.h index 31e89d438bc96c0f0f2ba56249664d28036cb607..b67e79182ccaaee7c0421c57a91ec9fa2adae65c 100644 --- a/src/hydro/Gadget2/hydro_debug.h +++ b/src/hydro/Gadget2/hydro_debug.h @@ -18,7 +18,7 @@ ******************************************************************************/ __attribute__((always_inline)) INLINE static void hydro_debug_particle( - struct part* p, struct xpart* xp) { + const struct part* p, const struct xpart* xp) { printf( "x=[%.3e,%.3e,%.3e], " "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e],\n " diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 8fcae293280e55bb838edf3c243f4e322914216f..8738b4be09931df4c938f1dff3adeed11468dcfc 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -20,12 +20,6 @@ #ifndef SWIFT_RUNNER_IACT_LEGACY_H #define SWIFT_RUNNER_IACT_LEGACY_H -/* Includes. */ -#include "const.h" -#include "kernel_hydro.h" -#include "part.h" -#include "vector.h" - /** * @brief SPH interaction functions following the Gadget-2 version of SPH. * @@ -42,7 +36,6 @@ /** * @brief Density loop */ - __attribute__((always_inline)) INLINE static void runner_iact_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -113,7 +106,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /** * @brief Density loop (non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -162,7 +154,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /** * @brief Force loop */ - __attribute__((always_inline)) INLINE static void runner_iact_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -260,7 +251,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force( /** * @brief Force loop (non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { diff --git a/src/hydro/Gadget2/hydro_io.h b/src/hydro/Gadget2/hydro_io.h index b977f25386fab0787c925635297a48fa85a8df24..f7ee04cbbad07ebb06d00ee39a614b9fed5c4dfd 100644 --- a/src/hydro/Gadget2/hydro_io.h +++ b/src/hydro/Gadget2/hydro_io.h @@ -83,8 +83,8 @@ __attribute__((always_inline)) INLINE static void hydro_write_particles( writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "SmoothingLength", FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH); - writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "Entropy", FLOAT, N, - 1, parts, N_total, mpi_rank, offset, entropy, us, + writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "InternalEnergy", + FLOAT, N, 1, parts, N_total, mpi_rank, offset, entropy, us, UNIT_CONV_ENTROPY_PER_UNIT_MASS); writeArray(h_grp, fileName, xmfFile, partTypeGroupName, "ParticleIDs", ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us, @@ -113,3 +113,10 @@ void writeSPHflavour(hid_t h_grpsph) { writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha); writeAttribute_f(h_grpsph, "Viscosity beta", 3.f); } + +/** + * @brief Are we writing entropy in the internal energy field ? + * + * @return 1 if entropy is in 'internal energy', 0 otherwise. + */ +int writeEntropyFlag() { return 1; } diff --git a/src/hydro/Minimal/hydro_debug.h b/src/hydro/Minimal/hydro_debug.h index 127ba75e99418b6a5dc197a44ccdc77de3cdef15..1e0e3f1f111149a3babeef431893dff2cbe6bb3c 100644 --- a/src/hydro/Minimal/hydro_debug.h +++ b/src/hydro/Minimal/hydro_debug.h @@ -18,7 +18,7 @@ ******************************************************************************/ __attribute__((always_inline)) INLINE static void hydro_debug_particle( - struct part* p, struct xpart* xp) { + const struct part* p, const struct xpart* xp) { printf( "x=[%.3e,%.3e,%.3e], " "v=[%.3e,%.3e,%.3e],v_full=[%.3e,%.3e,%.3e] \n a=[%.3e,%.3e,%.3e], " diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h index f453d8fa1c495472af27ccf98356a3dab0894e98..c9da185b8a29eafe2a58420ae5de3a05ff043225 100644 --- a/src/hydro/Minimal/hydro_iact.h +++ b/src/hydro/Minimal/hydro_iact.h @@ -19,12 +19,6 @@ #ifndef SWIFT_RUNNER_IACT_MINIMAL_H #define SWIFT_RUNNER_IACT_MINIMAL_H -/* Includes. */ -#include "const.h" -#include "kernel_hydro.h" -#include "part.h" -#include "vector.h" - /** * @brief Minimal conservative implementation of SPH * @@ -70,7 +64,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( /** * @brief Density loop (non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -95,7 +88,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( /** * @brief Force loop */ - __attribute__((always_inline)) INLINE static void runner_iact_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { @@ -168,7 +160,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_force( /** * @brief Force loop (non-symmetric version) */ - __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { diff --git a/src/hydro/Minimal/hydro_io.h b/src/hydro/Minimal/hydro_io.h index 1bbfe1065358017e0b27e2131ff717848221fe9c..bed6b8f8221dc3a6e94c65d101c68aa9b3bdea91 100644 --- a/src/hydro/Minimal/hydro_io.h +++ b/src/hydro/Minimal/hydro_io.h @@ -115,3 +115,10 @@ void writeSPHflavour(hid_t h_grpsph) { writeAttribute_f(h_grpsph, "Maximal Delta u change over dt", const_max_u_change); } + +/** + * @brief Are we writing entropy in the internal energy field ? + * + * @return 1 if entropy is in 'internal energy', 0 otherwise. + */ +int writeEntropyFlag() { return 0; } diff --git a/src/hydro_properties.c b/src/hydro_properties.c index b4e37d672408ad3c0adf1f37fcd4fcbd50095d92..16216f81a5b505fc3a887e86ca4898bc4179e4d5 100644 --- a/src/hydro_properties.c +++ b/src/hydro_properties.c @@ -29,6 +29,9 @@ #include "hydro.h" #include "kernel_hydro.h" +#define hydro_props_default_max_iterations 30 +#define hydro_props_default_volume_change 2.0f + void hydro_props_init(struct hydro_props *p, const struct swift_params *params) { @@ -39,13 +42,13 @@ void hydro_props_init(struct hydro_props *p, p->delta_neighbours = parser_get_param_float(params, "SPH:delta_neighbours"); /* Ghost stuff */ - p->max_smoothing_iterations = - parser_get_param_int(params, "SPH:max_ghost_iterations"); + p->max_smoothing_iterations = parser_get_opt_param_int( + params, "SPH:max_ghost_iterations", hydro_props_default_max_iterations); /* Time integration properties */ p->CFL_condition = parser_get_param_float(params, "SPH:CFL_condition"); - const float max_volume_change = - parser_get_param_float(params, "SPH:max_volume_change"); + const float max_volume_change = parser_get_opt_param_float( + params, "SPH:max_volume_change", hydro_props_default_volume_change); p->log_max_h_change = logf(powf(max_volume_change, 0.33333333333f)); } @@ -60,4 +63,8 @@ void hydro_props_print(const struct hydro_props *p) { "Hydrodynamic integration: Max change of volume: %.2f " "(max|dlog(h)/dt|=%f).", powf(expf(p->log_max_h_change), 3.f), p->log_max_h_change); + + if (p->max_smoothing_iterations != hydro_props_default_max_iterations) + message("Maximal iterations in ghost task set to %d (default is %d)", + p->max_smoothing_iterations, hydro_props_default_max_iterations); } diff --git a/src/lock.h b/src/lock.h index 31f80605efb29bfaaadc65e25ba4f31679f8fbe8..b2dd2eac9d0ca5d7807907e31cf3fa31894f9aed 100644 --- a/src/lock.h +++ b/src/lock.h @@ -23,7 +23,7 @@ #include <pthread.h> /* Includes. */ -#include "inline.h" +#include "atomic.h" #ifdef PTHREAD_SPINLOCK #include <pthread.h> @@ -50,14 +50,14 @@ #define lock_init(l) (*(l) = 0) #define lock_destroy(l) 0 INLINE static int lock_lock(volatile int *l) { - while (__sync_val_compare_and_swap(l, 0, 1) != 0) + while (atomic_cas(l, 0, 1) != 0) ; // while( *l ); return 0; } -#define lock_trylock(l) ((*(l)) ? 1 : __sync_val_compare_and_swap(l, 0, 1)) -#define lock_unlock(l) (__sync_val_compare_and_swap(l, 1, 0) != 1) -#define lock_unlock_blind(l) __sync_val_compare_and_swap(l, 1, 0) +#define lock_trylock(l) ((*(l)) ? 1 : atomic_cas(l, 0, 1)) +#define lock_unlock(l) (atomic_cas(l, 1, 0) != 1) +#define lock_unlock_blind(l) atomic_cas(l, 1, 0) #endif #endif /* SWIFT_LOCK_H */ diff --git a/src/map.c b/src/map.c index fbe57fde7b1e29c49b0f27d86d177245dd9a27e2..f4f9ac7cfa7141606b578739517b66e951e65eab 100644 --- a/src/map.c +++ b/src/map.c @@ -18,15 +18,23 @@ * ******************************************************************************/ -#include "map.h" +/* Config parameters. */ +#include "../config.h" + +/* Some standard headers. */ #include <stdio.h> #include <stdlib.h> + +/* This object's header. */ +#include "map.h" + +/* Local headers. */ +#include "atomic.h" #include "error.h" /** * @brief Mapping function to draw a specific cell (gnuplot). */ - void map_cells_plot(struct cell *c, void *data) { int depth = *(int *)data; @@ -80,24 +88,21 @@ void map_cells_plot(struct cell *c, void *data) { /** * @brief Mapping function for checking if each part is in its box. */ +void map_check(struct part *p, struct cell *c, void *data) { -/* void map_check ( struct part *p , struct cell *c , void *data ) { - - if ( p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] || - p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] || - p->x[0] < c->loc[0] || p->x[0] > c->loc[0]+c->h[0] ) - printf( "map_check: particle %i is outside of its box.\n" , p->id ); - - } */ + if (p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] || + p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0] || + p->x[0] < c->loc[0] || p->x[0] > c->loc[0] + c->h[0]) + printf("map_check: particle %lld is outside of its box.\n", p->id); +} /** * @brief Mapping function for neighbour count. */ - void map_cellcheck(struct cell *c, void *data) { int *count = (int *)data; - __sync_fetch_and_add(count, c->count); + atomic_add(count, c->count); /* Loop over all parts and check if they are in the cell. */ for (int k = 0; k < c->count; k++) { @@ -133,7 +138,6 @@ void map_cellcheck(struct cell *c, void *data) { /** * @brief Mapping function for maxdepth cell count. */ - void map_maxdepth(struct cell *c, void *data) { int maxdepth = ((int *)data)[0]; @@ -147,7 +151,6 @@ void map_maxdepth(struct cell *c, void *data) { /** * @brief Mapping function for neighbour count. */ - void map_count(struct part *p, struct cell *c, void *data) { double *wcount = (double *)data; @@ -156,7 +159,6 @@ void map_count(struct part *p, struct cell *c, void *data) { *wcount += p->density.wcount; } - void map_wcount_min(struct part *p, struct cell *c, void *data) { struct part **p2 = (struct part **)data; @@ -188,7 +190,6 @@ void map_h_max(struct part *p, struct cell *c, void *data) { /** * @brief Mapping function for neighbour count. */ - void map_icount(struct part *p, struct cell *c, void *data) { // int *count = (int *)data; @@ -201,7 +202,6 @@ void map_icount(struct part *p, struct cell *c, void *data) { /** * @brief Mapping function to print the particle position. */ - void map_dump(struct part *p, struct cell *c, void *data) { double *shift = (double *)data; diff --git a/src/parallel_io.c b/src/parallel_io.c index e779b56a85da3db72ea860fb336fa42037fbcd0e..66b02d2bff2bc0d6f2dd69d994a93576e8b96583 100644 --- a/src/parallel_io.c +++ b/src/parallel_io.c @@ -37,7 +37,11 @@ /* Local includes. */ #include "common_io.h" +#include "engine.h" #include "error.h" +#include "kernel_hydro.h" +#include "part.h" +#include "units.h" /** * @brief Reads a data array from a given HDF5 group. @@ -363,8 +367,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, */ void read_ic_parallel(char* fileName, double dim[3], struct part** parts, struct gpart** gparts, size_t* Ngas, size_t* Ngparts, - int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, - MPI_Info info, int dry_run) { + int* periodic, int* flag_entropy, int mpi_rank, + int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run) { hid_t h_file = 0, h_grp = 0; /* GADGET has only cubic boxes (in cosmological mode) */ double boxSize[3] = {0.0, -1.0, -1.0}; @@ -400,6 +404,7 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts, if (h_grp < 0) error("Error while opening file header\n"); /* Read the relevant information and print status */ + readAttribute(h_grp, "Flag_Entropy_ICs", INT, flag_entropy); readAttribute(h_grp, "BoxSize", DOUBLE, boxSize); readAttribute(h_grp, "NumPart_Total", UINT, numParticles); readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord); @@ -622,6 +627,7 @@ void write_output_parallel(struct engine* e, const char* baseName, double MassTable[6] = {0., 0., 0., 0., 0., 0.}; writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES); unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0}; + flagEntropy[0] = writeEntropyFlag(); writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, NUM_PARTICLE_TYPES); writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1); diff --git a/src/parallel_io.h b/src/parallel_io.h index 970ad8c41dcbc2df3a85b178f836e16926147788..709479fab029e862caddb36c4216b85077e96cec 100644 --- a/src/parallel_io.h +++ b/src/parallel_io.h @@ -19,6 +19,9 @@ #ifndef SWIFT_PARALLEL_IO_H #define SWIFT_PARALLEL_IO_H +/* Config parameters. */ +#include "../config.h" + /* MPI headers. */ #ifdef WITH_MPI #include <mpi.h> @@ -33,8 +36,8 @@ void read_ic_parallel(char* fileName, double dim[3], struct part** parts, struct gpart** gparts, size_t* Ngas, size_t* Ngparts, - int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, - MPI_Info info, int dry_run); + int* periodic, int* flag_entropy, int mpi_rank, + int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run); void write_output_parallel(struct engine* e, const char* baseName, struct UnitSystem* us, int mpi_rank, int mpi_size, diff --git a/src/part.c b/src/part.c index b89abdde40fe8c7a57d1e9ac9e18fece83ba1f21..b00eaccaae0e86f7c4e8019a307f0bf455687b7c 100644 --- a/src/part.c +++ b/src/part.c @@ -29,6 +29,36 @@ #include "error.h" #include "part.h" +/** + * @brief Re-link the #gpart%s associated with the list of #part%s. + * + * @param parts The list of #part. + * @param N The number of particles to re-link; + * @param offset The offset of #part%s relative to the global parts list. + */ +void part_relink_gparts(struct part *parts, size_t N, ptrdiff_t offset) { + for (size_t k = 0; k < N; k++) { + if (parts[k].gpart) { + parts[k].gpart->id_or_neg_offset = -(k + offset); + } + } +} + +/** + * @brief Re-link the #gpart%s associated with the list of #part%s. + * + * @param gparts The list of #gpart. + * @param N The number of particles to re-link; + * @param parts The global part array in which to find the #gpart offsets. + */ +void part_relink_parts(struct gpart *gparts, size_t N, struct part *parts) { + for (size_t k = 0; k < N; k++) { + if (gparts[k].id_or_neg_offset <= 0) { + parts[-gparts[k].id_or_neg_offset].gpart = &gparts[k]; + } + } +} + #ifdef WITH_MPI /* MPI data type for the particle transfers */ MPI_Datatype part_mpi_type; diff --git a/src/part.h b/src/part.h index 5d4c9c88a1acadea3d23a3df618c04da389fb61d..efca7b6b5bef49f20df1e2c45b30f65ecbbf4960 100644 --- a/src/part.h +++ b/src/part.h @@ -22,8 +22,8 @@ /* Config parameters. */ #include "../config.h" -/* Some standard headers. */ -#include <stdlib.h> +/* Standard headers. */ +#include <stddef.h> /* MPI headers. */ #ifdef WITH_MPI @@ -38,7 +38,7 @@ #define xpart_align 32 #define gpart_align 32 -/* Import the right particle definition */ +/* Import the right hydro particle definition */ #if defined(MINIMAL_SPH) #include "./hydro/Minimal/hydro_part.h" #elif defined(GADGET2_SPH) @@ -49,8 +49,11 @@ #error "Invalid choice of SPH variant" #endif +/* Import the right gravity particle definition */ #include "./gravity/Default/gravity_part.h" +void part_relink_gparts(struct part *parts, size_t N, ptrdiff_t offset); +void part_relink_parts(struct gpart *gparts, size_t N, struct part *parts); #ifdef WITH_MPI /* MPI data type for the particle transfers */ extern MPI_Datatype part_mpi_type; diff --git a/src/partition.c b/src/partition.c index b79dfd6b81d727b6e53f9e0906c7c59eee6607f1..6df437826de796c05143b2003dbdacb971d9b7be 100644 --- a/src/partition.c +++ b/src/partition.c @@ -453,9 +453,9 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, /* Skip un-interesting tasks. */ if (t->type != task_type_self && t->type != task_type_pair && - t->type != task_type_sub && t->type != task_type_ghost && - t->type != task_type_drift && t->type != task_type_kick && - t->type != task_type_init) + t->type != task_type_sub_self && t->type != task_type_sub_self && + t->type != task_type_ghost && t->type != task_type_drift && + t->type != task_type_kick && t->type != task_type_init) continue; /* Get the task weight. */ @@ -496,15 +496,15 @@ static void repart_edge_metis(int partweights, int bothweights, int nodeID, /* Self interaction? */ else if ((t->type == task_type_self && ci->nodeID == nodeID) || - (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) { + (t->type == task_type_sub_self && cj == NULL && + ci->nodeID == nodeID)) { /* Self interactions add only to vertex weight. */ if (taskvweights) weights_v[cid] += w; } /* Pair? */ - else if (t->type == task_type_pair || - (t->type == task_type_sub && cj != NULL)) { + else if (t->type == task_type_pair || (t->type == task_type_sub_pair)) { /* In-cell pair? */ if (ci == cj) { /* Add weight to vertex for ci. */ diff --git a/src/queue.c b/src/queue.c index a62ded2561fc683bf2f44abcf1110b0fb7eebd0c..9883d77e66421eda6093331e0c9a8f6ac0155ded 100644 --- a/src/queue.c +++ b/src/queue.c @@ -38,22 +38,11 @@ #include "const.h" #include "error.h" -/* Counter macros. */ -#ifdef COUNTER -#define COUNT(c) (__sync_add_and_fetch(&queue_counter[c], 1)) -#else -#define COUNT(c) -#endif - -/* The counters. */ -int queue_counter[queue_counter_count]; - /** * @brief Enqueue all tasks in the incoming DEQ. * * @param q The #queue, assumed to be locked. */ - void queue_get_incoming(struct queue *q) { int *tid = q->tid; @@ -108,7 +97,6 @@ void queue_get_incoming(struct queue *q) { * @param q The #queue. * @param t The #task. */ - void queue_insert(struct queue *q, struct task *t) { /* Get an index in the DEQ. */ const int ind = atomic_inc(&q->last_incoming) % queue_incoming_size; @@ -140,7 +128,6 @@ void queue_insert(struct queue *q, struct task *t) { * @param q The #queue. * @param tasks List of tasks to which the queue indices refer to. */ - void queue_init(struct queue *q, struct task *tasks) { /* Allocate the task list if needed. */ @@ -176,7 +163,6 @@ void queue_init(struct queue *q, struct task *tasks) { * @param prev The previous #task extracted from this #queue. * @param blocking Block until access to the queue is granted. */ - struct task *queue_gettask(struct queue *q, const struct task *prev, int blocking) { diff --git a/src/runner.c b/src/runner.c index b51cb0898b8a21d185eca9bc7ff99b25cfbce700..78c183de8922235cf5729b3e16e7e7b37ed9a826 100644 --- a/src/runner.c +++ b/src/runner.c @@ -40,6 +40,7 @@ /* Local headers. */ #include "approx_math.h" #include "atomic.h" +#include "cell.h" #include "const.h" #include "debug.h" #include "drift.h" @@ -86,9 +87,6 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, #define FUNCTION force #include "runner_doiact.h" -/* Import the gravity loop functions. */ -#include "runner_doiact_grav.h" - /** * @brief Calculate gravity acceleration from external potential * @@ -98,8 +96,8 @@ const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, */ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) { - struct gpart *g, *gparts = c->gparts; - int i, k, gcount = c->gcount; + struct gpart *restrict gparts = c->gparts; + const int gcount = c->gcount; const int ti_current = r->e->ti_current; const struct external_potential *potential = r->e->external_potential; const struct phys_const *constants = r->e->physical_constants; @@ -108,7 +106,7 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) { /* Recurse? */ if (c->split) { - for (k = 0; k < 8; k++) + for (int k = 0; k < 8; k++) if (c->progeny[k] != NULL) runner_do_grav_external(r, c->progeny[k], 0); return; } @@ -118,10 +116,10 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) { #endif /* Loop over the parts in this cell. */ - for (i = 0; i < gcount; i++) { + for (int i = 0; i < gcount; i++) { /* Get a direct pointer on the part. */ - g = &gparts[i]; + struct gpart *const g = &gparts[i]; /* Is this part within the time step? */ if (g->ti_end <= ti_current) { @@ -138,7 +136,6 @@ void runner_do_grav_external(struct runner *r, struct cell *c, int timer) { * @param sort The entries * @param N The number of entries. */ - void runner_do_sort_ascending(struct entry *sort, int N) { struct { @@ -220,7 +217,6 @@ void runner_do_sort_ascending(struct entry *sort, int N) { * @param clock Flag indicating whether to record the timing or not, needed * for recursive calls. */ - void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { struct entry *finger; @@ -350,18 +346,18 @@ void runner_do_sort(struct runner *r, struct cell *c, int flags, int clock) { } } +#ifdef SWIFT_DEBUG_CHECKS /* Verify the sorting. */ - /* for ( j = 0 ; j < 13 ; j++ ) { - if ( !( flags & (1 << j) ) ) - continue; - finger = &sort[ j*(count + 1) ]; - for ( k = 1 ; k < count ; k++ ) { - if ( finger[k].d < finger[k-1].d ) - error( "Sorting failed, ascending array." ); - if ( finger[k].i >= count ) - error( "Sorting failed, indices borked." ); - } - } */ + for (j = 0; j < 13; j++) { + if (!(flags & (1 << j))) continue; + finger = &sort[j * (count + 1)]; + for (k = 1; k < count; k++) { + if (finger[k].d < finger[k - 1].d) + error("Sorting failed, ascending array."); + if (finger[k].i >= count) error("Sorting failed, indices borked."); + } + } +#endif if (clock) TIMER_TOC(timer_dosort); } @@ -426,7 +422,6 @@ void runner_do_init(struct runner *r, struct cell *c, int timer) { * @param r The runner thread. * @param c The cell. */ - void runner_do_ghost(struct runner *r, struct cell *c) { struct part *p, *parts = c->parts; @@ -553,8 +548,13 @@ void runner_do_ghost(struct runner *r, struct cell *c) { } - /* Otherwise, sub interaction? */ - else if (l->t->type == task_type_sub) { + /* Otherwise, sub-self interaction? */ + else if (l->t->type == task_type_sub_self) + runner_dosub_subset_density(r, finger, parts, pid, count, NULL, -1, + 1); + + /* Otherwise, sub-pair interaction? */ + else if (l->t->type == task_type_sub_pair) { /* Left or right? */ if (l->t->ci == finger) @@ -760,7 +760,7 @@ void runner_do_kick_fixdt(struct runner *r, struct cell *c, int timer) { struct gpart *const gp = &gparts[k]; /* If the g-particle has no counterpart */ - if (gp->id < 0) { + if (gp->id_or_neg_offset > 0) { /* First, finish the force calculation */ gravity_end_force(gp); @@ -872,7 +872,7 @@ void runner_do_kick(struct runner *r, struct cell *c, int timer) { struct gpart *const gp = &gparts[k]; /* If the g-particle has no counterpart and needs to be kicked */ - if (gp->id < 0) { + if (gp->id_or_neg_offset > 0) { if (gp->ti_end <= ti_current) { @@ -980,19 +980,36 @@ void runner_do_recv_cell(struct runner *r, struct cell *c, int timer) { int ti_end_max = 0; float h_max = 0.f; - /* Collect everything... */ - for (size_t k = 0; k < nr_parts; k++) { - const int ti_end = parts[k].ti_end; - // if(ti_end < ti_current) error("Received invalid particle !"); - ti_end_min = min(ti_end_min, ti_end); - ti_end_max = max(ti_end_max, ti_end); - h_max = fmaxf(h_max, parts[k].h); + /* If this cell is a leaf, collect the particle data. */ + if (!c->split) { + + /* Collect everything... */ + for (size_t k = 0; k < nr_parts; k++) { + const int ti_end = parts[k].ti_end; + // if(ti_end < ti_current) error("Received invalid particle !"); + ti_end_min = min(ti_end_min, ti_end); + ti_end_max = max(ti_end_max, ti_end); + h_max = fmaxf(h_max, parts[k].h); + } + for (size_t k = 0; k < nr_gparts; k++) { + const int ti_end = gparts[k].ti_end; + // if(ti_end < ti_current) error("Received invalid particle !"); + ti_end_min = min(ti_end_min, ti_end); + ti_end_max = max(ti_end_max, ti_end); + } + } - for (size_t k = 0; k < nr_gparts; k++) { - const int ti_end = gparts[k].ti_end; - // if(ti_end < ti_current) error("Received invalid particle !"); - ti_end_min = min(ti_end_min, ti_end); - ti_end_max = max(ti_end_max, ti_end); + + /* Otherwise, recurse and collect. */ + else { + for (int k = 0; k < 8; k++) { + if (c->progeny[k] != NULL) { + runner_do_recv_cell(r, c->progeny[k], 0); + ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); + ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); + h_max = fmaxf(h_max, c->progeny[k]->h_max); + } + } } /* ... and store. */ @@ -1066,13 +1083,19 @@ void *runner_main(void *data) { case task_type_sort: runner_do_sort(r, ci, t->flags, 1); break; - case task_type_sub: + case task_type_sub_self: + if (t->subtype == task_subtype_density) + runner_dosub_self1_density(r, ci, 1); + else if (t->subtype == task_subtype_force) + runner_dosub_self2_force(r, ci, 1); + else + error("Unknown task subtype."); + break; + case task_type_sub_pair: if (t->subtype == task_subtype_density) - runner_dosub1_density(r, ci, cj, t->flags, 1); + runner_dosub_pair1_density(r, ci, cj, t->flags, 1); else if (t->subtype == task_subtype_force) - runner_dosub2_force(r, ci, cj, t->flags, 1); - else if (t->subtype == task_subtype_grav) - runner_dosub_grav(r, ci, cj, 1); + runner_dosub_pair2_force(r, ci, cj, t->flags, 1); else error("Unknown task subtype."); break; @@ -1092,24 +1115,17 @@ void *runner_main(void *data) { runner_do_kick_fixdt(r, ci, 1); break; case task_type_send: + if (t->subtype == task_subtype_tend) { + free(t->buff); + } break; case task_type_recv: - runner_do_recv_cell(r, ci, 1); - break; - case task_type_grav_pp: - if (t->cj == NULL) - runner_doself_grav(r, t->ci); - else - runner_dopair_grav(r, t->ci, t->cj); - break; - case task_type_grav_mm: - runner_dograv_mm(r, t->ci, t->cj); - break; - case task_type_grav_up: - runner_dograv_up(r, t->ci); - break; - case task_type_grav_down: - runner_dograv_down(r, t->ci); + if (t->subtype == task_subtype_tend) { + cell_unpack_ti_ends(ci, t->buff); + free(t->buff); + } else { + runner_do_recv_cell(r, ci, 1); + } break; case task_type_grav_external: runner_do_grav_external(r, t->ci, 1); diff --git a/src/runner.h b/src/runner.h index 758b6cf57dbc1a46b6fc068a23615f3b28c8707e..6838b959955c4e54e208b8d2d16339e7fdb1740f 100644 --- a/src/runner.h +++ b/src/runner.h @@ -23,13 +23,12 @@ #ifndef SWIFT_RUNNER_H #define SWIFT_RUNNER_H -/* Includes. */ -#include "cell.h" -#include "inline.h" - extern const double runner_shift[13][3]; extern const char runner_flip[27]; +struct cell; +struct engine; + /* A struct representing a runner's thread and its data. */ struct runner { diff --git a/src/runner_doiact.h b/src/runner_doiact.h index e0c13e4e0adb101d9a8975651f24225551277f43..4da83b940d55460c4bf8d2f650534aedf94dbd5d 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1,7 +1,7 @@ - /******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -18,10 +18,6 @@ * ******************************************************************************/ -/* Includes. */ -#include "cell.h" -#include "part.h" - /* Before including this file, define FUNCTION, which is the name of the interaction function. This creates the interaction functions runner_dopair_FUNCTION, runner_dopair_FUNCTION_naive, runner_doself_FUNCTION, @@ -57,11 +53,17 @@ #define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f) #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION) -#define _DOSUB1(f) PASTE(runner_dosub1, f) -#define DOSUB1 _DOSUB1(FUNCTION) +#define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f) +#define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION) + +#define _DOSUB_PAIR1(f) PASTE(runner_dosub_pair1, f) +#define DOSUB_PAIR1 _DOSUB_PAIR1(FUNCTION) + +#define _DOSUB_SELF2(f) PASTE(runner_dosub_self2, f) +#define DOSUB_SELF2 _DOSUB_SELF2(FUNCTION) -#define _DOSUB2(f) PASTE(runner_dosub2, f) -#define DOSUB2 _DOSUB2(FUNCTION) +#define _DOSUB_PAIR2(f) PASTE(runner_dosub_pair2, f) +#define DOSUB_PAIR2 _DOSUB_PAIR2(FUNCTION) #define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f) #define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION) @@ -72,14 +74,23 @@ #define _IACT(f) PASTE(runner_iact, f) #define IACT _IACT(FUNCTION) +#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f) +#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION) + +#define _IACT_VEC(f) PASTE(runner_iact_vec, f) +#define IACT_VEC _IACT_VEC(FUNCTION) + #define _TIMER_DOSELF(f) PASTE(timer_doself, f) #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION) #define _TIMER_DOPAIR(f) PASTE(timer_dopair, f) #define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION) -#define _TIMER_DOSUB(f) PASTE(timer_dosub, f) -#define TIMER_DOSUB _TIMER_DOSUB(FUNCTION) +#define _TIMER_DOSUB_SELF(f) PASTE(timer_dosub_self, f) +#define TIMER_DOSUB_SELF _TIMER_DOSUB_SELF(FUNCTION) + +#define _TIMER_DOSUB_PAIR(f) PASTE(timer_dosub_pair, f) +#define TIMER_DOSUB_PAIR _TIMER_DOSUB_PAIR(FUNCTION) #define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f) #define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION) @@ -87,12 +98,6 @@ #define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f) #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION) -#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f) -#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION) - -#define _IACT_VEC(f) PASTE(runner_iact_vec, f) -#define IACT_VEC _IACT_VEC(FUNCTION) - /** * @brief Compute the interactions between a cell pair. * @@ -100,18 +105,14 @@ * @param ci The first #cell. * @param cj The second #cell. */ - void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, struct cell *restrict cj) { - struct engine *e = r->e; - int pid, pjd, k, count_i = ci->count, count_j = cj->count; - double shift[3] = {0.0, 0.0, 0.0}; - struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts; - struct part *restrict pi, *restrict pj; - double pix[3]; - float dx[3], hi, hig2, r2; + const struct engine *e = r->e; const int ti_current = e->ti_current; + + error("Don't use in actual runs ! Slow code !"); + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -120,44 +121,47 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) shift[k] = e->s->dim[k]; else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) shift[k] = -e->s->dim[k]; } - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - /* Loop over the parts in ci. */ - for (pid = 0; pid < count_i; pid++) { + for (int pid = 0; pid < count_i; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[pid]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + struct part *restrict pi = &parts_i[pid]; + const float hi = pi->h; + + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j; pjd++) { + for (int pjd = 0; pjd < count_j; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[pjd]; + struct part *restrict pj = &parts_j[pjd]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -198,7 +202,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -207,12 +211,10 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { - int pid, pjd, k, count = c->count; - struct part *restrict parts = c->parts; - struct part *restrict pi, *restrict pj; - double pix[3] = {0.0, 0.0, 0.0}; - float dx[3], hi, hig2, r2; const int ti_current = r->e->ti_current; + + error("Don't use in actual runs ! Slow code !"); + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -221,38 +223,34 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (c->ti_end_min > ti_current) return; - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ + const int count = c->count; + struct part *restrict parts = c->parts; /* Loop over the parts in ci. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts[pid]; - pix[0] = pi->x[0]; - pix[1] = pi->x[1]; - pix[2] = pi->x[2]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + struct part *restrict pi = &parts[pid]; + const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]}; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = pid + 1; pjd < count; pjd++) { + for (int pjd = pid + 1; pjd < count; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; + struct part *restrict pj = &parts[pjd]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -293,7 +291,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -311,18 +309,121 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { * @param count The number of particles in @c ind. * @param cj The second #cell. */ +void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, + struct part *restrict parts_i, int *restrict ind, + int count, struct cell *restrict cj) { + + struct engine *e = r->e; + + error("Don't use in actual runs ! Slow code !"); + +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + + TIMER_TIC + + const int count_j = cj->count; + struct part *restrict parts_j = cj->parts; + + /* Get the relative distance between the pairs, wrapping. */ + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + } + + /* Loop over the parts_i. */ + for (int pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[ind[pid]]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < count_j; pjd++) { + + /* Get a pointer to the jth particle. */ + struct part *restrict pj = &parts_j[pjd]; + + /* Compute the pairwise distance. */ + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (int k = 0; k < icount; k++) + IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + + TIMER_TOC(timer_dopair_subset); +} +/** + * @brief Compute the interactions between a cell pair, but only for the + * given indices in ci. + * + * @param r The #runner. + * @param ci The first #cell. + * @param parts_i The #part to interact with @c cj. + * @param ind The list of indices of particles in @c ci to interact with. + * @param count The number of particles in @c ind. + * @param cj The second #cell. + */ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, struct part *restrict parts_i, int *restrict ind, int count, struct cell *restrict cj) { struct engine *e = r->e; - int pid, pjd, sid, k, count_j = cj->count, flipped; - double shift[3] = {0.0, 0.0, 0.0}; - struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; - double pix[3]; - float dx[3], hi, hig2, r2, di, dxj; - struct entry *sort_j; + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -331,10 +432,15 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC + const int count_j = cj->count; + struct part *restrict parts_j = cj->parts; + /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) shift[k] = e->s->dim[k]; else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) @@ -342,52 +448,50 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, } /* Get the sorting index. */ - for (sid = 0, k = 0; k < 3; k++) + int sid = 0; + for (int k = 0; k < 3; k++) sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); /* Switch the cells around? */ - flipped = runner_flip[sid]; + const int flipped = runner_flip[sid]; sid = sortlistID[sid]; /* Have the cells been sorted? */ if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - /* Pick-out the sorted lists. */ - sort_j = &cj->sort[sid * (cj->count + 1)]; - dxj = cj->dx_max; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + const float dxj = cj->dx_max; /* Parts are on the left? */ if (!flipped) { /* Loop over the parts_i. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[ind[pid]]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] + - pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2]; + struct part *restrict pi = &parts_i[ind[pid]]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] + + pix[1] * runner_shift[sid][1] + + pix[2] * runner_shift[sid][2]; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -431,25 +535,28 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, else { /* Loop over the parts_i. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[ind[pid]]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] + - pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2]; + struct part *restrict pi = &parts_i[ind[pid]]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + const float di = + -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] + + pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2]; /* Loop over the parts in cj. */ - for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) { + for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -491,119 +598,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) - IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); -#endif - - TIMER_TOC(timer_dopair_subset); -} - -/** - * @brief Compute the interactions between a cell pair, but only for the - * given indices in ci. - * - * @param r The #runner. - * @param ci The first #cell. - * @param parts_i The #part to interact with @c cj. - * @param ind The list of indices of particles in @c ci to interact with. - * @param count The number of particles in @c ind. - * @param cj The second #cell. - */ - -void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, - struct part *restrict parts_i, int *restrict ind, - int count, struct cell *restrict cj) { - - struct engine *e = r->e; - int pid, pjd, k, count_j = cj->count; - double shift[3] = {0.0, 0.0, 0.0}; - struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; - double pix[3]; - float dx[3], hi, hig2, r2; -#ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__((aligned(16))); - float hiq[VEC_SIZE] __attribute__((aligned(16))); - float hjq[VEC_SIZE] __attribute__((aligned(16))); - float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; -#endif - TIMER_TIC - - /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { - if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) - shift[k] = e->s->dim[k]; - else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) - shift[k] = -e->s->dim[k]; - } - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Loop over the parts_i. */ - for (pid = 0; pid < count; pid++) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ind[pid]]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j; pjd++) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[pjd]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k] * dx[k]; - } - - /* Hit or miss? */ - if (r2 < hig2) { - -#ifndef VECTORIZE - - IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); - -#else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3 * icount + 0] = dx[0]; - dxq[3 * icount + 1] = dx[1]; - dxq[3 * icount + 2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if (icount == VEC_SIZE) { - IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); - icount = 0; - } - -#endif - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - -#ifdef VECTORIZE - /* Pick up any leftovers. */ - if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -620,15 +615,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, * @param ind The list of indices of particles in @c ci to interact with. * @param count The number of particles in @c ind. */ - void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, struct part *restrict parts, int *restrict ind, int count) { - int pid, pjd, k, count_i = ci->count; - struct part *restrict parts_j = ci->parts; - struct part *restrict pi, *restrict pj; - double pix[3] = {0.0, 0.0, 0.0}; - float dx[3], hi, hig2, r2; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -637,35 +626,31 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ + const int count_i = ci->count; + struct part *restrict parts_j = ci->parts; /* Loop over the parts in ci. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts[ind[pid]]; - pix[0] = pi->x[0]; - pix[1] = pi->x[1]; - pix[2] = pi->x[2]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + struct part *restrict pi = &parts[ind[pid]]; + const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]}; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_i; pjd++) { + for (int pjd = 0; pjd < count_i; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[pjd]; + struct part *restrict pj = &parts_j[pjd]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -706,7 +691,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -714,26 +699,17 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, } /** - * @brief Compute the interactions between a cell pair. + * @brief Compute the interactions between a cell pair (non-symmetric). * * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. */ - void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = {0.0, 0.0, 0.0}; - struct entry *restrict sort_i, *restrict sort_j; - struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3], pjx[3], di, dj; - float dx[3], hi, hig2, hj, hjg2, r2, dx_max; - double hi_max, hj_max; - double di_max, dj_min; - int count_i, count_j; const int ti_current = e->ti_current; + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -742,60 +718,64 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; /* Get the sort ID. */ - sid = space_getsid(e->s, &ci, &cj, shift); + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); /* Have the cells been sorted? */ if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); /* Get the cutoff shift. */ - for (rshift = 0.0, k = 0; k < 3; k++) - rshift += shift[k] * runner_shift[sid][k]; + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - sort_i = &ci->sort[sid * (ci->count + 1)]; - sort_j = &cj->sort[sid * (cj->count + 1)]; + const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; /* Get some other useful values. */ - hi_max = ci->h_max * kernel_gamma - rshift; - hj_max = cj->h_max * kernel_gamma; - count_i = ci->count; - count_j = cj->count; - parts_i = ci->parts; - parts_j = cj->parts; - di_max = sort_i[count_i - 1].d - rshift; - dj_min = sort_j[0].d; - dx_max = (ci->dx_max + cj->dx_max); + const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hj_max = cj->h_max * kernel_gamma; + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max + cj->dx_max); /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; - pid--) { + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[sort_i[pid].i]; + struct part *restrict pi = &parts_i[sort_i[pid].i]; if (pi->ti_end > ti_current) continue; - hi = pi->h; - di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + const float hi = pi->h; + const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; if (di < dj_min) continue; - hig2 = hi * hi * kernel_gamma2; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -833,33 +813,31 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { } /* loop over the parts in ci. */ - /* printf( "runner_dopair: first half took %.3f %s...\n" , - clocks_from_ticks(getticks() - tic), clocks_getunit()); - tic = getticks(); */ - /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; pjd++) { /* Get a hold of the jth part in cj. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (pj->ti_end > ti_current) continue; - hj = pj->h; - dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + const float hj = pj->h; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; if (dj > di_max) continue; - for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; - hjg2 = hj * hj * kernel_gamma2; + double pjx[3]; + for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; + const float hjg2 = hj * hj * kernel_gamma2; /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { + for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { /* Get a pointer to the jth particle. */ - pi = &parts_i[sort_i[pid].i]; + struct part *restrict pi = &parts_i[sort_i[pid].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pjx[k] - pi->x[k]; r2 += dx[k] * dx[k]; } @@ -900,28 +878,25 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif TIMER_TOC(TIMER_DOPAIR); } +/** + * @brief Compute the interactions between a cell pair (symmetric) + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = {0.0, 0.0, 0.0}; - struct entry *sort_i, *sort_j; - struct entry *sortdt_i = NULL, *sortdt_j = NULL; - int countdt_i = 0, countdt_j = 0; - struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3], pjx[3], di, dj; - float dx[3], hi, hig2, hj, hjg2, r2, dx_max; - double hi_max, hj_max; - double di_max, dj_min; - int count_i, count_j; const int ti_current = e->ti_current; + #ifdef VECTORIZE int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); @@ -936,38 +911,42 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { float dxq2[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; /* Get the shift ID. */ - sid = space_getsid(e->s, &ci, &cj, shift); + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); /* Have the cells been sorted? */ if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); /* Get the cutoff shift. */ - for (rshift = 0.0, k = 0; k < 3; k++) - rshift += shift[k] * runner_shift[sid][k]; + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - sort_i = &ci->sort[sid * (ci->count + 1)]; - sort_j = &cj->sort[sid * (cj->count + 1)]; + struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; /* Get some other useful values. */ - hi_max = ci->h_max * kernel_gamma - rshift; - hj_max = cj->h_max * kernel_gamma; - count_i = ci->count; - count_j = cj->count; - parts_i = ci->parts; - parts_j = cj->parts; - di_max = sort_i[count_i - 1].d - rshift; - dj_min = sort_j[0].d; - dx_max = (ci->dx_max + cj->dx_max); + const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hj_max = cj->h_max * kernel_gamma; + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const double dx_max = (ci->dx_max + cj->dx_max); /* Collect the number of parts left and right below dt. */ + int countdt_i = 0, countdt_j = 0; + struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL; if (ci->ti_end_max <= ti_current) { sortdt_i = sort_i; countdt_i = count_i; @@ -975,7 +954,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) == NULL) error("Failed to allocate dt sortlists."); - for (k = 0; k < count_i; k++) + for (int k = 0; k < count_i; k++) if (parts_i[sort_i[k].i].ti_end <= ti_current) { sortdt_i[countdt_i] = sort_i[k]; countdt_i += 1; @@ -988,7 +967,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) == NULL) error("Failed to allocate dt sortlists."); - for (k = 0; k < count_j; k++) + for (int k = 0; k < count_j; k++) if (parts_j[sort_j[k].i].ti_end <= ti_current) { sortdt_j[countdt_j] = sort_j[k]; countdt_j += 1; @@ -996,31 +975,33 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; - pid--) { + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[sort_i[pid].i]; - hi = pi->h; - di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + struct part *restrict pi = &parts_i[sort_i[pid].i]; + const float hi = pi->h; + const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; if (di < dj_min) continue; - hig2 = hi * hi * kernel_gamma2; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hig2 = hi * hi * kernel_gamma2; /* Look at valid dt parts only? */ if (pi->ti_end > ti_current) { /* Loop over the parts in cj within dt. */ - for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sortdt_j[pjd].i]; - hj = pj->h; + struct part *restrict pj = &parts_j[sortdt_j[pjd].i]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pj->x[k] - pix[k]; r2 += dx[k] * dx[k]; } @@ -1062,15 +1043,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { else { /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; - hj = pj->h; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -1136,36 +1118,34 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /* loop over the parts in ci. */ - /* printf( "runner_dopair: first half took %.3f %s...\n" , - clocks_from_ticks(getticks() - tic), clocks_getunit()); - tic = getticks(); */ - /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; pjd++) { /* Get a hold of the jth part in cj. */ - pj = &parts_j[sort_j[pjd].i]; - hj = pj->h; - dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + const float hj = pj->h; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; if (dj > di_max) continue; - for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; - hjg2 = hj * hj * kernel_gamma2; + double pjx[3]; + for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; + const float hjg2 = hj * hj * kernel_gamma2; /* Is this particle outside the dt? */ if (pj->ti_end > ti_current) { /* Loop over the parts in ci. */ - for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) { + for (int pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) { /* Get a pointer to the jth particle. */ - pi = &parts_i[sortdt_i[pid].i]; - hi = pi->h; + struct part *restrict pi = &parts_i[sortdt_i[pid].i]; + const float hi = pi->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pi->x[k] - pjx[k]; r2 += dx[k] * dx[k]; } @@ -1206,15 +1186,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { else { /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { + for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { /* Get a pointer to the jth particle. */ - pi = &parts_i[sort_i[pid].i]; - hi = pi->h; + struct part *restrict pi = &parts_i[sort_i[pid].i]; + const float hi = pi->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pjx[k] - pi->x[k]; r2 += dx[k] * dx[k]; } @@ -1283,10 +1264,10 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount1 > 0) - for (k = 0; k < icount1; k++) + for (int k = 0; k < icount1; k++) IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); if (icount2 > 0) - for (k = 0; k < icount2; k++) + for (int k = 0; k < icount2; k++) IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); #endif @@ -1294,20 +1275,15 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /** - * @brief Compute the cell self-interaction. + * @brief Compute the cell self-interaction (non-symmetric). * * @param r The #runner. * @param c The #cell. */ - void DOSELF1(struct runner *r, struct cell *restrict c) { - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hj, hig2, r2; - struct part *restrict parts = c->parts, *restrict pi, *restrict pj; const int ti_current = r->e->ti_current; - int firstdt = 0, countdt = 0, *indt = NULL, doj; + #ifdef VECTORIZE int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); @@ -1324,43 +1300,49 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { #endif TIMER_TIC - /* Set up indt if needed. */ - if (c->ti_end_min > ti_current) - return; - else if (c->ti_end_max > ti_current) { - if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) - error("Failed to allocate indt."); - for (k = 0; k < count; k++) - if (parts[k].ti_end <= ti_current) { - indt[countdt] = k; - countdt += 1; - } - } + if (c->ti_end_min > ti_current) return; + if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); + + struct part *restrict parts = c->parts; + const int count = c->count; + + /* Set up indt. */ + int *indt = NULL; + int countdt = 0, firstdt = 0; + if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) + error("Failed to allocate indt."); + for (int k = 0; k < count; k++) + if (parts[k].ti_end <= ti_current) { + indt[countdt] = k; + countdt += 1; + } /* Loop over the particles in the cell. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Get the particle position and radius. */ - for (k = 0; k < 3; k++) pix[k] = pi->x[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Is the ith particle inactive? */ if (pi->ti_end > ti_current) { /* Loop over the other particles .*/ - for (pjd = firstdt; pjd < countdt; pjd++) { + for (int pjd = firstdt; pjd < countdt; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[indt[pjd]]; - hj = pj->h; + struct part *restrict pj = &parts[indt[pjd]]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pj->x[k] - pix[k]; r2 += dx[k] * dx[k]; } @@ -1405,19 +1387,21 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { firstdt += 1; /* Loop over the other particles .*/ - for (pjd = pid + 1; pjd < count; pjd++) { + for (int pjd = pid + 1; pjd < count; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; - hj = pj->h; + struct part *restrict pj = &parts[pjd]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } - doj = (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2); + const int doj = + (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2); /* Hit or miss? */ if (r2 < hig2 || doj) { @@ -1508,24 +1492,26 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount1 > 0) - for (k = 0; k < icount1; k++) + for (int k = 0; k < icount1; k++) IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); if (icount2 > 0) - for (k = 0; k < icount2; k++) + for (int k = 0; k < icount2; k++) IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); #endif TIMER_TOC(TIMER_DOSELF); } +/** + * @brief Compute the cell self-interaction (symmetric). + * + * @param r The #runner. + * @param c The #cell. + */ void DOSELF2(struct runner *r, struct cell *restrict c) { - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hj, hig2, r2; - struct part *restrict parts = c->parts, *restrict pi, *restrict pj; const int ti_current = r->e->ti_current; - int firstdt = 0, countdt = 0, *indt = NULL; + #ifdef VECTORIZE int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); @@ -1542,43 +1528,49 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { #endif TIMER_TIC - /* Set up indt if needed. */ - if (c->ti_end_min > ti_current) - return; - else if (c->ti_end_max > ti_current) { - if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) - error("Failed to allocate indt."); - for (k = 0; k < count; k++) - if (parts[k].ti_end <= ti_current) { - indt[countdt] = k; - countdt += 1; - } - } + if (c->ti_end_min > ti_current) return; + if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); + + struct part *restrict parts = c->parts; + const int count = c->count; + + /* Set up indt. */ + int *indt = NULL; + int countdt = 0, firstdt = 0; + if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) + error("Failed to allocate indt."); + for (int k = 0; k < count; k++) + if (parts[k].ti_end <= ti_current) { + indt[countdt] = k; + countdt += 1; + } /* Loop over the particles in the cell. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Get the particle position and radius. */ - for (k = 0; k < 3; k++) pix[k] = pi->x[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Is the ith particle not active? */ if (pi->ti_end > ti_current) { /* Loop over the other particles .*/ - for (pjd = firstdt; pjd < countdt; pjd++) { + for (int pjd = firstdt; pjd < countdt; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[indt[pjd]]; - hj = pj->h; + struct part *restrict pj = &parts[indt[pjd]]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pj->x[k] - pix[k]; r2 += dx[k] * dx[k]; } @@ -1623,15 +1615,16 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { firstdt += 1; /* Loop over the other particles .*/ - for (pjd = pid + 1; pjd < count; pjd++) { + for (int pjd = pid + 1; pjd < count; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; - hj = pj->h; + struct part *restrict pj = &parts[pjd]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -1700,10 +1693,10 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount1 > 0) - for (k = 0; k < icount1; k++) + for (int k = 0; k < icount1; k++) IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); if (icount2 > 0) - for (k = 0; k < icount2; k++) + for (int k = 0; k < icount2; k++) IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); #endif @@ -1711,7 +1704,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { } /** - * @brief Compute grouped sub-cell interactions + * @brief Compute grouped sub-cell interactions for pairs * * @param r The #runner. * @param ci The first #cell. @@ -1722,559 +1715,574 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { * @todo Hard-code the sid on the recursive calls to avoid the * redundant computations to find the sid on-the-fly. */ +void DOSUB_PAIR1(struct runner *r, struct cell *ci, struct cell *cj, int sid, + int gettimer) { -void DOSUB1(struct runner *r, struct cell *ci, struct cell *cj, int sid, - int gettimer) { - - int j = 0, k; - double shift[3]; - float h; struct space *s = r->e->s; const int ti_current = r->e->ti_current; TIMER_TIC - /* Is this a single cell? */ - if (cj == NULL) { - - /* Should we even bother? */ - if (ci->ti_end_min > ti_current) return; + /* Should we even bother? */ + if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; - /* Recurse? */ - if (ci->split) { + /* Get the cell dimensions. */ + const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); - /* Loop over all progeny. */ - for (k = 0; k < 8; k++) - if (ci->progeny[k] != NULL) { - DOSUB1(r, ci->progeny[k], NULL, -1, 0); - for (j = k + 1; j < 8; j++) - if (ci->progeny[j] != NULL) - DOSUB1(r, ci->progeny[k], ci->progeny[j], -1, 0); - } + /* Get the type of pair if not specified explicitly. */ + // if ( sid < 0 ) + double shift[3]; + sid = space_getsid(s, &ci, &cj, shift); - } + /* Recurse? */ + if (ci->split && cj->split && + fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < + h / 2) { - /* Otherwise, compute self-interaction. */ - else - DOSELF1(r, ci); + /* Different types of flags. */ + switch (sid) { - } /* self-interaction. */ + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + break; - /* Otherwise, it's a pair interaction. */ - else { + case 1: /* ( 1 , 1 , 0 ) */ + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0); + break; - /* Should we even bother? */ - if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; + case 2: /* ( 1 , 1 , -1 ) */ + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0); + break; - /* Get the cell dimensions. */ - h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + case 3: /* ( 1 , 0 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0); + break; - /* Get the type of pair if not specified explicitly. */ - // if ( sid < 0 ) - sid = space_getsid(s, &ci, &cj, shift); + case 4: /* ( 1 , 0 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[0], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[1], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[2], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[3], -1, 0); + break; - /* Recurse? */ - if (ci->split && cj->split && - fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < - h / 2) { + case 5: /* ( 1 , 0 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[3], -1, 0); + break; - /* Different types of flags. */ - switch (sid) { + case 6: /* ( 1 , -1 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0); + break; - /* Regular sub-cell interactions of a single cell. */ - case 0: /* ( 1 , 1 , 1 ) */ - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - break; + case 7: /* ( 1 , -1 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[3], -1, 0); + break; - case 1: /* ( 1 , 1 , 0 ) */ - if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0); - break; + case 8: /* ( 1 , -1 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR1(r, ci->progeny[4], cj->progeny[3], -1, 0); + break; - case 2: /* ( 1 , 1 , -1 ) */ - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); - break; + case 9: /* ( 0 , 1 , 1 ) */ + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0); + break; - case 3: /* ( 1 , 0 , 1 ) */ - if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0); - break; + case 10: /* ( 0 , 1 , 0 ) */ + if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[0], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[4], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[1], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[4], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[5], -1, 0); + break; - case 4: /* ( 1 , 0 , 0 ) */ - if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[0], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[1], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[2], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[3], -1, 0); - break; + case 11: /* ( 0 , 1 , -1 ) */ + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR1(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR1(r, ci->progeny[6], cj->progeny[5], -1, 0); + break; - case 5: /* ( 1 , 0 , -1 ) */ - if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0); - break; + case 12: /* ( 0 , 0 , 1 ) */ + if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[0], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[2], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[4], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR1(r, ci->progeny[1], cj->progeny[6], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[2], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR1(r, ci->progeny[3], cj->progeny[6], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[4], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR1(r, ci->progeny[5], cj->progeny[6], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR1(r, ci->progeny[7], cj->progeny[6], -1, 0); + break; + } - case 6: /* ( 1 , -1 , 1 ) */ - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); - break; + } - case 7: /* ( 1 , -1 , 0 ) */ - if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0); - break; + /* Otherwise, compute the pair directly. */ + else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) { - case 8: /* ( 1 , -1 , -1 ) */ - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); - break; + /* Do any of the cells need to be sorted first? */ + if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1); + if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1); - case 9: /* ( 0 , 1 , 1 ) */ - if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0); - break; + /* Compute the interactions. */ + DOPAIR1(r, ci, cj); + } - case 10: /* ( 0 , 1 , 0 ) */ - if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[2], cj->progeny[0], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[2], cj->progeny[4], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) - DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[1], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[5], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[4], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[5], -1, 0); - break; + if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR); +} - case 11: /* ( 0 , 1 , -1 ) */ - if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) - DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) - DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0); - break; +/** + * @brief Compute grouped sub-cell interactions for self tasks + * + * @param r The #runner. + * @param ci The first #cell. + * @param gettimer Do we have a timer ? + */ +void DOSUB_SELF1(struct runner *r, struct cell *ci, int gettimer) { - case 12: /* ( 0 , 0 , 1 ) */ - if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[1], cj->progeny[0], -1, 0); - if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[1], cj->progeny[2], -1, 0); - if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[1], cj->progeny[4], -1, 0); - if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) - DOSUB1(r, ci->progeny[1], cj->progeny[6], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[2], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) - DOSUB1(r, ci->progeny[3], cj->progeny[6], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[4], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) - DOSUB1(r, ci->progeny[5], cj->progeny[6], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) - DOSUB1(r, ci->progeny[7], cj->progeny[6], -1, 0); - break; - } + const int ti_current = r->e->ti_current; - } + TIMER_TIC - /* Otherwise, compute the pair directly. */ - else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) { + /* Should we even bother? */ + if (ci->ti_end_min > ti_current) return; - /* Do any of the cells need to be sorted first? */ - if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1); - if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1); + /* Recurse? */ + if (ci->split) { - /* Compute the interactions. */ - DOPAIR1(r, ci, cj); - } + /* Loop over all progeny. */ + for (int k = 0; k < 8; k++) + if (ci->progeny[k] != NULL) { + DOSUB_SELF1(r, ci->progeny[k], 0); + for (int j = k + 1; j < 8; j++) + if (ci->progeny[j] != NULL) + DOSUB_PAIR1(r, ci->progeny[k], ci->progeny[j], -1, 0); + } + } - } /* otherwise, pair interaction. */ + /* Otherwise, compute self-interaction. */ + else + DOSELF1(r, ci); - if (gettimer) TIMER_TOC(TIMER_DOSUB); + if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF); } -void DOSUB2(struct runner *r, struct cell *ci, struct cell *cj, int sid, - int gettimer) { +/** + * @brief Compute grouped sub-cell interactions for pairs (symmetric case) + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + * @param sid The direction linking the cells + * @param gettimer Do we have a timer ? + * + * @todo Hard-code the sid on the recursive calls to avoid the + * redundant computations to find the sid on-the-fly. + */ +void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid, + int gettimer) { - int j, k; - double shift[3]; - float h; struct space *s = r->e->s; const int ti_current = r->e->ti_current; TIMER_TIC - /* Is this a single cell? */ - if (cj == NULL) { + /* Should we even bother? */ + if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; - /* Should we even bother? */ - if (ci->ti_end_min > ti_current) return; + /* Get the cell dimensions. */ + const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); - /* Recurse? */ - if (ci->split) { + /* Get the type of pair if not specified explicitly. */ + // if ( sid < 0 ) + double shift[3]; + sid = space_getsid(s, &ci, &cj, shift); - /* Loop over all progeny. */ - for (k = 0; k < 8; k++) - if (ci->progeny[k] != NULL) { - DOSUB2(r, ci->progeny[k], NULL, -1, 0); - for (j = k + 1; j < 8; j++) - if (ci->progeny[j] != NULL) - DOSUB2(r, ci->progeny[k], ci->progeny[j], -1, 0); - } + /* Recurse? */ + if (ci->split && cj->split && + fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < + h / 2) { - } + /* Different types of flags. */ + switch (sid) { - /* Otherwise, compute self-interaction. */ - else - DOSELF2(r, ci); + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + break; - } /* self-interaction. */ + case 1: /* ( 1 , 1 , 0 ) */ + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0); + break; - /* Otherwise, it's a pair interaction. */ - else { + case 2: /* ( 1 , 1 , -1 ) */ + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0); + break; - /* Should we even bother? */ - if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; + case 3: /* ( 1 , 0 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0); + break; - /* Get the cell dimensions. */ - h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + case 4: /* ( 1 , 0 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[0], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[1], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[2], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[3], -1, 0); + break; - /* Get the type of pair if not specified explicitly. */ - // if ( sid < 0 ) - sid = space_getsid(s, &ci, &cj, shift); + case 5: /* ( 1 , 0 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[3], -1, 0); + break; - /* Recurse? */ - if (ci->split && cj->split && - fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < - h / 2) { + case 6: /* ( 1 , -1 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0); + break; - /* Different types of flags. */ - switch (sid) { + case 7: /* ( 1 , -1 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[3], -1, 0); + break; - /* Regular sub-cell interactions of a single cell. */ - case 0: /* ( 1 , 1 , 1 ) */ - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - break; + case 8: /* ( 1 , -1 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB_PAIR2(r, ci->progeny[4], cj->progeny[3], -1, 0); + break; - case 1: /* ( 1 , 1 , 0 ) */ - if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0); - break; + case 9: /* ( 0 , 1 , 1 ) */ + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0); + break; - case 2: /* ( 1 , 1 , -1 ) */ - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); - break; + case 10: /* ( 0 , 1 , 0 ) */ + if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[0], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[4], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[1], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[4], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[5], -1, 0); + break; - case 3: /* ( 1 , 0 , 1 ) */ - if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0); - break; + case 11: /* ( 0 , 1 , -1 ) */ + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR2(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB_PAIR2(r, ci->progeny[6], cj->progeny[5], -1, 0); + break; - case 4: /* ( 1 , 0 , 0 ) */ - if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[0], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[1], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[2], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[3], -1, 0); - break; + case 12: /* ( 0 , 0 , 1 ) */ + if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[0], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[2], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[4], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR2(r, ci->progeny[1], cj->progeny[6], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[2], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR2(r, ci->progeny[3], cj->progeny[6], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[4], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR2(r, ci->progeny[5], cj->progeny[6], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) + DOSUB_PAIR2(r, ci->progeny[7], cj->progeny[6], -1, 0); + break; + } - case 5: /* ( 1 , 0 , -1 ) */ - if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0); - break; + } - case 6: /* ( 1 , -1 , 1 ) */ - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); - break; + /* Otherwise, compute the pair directly. */ + else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) { - case 7: /* ( 1 , -1 , 0 ) */ - if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0); - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0); - break; + /* Do any of the cells need to be sorted first? */ + if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1); + if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1); - case 8: /* ( 1 , -1 , -1 ) */ - if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) - DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); - break; + /* Compute the interactions. */ + DOPAIR2(r, ci, cj); + } - case 9: /* ( 0 , 1 , 1 ) */ - if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0); - break; + if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR); +} - case 10: /* ( 0 , 1 , 0 ) */ - if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[2], cj->progeny[0], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[2], cj->progeny[4], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) - DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[1], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[5], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[4], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[5], -1, 0); - break; +/** + * @brief Compute grouped sub-cell interactions for self tasks (symmetric case) + * + * @param r The #runner. + * @param ci The first #cell. + * @param gettimer Do we have a timer ? + */ +void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) { - case 11: /* ( 0 , 1 , -1 ) */ - if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0); - if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) - DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); - if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) - DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0); - break; + const int ti_current = r->e->ti_current; - case 12: /* ( 0 , 0 , 1 ) */ - if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[1], cj->progeny[0], -1, 0); - if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[1], cj->progeny[2], -1, 0); - if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[1], cj->progeny[4], -1, 0); - if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) - DOSUB2(r, ci->progeny[1], cj->progeny[6], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[2], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0); - if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) - DOSUB2(r, ci->progeny[3], cj->progeny[6], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[4], -1, 0); - if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) - DOSUB2(r, ci->progeny[5], cj->progeny[6], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0); - if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) - DOSUB2(r, ci->progeny[7], cj->progeny[6], -1, 0); - break; - } + TIMER_TIC - } + /* Should we even bother? */ + if (ci->ti_end_min > ti_current) return; - /* Otherwise, compute the pair directly. */ - else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) { + /* Recurse? */ + if (ci->split) { - /* Do any of the cells need to be sorted first? */ - if (!(ci->sorted & (1 << sid))) runner_do_sort(r, ci, (1 << sid), 1); - if (!(cj->sorted & (1 << sid))) runner_do_sort(r, cj, (1 << sid), 1); + /* Loop over all progeny. */ + for (int k = 0; k < 8; k++) + if (ci->progeny[k] != NULL) { + DOSUB_SELF2(r, ci->progeny[k], 0); + for (int j = k + 1; j < 8; j++) + if (ci->progeny[j] != NULL) + DOSUB_PAIR2(r, ci->progeny[k], ci->progeny[j], -1, 0); + } - /* Compute the interactions. */ - DOPAIR2(r, ci, cj); - } + } - } /* otherwise, pair interaction. */ + /* Otherwise, compute self-interaction. */ + else + DOSELF2(r, ci); - if (gettimer) TIMER_TOC(TIMER_DOSUB); + if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF); } void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, int *ind, int count, struct cell *cj, int sid, int gettimer) { - int j, k; - double shift[3]; - float h; struct space *s = r->e->s; - struct cell *sub = NULL; const int ti_current = r->e->ti_current; TIMER_TIC /* Find out in which sub-cell of ci the parts are. */ - for (k = 0; k < 8; k++) + struct cell *sub = NULL; + for (int k = 0; k < 8; k++) if (ci->progeny[k] != NULL) { // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] && // parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] + @@ -2300,7 +2308,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, /* Loop over all progeny. */ DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0); - for (j = 0; j < 8; j++) + for (int j = 0; j < 8; j++) if (ci->progeny[j] != sub && ci->progeny[j] != NULL) DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0); @@ -2316,7 +2324,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, else { /* Get the cell dimensions. */ - h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); /* Recurse? */ if (ci->split && cj->split && @@ -2324,6 +2332,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, h / 2) { /* Get the type of pair if not specified explicitly. */ + double shift[3] = {0.0, 0.0, 0.0}; sid = space_getsid(s, &ci, &cj, shift); /* Different types of flags. */ @@ -2335,7 +2344,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], -1, 0); if (ci->progeny[7] != NULL && cj->progeny[0] == sub) - DOSUB_SUBSET(r, ci->progeny[0], parts, ind, count, cj->progeny[7], + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], -1, 0); break; @@ -2832,7 +2841,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) { /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2) shift[k] = s->dim[k]; else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2) @@ -2840,7 +2850,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, } /* Get the sorting index. */ - for (sid = 0, k = 0; k < 3; k++) + int sid = 0; + for (int k = 0; k < 3; k++) sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 @@ -2856,5 +2867,5 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, } /* otherwise, pair interaction. */ - if (gettimer) TIMER_TOC(TIMER_DOSUB); + if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR); } diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index 13e6fe68ca223fa95a307fd063208e1f5d5efa6c..e3788dfa1123584c913bca6baa6fc2db6698e6d0 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -19,11 +19,6 @@ #ifndef SWIFT_RUNNER_DOIACT_GRAV_H #define SWIFT_RUNNER_DOIACT_GRAV_H -/* Includes. */ -#include "cell.h" -#include "clocks.h" -#include "part.h" - /** * @brief Compute the sorted gravity interactions between a cell pair. * diff --git a/src/scheduler.c b/src/scheduler.c index cae4132e0fadc3bf2a8c179092a2d875edfd86e1..05f699557cc7aafb641112bf9d9493188ffde52c 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -44,6 +44,9 @@ #include "error.h" #include "intrinsics.h" #include "kernel_hydro.h" +#include "queue.h" +#include "space.h" +#include "task.h" #include "timers.h" /** @@ -106,17 +109,14 @@ void scheduler_addunlock(struct scheduler *s, struct task *ta, void scheduler_splittasks_mapper(void *map_data, int num_elements, void *extra_data) { - /* Static constants. */ - const static int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0}, - {-1, -1, 11, 10, 5, 4, 2, 1}, - {-1, -1, -1, 12, 7, 6, 4, 3}, - {-1, -1, -1, -1, 8, 7, 5, 4}, - {-1, -1, -1, -1, -1, 12, 10, 9}, - {-1, -1, -1, -1, -1, -1, 11, 10}, - {-1, -1, -1, -1, -1, -1, -1, 12}}; - const static float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, - 0.4025, 0.1897, 0.4025, 0.1897, 0.4025, - 0.5788, 0.4025, 0.5788}; + const int pts[7][8] = { + {-1, 12, 10, 9, 4, 3, 1, 0}, {-1, -1, 11, 10, 5, 4, 2, 1}, + {-1, -1, -1, 12, 7, 6, 4, 3}, {-1, -1, -1, -1, 8, 7, 5, 4}, + {-1, -1, -1, -1, -1, 12, 10, 9}, {-1, -1, -1, -1, -1, -1, 11, 10}, + {-1, -1, -1, -1, -1, -1, -1, 12}}; + const float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, + 0.4025, 0.1897, 0.4025, 0.1897, 0.4025, + 0.5788, 0.4025, 0.5788}; /* Extract the parameters. */ struct scheduler *s = (struct scheduler *)extra_data; @@ -681,9 +681,6 @@ struct task *scheduler_addtask(struct scheduler *s, int type, int subtype, t->rid = -1; t->last_rid = -1; - /* Init the lock. */ - lock_init(&t->lock); - /* Add an index for it. */ // lock_lock( &s->lock ); s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind; @@ -742,42 +739,11 @@ void scheduler_set_unlocks(struct scheduler *s) { t->unlock_tasks = &s->unlocks[offsets[k]]; } - /* Verify that there are no duplicate unlocks. */ - /* for (int k = 0; k < s->nr_tasks; k++) { - struct task *t = &s->tasks[k]; - for (int i = 0; i < t->nr_unlock_tasks; i++) { - for (int j = i + 1; j < t->nr_unlock_tasks; j++) { - if (t->unlock_tasks[i] == t->unlock_tasks[j]) - error("duplicate unlock!"); - } - } - } */ - /* Clean up. */ free(counts); free(offsets); } -/** - * @brief #threadpool_map function which runs through the task - * graph and re-computes the task wait counters. - */ - -void scheduler_simple_rewait_mapper(void *map_data, int num_elements, - void *extra_data) { - - struct task *tasks = (struct task *)map_data; - for (int ind = 0; ind < num_elements; ind++) { - struct task *t = &tasks[ind]; - - /* Increment the waits of the dependances */ - for (int k = 0; k < t->nr_unlock_tasks; k++) { - struct task *u = t->unlock_tasks[k]; - atomic_inc(&u->wait); - } - } -} - /** * @brief Sort the tasks in topological order over all queues. * @@ -829,10 +795,12 @@ void scheduler_ranktasks(struct scheduler *s) { j = left_old; } +#ifdef SWIFT_DEBUG_CHECKS /* Verify that the tasks were ranked correctly. */ - /* for (int k = 1; k < s->nr_tasks; k++) + for (int k = 1; k < s->nr_tasks; k++) if (tasks[tid[k - 1]].rank > tasks[tid[k - 1]].rank) - error("Task ranking failed."); */ + error("Task ranking failed."); +#endif } /** @@ -917,23 +885,23 @@ void scheduler_reweight(struct scheduler *s) { t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; break; - case task_type_sub: - if (t->cj != NULL) { - if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) { - if (t->flags < 0) - t->weight += 3 * wscale * t->ci->count * t->cj->count; - else - t->weight += 3 * wscale * t->ci->count * t->cj->count * - sid_scale[t->flags]; - } else { - if (t->flags < 0) - t->weight += 2 * wscale * t->ci->count * t->cj->count; - else - t->weight += 2 * wscale * t->ci->count * t->cj->count * - sid_scale[t->flags]; - } - } else - t->weight += 1 * wscale * t->ci->count * t->ci->count; + case task_type_sub_pair: + if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) { + if (t->flags < 0) + t->weight += 3 * wscale * t->ci->count * t->cj->count; + else + t->weight += 3 * wscale * t->ci->count * t->cj->count * + sid_scale[t->flags]; + } else { + if (t->flags < 0) + t->weight += 2 * wscale * t->ci->count * t->cj->count; + else + t->weight += 2 * wscale * t->ci->count * t->cj->count * + sid_scale[t->flags]; + } + break; + case task_type_sub_self: + t->weight += 1 * wscale * t->ci->count * t->ci->count; break; case task_type_ghost: if (t->ci == t->ci->super) t->weight += wscale * t->ci->count; @@ -950,8 +918,6 @@ void scheduler_reweight(struct scheduler *s) { default: break; } - if (t->type == task_type_send) t->weight = INT_MAX / 8; - if (t->type == task_type_recv) t->weight *= 1.41; } // message( "weighting tasks took %.3f %s." , // clocks_from_ticks( getticks() - tic ), clocks_getunit()); @@ -1075,6 +1041,7 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { if (t->implicit) { for (int j = 0; j < t->nr_unlock_tasks; j++) { struct task *t2 = t->unlock_tasks[j]; + if (atomic_dec(&t2->wait) == 1) scheduler_enqueue(s, t2); } } @@ -1097,17 +1064,22 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { qid = t->ci->super->owner; break; case task_type_pair: - case task_type_sub: + case task_type_sub_pair: qid = t->ci->super->owner; - if (t->cj != NULL && - (qid < 0 || - s->queues[qid].count > s->queues[t->cj->super->owner].count)) + if (qid < 0 || + s->queues[qid].count > s->queues[t->cj->super->owner].count) qid = t->cj->super->owner; break; case task_type_recv: #ifdef WITH_MPI - err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type, - t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if (t->subtype == task_subtype_tend) { + t->buff = malloc(sizeof(int) * t->ci->pcell_size); + err = MPI_Irecv(t->buff, t->ci->pcell_size, MPI_INT, t->ci->nodeID, + t->flags, MPI_COMM_WORLD, &t->req); + } else { + err = MPI_Irecv(t->ci->parts, t->ci->count, part_mpi_type, + t->ci->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + } if (err != MPI_SUCCESS) { mpi_error(err, "Failed to emit irecv for particle data."); } @@ -1121,8 +1093,15 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { break; case task_type_send: #ifdef WITH_MPI - err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, - t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + if (t->subtype == task_subtype_tend) { + t->buff = malloc(sizeof(int) * t->ci->pcell_size); + cell_pack_ti_ends(t->ci, t->buff); + err = MPI_Isend(t->buff, t->ci->pcell_size, MPI_INT, t->cj->nodeID, + t->flags, MPI_COMM_WORLD, &t->req); + } else { + err = MPI_Isend(t->ci->parts, t->ci->count, part_mpi_type, + t->cj->nodeID, t->flags, MPI_COMM_WORLD, &t->req); + } if (err != MPI_SUCCESS) { mpi_error(err, "Failed to emit isend for particle data."); } @@ -1326,8 +1305,7 @@ struct task *scheduler_gettask(struct scheduler *s, int qid, */ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, - int nr_queues, unsigned int flags, int nodeID, - struct threadpool *tp) { + int nr_queues, unsigned int flags, int nodeID) { /* Init the lock. */ lock_init(&s->lock); @@ -1347,7 +1325,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, /* Init the unlocks. */ if ((s->unlocks = (struct task **)malloc( - sizeof(struct task *) *scheduler_init_nr_unlocks)) == NULL || + sizeof(struct task *) * scheduler_init_nr_unlocks)) == NULL || (s->unlock_ind = (int *)malloc(sizeof(int) * scheduler_init_nr_unlocks)) == NULL) error("Failed to allocate unlocks."); @@ -1359,7 +1337,6 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, s->flags = flags; s->space = space; s->nodeID = nodeID; - s->threadpool = tp; /* Init the tasks array. */ s->size = 0; @@ -1392,3 +1369,31 @@ void scheduler_print_tasks(const struct scheduler *s, const char *fileName) { fclose(file); } + +/** + * @brief Sets the waits of the dependants of a range of task + * + * @param t_begin Beginning of the #task range + * @param t_end End of the #task range + * @param mask The scheduler task mask + * @param submask The scheduler subtask mask + */ +void scheduler_do_rewait(struct task *t_begin, struct task *t_end, + unsigned int mask, unsigned int submask) { + for (struct task *t2 = t_begin; t2 != t_end; t2++) { + + if (t2->skip) continue; + + /* Skip tasks not in the mask */ + if (!((1 << t2->type) & mask) || !((1 << t2->subtype) & submask)) continue; + + /* Skip sort tasks that have already been performed */ + if (t2->type == task_type_sort && t2->flags == 0) continue; + + /* Sets the waits of the dependances */ + for (int k = 0; k < t2->nr_unlock_tasks; k++) { + struct task *t3 = t2->unlock_tasks[k]; + atomic_inc(&t3->wait); + } + } +} diff --git a/src/scheduler.h b/src/scheduler.h index 7528e13c2f7901ba9979a3ef7567bef1e23d8ac4..37ddb704ac8adc19f870916103b691715c820c4e 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -35,7 +35,6 @@ #include "cell.h" #include "lock.h" #include "queue.h" -#include "space.h" #include "task.h" #include "threadpool.h" diff --git a/src/serial_io.c b/src/serial_io.c index 5abd3ebc28672d68c4135efe5753dc4713c2d3c6..03d3c38850e0a7800c83d7dd0942025632bf1d0b 100644 --- a/src/serial_io.c +++ b/src/serial_io.c @@ -26,22 +26,22 @@ /* Some standard headers. */ #include <hdf5.h> #include <math.h> +#include <mpi.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -/* MPI headers. */ -#ifdef WITH_MPI -#include <mpi.h> -#endif - /* This object's header. */ #include "serial_io.h" /* Local includes. */ #include "common_io.h" +#include "engine.h" #include "error.h" +#include "kernel_hydro.h" +#include "part.h" +#include "units.h" /*----------------------------------------------------------------------------- * Routines reading an IC file @@ -424,8 +424,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, */ void read_ic_serial(char* fileName, double dim[3], struct part** parts, struct gpart** gparts, size_t* Ngas, size_t* Ngparts, - int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, - MPI_Info info, int dry_run) { + int* periodic, int* flag_entropy, int mpi_rank, + int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run) { hid_t h_file = 0, h_grp = 0; /* GADGET has only cubic boxes (in cosmological mode) */ double boxSize[3] = {0.0, -1.0, -1.0}; @@ -462,6 +462,7 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts, if (h_grp < 0) error("Error while opening file header\n"); /* Read the relevant information and print status */ + readAttribute(h_grp, "Flag_Entropy_ICs", INT, flag_entropy); readAttribute(h_grp, "BoxSize", DOUBLE, boxSize); readAttribute(h_grp, "NumPart_Total", UINT, numParticles); readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord); @@ -488,6 +489,7 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts, } /* Now need to broadcast that information to all ranks. */ + MPI_Bcast(flag_entropy, 1, MPI_INT, 0, comm); MPI_Bcast(periodic, 1, MPI_INT, 0, comm); MPI_Bcast(&N_total, NUM_PARTICLE_TYPES, MPI_LONG_LONG, 0, comm); MPI_Bcast(dim, 3, MPI_DOUBLE, 0, comm); @@ -699,6 +701,7 @@ void write_output_serial(struct engine* e, const char* baseName, double MassTable[6] = {0., 0., 0., 0., 0., 0.}; writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES); unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0}; + flagEntropy[0] = writeEntropyFlag(); writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, NUM_PARTICLE_TYPES); writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1); diff --git a/src/serial_io.h b/src/serial_io.h index b7ed6eb62d823829a473f828696c291e552effa3..fa88d5cb87edfba2c101b6096b8a57b04b7178cd 100644 --- a/src/serial_io.h +++ b/src/serial_io.h @@ -19,6 +19,9 @@ #ifndef SWIFT_SERIAL_IO_H #define SWIFT_SERIAL_IO_H +/* Config parameters. */ +#include "../config.h" + /* MPI headers. */ #ifdef WITH_MPI #include <mpi.h> @@ -33,8 +36,8 @@ void read_ic_serial(char* fileName, double dim[3], struct part** parts, struct gpart** gparts, size_t* Ngas, size_t* Ngparts, - int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, - MPI_Info info, int dry_run); + int* periodic, int* flag_entropy, int mpi_rank, + int mpi_size, MPI_Comm comm, MPI_Info info, int dry_run); void write_output_serial(struct engine* e, const char* baseName, struct UnitSystem* us, int mpi_rank, int mpi_size, diff --git a/src/single_io.c b/src/single_io.c index fb3bf4368feed9892b098228d85c99f0bc3e724b..3893f1c86784e07e5cd78afc32a49339e60840ba 100644 --- a/src/single_io.c +++ b/src/single_io.c @@ -36,8 +36,11 @@ /* Local includes. */ #include "common_io.h" -#include "const.h" +#include "engine.h" #include "error.h" +#include "kernel_hydro.h" +#include "part.h" +#include "units.h" /*----------------------------------------------------------------------------- * Routines reading an IC file @@ -319,6 +322,8 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, * @param Ngas (output) number of Gas particles read. * @param Ngparts (output) The number of #gpart read. * @param periodic (output) 1 if the volume is periodic, 0 if not. + * @param flag_entropy 1 if the ICs contained Entropy in the InternalEnergy + * field * @param dry_run If 1, don't read the particle. Only allocates the arrays. * * Opens the HDF5 file fileName and reads the particles contained @@ -331,7 +336,7 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, */ void read_ic_single(char* fileName, double dim[3], struct part** parts, struct gpart** gparts, size_t* Ngas, size_t* Ngparts, - int* periodic, int dry_run) { + int* periodic, int* flag_entropy, int dry_run) { hid_t h_file = 0, h_grp = 0; /* GADGET has only cubic boxes (in cosmological mode) */ double boxSize[3] = {0.0, -1.0, -1.0}; @@ -365,6 +370,7 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts, if (h_grp < 0) error("Error while opening file header\n"); /* Read the relevant information and print status */ + readAttribute(h_grp, "Flag_Entropy_ICs", INT, flag_entropy); readAttribute(h_grp, "BoxSize", DOUBLE, boxSize); readAttribute(h_grp, "NumPart_Total", UINT, numParticles); readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord); @@ -546,6 +552,7 @@ void write_output_single(struct engine* e, const char* baseName, double MassTable[NUM_PARTICLE_TYPES] = {0}; writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, NUM_PARTICLE_TYPES); unsigned int flagEntropy[NUM_PARTICLE_TYPES] = {0}; + flagEntropy[0] = writeEntropyFlag(); writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, NUM_PARTICLE_TYPES); writeAttribute(h_grp, "NumFilesPerSnapshot", INT, &numFiles, 1); diff --git a/src/single_io.h b/src/single_io.h index adfc5b43941b2c0d69d9ce0924164aff56864a23..4936aa3eb35a5c76a5a8f242c9cc79bbc86161a0 100644 --- a/src/single_io.h +++ b/src/single_io.h @@ -19,6 +19,9 @@ #ifndef SWIFT_SINGLE_IO_H #define SWIFT_SINGLE_IO_H +/* Config parameters. */ +#include "../config.h" + /* Includes. */ #include "engine.h" #include "part.h" @@ -28,7 +31,7 @@ void read_ic_single(char* fileName, double dim[3], struct part** parts, struct gpart** gparts, size_t* Ngas, size_t* Ndm, - int* periodic, int dry_run); + int* periodic, int* flag_entropy, int dry_run); void write_output_single(struct engine* e, const char* baseName, struct UnitSystem* us); diff --git a/src/space.c b/src/space.c index 4f6a3362534b1454b5864bfde25a2a497ce72697..04214e49e0c66fc468c91f0098f75c10ef59c252 100644 --- a/src/space.c +++ b/src/space.c @@ -439,10 +439,10 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { s->parts[k] = s->parts[nr_parts]; s->parts[nr_parts] = tp; if (s->parts[k].gpart != NULL) { - s->parts[k].gpart->part = &s->parts[k]; + s->parts[k].gpart->id_or_neg_offset = -k; } if (s->parts[nr_parts].gpart != NULL) { - s->parts[nr_parts].gpart->part = &s->parts[nr_parts]; + s->parts[nr_parts].gpart->id_or_neg_offset = -nr_parts; } const struct xpart txp = s->xparts[k]; s->xparts[k] = s->xparts[nr_parts]; @@ -456,8 +456,9 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { } } +#ifdef SWIFT_DEBUG_CHECKS /* Check that all parts are in the correct places. */ - /* for (size_t k = 0; k < nr_parts; k++) { + for (size_t k = 0; k < nr_parts; k++) { if (cells[ind[k]].nodeID != local_nodeID) { error("Failed to move all non-local parts to send list"); } @@ -466,7 +467,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { if (cells[ind[k]].nodeID == local_nodeID) { error("Failed to remove local parts from send list"); } - }*/ + } +#endif /* Move non-local gparts to the end of the list. */ for (int k = 0; k < nr_gparts;) { @@ -476,11 +478,12 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { const struct gpart tp = s->gparts[k]; s->gparts[k] = s->gparts[nr_gparts]; s->gparts[nr_gparts] = tp; - if (s->gparts[k].id > 0) { - s->gparts[k].part->gpart = &s->gparts[k]; + if (s->gparts[k].id_or_neg_offset <= 0) { + s->parts[-s->gparts[k].id_or_neg_offset].gpart = &s->gparts[k]; } - if (s->gparts[nr_gparts].id > 0) { - s->gparts[nr_gparts].part->gpart = &s->gparts[nr_gparts]; + if (s->gparts[nr_gparts].id_or_neg_offset <= 0) { + s->parts[-s->gparts[nr_gparts].id_or_neg_offset].gpart = + &s->gparts[nr_gparts]; } const int t = gind[k]; gind[k] = gind[nr_gparts]; @@ -491,8 +494,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { } } +#ifdef SWIFT_DEBUG_CHECKS /* Check that all gparts are in the correct place (untested). */ - /* for (size_t k = 0; k < nr_gparts; k++) { if (cells[gind[k]].nodeID != local_nodeID) { error("Failed to move all non-local gparts to send list"); @@ -502,7 +505,8 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { if (cells[gind[k]].nodeID == local_nodeID) { error("Failed to remove local gparts from send list"); } - }*/ + } +#endif /* Exchange the strays, note that this potentially re-allocates the parts arrays. */ @@ -531,29 +535,34 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { ind[k] = cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); cells[ind[k]].count += 1; - /* if ( cells[ ind[k] ].nodeID != nodeID ) - error( "Received part that does not belong to me (nodeID=%i)." , cells[ - ind[k] ].nodeID ); */ +#ifdef SWIFT_DEBUG_CHECKS + if (cells[ind[k]].nodeID != local_nodeID) + error("Received part that does not belong to me (nodeID=%i).", + cells[ind[k]].nodeID); +#endif } nr_parts = s->nr_parts; -#endif + +#endif /* WITH_MPI */ /* Sort the parts according to their cells. */ space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); /* Re-link the gparts. */ - for (size_t k = 0; k < nr_parts; k++) - if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k]; - - /* Verify sort_struct. */ - /* for ( k = 1 ; k < nr_parts ; k++ ) { - if ( ind[k-1] > ind[k] ) { - error( "Sort failed!" ); - } - else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] , - parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ) - error( "Incorrect indices!" ); - } */ + part_relink_gparts(s->parts, nr_parts, 0); + +#ifdef SWIFT_DEBUG_CHECKS + /* Verify space_sort_struct. */ + for (size_t k = 1; k < nr_parts; k++) { + if (ind[k - 1] > ind[k]) { + error("Sort failed!"); + } else if (ind[k] != cell_getid(cdim, s->parts[k].x[0] * ih[0], + s->parts[k].x[1] * ih[1], + s->parts[k].x[2] * ih[2])) { + error("Incorrect indices!"); + } + } +#endif /* We no longer need the indices as of here. */ free(ind); @@ -588,33 +597,34 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); /* Re-link the parts. */ - for (int k = 0; k < nr_gparts; k++) - if (s->gparts[k].id > 0) s->gparts[k].part->gpart = &s->gparts[k]; + part_relink_parts(s->gparts, nr_gparts, s->parts); /* We no longer need the indices as of here. */ free(gind); +#ifdef SWIFT_DEBUG_CHECKS /* Verify that the links are correct */ - /* MATTHIEU: To be commented out once we are happy */ for (size_t k = 0; k < nr_gparts; ++k) { - if (s->gparts[k].id > 0) { + if (s->gparts[k].id_or_neg_offset < 0) { + + const struct part *part = &s->parts[-s->gparts[k].id_or_neg_offset]; - if (s->gparts[k].part->gpart != &s->gparts[k]) error("Linking problem !"); + if (part->gpart != &s->gparts[k]) error("Linking problem !"); - if (s->gparts[k].x[0] != s->gparts[k].part->x[0] || - s->gparts[k].x[1] != s->gparts[k].part->x[1] || - s->gparts[k].x[2] != s->gparts[k].part->x[2]) + if (s->gparts[k].x[0] != part->x[0] || s->gparts[k].x[1] != part->x[1] || + s->gparts[k].x[2] != part->x[2]) error("Linked particles are not at the same position !"); } } for (size_t k = 0; k < nr_parts; ++k) { - if (s->parts[k].gpart != NULL) { - - if (s->parts[k].gpart->part != &s->parts[k]) error("Linking problem !"); + if (s->parts[k].gpart != NULL && + s->parts[k].gpart->id_or_neg_offset != -k) { + error("Linking problem !"); } } +#endif /* Hook the cells up to the parts. */ // tic = getticks(); @@ -705,13 +715,14 @@ void space_parts_sort(struct space *s, int *ind, size_t N, int min, int max, threadpool_map(&s->e->threadpool, space_parts_sort_mapper, &sort_struct, s->e->threadpool.num_threads, 0, 1, NULL); - /* Verify sort_struct. */ - /* for (int i = 1; i < N; i++) +#ifdef SWIFT_DEBUG_CHECKS + /* Verify space_sort_struct. */ + for (int i = 1; i < N; i++) if (ind[i - 1] > ind[i]) error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1, - ind[i - 1], i, - ind[i], min, max); - message("Sorting succeeded."); */ + ind[i - 1], i, ind[i], min, max); + message("Sorting succeeded."); +#endif /* Clean up. */ free(sort_struct.stack); @@ -775,19 +786,21 @@ void space_parts_sort_mapper(void *map_data, int num_elements, void *extra_data) } } - /* Verify sort_struct. */ - /* for (int k = i; k <= jj; k++) +#ifdef SWIFT_DEBUG_CHECKS + /* Verify space_sort_struct. */ + for (int k = i; k <= jj; k++) if (ind[k] > pivot) { - message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k, - ind[k], pivot, i, j); + message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", + k, ind[k], pivot, i, j); error("Partition failed (<=pivot)."); } for (int k = jj + 1; k <= j; k++) if (ind[k] <= pivot) { - message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k, - ind[k], pivot, i, j); + message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", + k, ind[k], pivot, i, j); error("Partition failed (>pivot)."); - } */ + } +#endif /* Split-off largest interval. */ if (jj - i > j - jj + 1) { @@ -886,13 +899,14 @@ void space_gparts_sort(struct space *s, int *ind, size_t N, int min, int max, threadpool_map(&s->e->threadpool, space_gparts_sort_mapper, &sort_struct, s->e->threadpool.num_threads, 0, 1, NULL); - /* Verify sort_struct. */ - /* for (int i = 1; i < N; i++) +#ifdef SWIFT_DEBUG_CHECKS + /* Verify space_sort_struct. */ + for (int i = 1; i < N; i++) if (ind[i - 1] > ind[i]) error("Sorting failed (ind[%i]=%i,ind[%i]=%i), min=%i, max=%i.", i - 1, - ind[i - 1], i, - ind[i], min, max); - message("Sorting succeeded."); */ + ind[i - 1], i, ind[i], min, max); + message("Sorting succeeded."); +#endif /* Clean up. */ free(sort_struct.stack); @@ -952,19 +966,21 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, void *extra_data } } - /* Verify sort_struct. */ - /* for (int k = i; k <= jj; k++) +#ifdef SWIFT_DEBUG_CHECKS + /* Verify space_sort_struct. */ + for (int k = i; k <= jj; k++) if (ind[k] > pivot) { - message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k, - ind[k], pivot, i, j); + message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", + k, ind[k], pivot, i, j); error("Partition failed (<=pivot)."); } for (int k = jj + 1; k <= j; k++) if (ind[k] <= pivot) { - message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i.", k, - ind[k], pivot, i, j); + message("sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%li, j=%li.", + k, ind[k], pivot, i, j); error("Partition failed (>pivot)."); - } */ + } +#endif /* Split-off largest interval. */ if (jj - i > j - jj + 1) { diff --git a/src/space.h b/src/space.h index d6802db02fccc398f15ad5637d24c5ab7140d147..f6a4c30250e8e737365a338530f003a9846f5cee 100644 --- a/src/space.h +++ b/src/space.h @@ -23,16 +23,18 @@ #ifndef SWIFT_SPACE_H #define SWIFT_SPACE_H -/* Includes. */ +/* Config parameters. */ +#include "../config.h" + +/* Some standard headers. */ #include <stddef.h> -/* Local includes. */ +/* Includes. */ #include "cell.h" +#include "lock.h" #include "parser.h" #include "part.h" - -/* Forward-declare the engine to avoid cyclic includes. */ -struct engine; +#include "space.h" /* Some constants. */ #define space_maxdepth 10 @@ -159,4 +161,5 @@ void space_split_mapper(void *map_data, int num_elements, void *extra_data); void space_do_parts_sort(); void space_do_gparts_sort(); void space_link_cleanup(struct space *s); + #endif /* SWIFT_SPACE_H */ diff --git a/src/task.c b/src/task.c index b93dfc78adeba446d0828202b92c5477a474611f..cb39501a96d22809051ba694285e694c92ff9562 100644 --- a/src/task.c +++ b/src/task.c @@ -112,13 +112,14 @@ void task_unlock(struct task *t) { /* Act based on task type. */ switch (t->type) { case task_type_self: + case task_type_sub_self: case task_type_sort: cell_unlocktree(t->ci); break; case task_type_pair: - case task_type_sub: + case task_type_sub_pair: cell_unlocktree(t->ci); - if (t->cj != NULL) cell_unlocktree(t->cj); + cell_unlocktree(t->cj); break; case task_type_grav_pp: case task_type_grav_mm: @@ -139,174 +140,74 @@ void task_unlock(struct task *t) { int task_lock(struct task *t) { - int type = t->type; + const int type = t->type; + const int subtype = t->subtype; struct cell *ci = t->ci, *cj = t->cj; +#ifdef WITH_MPI + int res = 0, err = 0; + MPI_Status stat; +#endif - /* Communication task? */ - if (type == task_type_recv || type == task_type_send) { + switch (type) { + /* Communication task? */ + case task_type_recv: + case task_type_send: #ifdef WITH_MPI - /* Check the status of the MPI request. */ - int res = 0, err = 0; - MPI_Status stat; - if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) { - char buff[MPI_MAX_ERROR_STRING]; - int len; - MPI_Error_string(err, buff, &len); - error("Failed to test request on send/recv task (tag=%i, %s).", t->flags, - buff); - } - return res; + /* Check the status of the MPI request. */ + if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) { + char buff[MPI_MAX_ERROR_STRING]; + int len; + MPI_Error_string(err, buff, &len); + error("Failed to test request on send/recv task (tag=%i, %s).", + t->flags, buff); + } + return res; #else - error("SWIFT was not compiled with MPI support."); + error("SWIFT was not compiled with MPI support."); #endif + break; - } + case task_type_sort: + if (cell_locktree(ci) != 0) return 0; + break; - /* Unary lock? */ - else if (type == task_type_self || type == task_type_sort || - (type == task_type_sub && cj == NULL)) { - if (cell_locktree(ci) != 0) return 0; - } + case task_type_self: + case task_type_sub_self: + if (subtype == task_subtype_grav) { + if (cell_glocktree(ci) != 0) return 0; + } else { + if (cell_locktree(ci) != 0) return 0; + } + break; - /* Otherwise, binary lock. */ - else if (type == task_type_pair || (type == task_type_sub && cj != NULL)) { - if (ci->hold || cj->hold) return 0; - if (cell_locktree(ci) != 0) return 0; - if (cell_locktree(cj) != 0) { - cell_unlocktree(ci); - return 0; - } - } + case task_type_pair: + case task_type_sub_pair: + if (subtype == task_subtype_grav) { + if (ci->ghold || cj->ghold) return 0; + if (cell_glocktree(ci) != 0) return 0; + if (cell_glocktree(cj) != 0) { + cell_gunlocktree(ci); + return 0; + } + } else { + if (ci->hold || cj->hold) return 0; + if (cell_locktree(ci) != 0) return 0; + if (cell_locktree(cj) != 0) { + cell_unlocktree(ci); + return 0; + } + } + break; - /* Gravity tasks? */ - else if (type == task_type_grav_mm || type == task_type_grav_pp || - type == task_type_grav_down) { - if (ci->ghold || (cj != NULL && cj->ghold)) return 0; - if (cell_glocktree(ci) != 0) return 0; - if (cj != NULL && cell_glocktree(cj) != 0) { - cell_gunlocktree(ci); - return 0; - } + default: + break; } /* If we made it this far, we've got a lock. */ return 1; } -/** - * @brief Remove all unlocks to tasks that are of the given type. - * - * @param t The #task. - * @param type The task type ID to remove. - */ - -void task_cleanunlock(struct task *t, int type) { - - int k; - - lock_lock(&t->lock); - - for (k = 0; k < t->nr_unlock_tasks; k++) - if (t->unlock_tasks[k]->type == type) { - t->nr_unlock_tasks -= 1; - t->unlock_tasks[k] = t->unlock_tasks[t->nr_unlock_tasks]; - } - - lock_unlock_blind(&t->lock); -} - -/** - * @brief Remove an unlock_task from the given task. - * - * @param ta The unlocking #task. - * @param tb The #task that will be unlocked. - */ - -void task_rmunlock(struct task *ta, struct task *tb) { - - int k; - - lock_lock(&ta->lock); - - for (k = 0; k < ta->nr_unlock_tasks; k++) - if (ta->unlock_tasks[k] == tb) { - ta->nr_unlock_tasks -= 1; - ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks]; - lock_unlock_blind(&ta->lock); - return; - } - error("Task not found."); -} - -/** - * @brief Remove an unlock_task from the given task. - * - * @param ta The unlocking #task. - * @param tb The #task that will be unlocked. - * - * Differs from #task_rmunlock in that it will not fail if - * the task @c tb is not in the unlocks of @c ta. - */ - -void task_rmunlock_blind(struct task *ta, struct task *tb) { - - int k; - - lock_lock(&ta->lock); - - for (k = 0; k < ta->nr_unlock_tasks; k++) - if (ta->unlock_tasks[k] == tb) { - ta->nr_unlock_tasks -= 1; - ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks]; - break; - } - - lock_unlock_blind(&ta->lock); -} - -/** - * @brief Add an unlock_task to the given task. - * - * @param ta The unlocking #task. - * @param tb The #task that will be unlocked. - */ - -void task_addunlock(struct task *ta, struct task *tb) { - - error("Use sched_addunlock instead."); - - /* Add the lock atomically. */ - ta->unlock_tasks[atomic_inc(&ta->nr_unlock_tasks)] = tb; - - /* Check a posteriori if we did not overshoot. */ - if (ta->nr_unlock_tasks > task_maxunlock) - error("Too many unlock_tasks in task."); -} - -void task_addunlock_old(struct task *ta, struct task *tb) { - - int k; - - lock_lock(&ta->lock); - - /* Check if ta already unlocks tb. */ - for (k = 0; k < ta->nr_unlock_tasks; k++) - if (ta->unlock_tasks[k] == tb) { - error("Duplicate unlock."); - lock_unlock_blind(&ta->lock); - return; - } - - if (ta->nr_unlock_tasks == task_maxunlock) - error("Too many unlock_tasks in task."); - - ta->unlock_tasks[ta->nr_unlock_tasks] = tb; - ta->nr_unlock_tasks += 1; - - lock_unlock_blind(&ta->lock); -} - /** * @brief Prints the list of tasks contained in a given mask * diff --git a/src/task.h b/src/task.h index a949e02fce1e9e0fa6b86f27e1c192a45b9073f3..db7305238dc85351f39fb127197b318d3b04c1ee 100644 --- a/src/task.h +++ b/src/task.h @@ -37,7 +37,8 @@ enum task_types { task_type_sort, task_type_self, task_type_pair, - task_type_sub, + task_type_sub_self, + task_type_sub_pair, task_type_init, task_type_ghost, task_type_drift, @@ -62,6 +63,7 @@ enum task_subtypes { task_subtype_density, task_subtype_force, task_subtype_grav, + task_subtype_tend, task_subtype_count }; @@ -75,10 +77,10 @@ struct task { char skip, tight, implicit; int flags, wait, rank, weight; - swift_lock_type lock; - struct cell *ci, *cj; + void *buff; + #ifdef WITH_MPI MPI_Request req; #endif @@ -91,10 +93,6 @@ struct task { }; /* Function prototypes. */ -void task_rmunlock(struct task *ta, struct task *tb); -void task_rmunlock_blind(struct task *ta, struct task *tb); -void task_cleanunlock(struct task *t, int type); -void task_addunlock(struct task *ta, struct task *tb); void task_unlock(struct task *t); float task_overlap(const struct task *ta, const struct task *tb); int task_lock(struct task *t); diff --git a/src/timers.h b/src/timers.h index 92b685ebe9b11d49c4703e5837d35cffdca81c4d..c48961b39737f23eb936d7283f76651d33892991 100644 --- a/src/timers.h +++ b/src/timers.h @@ -23,6 +23,7 @@ #define SWIFT_TIMERS_H /* Includes. */ +#include "atomic.h" #include "cycle.h" #include "inline.h" @@ -41,9 +42,12 @@ enum { timer_dopair_force, timer_dopair_grav, timer_dograv_external, - timer_dosub_density, - timer_dosub_force, - timer_dosub_grav, + timer_dosub_self_density, + timer_dosub_self_force, + timer_dosub_self_grav, + timer_dosub_pair_density, + timer_dosub_pair_force, + timer_dosub_pair_grav, timer_dopair_subset, timer_do_ghost, timer_dorecv_cell, @@ -71,7 +75,7 @@ extern ticks timers[timer_count]; #define TIMER_TOC2(t) timers_toc(t, tic2) INLINE static ticks timers_toc(int t, ticks tic) { ticks d = (getticks() - tic); - __sync_add_and_fetch(&timers[t], d); + atomic_add(&timers[t], d); return d; } #else diff --git a/src/tools.c b/src/tools.c index 0363100331ad5b298279e9ebadcf87eab5f9e896..710efd7e4dd54784ed735492b4e1af0b466ef682 100644 --- a/src/tools.c +++ b/src/tools.c @@ -19,16 +19,25 @@ * ******************************************************************************/ +/* Config parameters. */ +#include "../config.h" + +/* Some standard headers. */ #include <math.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> +/* This object's header. */ +#include "tools.h" + +/* Local includes. */ #include "cell.h" #include "error.h" +#include "gravity.h" +#include "hydro.h" #include "part.h" -#include "swift.h" -#include "tools.h" +#include "runner.h" /** * Factorize a given integer, attempts to keep larger pair of factors. @@ -56,9 +65,7 @@ void factor(int value, int *f1, int *f2) { * @param N The number of parts. * @param periodic Periodic boundary conditions flag. */ - -void pairs_n2(double *dim, struct part *__restrict__ parts, int N, - int periodic) { +void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic) { int i, j, k, count = 0; // int mj, mk; // double maxratio = 1.0; @@ -121,8 +128,7 @@ void pairs_n2(double *dim, struct part *__restrict__ parts, int N, } void pairs_single_density(double *dim, long long int pid, - struct part *__restrict__ parts, int N, - int periodic) { + struct part *restrict parts, int N, int periodic) { int i, k; // int mj, mk; // double maxratio = 1.0; @@ -175,16 +181,19 @@ void pairs_single_density(double *dim, long long int pid, } void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { + float r2, hi, hj, hig2, hjg2, dx[3]; struct part *pi, *pj; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { + pi = &ci->parts[i]; hi = pi->h; hig2 = hi * hi * kernel_gamma2; for (int j = 0; j < cj->count; ++j) { + pj = &cj->parts[j]; /* Pairwise distance */ @@ -196,6 +205,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hig2) { + /* Interact */ runner_iact_nonsym_density(r2, dx, hi, pj->h, pi, pj); } @@ -204,11 +214,13 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { /* Reverse double-for loop and checks every interaction */ for (int j = 0; j < cj->count; ++j) { + pj = &cj->parts[j]; hj = pj->h; hjg2 = hj * hj * kernel_gamma2; for (int i = 0; i < ci->count; ++i) { + pi = &ci->parts[i]; /* Pairwise distance */ @@ -220,6 +232,7 @@ void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hjg2) { + /* Interact */ runner_iact_nonsym_density(r2, dx, hj, pi->h, pj, pi); } @@ -233,11 +246,13 @@ void self_all_density(struct runner *r, struct cell *ci) { /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { + pi = &ci->parts[i]; hi = pi->h; hig2 = hi * hi * kernel_gamma2; for (int j = i + 1; j < ci->count; ++j) { + pj = &ci->parts[j]; hj = pj->h; hjg2 = hj * hj * kernel_gamma2; @@ -253,12 +268,14 @@ void self_all_density(struct runner *r, struct cell *ci) { /* Hit or miss? */ if (r2 < hig2) { + /* Interact */ runner_iact_nonsym_density(r2, dxi, hi, hj, pi, pj); } /* Hit or miss? */ if (r2 < hjg2) { + dxi[0] = -dxi[0]; dxi[1] = -dxi[1]; dxi[2] = -dxi[2]; @@ -271,7 +288,9 @@ void self_all_density(struct runner *r, struct cell *ci) { } void pairs_single_grav(double *dim, long long int pid, - struct gpart *__restrict__ parts, int N, int periodic) { + struct gpart *restrict gparts, const struct part *parts, + int N, int periodic) { + int i, k; // int mj, mk; // double maxratio = 1.0; @@ -282,18 +301,20 @@ void pairs_single_grav(double *dim, long long int pid, /* Find "our" part. */ for (k = 0; k < N; k++) - if ((parts[k].id > 0 && parts[k].part->id == pid) || parts[k].id == -pid) + if ((gparts[k].id_or_neg_offset < 0 && + parts[-gparts[k].id_or_neg_offset].id == pid) || + gparts[k].id_or_neg_offset == pid) break; if (k == N) error("Part not found."); - pi = parts[k]; + pi = gparts[k]; pi.a_grav[0] = 0.0f; pi.a_grav[1] = 0.0f; pi.a_grav[2] = 0.0f; /* Loop over all particle pairs. */ for (k = 0; k < N; k++) { - if (parts[k].id == pi.id) continue; - pj = parts[k]; + if (gparts[k].id_or_neg_offset == pi.id_or_neg_offset) continue; + pj = gparts[k]; for (i = 0; i < 3; i++) { dx[i] = pi.x[i] - pj.x[i]; if (periodic) { @@ -320,7 +341,8 @@ void pairs_single_grav(double *dim, long long int pid, /* Dump the result. */ message( "acceleration on gpart %lli is a=[ %e %e %e ], |a|=[ %.2e %.2e %.2e ].\n", - pi.part->id, a[0], a[1], a[2], aabs[0], aabs[1], aabs[2]); + parts[-pi.id_or_neg_offset].id, a[0], a[1], a[2], aabs[0], aabs[1], + aabs[2]); } /** @@ -328,8 +350,8 @@ void pairs_single_grav(double *dim, long long int pid, * * @param N number of intervals in [0,1]. */ - void density_dump(int N) { + int k; float r2[4] = {0.0f, 0.0f, 0.0f, 0.0f}, hi[4], hj[4]; struct part /**pi[4], *pj[4],*/ Pi[4], Pj[4]; @@ -363,10 +385,8 @@ void density_dump(int N) { /** * @brief Compute the force on a single particle brute-force. */ - void engine_single_density(double *dim, long long int pid, - struct part *__restrict__ parts, int N, - int periodic) { + struct part *restrict parts, int N, int periodic) { int i, k; double r2, dx[3]; float fdx[3], ih; @@ -412,7 +432,7 @@ void engine_single_density(double *dim, long long int pid, } void engine_single_force(double *dim, long long int pid, - struct part *__restrict__ parts, int N, int periodic) { + struct part *restrict parts, int N, int periodic) { int i, k; double r2, dx[3]; float fdx[3]; diff --git a/src/tools.h b/src/tools.h index 5f9f41d033ab03983bde3bb37e87f8a39d2deecd..97f036994bc0d1e2b4a95d806cfdbd253664a260 100644 --- a/src/tools.h +++ b/src/tools.h @@ -22,21 +22,26 @@ #ifndef SWIFT_TOOL_H #define SWIFT_TOOL_H +/* Config parameters. */ +#include "../config.h" + +/* Includes. */ #include "cell.h" +#include "part.h" #include "runner.h" void factor(int value, int *f1, int *f2); void density_dump(int N); void pairs_single_grav(double *dim, long long int pid, - struct gpart *__restrict__ parts, int N, int periodic); + struct gpart *restrict gparts, const struct part *parts, + int N, int periodic); void pairs_single_density(double *dim, long long int pid, - struct part *__restrict__ parts, int N, int periodic); + struct part *restrict parts, int N, int periodic); void pairs_all_density(struct runner *r, struct cell *ci, struct cell *cj); void self_all_density(struct runner *r, struct cell *ci); -void pairs_n2(double *dim, struct part *__restrict__ parts, int N, - int periodic); +void pairs_n2(double *dim, struct part *restrict parts, int N, int periodic); double random_uniform(double a, double b); void shuffle_particles(struct part *parts, const int count);