Commit fb28100d authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge remote-tracking branch 'origin/master' into repart-by-ticks-with-means

parents 4adcd862 ec378b32
...@@ -7,8 +7,8 @@ General information for adding new schemes ...@@ -7,8 +7,8 @@ General information for adding new schemes
========================================== ==========================================
The following steps are required for any new options (such as new The following steps are required for any new options (such as new
:ref:`hydro`, :ref:`chemistry`, :ref:`cooling`, :ref:`hydro`, chemistry, cooling,
:ref:`equation_of_state`, :ref:`stars` or :ref:`gravity`) :ref:`equation_of_state`, stars, or gravity)
In order to add a new scheme, you will need to: In order to add a new scheme, you will need to:
......
This diff is collapsed.
This diff is collapsed.
...@@ -50,8 +50,10 @@ HDF5 library, not a parallel build. ...@@ -50,8 +50,10 @@ HDF5 library, not a parallel build.
Compiling SWIFT Compiling SWIFT
--------------- ---------------
The next part is compiling SWIFT with VELOCIraptor and assumes you already The next part is compiling SWIFT with VELOCIraptor and assumes you already
downloaded SWIFT from the GitLab_, this can be done by running:: downloaded SWIFT from the GitLab_, this can be done by running
.. code:: bash
./autogen.sh ./autogen.sh
./configure --with-velociraptor=/path/to/VELOCIraptor-STF/src ./configure --with-velociraptor=/path/to/VELOCIraptor-STF/src
make make
...@@ -60,16 +62,16 @@ In which ``./autogen.sh`` only needs to be run once after the code is cloned ...@@ -60,16 +62,16 @@ In which ``./autogen.sh`` only needs to be run once after the code is cloned
from the GitLab_, and ``/path/to/`` is the path to the ``VELOCIraptor-STF`` from the GitLab_, and ``/path/to/`` is the path to the ``VELOCIraptor-STF``
directory on your machine. In general ``./configure`` can be run with other directory on your machine. In general ``./configure`` can be run with other
options as desired. After this we can run SWIFT with VELOCIraptor, but for this options as desired. After this we can run SWIFT with VELOCIraptor, but for this
we first need to add several lines to the yaml file of our simulation:: we first need to add several lines to the yaml file of our simulation
#structure finding options .. code:: YAML
StructureFinding:
config_file_name: stf_input_6dfof_dmonly_sub.cfg StructureFinding:
basename: ./stf config_file_name: stf_input_6dfof_dmonly_sub.cfg
output_time_format: 1 basename: ./stf
scale_factor_first: 0.02 scale_factor_first: 0.02
delta_time: 1.02 delta_time: 1.02
In which we specify the ``.cfg`` file that is used by VELOCIraptor and the In which we specify the ``.cfg`` file that is used by VELOCIraptor and the
other parameters which SWIFT needs to use. In the case of other parameters which SWIFT needs to use. In the case of
......
...@@ -87,7 +87,7 @@ html_theme = 'sphinx_rtd_theme' ...@@ -87,7 +87,7 @@ html_theme = 'sphinx_rtd_theme'
# Add any paths that contain custom static files (such as style sheets) here, # Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files, # relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css". # so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['.static'] # html_static_path = ['.static']
# Custom sidebar templates, must be a dictionary that maps document names # Custom sidebar templates, must be a dictionary that maps document names
# to template names. # to template names.
......
...@@ -10,10 +10,8 @@ InternalUnitSystem: ...@@ -10,10 +10,8 @@ InternalUnitSystem:
StructureFinding: StructureFinding:
config_file_name: stf_input.cfg # Name of the STF config file. config_file_name: stf_input.cfg # Name of the STF config file.
basename: ./stf # Common part of the name of output files. basename: ./stf # Common part of the name of output files.
output_time_format: 0 # Specifies the frequency format of structure finding. 0 for simulation steps (delta_step) and 1 for simulation time intervals (delta_time).
scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run) scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run)
time_first: 0.01 # Time of the first structure finding output (in internal units). time_first: 0.01 # Time of the first structure finding output (in internal units).
delta_step: 1000 # Time difference between consecutive structure finding outputs (in internal units) in simulation steps.
delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals. delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals.
# Cosmological parameters # Cosmological parameters
......
...@@ -10,10 +10,8 @@ InternalUnitSystem: ...@@ -10,10 +10,8 @@ InternalUnitSystem:
StructureFinding: StructureFinding:
config_file_name: stf_input.cfg # Name of the STF config file. config_file_name: stf_input.cfg # Name of the STF config file.
basename: ./stf # Common part of the name of output files. basename: ./stf # Common part of the name of output files.
output_time_format: 0 # Specifies the frequency format of structure finding. 0 for simulation steps (delta_step) and 1 for simulation time intervals (delta_time).
scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run) scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run)
time_first: 0.01 # Time of the first structure finding output (in internal units). time_first: 0.01 # Time of the first structure finding output (in internal units).
delta_step: 1000 # Time difference between consecutive structure finding outputs (in internal units) in simulation steps.
delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals. delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals.
# Cosmological parameters # Cosmological parameters
......
...@@ -10,10 +10,8 @@ InternalUnitSystem: ...@@ -10,10 +10,8 @@ InternalUnitSystem:
StructureFinding: StructureFinding:
config_file_name: stf_input.cfg # Name of the STF config file. config_file_name: stf_input.cfg # Name of the STF config file.
basename: ./stf # Common part of the name of output files. basename: ./stf # Common part of the name of output files.
output_time_format: 0 # Specifies the frequency format of structure finding. 0 for simulation steps (delta_step) and 1 for simulation time intervals (delta_time).
scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run) scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run)
time_first: 0.01 # Time of the first structure finding output (in internal units). time_first: 0.01 # Time of the first structure finding output (in internal units).
delta_step: 1000 # Time difference between consecutive structure finding outputs (in internal units) in simulation steps.
delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals. delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals.
# Cosmological parameters # Cosmological parameters
......
...@@ -10,7 +10,6 @@ InternalUnitSystem: ...@@ -10,7 +10,6 @@ InternalUnitSystem:
StructureFinding: StructureFinding:
config_file_name: stf_input_6dfof_dmonly_sub.cfg config_file_name: stf_input_6dfof_dmonly_sub.cfg
basename: ./stf basename: ./stf
output_time_format: 1
scale_factor_first: 0.02 scale_factor_first: 0.02
delta_time: 1.02 delta_time: 1.02
......
...@@ -37,8 +37,9 @@ SPH: ...@@ -37,8 +37,9 @@ SPH:
# Parameters governing the snapshots # Parameters governing the snapshots
Snapshots: Snapshots:
basename: snap basename: snap
delta_time: 1.02 delta_time: 1.05
scale_factor_first: 0.02 scale_factor_first: 0.02
invoke_stf: 1
# Parameters governing the conserved quantities statistics # Parameters governing the conserved quantities statistics
Statistics: Statistics:
...@@ -52,16 +53,16 @@ Scheduler: ...@@ -52,16 +53,16 @@ Scheduler:
# Parameters related to the initial conditions # Parameters related to the initial conditions
InitialConditions: InitialConditions:
file_name: small_cosmo_volume.hdf5 file_name: small_cosmo_volume.hdf5
periodic: 1
cleanup_h_factors: 1 cleanup_h_factors: 1
cleanup_velocity_factors: 1 cleanup_velocity_factors: 1
generate_gas_in_ics: 1 # Generate gas particles from the DM-only ICs generate_gas_in_ics: 1 # Generate gas particles from the DM-only ICs
cleanup_smoothing_lengths: 1 # Since we generate gas, make use of the (expensive) cleaning-up procedure. cleanup_smoothing_lengths: 1 # Since we generate gas, make use of the (expensive) cleaning-up procedure.
# Structure finding options (requires velociraptor) # Structure finding options (requires velociraptor)
StructureFinding: StructureFinding:
config_file_name: stfconfig_input.cfg config_file_name: stfconfig_input.cfg
basename: ./stf basename: ./stf
output_time_format: 1
scale_factor_first: 0.02 scale_factor_first: 0.02
delta_time: 1.02 delta_time: 1.02
...@@ -923,6 +923,10 @@ int main(int argc, char *argv[]) { ...@@ -923,6 +923,10 @@ int main(int argc, char *argv[]) {
fflush(stdout); fflush(stdout);
} }
#ifdef HAVE_VELOCIRAPTOR
if (with_structure_finding) velociraptor_init(&e);
#endif
/* Get some info to the user. */ /* Get some info to the user. */
if (myrank == 0) { if (myrank == 0) {
long long N_DM = N_total[1] - N_total[2] - N_total[0]; long long N_DM = N_total[1] - N_total[2] - N_total[0];
...@@ -1123,14 +1127,6 @@ int main(int argc, char *argv[]) { ...@@ -1123,14 +1127,6 @@ int main(int argc, char *argv[]) {
#endif #endif
// write a final snapshot with logger, in order to facilitate a restart // write a final snapshot with logger, in order to facilitate a restart
engine_dump_snapshot(&e); engine_dump_snapshot(&e);
#ifdef HAVE_VELOCIRAPTOR
/* Call VELOCIraptor at the end of the run to find groups. */
if (e.policy & engine_policy_structure_finding) {
velociraptor_init(&e);
velociraptor_invoke(&e);
}
#endif
} }
#ifdef WITH_MPI #ifdef WITH_MPI
......
...@@ -85,6 +85,7 @@ Snapshots: ...@@ -85,6 +85,7 @@ Snapshots:
scale_factor_first: 0.1 # (Optional) Scale-factor of the first snapshot if cosmological time-integration. scale_factor_first: 0.1 # (Optional) Scale-factor of the first snapshot if cosmological time-integration.
time_first: 0. # (Optional) Time of the first output if non-cosmological time-integration (in internal units) time_first: 0. # (Optional) Time of the first output if non-cosmological time-integration (in internal units)
delta_time: 0.01 # Time difference between consecutive outputs (in internal units) delta_time: 0.01 # Time difference between consecutive outputs (in internal units)
invoke_stf: 0 # (Optional) Call VELOCIraptor every time a snapshot is written irrespective of the VELOCIraptor output strategy.
compression: 0 # (Optional) Set the level of compression of the HDF5 datasets [0-9]. 0 does no compression. compression: 0 # (Optional) Set the level of compression of the HDF5 datasets [0-9]. 0 does no compression.
int_time_label_on: 0 # (Optional) Enable to label the snapshots using the time rounded to an integer (in internal units) int_time_label_on: 0 # (Optional) Enable to label the snapshots using the time rounded to an integer (in internal units)
UnitMass_in_cgs: 1 # (Optional) Unit system for the outputs (Grams) UnitMass_in_cgs: 1 # (Optional) Unit system for the outputs (Grams)
...@@ -158,6 +159,16 @@ DomainDecomposition: ...@@ -158,6 +159,16 @@ DomainDecomposition:
# task weights in first repartition, if 0 only use task timings, if > 1 only use # task weights in first repartition, if 0 only use task timings, if > 1 only use
# fixed costs, unless none are available. # fixed costs, unless none are available.
# Structure finding options (requires velociraptor)
StructureFinding:
config_file_name: stf_input.cfg # Name of the STF config file.
basename: ./stf # Common part of the name of output files.
scale_factor_first: 0.92 # (Optional) Scale-factor of the first snaphot (cosmological run)
time_first: 0.01 # (Optional) Time of the first structure finding output (in internal units).
delta_time: 1.10 # (Optional) Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals.
output_list_on: 0 # (Optional) Enable the output list
output_list: stflist.txt # (Optional) File containing the output times (see documentation in "Parameter File" section)
# Parameters related to the equation of state ------------------------------------------ # Parameters related to the equation of state ------------------------------------------
EoS: EoS:
...@@ -288,15 +299,3 @@ EAGLEChemistry: ...@@ -288,15 +299,3 @@ EAGLEChemistry:
init_abundance_Magnesium: 0.000 # Inital fraction of particle mass in Magnesium init_abundance_Magnesium: 0.000 # Inital fraction of particle mass in Magnesium
init_abundance_Silicon: 0.000 # Inital fraction of particle mass in Silicon init_abundance_Silicon: 0.000 # Inital fraction of particle mass in Silicon
init_abundance_Iron: 0.000 # Inital fraction of particle mass in Iron init_abundance_Iron: 0.000 # Inital fraction of particle mass in Iron
# Structure finding options (requires velociraptor)
StructureFinding:
config_file_name: stf_input.cfg # Name of the STF config file.
basename: ./stf # Common part of the name of output files.
output_time_format: 0 # Specifies the frequency format of structure finding. 0 for simulation steps (delta_step) and 1 for simulation time intervals (delta_time).
scale_factor_first: 0.92 # Scale-factor of the first snaphot (cosmological run)
time_first: 0.01 # Time of the first structure finding output (in internal units).
delta_step: 1000 # Time difference between consecutive structure finding outputs (in internal units) in simulation steps.
delta_time: 1.10 # Time difference between consecutive structure finding outputs (in internal units) in simulation time intervals.
output_list_on: 0 # (Optional) Enable the output list
output_list: stflist.txt # (Optional) File containing the output times (see documentation in "Parameter File" section)
...@@ -49,7 +49,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ ...@@ -49,7 +49,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
gravity_softened_derivatives.h vector_power.h collectgroup.h hydro_space.h sort_part.h \ gravity_softened_derivatives.h vector_power.h collectgroup.h hydro_space.h sort_part.h \
chemistry.h chemistry_io.h chemistry_struct.h cosmology.h restart.h space_getsid.h utilities.h \ chemistry.h chemistry_io.h chemistry_struct.h cosmology.h restart.h space_getsid.h utilities.h \
mesh_gravity.h cbrt.h exp10.h velociraptor_interface.h swift_velociraptor_part.h outputlist.h \ mesh_gravity.h cbrt.h exp10.h velociraptor_interface.h swift_velociraptor_part.h outputlist.h \
logger_io.h tracers_io.h tracers.h tracers_struct.h logger_io.h tracers_io.h tracers.h tracers_struct.h velociraptor_struct.h velociraptor_io.h
# source files for EAGLE cooling # source files for EAGLE cooling
EAGLE_COOLING_SOURCES = EAGLE_COOLING_SOURCES =
......
...@@ -179,8 +179,9 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, ...@@ -179,8 +179,9 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
* *
* @param ci The #cell. * @param ci The #cell.
* @param ci_cache The cache. * @param ci_cache The cache.
* @return uninhibited_count The no. of uninhibited particles.
*/ */
__attribute__((always_inline)) INLINE void cache_read_particles( __attribute__((always_inline)) INLINE int cache_read_particles(
const struct cell *restrict const ci, const struct cell *restrict const ci,
struct cache *restrict const ci_cache) { struct cache *restrict const ci_cache) {
...@@ -197,12 +198,29 @@ __attribute__((always_inline)) INLINE void cache_read_particles( ...@@ -197,12 +198,29 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
const int count = ci->hydro.count;
const struct part *restrict parts = ci->hydro.parts; const struct part *restrict parts = ci->hydro.parts;
const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
const double max_dx = ci->hydro.dx_max_part;
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->hydro.h_max / 4.;
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
for (int i = 0; i < ci->hydro.count; i++) { for (int i = 0; i < count; i++) {
/* Pad inhibited particles. */
if (parts[i].time_bin >= time_bin_inhibited) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
continue;
}
x[i] = (float)(parts[i].x[0] - loc[0]); x[i] = (float)(parts[i].x[0] - loc[0]);
y[i] = (float)(parts[i].x[1] - loc[1]); y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]); z[i] = (float)(parts[i].x[2] - loc[2]);
...@@ -213,6 +231,26 @@ __attribute__((always_inline)) INLINE void cache_read_particles( ...@@ -213,6 +231,26 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
vz[i] = parts[i].v[2]; vz[i] = parts[i].v[2];
} }
/* Pad cache if the no. of particles is not a multiple of double the vector
* length. */
int count_align = count;
const int rem = count % (NUM_VEC_PROC * VEC_SIZE);
if (rem != 0) {
count_align += (NUM_VEC_PROC * VEC_SIZE) - rem;
/* Set positions to something outside of the range of any particle */
for (int i = count; i < count_align; i++) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
}
}
return count_align;
#else
error("Can't call the cache reading function with this flavour of SPH!");
return 0;
#endif #endif
} }
...@@ -261,10 +299,32 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset( ...@@ -261,10 +299,32 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset(
if (*last_pi + pad < ci->hydro.count) *last_pi += pad; if (*last_pi + pad < ci->hydro.count) *last_pi += pad;
} }
const double max_dx = ci->hydro.dx_max_part;
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->hydro.h_max / 4.;
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
for (int i = 0; i < *last_pi; i++) { for (int i = 0; i < *last_pi; i++) {
const int idx = sort_i[i].i; const int idx = sort_i[i].i;
/* Put inhibited particles out of range. */
if (parts[idx].time_bin >= time_bin_inhibited) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
m[i] = 1.f;
vx[i] = 1.f;
vy[i] = 1.f;
vz[i] = 1.f;
continue;
}
x[i] = (float)(parts[idx].x[0] - loc[0]); x[i] = (float)(parts[idx].x[0] - loc[0]);
y[i] = (float)(parts[idx].x[1] - loc[1]); y[i] = (float)(parts[idx].x[1] - loc[1]);
z[i] = (float)(parts[idx].x[2] - loc[2]); z[i] = (float)(parts[idx].x[2] - loc[2]);
...@@ -278,12 +338,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset( ...@@ -278,12 +338,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset(
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real * interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */ * particles to avoid overflow problems. */
const double max_dx = ci->hydro.dx_max_part;
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->hydro.parts[0].h;
for (int i = *last_pi; i < *last_pi + VEC_SIZE; i++) { for (int i = *last_pi; i < *last_pi + VEC_SIZE; i++) {
x[i] = pos_padded[0]; x[i] = pos_padded[0];
y[i] = pos_padded[1]; y[i] = pos_padded[1];
...@@ -308,11 +362,32 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset( ...@@ -308,11 +362,32 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset(
} }
const int ci_cache_count = ci->hydro.count - *first_pi; const int ci_cache_count = ci->hydro.count - *first_pi;
const double max_dx = ci->hydro.dx_max_part;
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->hydro.h_max / 4.;
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
for (int i = 0; i < ci_cache_count; i++) { for (int i = 0; i < ci_cache_count; i++) {
const int idx = sort_i[i + *first_pi].i; const int idx = sort_i[i + *first_pi].i;
/* Put inhibited particles out of range. */
if (parts[idx].time_bin >= time_bin_inhibited) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
m[i] = 1.f;
vx[i] = 1.f;
vy[i] = 1.f;
vz[i] = 1.f;
continue;
}
x[i] = (float)(parts[idx].x[0] - loc[0]); x[i] = (float)(parts[idx].x[0] - loc[0]);
y[i] = (float)(parts[idx].x[1] - loc[1]); y[i] = (float)(parts[idx].x[1] - loc[1]);
z[i] = (float)(parts[idx].x[2] - loc[2]); z[i] = (float)(parts[idx].x[2] - loc[2]);
...@@ -326,12 +401,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset( ...@@ -326,12 +401,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset(
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real * interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */ * particles to avoid overflow problems. */
const double max_dx = ci->hydro.dx_max_part;
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->hydro.parts[0].h;
for (int i = ci->hydro.count - *first_pi; for (int i = ci->hydro.count - *first_pi;
i < ci->hydro.count - *first_pi + VEC_SIZE; i++) { i < ci->hydro.count - *first_pi + VEC_SIZE; i++) {
x[i] = pos_padded[0]; x[i] = pos_padded[0];
...@@ -355,8 +424,9 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset( ...@@ -355,8 +424,9 @@ __attribute__((always_inline)) INLINE void cache_read_particles_subset(
* *
* @param ci The #cell. * @param ci The #cell.
* @param ci_cache The cache. * @param ci_cache The cache.
* @return uninhibited_count The no. of uninhibited particles.
*/ */
__attribute__((always_inline)) INLINE void cache_read_force_particles( __attribute__((always_inline)) INLINE int cache_read_force_particles(
const struct cell *restrict const ci, const struct cell *restrict const ci,
struct cache *restrict const ci_cache) { struct cache *restrict const ci_cache) {
...@@ -382,12 +452,34 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( ...@@ -382,12 +452,34 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
SWIFT_CACHE_ALIGNMENT); SWIFT_CACHE_ALIGNMENT);
const int count = ci->hydro.count;
const struct part *restrict parts = ci->hydro.parts; const struct part *restrict parts = ci->hydro.parts;
const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
const double max_dx = ci->hydro.dx_max_part;
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->hydro.h_max / 4.;
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
for (int i = 0; i < ci->hydro.count; i++) { for (int i = 0; i < count; i++) {
/* Skip inhibited particles. */
if (parts[i].time_bin >= time_bin_inhibited) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
rho[i] = 1.f;
grad_h[i] = 1.f;
pOrho2[i] = 1.f;
balsara[i] = 1.f;
soundspeed[i] = 1.f;
continue;
}
x[i] = (float)(parts[i].x[0] - loc[0]); x[i] = (float)(parts[i].x[0] - loc[0]);
y[i] = (float)(parts[i].x[1] - loc[1]); y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]); z[i] = (float)(parts[i].x[2] - loc[2]);
...@@ -403,6 +495,32 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( ...@@ -403,6 +495,32 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
soundspeed[i] = parts[i].force.soundspeed; soundspeed[i] = parts[i].force.soundspeed;
} }
/* Pad cache if there is a serial remainder. */
int count_align = count;
const int rem = count % VEC_SIZE;
if (rem != 0) {
count_align += VEC_SIZE - rem;
/* Set positions to the same as particle pi so when the r2 > 0 mask is
* applied these extra contributions are masked out.*/
for (int i = count; i < count_align; i++) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
rho[i] = 1.f;
grad_h[i] = 1.f;
pOrho2[i] = 1.f;
balsara[i] = 1.f;
soundspeed[i] = 1.f;
}
}
return count_align;
#else
error("Can't call the cache reading function with this flavour of SPH!");
return 0;
#endif #endif
} }
...@@ -472,11 +590,32 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -472,11 +590,32 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); <