Commit 699d0b20 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge remote-tracking branch 'origin/master' into cells-per-task

parents 92b0509c 06771ad2
......@@ -34,7 +34,7 @@ examples/*/*/*.txt
examples/*/*/used_parameters.yml
examples/*/gravity_checks_*.dat
tests/testPair
tests/testActivePair
tests/brute_force_periodic_BC_standard.dat
tests/swift_periodic_BC_standard.dat
tests/brute_force_periodic_BC_pertrubed.dat
......@@ -54,6 +54,11 @@ tests/brute_force_125_standard.dat
tests/swift_dopair_125_standard.dat
tests/brute_force_125_perturbed.dat
tests/swift_dopair_125_perturbed.dat
tests/brute_force_active.dat
tests/brute_force_periodic_BC_perturbed.dat
tests/swift_dopair_active.dat
tests/test_nonsym_density_serial.dat
tests/test_nonsym_density_vec.dat
tests/testGreetings
tests/testReading
tests/input.hdf5
......@@ -75,8 +80,6 @@ tests/test27cells.sh
tests/test27cellsPerturbed.sh
tests/test125cells.sh
tests/test125cellsPerturbed.sh
tests/testPair.sh
tests/testPairPerturbed.sh
tests/testParser.sh
tests/testReading.sh
tests/testAdiabaticIndex
......
......@@ -861,14 +861,14 @@ AM_CONDITIONAL([HAVE_DOXYGEN], [test "$ac_cv_path_ac_pt_DX_DOXYGEN" != ""])
# Handle .in files.
AC_CONFIG_FILES([Makefile src/Makefile examples/Makefile doc/Makefile doc/Doxyfile tests/Makefile])
AC_CONFIG_FILES([tests/testReading.sh], [chmod +x tests/testReading.sh])
AC_CONFIG_FILES([tests/testPair.sh], [chmod +x tests/testPair.sh])
AC_CONFIG_FILES([tests/testPairPerturbed.sh], [chmod +x tests/testPairPerturbed.sh])
AC_CONFIG_FILES([tests/testActivePair.sh], [chmod +x tests/testActivePair.sh])
AC_CONFIG_FILES([tests/test27cells.sh], [chmod +x tests/test27cells.sh])
AC_CONFIG_FILES([tests/test27cellsPerturbed.sh], [chmod +x tests/test27cellsPerturbed.sh])
AC_CONFIG_FILES([tests/test125cells.sh], [chmod +x tests/test125cells.sh])
AC_CONFIG_FILES([tests/test125cellsPerturbed.sh], [chmod +x tests/test125cellsPerturbed.sh])
AC_CONFIG_FILES([tests/testPeriodicBC.sh], [chmod +x tests/testPeriodicBC.sh])
AC_CONFIG_FILES([tests/testPeriodicBCPerturbed.sh], [chmod +x tests/testPeriodicBCPerturbed.sh])
AC_CONFIG_FILES([tests/testInteractions.sh], [chmod +x tests/testInteractions.sh])
AC_CONFIG_FILES([tests/testParser.sh], [chmod +x tests/testParser.sh])
# Save the compilation options
......
......@@ -26,7 +26,7 @@ Statistics:
# Parameters for the self-gravity scheme
Gravity:
eta: 0.025 # Constant dimensionless multiplier for time integration.
epsilon: 0.0001 # Softening length (in internal units).
epsilon: 0.001 # Softening length (in internal units).
theta: 0.7 # Opening angle (Multipole acceptance criterion)
# Parameters for the hydrodynamics scheme
......
......@@ -12,6 +12,9 @@ TimeIntegration:
time_end: 1e-2 # The end time of the simulation (in internal units).
dt_min: 1e-10 # The minimal time-step size of the simulation (in internal units).
dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units).
Scheduler:
cell_split_size: 64
# Parameters governing the snapshots
Snapshots:
......
......@@ -9,7 +9,7 @@ InternalUnitSystem:
# Parameters governing the time integration
TimeIntegration:
time_begin: 0. # The starting time of the simulation (in internal units).
time_end: 10. # The end time of the simulation (in internal units).
time_end: 1000. # The end time of the simulation (in internal units).
dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units).
dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units).
......@@ -21,12 +21,11 @@ Snapshots:
# Parameters governing the conserved quantities statistics
Statistics:
delta_time: 1e-3 # Time between statistics output
delta_time: 1. # Time between statistics output
# Parameters for the hydrodynamics scheme
SPH:
resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours.
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions
......
......@@ -9,9 +9,9 @@ InternalUnitSystem:
# Parameters governing the time integration
TimeIntegration:
time_begin: 0. # The starting time of the simulation (in internal units).
time_end: 1. # The end time of the simulation (in internal units).
time_end: 1000 # The end time of the simulation (in internal units).
dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units).
dt_max: 1e-3 # The maximal time-step size of the simulation (in internal units).
dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units).
# Parameters governing the snapshots
Snapshots:
......@@ -21,12 +21,11 @@ Snapshots:
# Parameters governing the conserved quantities statistics
Statistics:
delta_time: 1e-3 # Time between statistics output
delta_time: 1. # Time between statistics output
# Parameters for the hydrodynamics scheme
SPH:
resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours.
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions
......
......@@ -190,7 +190,11 @@ int main(int argc, char *argv[]) {
while ((c = getopt(argc, argv, "acCdDef:FgGhMn:P:sSt:Tv:y:Y:")) != -1)
switch (c) {
case 'a':
#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA)
with_aff = 1;
#else
error("Need NUMA support for thread affinity");
#endif
break;
case 'c':
with_cosmology = 1;
......@@ -392,8 +396,12 @@ int main(int argc, char *argv[]) {
parser_read_file(paramFileName, params);
/* Handle any command-line overrides. */
if (nparams > 0)
if (nparams > 0) {
message(
"Overwriting values read from the YAML file with command-line "
"values.");
for (int k = 0; k < nparams; k++) parser_set_param(params, cmdparams[k]);
}
/* And dump the parameters as used. */
// parser_print_params(&params);
......@@ -565,6 +573,11 @@ int main(int argc, char *argv[]) {
message("nr of cells at depth %i is %i.", data[0], data[1]);
}
/* Initialise the table of Ewald corrections for the gravity checks */
#ifdef SWIFT_GRAVITY_FORCE_CHECKS
if (periodic) gravity_exact_force_ewald_init(dim[0]);
#endif
/* Initialise the external potential properties */
struct external_potential potential;
if (with_external_gravity)
......
......@@ -64,7 +64,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \
runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
dimension.h equation_of_state.h part_type.h periodic.h \
gravity.h gravity_io.h \
gravity.h gravity_io.h gravity_cache.h \
gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
gravity/Default/gravity_debug.h gravity/Default/gravity_part.h \
sourceterms.h \
......
......@@ -23,9 +23,71 @@
* @brief The default struct alignment in SWIFT.
*/
#define SWIFT_STRUCT_ALIGNMENT 32
/**
* @brief Defines alignment of structures
*/
#define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)))
/**
* @brief The default cache alignment in SWIFT.
*/
#define SWIFT_CACHE_ALIGNMENT 64
/**
* @brief Defines alignment of caches
*/
#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT)))
/**
* @brief Macro to tell the compiler that a given array has the specified
* alignment.
*
* Note that this turns into a no-op but gives information to the compiler.
*
* @param array The array.
* @param alignment The alignment in bytes of the array.
*/
#if defined(__ICC)
#define swift_align_information(array, alignment) \
__assume_aligned(array, alignment);
#elif defined(__GNUC__)
#define swift_align_information(array, alignment) \
array = __builtin_assume_aligned(array, alignment);
#else
#define swift_align_information(array, alignment) ;
#endif
/**
* @brief Macro to create a restrict pointer to an array and tell the compiler
* that the given array has the specified
* alignment.
*
* Note that this turns into a no-op but gives information to the compiler.
*
* @param array The array.
* @param ptr Pointer to array
* @param type Type of array
* @param alignment The alignment in bytes of the array.
*/
#define swift_declare_aligned_ptr(type, array, ptr, alignment) \
type *restrict array = ptr; \
swift_align_information(array, alignment);
/**
* @brief Macro to tell the compiler that a given number is 0 modulo a given
* size.
*
* Note that this turns into a no-op but gives information to the compiler.
* GCC does not have the equivalent built-in so defaults to nothing.
*
* @param var The variable
* @param size The modulo of interest.
*/
#if defined(__ICC)
#define swift_assume_size(var, size) __assume(var % size == 0);
#else
#define swift_assume_size(var, size) ;
#endif
#endif /* SWIFT_ALIGN_H */
......@@ -23,6 +23,7 @@
#include "../config.h"
/* Local headers */
#include "align.h"
#include "cell.h"
#include "error.h"
#include "part.h"
......@@ -30,9 +31,7 @@
#include "vector.h"
#define NUM_VEC_PROC 2
#define CACHE_ALIGN 64
#define C2_CACHE_SIZE (NUM_VEC_PROC * VEC_SIZE * 6) + (NUM_VEC_PROC * VEC_SIZE)
#define C2_CACHE_ALIGN sizeof(float) * VEC_SIZE
#ifdef WITH_VECTORIZATION
/* Cache struct to hold a local copy of a cells' particle
......@@ -40,31 +39,31 @@
struct cache {
/* Particle x position. */
float *restrict x __attribute__((aligned(CACHE_ALIGN)));
float *restrict x SWIFT_CACHE_ALIGN;
/* Particle y position. */
float *restrict y __attribute__((aligned(CACHE_ALIGN)));
float *restrict y SWIFT_CACHE_ALIGN;
/* Particle z position. */
float *restrict z __attribute__((aligned(CACHE_ALIGN)));
float *restrict z SWIFT_CACHE_ALIGN;
/* Particle smoothing length. */
float *restrict h __attribute__((aligned(CACHE_ALIGN)));
float *restrict h SWIFT_CACHE_ALIGN;
/* Particle mass. */
float *restrict m __attribute__((aligned(CACHE_ALIGN)));
float *restrict m SWIFT_CACHE_ALIGN;
/* Particle x velocity. */
float *restrict vx __attribute__((aligned(CACHE_ALIGN)));
float *restrict vx SWIFT_CACHE_ALIGN;
/* Particle y velocity. */
float *restrict vy __attribute__((aligned(CACHE_ALIGN)));
float *restrict vy SWIFT_CACHE_ALIGN;
/* Particle z velocity. */
float *restrict vz __attribute__((aligned(CACHE_ALIGN)));
float *restrict vz SWIFT_CACHE_ALIGN;
/* Maximum distance of particles into neighbouring cell. */
float *restrict max_d __attribute__((aligned(CACHE_ALIGN)));
/* Maximum index into neighbouring cell for particles that are in range. */
int *restrict max_index SWIFT_CACHE_ALIGN;
/* Cache size. */
int count;
......@@ -75,28 +74,28 @@ struct cache {
struct c2_cache {
/* Separation between two particles squared. */
float r2q[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float r2q[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* x separation between two particles. */
float dxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float dxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* y separation between two particles. */
float dyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float dyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* z separation between two particles. */
float dzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float dzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* Mass of particle pj. */
float mq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float mq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* x velocity of particle pj. */
float vxq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float vxq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* y velocity of particle pj. */
float vyq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float vyq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
/* z velocity of particle pj. */
float vzq[C2_CACHE_SIZE] __attribute__((aligned(C2_CACHE_ALIGN)));
float vzq[C2_CACHE_SIZE] SWIFT_CACHE_ALIGN;
};
/**
......@@ -111,9 +110,10 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
/* Align cache on correct byte boundary and pad cache size to be a multiple of
* the vector size
* and include 2 vector lengths for remainder operations. */
unsigned int pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
size_t pad = 2 * VEC_SIZE, rem = count % VEC_SIZE;
if (rem > 0) pad += VEC_SIZE - rem;
unsigned int sizeBytes = (count + pad) * sizeof(float);
size_t sizeBytes = (count + pad) * sizeof(float);
size_t sizeIntBytes = (count + pad) * sizeof(int);
int error = 0;
/* Free memory if cache has already been allocated. */
......@@ -126,18 +126,19 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
free(c->vy);
free(c->vz);
free(c->h);
free(c->max_d);
free(c->max_index);
}
error += posix_memalign((void **)&c->x, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->y, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->z, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->m, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vx, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vy, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->vz, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->h, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->max_d, CACHE_ALIGN, sizeBytes);
error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->y, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->z, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->m, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->vx, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->vy, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->vz, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes);
error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT,
sizeIntBytes);
if (error != 0)
error("Couldn't allocate cache, no. of particles: %d", (int)count);
......@@ -151,156 +152,43 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c,
* @param ci_cache The cache.
*/
__attribute__((always_inline)) INLINE void cache_read_particles(
const struct cell *const ci, struct cache *const ci_cache) {
const struct cell *restrict const ci,
struct cache *restrict const ci_cache) {
#if defined(GADGET2_SPH)
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0];
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1];
ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2];
ci_cache->h[i] = ci->parts[i].h;
ci_cache->m[i] = ci->parts[i].mass;
ci_cache->vx[i] = ci->parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
}
#endif
}
/**
* @brief Populate cache by reading in the particles from two cells in unsorted
* order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The cache for cell ci.
* @param cj_cache The cache for cell cj.
* @param shift The amount to shift the particle positions to account for BCs
*/
__attribute__((always_inline)) INLINE void cache_read_two_cells(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const double *const shift) {
/* Shift the particles positions to a local frame (ci frame) so single
* precision can be
* used instead of double precision. Also shift the cell ci, particles
* positions due to BCs but leave cell cj. */
for (int i = 0; i < ci->count; i++) {
ci_cache->x[i] = ci->parts[i].x[0] - ci->loc[0] - shift[0];
ci_cache->y[i] = ci->parts[i].x[1] - ci->loc[1] - shift[1];
ci_cache->z[i] = ci->parts[i].x[2] - ci->loc[2] - shift[2];
ci_cache->h[i] = ci->parts[i].h;
ci_cache->m[i] = ci->parts[i].mass;
ci_cache->vx[i] = ci->parts[i].v[0];
ci_cache->vy[i] = ci->parts[i].v[1];
ci_cache->vz[i] = ci->parts[i].v[2];
}
for (int i = 0; i < cj->count; i++) {
cj_cache->x[i] = cj->parts[i].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[i].x[1] - ci->loc[1];
cj_cache->z[i] = cj->parts[i].x[2] - ci->loc[2];
cj_cache->h[i] = cj->parts[i].h;
cj_cache->m[i] = cj->parts[i].mass;
cj_cache->vx[i] = cj->parts[i].v[0];
cj_cache->vy[i] = cj->parts[i].v[1];
cj_cache->vz[i] = cj->parts[i].v[2];
}
}
__attribute__((always_inline)) INLINE void cache_read_cell_sorted(
const struct cell *const ci, struct cache *const ci_cache,
const struct entry *restrict sort_i, double *const loc,
double *const shift) {
int idx;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
for (int i = 0; i < ci->count; i++) {
idx = sort_i[i].i;
ci_cache->x[i] = ci->parts[idx].x[0] - loc[0] - shift[0];
ci_cache->y[i] = ci->parts[idx].x[1] - loc[1] - shift[1];
ci_cache->z[i] = ci->parts[idx].x[2] - loc[2] - shift[2];
ci_cache->h[i] = ci->parts[idx].h;
ci_cache->m[i] = ci->parts[idx].mass;
ci_cache->vx[i] = ci->parts[idx].v[0];
ci_cache->vy[i] = ci->parts[idx].v[1];
ci_cache->vz[i] = ci->parts[idx].v[2];
}
}
/**
* @brief Populate cache by reading in the particles from two cells in sorted
* order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
*/
__attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift) {
int idx;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
for (int i = 0; i < ci->count; i++) {
idx = sort_i[i].i;
ci_cache->x[i] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
ci_cache->y[i] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
ci_cache->z[i] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
ci_cache->h[i] = ci->parts[idx].h;
ci_cache->m[i] = ci->parts[idx].mass;
ci_cache->vx[i] = ci->parts[idx].v[0];
ci_cache->vy[i] = ci->parts[idx].v[1];
ci_cache->vz[i] = ci->parts[idx].v[2];
x[i] = (float)(parts[i].x[0] - loc[0]);
y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h;
m[i] = parts[i].mass;
vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1];
vz[i] = parts[i].v[2];
}
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma simd
#endif
for (int i = 0; i < cj->count; i++) {
idx = sort_j[i].i;
cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
cj_cache->h[i] = cj->parts[idx].h;
cj_cache->m[i] = cj->parts[idx].mass;
cj_cache->vx[i] = cj->parts[idx].v[0];
cj_cache->vy[i] = cj->parts[idx].v[1];
cj_cache->vz[i] = cj->parts[idx].v[2];
}
}
/**
......@@ -321,13 +209,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_cells_sorted(
* interaction.
*/
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj,
const int num_vec_proc) {
const struct cell *restrict const ci, const struct cell *restrict const cj,
struct cache *restrict const ci_cache,
struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
const struct entry *restrict sort_j, const double *restrict const shift,
int *first_pi, int *last_pj, const int num_vec_proc) {
int idx, ci_cache_idx;
int idx;
/* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) {
......@@ -345,74 +233,97 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
int first_pi_align = *first_pi;
int last_pj_align = *last_pj;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = first_pi_align; i < ci->count; i++) {
/* Make sure ci_cache is filled from the first element. */
ci_cache_idx = i - first_pi_align;
idx = sort_i[i].i;
ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = ci->loc[0];