Commit f617772d authored by James Willis's avatar James Willis
Browse files

Merge branch 'master' into do_subset_vec

parents 174e8e0c b0b0b877
...@@ -55,7 +55,9 @@ tests/brute_force_125_standard.dat ...@@ -55,7 +55,9 @@ tests/brute_force_125_standard.dat
tests/swift_dopair_125_standard.dat tests/swift_dopair_125_standard.dat
tests/brute_force_125_perturbed.dat tests/brute_force_125_perturbed.dat
tests/swift_dopair_125_perturbed.dat tests/swift_dopair_125_perturbed.dat
tests/brute_force_active.dat tests/brute_force_pair_active.dat
tests/brute_force_dopair2_active.dat
tests/swift_dopair2_force_active.dat
tests/brute_force_periodic_BC_perturbed.dat tests/brute_force_periodic_BC_perturbed.dat
tests/swift_dopair_active.dat tests/swift_dopair_active.dat
tests/test_nonsym_density_serial.dat tests/test_nonsym_density_serial.dat
......
...@@ -198,10 +198,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles( ...@@ -198,10 +198,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts; const struct part *restrict parts = ci->parts;
double loc[3]; const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
...@@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles( ...@@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
y[i] = (float)(parts[i].x[1] - loc[1]); y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]); z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h; h[i] = parts[i].h;
m[i] = parts[i].mass; m[i] = parts[i].mass;
vx[i] = parts[i].v[0]; vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1]; vy[i] = parts[i].v[1];
...@@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( ...@@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
SWIFT_CACHE_ALIGNMENT); SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts; const struct part *restrict parts = ci->parts;
double loc[3]; const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift the particles positions to a local frame so single precision can be /* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */ * used instead of double precision. */
...@@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( ...@@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
y[i] = (float)(parts[i].x[1] - loc[1]); y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]); z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h; h[i] = parts[i].h;
m[i] = parts[i].mass; m[i] = parts[i].mass;
vx[i] = parts[i].v[0]; vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1]; vy[i] = parts[i].v[1];
vz[i] = parts[i].v[2]; vz[i] = parts[i].v[2];
rho[i] = parts[i].rho; rho[i] = parts[i].rho;
grad_h[i] = parts[i].force.f; grad_h[i] = parts[i].force.f;
pOrho2[i] = parts[i].force.P_over_rho2; pOrho2[i] = parts[i].force.P_over_rho2;
...@@ -296,7 +287,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( ...@@ -296,7 +287,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
* @param shift The amount to shift the particle positions to account for BCs * @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range. * @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range. * @param last_pj The last particle in cell cj that is in range.
* @param num_vec_proc Number of vectors that will be used to process the
* interaction. * interaction.
*/ */
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
...@@ -304,38 +294,39 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -304,38 +294,39 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
struct cache *restrict const ci_cache, struct cache *restrict const ci_cache,
struct cache *restrict const cj_cache, const struct entry *restrict sort_i, struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
const struct entry *restrict sort_j, const double *restrict const shift, const struct entry *restrict sort_j, const double *restrict const shift,
int *first_pi, int *last_pj, const int num_vec_proc) { int *first_pi, int *last_pj) {
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
* cache. */
int idx; /* Is the number of particles to read a multiple of the vector size? */
/* Pad number of particles read to the vector size. */ int rem = (ci->count - *first_pi) % VEC_SIZE;
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) { if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem; int pad = VEC_SIZE - rem;
/* Decrease first_pi if there are particles in the cell left to read. */
if (*first_pi - pad >= 0) *first_pi -= pad; if (*first_pi - pad >= 0) *first_pi -= pad;
} }
rem = *last_pj % (num_vec_proc * VEC_SIZE); rem = (*last_pj + 1) % VEC_SIZE;
if (rem != 0) { if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem; int pad = VEC_SIZE - rem;
/* Increase last_pj if there are particles in the cell left to read. */
if (*last_pj + pad < cj->count) *last_pj += pad; if (*last_pj + pad < cj->count) *last_pj += pad;
} }
int first_pi_align = *first_pi; /* Get some local pointers */
int last_pj_align = *last_pj; const int first_pi_align = *first_pi;
const int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts; const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = cj->loc[0];
loc[1] = cj->loc[1];
loc[2] = cj->loc[2];
/* Shift ci particles for boundary conditions and location of cell.*/ /* Shift particles to the local frame and account for boundary conditions.*/
double total_ci_shift[3]; const double total_ci_shift[3] = {
total_ci_shift[0] = loc[0] + shift[0]; cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
total_ci_shift[1] = loc[1] + shift[1]; const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
total_ci_shift[2] = loc[2] + shift[2];
/* Let the compiler know that the data is aligned and create pointers to the /* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */ * arrays inside the cache. */
...@@ -349,19 +340,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -349,19 +340,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
int ci_cache_count = ci->count - first_pi_align; int ci_cache_count = ci->count - first_pi_align;
/* Shift the particles positions to a local frame (ci frame) so single /* Shift the particles positions to a local frame (ci frame) so single
* precision * precision can be used instead of double precision. */
* can be
* used instead of double precision. Also shift the cell ci, particles
* positions
* due to BCs but leave cell cj. */
for (int i = 0; i < ci_cache_count; i++) { for (int i = 0; i < ci_cache_count; i++) {
idx = sort_i[i + first_pi_align].i; const int idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h; h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass; m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0]; vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1]; vy[i] = parts_i[idx].v[1];
...@@ -384,36 +371,42 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -384,36 +371,42 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
"is not within " "is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. x=%f, ci->width[0]=%f", "2*space_maxreldx)]. x=%f, ci->width[0]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, x[i], ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
ci->width[0]); cj->loc[2], i, x[i], ci->width[0]);
if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y) if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
error( error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos " "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
"is not within " "is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. y=%f, ci->width[1]=%f", "2*space_maxreldx)]. y=%f, ci->width[1]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, y[i], ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
ci->width[1]); cj->loc[2], i, y[i], ci->width[1]);
if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z) if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
error( error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos " "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
"is not within " "is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. z=%f, ci->width[2]=%f", "2*space_maxreldx)]. z=%f, ci->width[2]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, z[i], ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
ci->width[2]); cj->loc[2], i, z[i], ci->width[2]);
} }
#endif #endif
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact.*/ * interact. We use values of the same magnitude (but negative!) as the real
float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0]; * particles to avoid overflow problems. */
const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->parts[0].h;
for (int i = ci->count - first_pi_align; for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) { i < ci->count - first_pi_align + VEC_SIZE; i++) {
x[i] = fake_pix; x[i] = pos_padded[0];
y[i] = 1.f; y[i] = pos_padded[1];
z[i] = 1.f; z[i] = pos_padded[2];
h[i] = 1.f; h[i] = h_padded;
m[i] = 1.f; m[i] = 1.f;
vx[i] = 1.f; vx[i] = 1.f;
...@@ -433,12 +426,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -433,12 +426,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
for (int i = 0; i <= last_pj_align; i++) { for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i; const int idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - loc[0]); xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
yj[i] = (float)(parts_j[idx].x[1] - loc[1]); yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
zj[i] = (float)(parts_j[idx].x[2] - loc[2]); zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
hj[i] = parts_j[idx].h; hj[i] = parts_j[idx].h;
mj[i] = parts_j[idx].mass; mj[i] = parts_j[idx].mass;
vxj[i] = parts_j[idx].v[0]; vxj[i] = parts_j[idx].v[0];
vyj[i] = parts_j[idx].v[1]; vyj[i] = parts_j[idx].v[1];
...@@ -454,40 +446,228 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( ...@@ -454,40 +446,228 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
"pos is not within " "pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. xj=%f, ci->width[0]=%f", "2*space_maxreldx)]. xj=%f, ci->width[0]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, xj[i], ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
ci->width[0]); cj->loc[2], i, xj[i], ci->width[0]);
if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y) if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
error( error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj " "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
"pos is not within " "pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. yj=%f, ci->width[1]=%f", "2*space_maxreldx)]. yj=%f, ci->width[1]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, yj[i], ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
ci->width[1]); cj->loc[2], i, yj[i], ci->width[1]);
if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z) if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
error( error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj " "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
"pos is not within " "pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. zj=%f, ci->width[2]=%f", "2*space_maxreldx)]. zj=%f, ci->width[2]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, zj[i], ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
ci->width[2]); cj->loc[2], i, zj[i], ci->width[2]);
} }
#endif #endif
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact.*/ * interact. We use values of the same magnitude (but negative!) as the real
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; * particles to avoid overflow problems. */
const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
-(2. * cj->width[1] + max_dx),
-(2. * cj->width[2] + max_dx)};
const float h_padded_j = cj->parts[0].h;
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
xj[i] = fake_pjx; xj[i] = pos_padded_j[0];
yj[i] = 1.f; yj[i] = pos_padded_j[1];
zj[i] = 1.f; zj[i] = pos_padded_j[2];
hj[i] = 1.f; hj[i] = h_padded_j;
mj[i] = 1.f;
vxj[i] = 1.f;
vyj[i] = 1.f;
vzj[i] = 1.f;
}
}
/**
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
* in sorted order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* interaction.
*/
__attribute__((always_inline)) INLINE void
cache_read_two_partial_cells_sorted_force(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj) {
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
* cache. */
/* Is the number of particles to read a multiple of the vector size? */
int rem = (ci->count - *first_pi) % VEC_SIZE;
if (rem != 0) {
int pad = VEC_SIZE - rem;
/* Decrease first_pi if there are particles in the cell left to read. */
if (*first_pi - pad >= 0) *first_pi -= pad;
}
rem = (*last_pj + 1) % VEC_SIZE;
if (rem != 0) {
int pad = VEC_SIZE - rem;
/* Increase last_pj if there are particles in the cell left to read. */
if (*last_pj + pad < cj->count) *last_pj += pad;
}
/* Get some local pointers */
const int first_pi_align = *first_pi;
const int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
/* Shift particles to the local frame and account for boundary conditions.*/
const double total_ci_shift[3] = {
cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
SWIFT_CACHE_ALIGNMENT);
int ci_cache_count = ci->count - first_pi_align;
/* Shift the particles positions to a local frame (ci frame) so single
* precision can be used instead of double precision. */
for (int i = 0; i < ci_cache_count; i++) {
const int idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
vz[i] = parts_i[idx].v[2];
rho[i] = parts_i[idx].rho;
grad_h[i] = parts_i[idx].force.f;
pOrho2[i] = parts_i[idx].force.P_over_rho2;
balsara[i] = parts_i[idx].force.balsara;
soundspeed[i] = parts_i[idx].force.soundspeed;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->parts[0].h;
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
m[i] = 1.f;
vx[i] = 1.f;
vy[i] = 1.f;
vz[i] = 1.f;
rho[i] = 1.f;
grad_h[i] = 1.f;
pOrho2[i] = 1.f;
balsara[i] = 1.f;
soundspeed[i] = 1.f;
}
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed,
SWIFT_CACHE_ALIGNMENT);
for (int i = 0; i <= last_pj_align; i++) {
const int idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
hj[i] = parts_j[idx].h;
mj[i] = parts_j[idx].mass;
vxj[i] = parts_j[idx].v[0];
vyj[i] = parts_j[idx].v[1];
vzj[i] = parts_j[idx].v[2];
rhoj[i] = parts_j[idx].rho;
grad_hj[i] = parts_j[idx].force.f;
pOrho2j[i] = parts_j[idx].force.P_over_rho2;
balsaraj[i] = parts_j[idx].force.balsara;
soundspeedj[i] = parts_j[idx].force.soundspeed;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
-(2. * cj->width[1] + max_dx),
-(2. * cj->width[2] + max_dx)};
const float h_padded_j = cj->parts[0].h;
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
xj[i] = pos_padded_j[0];
yj[i] = pos_padded_j[1];
zj[i] = pos_padded_j[2];
hj[i] = h_padded_j;
mj[i] = 1.f; mj[i] = 1.f;
vxj[i] = 1.f; vxj[i] = 1.f;
vyj[i] = 1.f; vyj[i] = 1.f;
vzj[i] = 1.f; vzj[i] = 1.f;
rhoj[i] = 1.f;
grad_hj[i] = 1.f;
pOrho2j[i] = 1.f;
balsaraj[i] = 1.f;
soundspeedj[i] = 1.f;
} }
} }
...@@ -506,6 +686,11 @@ static INLINE void cache_clean(struct cache *c) { ...@@ -506,6 +686,11 @@ static INLINE void cache_clean(struct cache *c) {
free(c->vz); free(c->vz);
free(c->h); free(c->h);
free(c->max_index); free(c->max_index);
free(c->rho);
free(c->grad_h);
free(c->pOrho2);
free(c->balsara);
free(c->soundspeed);
} }
} }
......
...@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, ...@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
mask_t mask) { mask_t mask) {
vector r, ri, ui, wi, wi_dx; vector r, ri, ui, wi, wi_dx;
vector mj;
vector dvx, dvy, dvz; vector dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector dvdr; vector dvdr;
vector curlvrx, curlvry, curlvrz; vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */ /* Fill the vectors. */
mj.v = vec_load(Mj); const vector mj = vector_load(Mj);
vjx.v =