Commit f617772d authored by James Willis's avatar James Willis
Browse files

Merge branch 'master' into do_subset_vec

parents 174e8e0c b0b0b877
......@@ -55,7 +55,9 @@ tests/brute_force_125_standard.dat
tests/swift_dopair_125_standard.dat
tests/brute_force_125_perturbed.dat
tests/swift_dopair_125_perturbed.dat
tests/brute_force_active.dat
tests/brute_force_pair_active.dat
tests/brute_force_dopair2_active.dat
tests/swift_dopair2_force_active.dat
tests/brute_force_periodic_BC_perturbed.dat
tests/swift_dopair_active.dat
tests/test_nonsym_density_serial.dat
......
......@@ -198,10 +198,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
......@@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h;
m[i] = parts[i].mass;
vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1];
......@@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]};
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
......@@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h;
m[i] = parts[i].mass;
vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1];
vz[i] = parts[i].v[2];
rho[i] = parts[i].rho;
grad_h[i] = parts[i].force.f;
pOrho2[i] = parts[i].force.P_over_rho2;
......@@ -296,7 +287,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* @param num_vec_proc Number of vectors that will be used to process the
* interaction.
*/
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
......@@ -304,38 +294,39 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
struct cache *restrict const ci_cache,
struct cache *restrict const cj_cache, const struct entry *restrict sort_i,
const struct entry *restrict sort_j, const double *restrict const shift,
int *first_pi, int *last_pj, const int num_vec_proc) {
int *first_pi, int *last_pj) {
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
* cache. */
int idx;
/* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
/* Is the number of particles to read a multiple of the vector size? */
int rem = (ci->count - *first_pi) % VEC_SIZE;
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
int pad = VEC_SIZE - rem;
/* Decrease first_pi if there are particles in the cell left to read. */
if (*first_pi - pad >= 0) *first_pi -= pad;
}
rem = *last_pj % (num_vec_proc * VEC_SIZE);
rem = (*last_pj + 1) % VEC_SIZE;
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
int pad = VEC_SIZE - rem;
/* Increase last_pj if there are particles in the cell left to read. */
if (*last_pj + pad < cj->count) *last_pj += pad;
}
int first_pi_align = *first_pi;
int last_pj_align = *last_pj;
/* Get some local pointers */
const int first_pi_align = *first_pi;
const int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = cj->loc[0];
loc[1] = cj->loc[1];
loc[2] = cj->loc[2];
/* Shift ci particles for boundary conditions and location of cell.*/
double total_ci_shift[3];
total_ci_shift[0] = loc[0] + shift[0];
total_ci_shift[1] = loc[1] + shift[1];
total_ci_shift[2] = loc[2] + shift[2];
/* Shift particles to the local frame and account for boundary conditions.*/
const double total_ci_shift[3] = {
cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
......@@ -349,19 +340,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
int ci_cache_count = ci->count - first_pi_align;
/* Shift the particles positions to a local frame (ci frame) so single
* precision
* can be
* used instead of double precision. Also shift the cell ci, particles
* positions
* due to BCs but leave cell cj. */
* precision can be used instead of double precision. */
for (int i = 0; i < ci_cache_count; i++) {
idx = sort_i[i + first_pi_align].i;
const int idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
......@@ -384,36 +371,42 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
"is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. x=%f, ci->width[0]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, x[i],
ci->width[0]);
ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
cj->loc[2], i, x[i], ci->width[0]);
if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos "
"is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. y=%f, ci->width[1]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, y[i],
ci->width[1]);
ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
cj->loc[2], i, y[i], ci->width[1]);
if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos "
"is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. z=%f, ci->width[2]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, z[i],
ci->width[2]);
ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
cj->loc[2], i, z[i], ci->width[2]);
}
#endif
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0];
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->parts[0].h;
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) {
x[i] = fake_pix;
y[i] = 1.f;
z[i] = 1.f;
h[i] = 1.f;
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
m[i] = 1.f;
vx[i] = 1.f;
......@@ -433,12 +426,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
const int idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
hj[i] = parts_j[idx].h;
mj[i] = parts_j[idx].mass;
vxj[i] = parts_j[idx].v[0];
vyj[i] = parts_j[idx].v[1];
......@@ -454,40 +446,228 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
"pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. xj=%f, ci->width[0]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, xj[i],
ci->width[0]);
ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
cj->loc[2], i, xj[i], ci->width[0]);
if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj "
"pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. yj=%f, ci->width[1]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, yj[i],
ci->width[1]);
ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
cj->loc[2], i, yj[i], ci->width[1]);
if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z)
error(
"Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj "
"pos is not within "
"[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + "
"2*space_maxreldx)]. zj=%f, ci->width[2]=%f",
ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, zj[i],
ci->width[2]);
ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1],
cj->loc[2], i, zj[i], ci->width[2]);
}
#endif
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
-(2. * cj->width[1] + max_dx),
-(2. * cj->width[2] + max_dx)};
const float h_padded_j = cj->parts[0].h;
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
xj[i] = fake_pjx;
yj[i] = 1.f;
zj[i] = 1.f;
hj[i] = 1.f;
xj[i] = pos_padded_j[0];
yj[i] = pos_padded_j[1];
zj[i] = pos_padded_j[2];
hj[i] = h_padded_j;
mj[i] = 1.f;
vxj[i] = 1.f;
vyj[i] = 1.f;
vzj[i] = 1.f;
}
}
/**
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
* in sorted order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* interaction.
*/
__attribute__((always_inline)) INLINE void
cache_read_two_partial_cells_sorted_force(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj) {
/* Make the number of particles to be read a multiple of the vector size.
* This eliminates serial remainder loops where possible when populating the
* cache. */
/* Is the number of particles to read a multiple of the vector size? */
int rem = (ci->count - *first_pi) % VEC_SIZE;
if (rem != 0) {
int pad = VEC_SIZE - rem;
/* Decrease first_pi if there are particles in the cell left to read. */
if (*first_pi - pad >= 0) *first_pi -= pad;
}
rem = (*last_pj + 1) % VEC_SIZE;
if (rem != 0) {
int pad = VEC_SIZE - rem;
/* Increase last_pj if there are particles in the cell left to read. */
if (*last_pj + pad < cj->count) *last_pj += pad;
}
/* Get some local pointers */
const int first_pi_align = *first_pi;
const int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
/* Shift particles to the local frame and account for boundary conditions.*/
const double total_ci_shift[3] = {
cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]};
const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsara, ci_cache->balsara,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed,
SWIFT_CACHE_ALIGNMENT);
int ci_cache_count = ci->count - first_pi_align;
/* Shift the particles positions to a local frame (ci frame) so single
* precision can be used instead of double precision. */
for (int i = 0; i < ci_cache_count; i++) {
const int idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
vz[i] = parts_i[idx].v[2];
rho[i] = parts_i[idx].rho;
grad_h[i] = parts_i[idx].force.f;
pOrho2[i] = parts_i[idx].force.P_over_rho2;
balsara[i] = parts_i[idx].force.balsara;
soundspeed[i] = parts_i[idx].force.soundspeed;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const double max_dx = max(ci->dx_max_part, cj->dx_max_part);
const float pos_padded[3] = {-(2. * ci->width[0] + max_dx),
-(2. * ci->width[1] + max_dx),
-(2. * ci->width[2] + max_dx)};
const float h_padded = ci->parts[0].h;
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) {
x[i] = pos_padded[0];
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
m[i] = 1.f;
vx[i] = 1.f;
vy[i] = 1.f;
vz[i] = 1.f;
rho[i] = 1.f;
grad_h[i] = 1.f;
pOrho2[i] = 1.f;
balsara[i] = 1.f;
soundspeed[i] = 1.f;
}
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara,
SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed,
SWIFT_CACHE_ALIGNMENT);
for (int i = 0; i <= last_pj_align; i++) {
const int idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
hj[i] = parts_j[idx].h;
mj[i] = parts_j[idx].mass;
vxj[i] = parts_j[idx].v[0];
vyj[i] = parts_j[idx].v[1];
vzj[i] = parts_j[idx].v[2];
rhoj[i] = parts_j[idx].rho;
grad_hj[i] = parts_j[idx].force.f;
pOrho2j[i] = parts_j[idx].force.P_over_rho2;
balsaraj[i] = parts_j[idx].force.balsara;
soundspeedj[i] = parts_j[idx].force.soundspeed;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact. We use values of the same magnitude (but negative!) as the real
* particles to avoid overflow problems. */
const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx),
-(2. * cj->width[1] + max_dx),
-(2. * cj->width[2] + max_dx)};
const float h_padded_j = cj->parts[0].h;
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
xj[i] = pos_padded_j[0];
yj[i] = pos_padded_j[1];
zj[i] = pos_padded_j[2];
hj[i] = h_padded_j;
mj[i] = 1.f;
vxj[i] = 1.f;
vyj[i] = 1.f;
vzj[i] = 1.f;
rhoj[i] = 1.f;
grad_hj[i] = 1.f;
pOrho2j[i] = 1.f;
balsaraj[i] = 1.f;
soundspeedj[i] = 1.f;
}
}
......@@ -506,6 +686,11 @@ static INLINE void cache_clean(struct cache *c) {
free(c->vz);
free(c->h);
free(c->max_index);
free(c->rho);
free(c->grad_h);
free(c->pOrho2);
free(c->balsara);
free(c->soundspeed);
}
}
......
......@@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz,
mask_t mask) {
vector r, ri, ui, wi, wi_dx;
vector mj;
vector dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector dvdr;
vector curlvrx, curlvry, curlvrz;
/* Fill the vectors. */
mj.v = vec_load(Mj);
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
const vector mj = vector_load(Mj);
const vector vjx = vector_load(Vjx);
const vector vjy = vector_load(Vjy);
const vector vjz = vector_load(Vjz);
/* Get the radius and inverse radius. */
ri = vec_reciprocal_sqrt(*r2);
......@@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz,
vector *curlvySum, vector *curlvzSum,
mask_t mask, mask_t mask2, short mask_cond) {
vector r, ri, r2, ui, wi, wi_dx;
vector mj;
vector dx, dy, dz, dvx, dvy, dvz;
vector vjx, vjy, vjz;
vector r, ri, ui, wi, wi_dx;
vector dvx, dvy, dvz;
vector dvdr;
vector curlvrx, curlvry, curlvrz;
vector r_2, ri2, r2_2, ui2, wi2, wi_dx2;
vector mj2;
vector dx2, dy2, dz2, dvx2, dvy2, dvz2;
vector vjx2, vjy2, vjz2;
vector r_2, ri2, ui2, wi2, wi_dx2;
vector dvx2, dvy2, dvz2;
vector dvdr2;
vector curlvrx2, curlvry2, curlvrz2;
/* Fill the vectors. */
mj.v = vec_load(Mj);
mj2.v = vec_load(&Mj[VEC_SIZE]);
vjx.v = vec_load(Vjx);
vjx2.v = vec_load(&Vjx[VEC_SIZE]);
vjy.v = vec_load(Vjy);
vjy2.v = vec_load(&Vjy[VEC_SIZE]);
vjz.v = vec_load(Vjz);
vjz2.v = vec_load(&Vjz[VEC_SIZE]);
dx.v = vec_load(Dx);
dx2.v = vec_load(&Dx[VEC_SIZE]);
dy.v = vec_load(Dy);
dy2.v = vec_load(&Dy[VEC_SIZE]);
dz.v = vec_load(Dz);
dz2.v = vec_load(&Dz[VEC_SIZE]);
const vector mj = vector_load(Mj);
const vector mj2 = vector_load(&Mj[VEC_SIZE]);
const vector vjx = vector_load(Vjx);
const vector vjx2 = vector_load(&Vjx[VEC_SIZE]);
const vector vjy = vector_load(Vjy);
const vector vjy2 = vector_load(&Vjy[VEC_SIZE]);
const vector vjz = vector_load(Vjz);
const vector vjz2 = vector_load(&Vjz[VEC_SIZE]);
const vector dx = vector_load(Dx);
const vector dx2 = vector_load(&Dx[VEC_SIZE]);
const vector dy = vector_load(Dy);
const vector dy2 = vector_load(&Dy[VEC_SIZE]);
const vector dz = vector_load(Dz);
const vector dz2 = vector_load(&Dz[VEC_SIZE]);
/* Get the radius and inverse radius. */
r2.v = vec_load(R2);
r2_2.v = vec_load(&R2[VEC_SIZE]);
const vector r2 = vector_load(R2);
const vector r2_2 = vector_load(&R2[VEC_SIZE]);
ri = vec_reciprocal_sqrt(r2);
ri2 = vec_reciprocal_sqrt(r2_2);
r.v = vec_mul(r2.v, ri.v);
......@@ -592,30 +586,29 @@ runner_iact_nonsym_1_vec_force(
#ifdef WITH_VECTORIZATION
vector r, ri;
vector vjx, vjy, vjz, dvx, dvy, dvz;
vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj;
vector dvx, dvy, dvz;
vector xi, xj;
vector hid_inv, hjd_inv;
vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr;
vector piax, piay, piaz;
vector pih_dt;
vector v_sig;
vector omega_ij, mu_ij, fac_mu, balsara;
vector omega_ij, mu_ij, balsara;
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
/* Fill vectors. */
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
mj.v = vec_load(Mj);
pjrho.v = vec_load(Pjrho);
grad_hj.v = vec_load(Grad_hj);
pjPOrho2.v = vec_load(PjPOrho2);
balsara_j.v = vec_load(Balsara_j);
cj.v = vec_load(Cj);
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
const vector vjx = vector_load(Vjx);
const vector vjy = vector_load(Vjy);
const vector vjz = vector_load(Vjz);
const vector mj = vector_load(Mj);