Commit 63c236da authored by James Willis's avatar James Willis
Browse files

Created a vectorised version of runner_dopair2_force that computes the...

Created a vectorised version of runner_dopair2_force that computes the interaction and masks at the same time as the neighbour search.
parent 7b68d177
......@@ -465,6 +465,144 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
}
}
/**
* @brief Populate caches by only reading particles that are within range of
* each other within the adjoining cell.Also read the particles into the cache
* in sorted order.
*
* @param ci The i #cell.
* @param cj The j #cell.
* @param ci_cache The #cache for cell ci.
* @param cj_cache The #cache for cell cj.
* @param sort_i The array of sorted particle indices for cell ci.
* @param sort_j The array of sorted particle indices for cell ci.
* @param shift The amount to shift the particle positions to account for BCs
* @param first_pi The first particle in cell ci that is in range.
* @param last_pj The last particle in cell cj that is in range.
* @param num_vec_proc Number of vectors that will be used to process the
* interaction.
*/
__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_force(
const struct cell *const ci, const struct cell *const cj,
struct cache *const ci_cache, struct cache *const cj_cache,
const struct entry *restrict sort_i, const struct entry *restrict sort_j,
const double *const shift, int *first_pi, int *last_pj,
const int num_vec_proc) {
int idx, ci_cache_idx;
/* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
if (*first_pi - pad >= 0) *first_pi -= pad;
}
rem = *last_pj % (num_vec_proc * VEC_SIZE);
if (rem != 0) {
int pad = (num_vec_proc * VEC_SIZE) - rem;
if (*last_pj + pad < cj->count) *last_pj += pad;
}
int first_pi_align = *first_pi;
int last_pj_align = *last_pj;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = first_pi_align; i < ci->count; i++) {
/* Make sure ci_cache is filled from the first element. */
ci_cache_idx = i - first_pi_align;
idx = sort_i[i].i;
ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
ci_cache->rho[ci_cache_idx] = ci->parts[idx].rho;
ci_cache->grad_h[ci_cache_idx] = ci->parts[idx].force.f;
ci_cache->pOrho2[ci_cache_idx] = ci->parts[idx].force.P_over_rho2;
ci_cache->balsara[ci_cache_idx] = ci->parts[idx].force.balsara;
ci_cache->soundspeed[ci_cache_idx] = ci->parts[idx].force.soundspeed;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) {
ci_cache->x[i] = fake_pix;
ci_cache->y[i] = 1.f;
ci_cache->z[i] = 1.f;
ci_cache->h[i] = 1.f;
ci_cache->m[i] = 1.f;
ci_cache->vx[i] = 1.f;
ci_cache->vy[i] = 1.f;
ci_cache->vz[i] = 1.f;
ci_cache->rho[i] = 1.f;
ci_cache->grad_h[i] = 1.f;
ci_cache->pOrho2[i] = 1.f;
ci_cache->balsara[i] = 1.f;
ci_cache->soundspeed[i] = 1.f;
}
#if defined(WITH_VECTORIZATION) && defined(__ICC)
#pragma vector aligned
#endif
for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i;
cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0];
cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1];
cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2];
cj_cache->h[i] = cj->parts[idx].h;
cj_cache->m[i] = cj->parts[idx].mass;
cj_cache->vx[i] = cj->parts[idx].v[0];
cj_cache->vy[i] = cj->parts[idx].v[1];
cj_cache->vz[i] = cj->parts[idx].v[2];
cj_cache->rho[i] = cj->parts[idx].rho;
cj_cache->grad_h[i] = cj->parts[idx].force.f;
cj_cache->pOrho2[i] = cj->parts[idx].force.P_over_rho2;
cj_cache->balsara[i] = cj->parts[idx].force.balsara;
cj_cache->soundspeed[i] = cj->parts[idx].force.soundspeed;
}
/* Pad cache with fake particles that exist outside the cell so will not
* interact.*/
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
cj_cache->x[i] = fake_pjx;
cj_cache->y[i] = 1.f;
cj_cache->z[i] = 1.f;
cj_cache->h[i] = 1.f;
cj_cache->m[i] = 1.f;
cj_cache->vx[i] = 1.f;
cj_cache->vy[i] = 1.f;
cj_cache->vz[i] = 1.f;
cj_cache->rho[i] = 1.f;
cj_cache->grad_h[i] = 1.f;
cj_cache->pOrho2[i] = 1.f;
cj_cache->balsara[i] = 1.f;
cj_cache->soundspeed[i] = 1.f;
}
}
/* @brief Clean the memory allocated by a #cache object.
*
* @param c The #cache to clean.
......
......@@ -1163,18 +1163,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force(
#ifdef WITH_VECTORIZATION
__attribute__((always_inline)) INLINE static void
runner_iact_nonsym_1_vec_force(
float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy,
vector *r2, vector *dx, vector *dy, vector *dz, vector *vix, vector *viy,
vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2,
vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz,
vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Mj,
float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj,
float *Mj, vector *hi_inv, float *Hj_inv, vector *a_hydro_xSum,
vector *hi_inv, vector *hj, vector *a_hydro_xSum,
vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum,
vector *v_sigSum, vector *entropy_dtSum, mask_t mask) {
#ifdef WITH_VECTORIZATION
vector r, r2, ri;
vector dx, dy, dz;
vector r, ri;
vector vjx, vjy, vjz;
vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv;
vector xi, xj;
......@@ -1187,11 +1186,6 @@ runner_iact_nonsym_1_vec_force(
vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt;
/* Fill vectors. */
r2.v = vec_load(R2);
dx.v = vec_load(Dx);
dy.v = vec_load(Dy);
dz.v = vec_load(Dz);
vjx.v = vec_load(Vjx);
vjy.v = vec_load(Vjy);
vjz.v = vec_load(Vjz);
......@@ -1202,7 +1196,7 @@ runner_iact_nonsym_1_vec_force(
pjPOrho2.v = vec_load(PjPOrho2);
balsara_j.v = vec_load(Balsara_j);
cj.v = vec_load(Cj);
hj_inv.v = vec_load(Hj_inv);
hj_inv = vec_reciprocal(*hj);
fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */
......@@ -1210,8 +1204,8 @@ runner_iact_nonsym_1_vec_force(
balsara.v = balsara_i->v + balsara_j.v;
/* Get the radius and inverse radius. */
ri = vec_reciprocal_sqrt(r2);
r.v = r2.v * ri.v;
ri = vec_reciprocal_sqrt(*r2);
r.v = r2->v * ri.v;
/* Get the kernel for hi. */
hid_inv = pow_dimension_plus_one_vec(*hi_inv);
......@@ -1223,14 +1217,14 @@ runner_iact_nonsym_1_vec_force(
hjd_inv = pow_dimension_plus_one_vec(hj_inv);
xj.v = r.v * hj_inv.v;
/* Calculate the kernel for two particles. */
/* Calculate the kernel. */
kernel_eval_dWdx_force_vec(&xj, &wj_dx);
wj_dr.v = hjd_inv.v * wj_dx.v;
/* Compute dv dot r. */
dvdr.v = ((vix->v - vjx.v) * dx.v) + ((viy->v - vjy.v) * dy.v) +
((viz->v - vjz.v) * dz.v);
dvdr.v = ((vix->v - vjx.v) * dx->v) + ((viy->v - vjy.v) * dy->v) +
((viz->v - vjz.v) * dz->v);
/* Compute the relative velocity. (This is 0 if the particles move away from
* each other and negative otherwise) */
......@@ -1255,9 +1249,9 @@ runner_iact_nonsym_1_vec_force(
acc.v = visc_term.v + sph_term.v;
/* Use the force, Luke! */
piax.v = mj.v * dx.v * acc.v;
piay.v = mj.v * dy.v * acc.v;
piaz.v = mj.v * dz.v * acc.v;
piax.v = mj.v * dx->v * acc.v;
piay.v = mj.v * dy->v * acc.v;
piaz.v = mj.v * dz->v * acc.v;
/* Get the time derivative for h. */
pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v;
......
......@@ -1424,3 +1424,464 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci,
#endif /* WITH_VECTORIZATION */
}
/**
* @brief Compute the force interactions between a cell pair (non-symmetric)
* using vector intrinsics.
*
* @param r The #runner.
* @param ci The first #cell.
* @param cj The second #cell.
*/
void runner_dopair2_force_vec(struct runner *r, struct cell *ci,
struct cell *cj) {
#ifdef WITH_VECTORIZATION
const struct engine *restrict e = r->e;
vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2;
vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci;
TIMER_TIC;
/* Anything to do here? */
if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e))
error("Interacting undrifted cells.");
/* Get the sort ID. */
double shift[3] = {0.0, 0.0, 0.0};
const int sid = space_getsid(e->s, &ci, &cj, shift);
/* Have the cells been sorted? */
if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin)
runner_do_sort(r, ci, (1 << sid), 1);
if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin)
runner_do_sort(r, cj, (1 << sid), 1);
/* Get the cutoff shift. */
double rshift = 0.0;
for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
/* Pick-out the sorted lists. */
const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
#ifdef SWIFT_DEBUG_CHECKS
/* Check that the dx_max_sort values in the cell are indeed an upper
bound on particle movement. */
for (int pid = 0; pid < ci->count; pid++) {
const struct part *p = &ci->parts[sort_i[pid].i];
const float d = p->x[0] * runner_shift[sid][0] +
p->x[1] * runner_shift[sid][1] +
p->x[2] * runner_shift[sid][2];
if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
1.0e-6 * max(fabsf(d), ci->dx_max_sort))
error("particle shift diff exceeds dx_max_sort.");
}
for (int pjd = 0; pjd < cj->count; pjd++) {
const struct part *p = &cj->parts[sort_j[pjd].i];
const float d = p->x[0] * runner_shift[sid][0] +
p->x[1] * runner_shift[sid][1] +
p->x[2] * runner_shift[sid][2];
if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
1.0e-6 * max(fabsf(d), cj->dx_max_sort))
error("particle shift diff exceeds dx_max_sort.");
}
#endif /* SWIFT_DEBUG_CHECKS */
/* Get some other useful values. */
const int count_i = ci->count;
const int count_j = cj->count;
//const double hi_max = ci->h_max * kernel_gamma - rshift;
//const double hj_max = cj->h_max * kernel_gamma;
struct part *restrict parts_i = ci->parts;
struct part *restrict parts_j = cj->parts;
//const double di_max = sort_i[count_i - 1].d - rshift;
//const double dj_min = sort_j[0].d;
const float dx_max = (ci->dx_max_sort + cj->dx_max_sort);
/* Check if any particles are active and return if there are not. */
//int numActive = 0;
//for (int pid = count_i - 1;
// pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
// struct part *restrict pi = &parts_i[sort_i[pid].i];
// if (part_is_active(pi, e)) {
// numActive++;
// break;
// }
//}
//if (!numActive) {
// for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max;
// pjd++) {
// struct part *restrict pj = &parts_j[sort_j[pjd].i];
// if (part_is_active(pj, e)) {
// numActive++;
// break;
// }
// }
//}
//if (numActive == 0) return;
/* Get both particle caches from the runner and re-allocate
* them if they are not big enough for the cells. */
struct cache *restrict ci_cache = &r->ci_cache;
struct cache *restrict cj_cache = &r->cj_cache;
if (ci_cache->count < count_i) {
cache_init(ci_cache, count_i);
}
if (cj_cache->count < count_j) {
cache_init(cj_cache, count_j);
}
int first_pi, last_pj;
float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE)));
float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE)));
max_di = r->ci_cache.max_d;
max_dj = r->cj_cache.max_d;
/* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
/* Also find the first pi that interacts with any particle in cj and the last
* pj that interacts with any particle in ci. */
populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di,
max_dj, &first_pi, &last_pj, e);
/* Find the maximum index into cj that is required by a particle in ci. */
/* Find the maximum index into ci that is required by a particle in cj. */
float di, dj;
int max_ind_j = count_j - 1;
int max_ind_i = 0;
dj = sort_j[max_ind_j].d;
while (max_ind_j > 0 && max_di[count_i - 1] < dj) {
max_ind_j--;
dj = sort_j[max_ind_j].d;
}
di = sort_i[max_ind_i].d;
while (max_ind_i < count_i - 1 && max_dj[0] > di) {
max_ind_i++;
di = sort_i[max_ind_i].d;
}
/* Limits of the outer loops. */
//int first_pi_loop = first_pi;
//int last_pj_loop = last_pj;
/* Take the max/min of both values calculated to work out how many particles
* to read into the cache. */
last_pj = max(last_pj, max_ind_j);
first_pi = min(first_pi, max_ind_i);
last_pj = count_j - 1;
first_pi = 0;
/* Read the needed particles into the two caches. */
int first_pi_align = first_pi;
int last_pj_align = last_pj;
cache_read_two_partial_cells_sorted_force(ci, cj, ci_cache, cj_cache, sort_i,
sort_j, shift, &first_pi_align,
&last_pj_align, 1);
last_pj_align = count_j - 1;
first_pi_align = 0;
/* Get the number of particles read into the ci cache. */
int ci_cache_count = count_i - first_pi_align;
if (cell_is_active(ci, e)) {
/* Loop over the parts in ci. */
//for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) {
for (int pid = count_i - 1; pid >= 0; pid--) {
/* Get a hold of the ith part in ci. */
struct part *restrict pi = &parts_i[sort_i[pid].i];
if (!part_is_active(pi, e)) continue;
/* Determine the exit iteration of the interaction loop. */
//dj = sort_j[max_ind_j].d;
//while (max_ind_j > 0 && max_di[pid] < dj) {
// max_ind_j--;
// dj = sort_j[max_ind_j].d;
//}
//int exit_iteration = max_ind_j + 1;
int exit_iteration = count_j;
/* Set the cache index. */
int ci_cache_idx = pid - first_pi_align;
const float hi = ci_cache->h[ci_cache_idx];
const float hig2 = hi * hi * kernel_gamma2;
vector pix, piy, piz;
/* Fill particle pi vectors. */
pix.v = vec_set1(ci_cache->x[ci_cache_idx]);
piy.v = vec_set1(ci_cache->y[ci_cache_idx]);
piz.v = vec_set1(ci_cache->z[ci_cache_idx]);
v_hi.v = vec_set1(hi);
v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]);
v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]);
v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]);
v_rhoi.v = vec_set1(ci_cache->rho[ci_cache_idx]);
v_grad_hi.v = vec_set1(ci_cache->grad_h[ci_cache_idx]);
v_pOrhoi2.v = vec_set1(ci_cache->pOrho2[ci_cache_idx]);
v_balsara_i.v = vec_set1(ci_cache->balsara[ci_cache_idx]);
v_ci.v = vec_set1(ci_cache->soundspeed[ci_cache_idx]);
v_hig2.v = vec_set1(hig2);
/* Reset cumulative sums of update vectors. */
vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum,
entropy_dtSum;
/* Get the inverse of hi. */
vector v_hi_inv;
v_hi_inv = vec_reciprocal(v_hi);
a_hydro_xSum.v = vec_setzero();
a_hydro_ySum.v = vec_setzero();
a_hydro_zSum.v = vec_setzero();
h_dtSum.v = vec_setzero();
v_sigSum.v = vec_set1(pi->force.v_sig);
entropy_dtSum.v = vec_setzero();
/* Pad the exit iteration if there is a serial remainder. */
int exit_iteration_align = exit_iteration;
int rem = exit_iteration % VEC_SIZE;
if (rem != 0) {
int pad = VEC_SIZE - rem;
if (exit_iteration_align + pad <= last_pj_align + 1)
exit_iteration_align += pad;
}
vector pjx, pjy, pjz, hj, hjg2;
/* Loop over the parts in cj. */
for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) {
/* Get the cache index to the jth particle. */
int cj_cache_idx = pjd;
vector v_dx, v_dy, v_dz;
#ifdef SWIFT_DEBUG_CHECKS
if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) {
error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx);
}
#endif
/* Load 2 sets of vectors from the particle cache. */
pjx.v = vec_load(&cj_cache->x[cj_cache_idx]);
pjy.v = vec_load(&cj_cache->y[cj_cache_idx]);
pjz.v = vec_load(&cj_cache->z[cj_cache_idx]);
hj.v = vec_load(&cj_cache->h[cj_cache_idx]);
hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v);
/* Compute the pairwise distance. */
v_dx.v = vec_sub(pix.v, pjx.v);
v_dy.v = vec_sub(piy.v, pjy.v);
v_dz.v = vec_sub(piz.v, pjz.v);
v_r2.v = vec_mul(v_dx.v, v_dx.v);
v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v);
v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v);
mask_t v_doi_mask;
int doi_mask;
/* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */
vector v_h2;
v_h2.v = vec_fmax(v_hig2.v, hjg2.v);
vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v));
/* Form integer masks. */
doi_mask = vec_form_int_mask(v_doi_mask);
/* If there are any interactions perform them. */
if (doi_mask) {
runner_iact_nonsym_1_vec_force(
&v_r2, &v_dx, &v_dy, &v_dz, &v_vix, &v_viy, &v_viz,
&v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci,
&cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx],
&cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx],
&cj_cache->rho[cj_cache_idx], &cj_cache->grad_h[cj_cache_idx],
&cj_cache->pOrho2[cj_cache_idx], &cj_cache->balsara[cj_cache_idx],
&cj_cache->soundspeed[cj_cache_idx], &v_hi_inv, &hj,
&a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum,
&h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask);
}
} /* loop over the parts in cj. */
/* Perform horizontal adds on vector sums and store result in particle pi.
*/
VEC_HADD(a_hydro_xSum, pi->a_hydro[0]);
VEC_HADD(a_hydro_ySum, pi->a_hydro[1]);
VEC_HADD(a_hydro_zSum, pi->a_hydro[2]);
VEC_HADD(h_dtSum, pi->force.h_dt);
VEC_HMAX(v_sigSum, pi->force.v_sig);
VEC_HADD(entropy_dtSum, pi->entropy_dt);
} /* loop over the parts in ci. */
}
if (cell_is_active(cj, e)) {
/* Loop over the parts in cj. */
//for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) {
for (int pjd = 0; pjd < count_j; pjd++) {
/* Get a hold of the jth part in cj. */
struct part *restrict pj = &parts_j[sort_j[pjd].i];
if (!part_is_active(pj, e)) continue;
/* Determine the exit iteration of the interaction loop. */
//di = sort_i[max_ind_i].d;
//while (max_ind_i < count_i - 1 && max_dj[pjd] > di) {
// max_ind_i++;
// di = sort_i[max_ind_i].d;
//}
//int exit_iteration = max_ind_i;
int exit_iteration = 0;
/* Set the cache index. */
int cj_cache_idx = pjd;
const float hj = cj_cache->h[cj_cache_idx];
const float hjg2 = hj * hj * kernel_gamma2;
vector pjx, pjy, pjz;
vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2;
vector v_rhoj, v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj;
/* Fill particle pi vectors. */
pjx.v = vec_set1(cj_cache->x[cj_cache_idx]);
pjy.v = vec_set1(cj_cache->y[cj_cache_idx]);
pjz.v = vec_set1(cj_cache->z[cj_cache_idx]);
v_hj.v = vec_set1(hj);
v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]);
v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]);
v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]);
v_rhoj.v = vec_set1(cj_cache->rho[cj_cache_idx]);
v_grad_hj.v = vec_set1(cj_cache->grad_h[cj_cache_idx]);
v_pOrhoj2.v = vec_set1(cj_cache->pOrho2[cj_cache_idx]);
v_balsara_j.v = vec_set1(cj_cache->balsara[cj_cache_idx]);
v_cj.v = vec_set1(cj_cache->soundspeed[cj_cache_idx]);
v_hjg2.v = vec_set1