Commit 5b0080ee authored by James Willis's avatar James Willis
Browse files

Refactoring.

parent a0fe87bb
......@@ -198,11 +198,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
const double loc[3] = {ci->loc[0], ci->loc[1],ci->loc[2]};
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
for (int i = 0; i < ci->count; i++) {
......@@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles(
y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h;
m[i] = parts[i].mass;
vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1];
......@@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
SWIFT_CACHE_ALIGNMENT);
const struct part *restrict parts = ci->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
const double loc[3] = {ci->loc[0], ci->loc[1],ci->loc[2]};
/* Shift the particles positions to a local frame so single precision can be
* used instead of double precision. */
......@@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles(
y[i] = (float)(parts[i].x[1] - loc[1]);
z[i] = (float)(parts[i].x[2] - loc[2]);
h[i] = parts[i].h;
m[i] = parts[i].mass;
vx[i] = parts[i].v[0];
vy[i] = parts[i].v[1];
vz[i] = parts[i].v[2];
rho[i] = parts[i].rho;
grad_h[i] = parts[i].force.f;
pOrho2[i] = parts[i].force.P_over_rho2;
......@@ -325,17 +316,13 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
const int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = cj->loc[0];
loc[1] = cj->loc[1];
loc[2] = cj->loc[2];
/* Shift ci particles for boundary conditions and location of cell.*/
double total_ci_shift[3];
total_ci_shift[0] = loc[0] + shift[0];
total_ci_shift[1] = loc[1] + shift[1];
total_ci_shift[2] = loc[2] + shift[2];
/* Shift particles to the local frame and account for boundary conditions.*/
const double total_ci_shift[3] = {cj->loc[0] + shift[0], cj->loc[1] + shift[1],
cj->loc[2] + shift[2]};
const double total_cj_shift[3] = {cj->loc[0], cj->loc[1],
cj->loc[2]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
......@@ -348,6 +335,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
int ci_cache_count = ci->count - first_pi_align;
/* Shift the particles positions to a local frame (ci frame) so single
* precision
* can be
......@@ -355,12 +343,12 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
* positions
* due to BCs but leave cell cj. */
for (int i = 0; i < ci_cache_count; i++) {
/* Make sure ci_cache is filled from the first element. */
idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
......@@ -437,11 +425,10 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
hj[i] = parts_j[idx].h;
mj[i] = parts_j[idx].mass;
vxj[i] = parts_j[idx].v[0];
vyj[i] = parts_j[idx].v[1];
......@@ -488,7 +475,6 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted(
yj[i] = pos_padded_j[1];
zj[i] = pos_padded_j[2];
hj[i] = 1.f;
mj[i] = 1.f;
vxj[i] = 1.f;
vyj[i] = 1.f;
......@@ -539,10 +525,12 @@ cache_read_two_partial_cells_sorted_force(
const int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift particles to the local frame and account for boundary conditions.*/
const double total_ci_shift[3] = {cj->loc[0] + shift[0], cj->loc[1] + shift[1],
cj->loc[2] + shift[2]};
const double total_cj_shift[3] = {cj->loc[0], cj->loc[1],
cj->loc[2]};
/* Let the compiler know that the data is aligned and create pointers to the
* arrays inside the cache. */
......@@ -573,18 +561,15 @@ cache_read_two_partial_cells_sorted_force(
* due to BCs but leave cell cj. */
for (int i = 0; i < ci_cache_count; i++) {
/* Make sure ci_cache is filled from the first element. */
idx = sort_i[i + first_pi_align].i;
x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]);
y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]);
z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]);
h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
vz[i] = parts_i[idx].v[2];
rho[i] = parts_i[idx].rho;
grad_h[i] = parts_i[idx].force.f;
pOrho2[i] = parts_i[idx].force.P_over_rho2;
......@@ -606,12 +591,10 @@ cache_read_two_partial_cells_sorted_force(
y[i] = pos_padded[1];
z[i] = pos_padded[2];
h[i] = h_padded;
m[i] = 1.f;
vx[i] = 1.f;
vy[i] = 1.f;
vz[i] = 1.f;
rho[i] = 1.f;
grad_h[i] = 1.f;
pOrho2[i] = 1.f;
......@@ -641,16 +624,14 @@ cache_read_two_partial_cells_sorted_force(
for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i;
xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]);
yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]);
zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]);
hj[i] = parts_j[idx].h;
mj[i] = parts_j[idx].mass;
vxj[i] = parts_j[idx].v[0];
vyj[i] = parts_j[idx].v[1];
vzj[i] = parts_j[idx].v[2];
rhoj[i] = parts_j[idx].rho;
grad_hj[i] = parts_j[idx].force.f;
pOrho2j[i] = parts_j[idx].force.P_over_rho2;
......@@ -670,12 +651,10 @@ cache_read_two_partial_cells_sorted_force(
yj[i] = pos_padded_j[1];
zj[i] = pos_padded_j[2];
hj[i] = h_padded_j;
mj[i] = 1.f;
vxj[i] = 1.f;
vyj[i] = 1.f;
vzj[i] = 1.f;
rhoj[i] = 1.f;
grad_hj[i] = 1.f;
pOrho2j[i] = 1.f;
......@@ -699,6 +678,11 @@ static INLINE void cache_clean(struct cache *c) {
free(c->vz);
free(c->h);
free(c->max_index);
free(c->rho);
free(c->grad_h);
free(c->pOrho2);
free(c->balsara);
free(c->soundspeed);
}
}
......
......@@ -1072,37 +1072,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
struct entry *restrict sort_j = cj->sort[sid];
#ifdef SWIFT_DEBUG_CHECKS
/* Check that the dx_max_sort values in the cell are indeed an upper
bound on particle movement. */
for (int pid = 0; pid < ci->count; pid++) {
const struct part *p = &ci->parts[sort_i[pid].i];
const float d = p->x[0] * runner_shift[sid][0] +
p->x[1] * runner_shift[sid][1] +
p->x[2] * runner_shift[sid][2];
if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort >
1.0e-4 * max(fabsf(d), ci->dx_max_sort_old))
error(
"particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d "
"cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e "
"ci->dx_max_sort_old=%e",
ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort,
ci->dx_max_sort_old);
}
for (int pjd = 0; pjd < cj->count; pjd++) {
const struct part *p = &cj->parts[sort_j[pjd].i];
const float d = p->x[0] * runner_shift[sid][0] +
p->x[1] * runner_shift[sid][1] +
p->x[2] * runner_shift[sid][2];
if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort >
1.0e-4 * max(fabsf(d), cj->dx_max_sort_old))
error(
"particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d "
"ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e "
"cj->dx_max_sort_old=%e",
cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort,
cj->dx_max_sort_old);
}
/* Some constants used to checks that the parts are in the right frame */
const float shift_threshold_x =
2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part);
......@@ -1110,7 +1079,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part);
const float shift_threshold_z =
2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part);
#endif /* SWIFT_DEBUG_CHECKS */
/* Get some other useful values. */
......@@ -1514,7 +1482,6 @@ void DOPAIR2_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) {
error("Interacting unsorted cells.");
#ifdef SWIFT_DEBUG_CHECKS
/* Pick-out the sorted lists. */
const struct entry *restrict sort_i = ci->sort[sid];
const struct entry *restrict sort_j = cj->sort[sid];
......
......@@ -35,21 +35,21 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
* @param int_cache (return) secondary #cache of interactions between two
* particles.
* @param icount Interaction count.
* @param rhoSum (return) #vector holding the cumulative sum of the density
* @param v_rhoSum (return) #vector holding the cumulative sum of the density
* update on pi.
* @param rho_dhSum (return) #vector holding the cumulative sum of the density
* @param v_rho_dhSum (return) #vector holding the cumulative sum of the density
* gradient update on pi.
* @param wcountSum (return) #vector holding the cumulative sum of the wcount
* @param v_wcountSum (return) #vector holding the cumulative sum of the wcount
* update on pi.
* @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount
* @param v_wcount_dhSum (return) #vector holding the cumulative sum of the wcount
* gradient update on pi.
* @param div_vSum (return) #vector holding the cumulative sum of the divergence
* @param v_div_vSum (return) #vector holding the cumulative sum of the divergence
* update on pi.
* @param curlvxSum (return) #vector holding the cumulative sum of the curl of
* @param v_curlvxSum (return) #vector holding the cumulative sum of the curl of
* vx update on pi.
* @param curlvySum (return) #vector holding the cumulative sum of the curl of
* @param v_curlvySum (return) #vector holding the cumulative sum of the curl of
* vy update on pi.
* @param curlvzSum (return) #vector holding the cumulative sum of the curl of
* @param v_curlvzSum (return) #vector holding the cumulative sum of the curl of
* vz update on pi.
* @param v_hi_inv #vector of 1/h for pi.
* @param v_vix #vector of x velocity of pi.
......@@ -59,9 +59,9 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2);
* interactions have been performed, should be a multiple of the vector length.
*/
__attribute__((always_inline)) INLINE static void calcRemInteractions(
struct c2_cache *const int_cache, const int icount, vector *rhoSum,
vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
struct c2_cache *const int_cache, const int icount, vector *v_rhoSum,
vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum,
vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, vector *v_curlvzSum,
vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz,
int *icount_align) {
......@@ -107,8 +107,8 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
&int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align],
v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align],
&int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align],
&int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum,
wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask,
&int_cache->mq[*icount_align], v_rhoSum, v_rho_dhSum, v_wcountSum,
v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, int_mask,
int_mask2, 1);
}
}
......@@ -127,20 +127,20 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
* @param int_cache (return) secondary #cache of interactions between two
* particles.
* @param icount Interaction count.
* @param rhoSum #vector holding the cumulative sum of the density update on pi.
* @param rho_dhSum #vector holding the cumulative sum of the density gradient
* @param v_rhoSum #vector holding the cumulative sum of the density update on pi.
* @param v_rho_dhSum #vector holding the cumulative sum of the density gradient
* update on pi.
* @param wcountSum #vector holding the cumulative sum of the wcount update on
* @param v_wcountSum #vector holding the cumulative sum of the wcount update on
* pi.
* @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient
* @param v_wcount_dhSum #vector holding the cumulative sum of the wcount gradient
* update on pi.
* @param div_vSum #vector holding the cumulative sum of the divergence update
* @param v_div_vSum #vector holding the cumulative sum of the divergence update
* on pi.
* @param curlvxSum #vector holding the cumulative sum of the curl of vx update
* @param v_curlvxSum #vector holding the cumulative sum of the curl of vx update
* on pi.
* @param curlvySum #vector holding the cumulative sum of the curl of vy update
* @param v_curlvySum #vector holding the cumulative sum of the curl of vy update
* on pi.
* @param curlvzSum #vector holding the cumulative sum of the curl of vz update
* @param v_curlvzSum #vector holding the cumulative sum of the curl of vz update
* on pi.
* @param v_hi_inv #vector of 1/h for pi.
* @param v_vix #vector of x velocity of pi.
......@@ -150,9 +150,9 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions(
__attribute__((always_inline)) INLINE static void storeInteractions(
const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy,
vector *v_dz, const struct cache *const cell_cache,
struct c2_cache *const int_cache, int *icount, vector *rhoSum,
vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum,
vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum,
struct c2_cache *const int_cache, int *icount, vector *v_rhoSum,
vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum,
vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, vector *v_curlvzSum,
vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) {
/* Left-pack values needed into the secondary cache using the interaction mask.
......@@ -202,8 +202,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
int icount_align = *icount;
/* Peform remainder interactions. */
calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum,
wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum,
calcRemInteractions(int_cache, *icount, v_rhoSum, v_rho_dhSum, v_wcountSum,
v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum,
v_hi_inv, v_vix, v_viy, v_viz, &icount_align);
mask_t int_mask, int_mask2;
......@@ -215,9 +215,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions(
runner_iact_nonsym_2_vec_density(
&int_cache->r2q[j], &int_cache->dxq[j], &int_cache->dyq[j],
&int_cache->dzq[j], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[j],
&int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], rhoSum,
rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum,
curlvzSum, int_mask, int_mask2, 0);
&int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], v_rhoSum,
v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum,
v_curlvzSum, int_mask, int_mask2, 0);
}
/* Reset interaction count. */
......@@ -358,6 +358,33 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache(
*init_pj = last_pj;
}
/**
* @brief Populates the arrays max_index_i and max_index_j with the maximum
* indices of
* particles into their neighbouring cells. Also finds the first pi that
* interacts with any particle in cj and the last pj that interacts with any
* particle in ci.
*
* @param ci #cell pointer to ci
* @param cj #cell pointer to cj
* @param sort_i #entry array for particle distance in ci
* @param sort_j #entry array for particle distance in cj
* @param dx_max maximum particle movement allowed in cell
* @param rshift cutoff shift
* @param hi_max_raw Maximal smoothing length in cell ci
* @param hj_max_raw Maximal smoothing length in cell cj
* @param hi_max Maximal smoothing length in cell ci scaled by kernel_gamma
* @param hj_max Maximal smoothing length in cell cj scaled by kernel_gamma
* @param di_max Maximal position on the axis that can interact in cell ci
* @param dj_min Minimal position on the axis that can interact in cell ci
* @param max_index_i array to hold the maximum distances of pi particles into
* #cell cj
* @param max_index_j array to hold the maximum distances of pj particles into
* #cell cj
* @param init_pi first pi to interact with a pj particle
* @param init_pj last pj to interact with a pi particle
* @param max_active_bin The largest time-bin active during this step.
*/
__attribute__((always_inline)) INLINE static void
populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj,
const struct entry *restrict sort_i,
......@@ -530,14 +557,14 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
/* Is the ith particle active? */
if (!part_is_active_no_debug(pi, max_active_bin)) continue;
vector pix, piy, piz;
vector v_pix, v_piy, v_piz;
const float hi = cell_cache->h[pid];
/* Fill particle pi vectors. */
pix.v = vec_set1(cell_cache->x[pid]);
piy.v = vec_set1(cell_cache->y[pid]);
piz.v = vec_set1(cell_cache->z[pid]);
v_pix.v = vec_set1(cell_cache->x[pid]);
v_piy.v = vec_set1(cell_cache->y[pid]);
v_piz.v = vec_set1(cell_cache->z[pid]);
v_hi.v = vec_set1(hi);
v_vix.v = vec_set1(cell_cache->vx[pid]);
v_viy.v = vec_set1(cell_cache->vy[pid]);
......@@ -547,22 +574,22 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
v_hig2.v = vec_set1(hig2);
/* Reset cumulative sums of update vectors. */
vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum,
curlvySum, curlvzSum;
vector v_rhoSum, v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum,
v_curlvySum, v_curlvzSum;
/* Get the inverse of hi. */
vector v_hi_inv;
v_hi_inv = vec_reciprocal(v_hi);
rhoSum.v = vec_setzero();
rho_dhSum.v = vec_setzero();
wcountSum.v = vec_setzero();
wcount_dhSum.v = vec_setzero();
div_vSum.v = vec_setzero();
curlvxSum.v = vec_setzero();
curlvySum.v = vec_setzero();
curlvzSum.v = vec_setzero();
v_rhoSum.v = vec_setzero();
v_rho_dhSum.v = vec_setzero();
v_wcountSum.v = vec_setzero();
v_wcount_dhSum.v = vec_setzero();
v_div_vSum.v = vec_setzero();
v_curlvxSum.v = vec_setzero();
v_curlvySum.v = vec_setzero();
v_curlvzSum.v = vec_setzero();
/* Pad cache if there is a serial remainder. */
count_align = count;
......@@ -575,38 +602,38 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
/* Set positions to the same as particle pi so when the r2 > 0 mask is
* applied these extra contributions are masked out.*/
for (int i = count; i < count_align; i++) {
cell_cache->x[i] = pix.f[0];
cell_cache->y[i] = piy.f[0];
cell_cache->z[i] = piz.f[0];
cell_cache->x[i] = v_pix.f[0];
cell_cache->y[i] = v_piy.f[0];
cell_cache->z[i] = v_piz.f[0];
}
}
vector pjx, pjy, pjz;
vector pjx2, pjy2, pjz2;
vector v_pjx, v_pjy, v_pjz;
vector v_pjx2, v_pjy2, v_pjz2;
/* Find all of particle pi's interacions and store needed values in the
* secondary cache.*/
for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) {
/* Load 2 sets of vectors from the particle cache. */
pjx.v = vec_load(&cell_cache->x[pjd]);
pjy.v = vec_load(&cell_cache->y[pjd]);
pjz.v = vec_load(&cell_cache->z[pjd]);
v_pjx.v = vec_load(&cell_cache->x[pjd]);
v_pjy.v = vec_load(&cell_cache->y[pjd]);
v_pjz.v = vec_load(&cell_cache->z[pjd]);
pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
v_pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]);
v_pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]);
v_pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]);
/* Compute the pairwise distance. */
vector v_dx, v_dy, v_dz;
vector v_dx_2, v_dy_2, v_dz_2, v_r2_2;
v_dx.v = vec_sub(pix.v, pjx.v);
v_dx_2.v = vec_sub(pix.v, pjx2.v);
v_dy.v = vec_sub(piy.v, pjy.v);
v_dy_2.v = vec_sub(piy.v, pjy2.v);
v_dz.v = vec_sub(piz.v, pjz.v);
v_dz_2.v = vec_sub(piz.v, pjz2.v);
v_dx.v = vec_sub(v_pix.v, v_pjx.v);
v_dx_2.v = vec_sub(v_pix.v, v_pjx2.v);
v_dy.v = vec_sub(v_piy.v, v_pjy.v);
v_dy_2.v = vec_sub(v_piy.v, v_pjy2.v);
v_dz.v = vec_sub(v_piz.v, v_pjz.v);
v_dz_2.v = vec_sub(v_piz.v, v_pjz2.v);
v_r2.v = vec_mul(v_dx.v, v_dx.v);
v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v);
......@@ -644,23 +671,23 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
* cache. */
if (doi_mask) {
storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache,
&int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum,
&wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
&curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
&int_cache, &icount, &v_rhoSum, &v_rho_dhSum, &v_wcountSum,
&v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum,
&v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz);
}
if (doi_mask2) {
storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2,
&v_dz_2, cell_cache, &int_cache, &icount, &rhoSum,
&rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum,
&curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix,
&v_dz_2, cell_cache, &int_cache, &icount, &v_rhoSum,
&v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, &v_div_vSum,
&v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, v_vix,
v_viy, v_viz);
}
}
/* Perform padded vector remainder interactions if any are present. */
calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum,
&wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum,
&curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum, &v_wcountSum,
&v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum,
&v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz,
&icount_align);
/* Initialise masks to true in case remainder interactions have been
......@@ -675,21 +702,21 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec(
&int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd],
&int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz,
&int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd],
&int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum,
&div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2,
&int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum,
&v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, int_mask, int_mask2,
0);
}
/* Perform horizontal adds on vector sums and store result in particle pi.
*/
VEC_HADD(rhoSum, pi->rho);
VEC_HADD(rho_dhSum, pi->density.rho_dh);
VEC_HADD(wcountSum, pi->density.wcount);
VEC_HADD(wcount_dhSum, pi->density.wcount_dh);
VEC_HADD(div_vSum, pi->density.div_v);
VEC_HADD(curlvxSum, pi->density.rot_v[0]);
VEC_HADD(curlvySum, pi->density.rot_v[1]);
VEC_HADD(curlvzSum, pi->density.rot_v[2]);
VEC_HADD(v_rhoSum, pi->rho);
VEC_HADD(v_rho_dhSum, pi->density.rho_dh);
VEC_HADD(v_wcountSum, pi->density.wcount);
VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh);
VEC_HADD(v_div_vSum, pi->density.div_v);
VEC_HADD(v_curlvxSum, pi->density.rot_v[0]);
VEC_HADD(v_curlvySum, pi->density.rot_v[1]);
VEC_HADD(v_curlvzSum, pi->density.rot_v[2]);
/* Reset interaction count. */
icount = 0;
......@@ -758,14 +785,14 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec(
/* Is the ith particle active? */
if (!part_is_active_no_debug(pi, max_active_bin)) continue;
vector pix, piy, piz;
vector v_pix, v_piy, v_piz;
const float hi = cell_cache->h[pid];
/* Fill particle pi vectors. */
pix.v = vec_set1(cell_cache->x[pid]);
piy.v = vec_set1(cell_cache->y[pid]);
piz.v = vec_set1(cell_cache->z[pid]);
v_pix.v = vec_set1(cell_cache->x[pid]);
v_piy.v = vec_set1(cell_cache->y[pid]);