Skip to content
Snippets Groups Projects
Commit dcbe6d03 authored by James Willis's avatar James Willis
Browse files

Help the compiler auto-vectorise the reading of cache for force interactions with hints.

parent b759fd04
No related branches found
No related tags found
1 merge request!440Dopair2 vectorisation
...@@ -442,7 +442,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f ...@@ -442,7 +442,7 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
const double *const shift, int *first_pi, int *last_pj, const double *const shift, int *first_pi, int *last_pj,
const int num_vec_proc) { const int num_vec_proc) {
int idx, ci_cache_idx; int idx;
/* Pad number of particles read to the vector size. */ /* Pad number of particles read to the vector size. */
int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE);
if (rem != 0) { if (rem != 0) {
...@@ -460,33 +460,53 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f ...@@ -460,33 +460,53 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
int first_pi_align = *first_pi; int first_pi_align = *first_pi;
int last_pj_align = *last_pj; int last_pj_align = *last_pj;
const struct part *restrict parts_i = ci->parts;
const struct part *restrict parts_j = cj->parts;
double loc[3];
loc[0] = ci->loc[0];
loc[1] = ci->loc[1];
loc[2] = ci->loc[2];
/* Shift the particles positions to a local frame (ci frame) so single precision /* Let the compiler know that the data is aligned and create pointers to the
* can be * arrays inside the cache. */
* used instead of double precision. Also shift the cell ci, particles positions swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT);
* due to BCs but leave cell cj. */ swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT);
#if defined(WITH_VECTORIZATION) && defined(__ICC) swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT);
#pragma vector aligned swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT);
#endif swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT);
for (int i = first_pi_align; i < ci->count; i++) { swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, SWIFT_CACHE_ALIGNMENT);
int ci_cache_count = ci->count - first_pi_align;
/* Shift the particles positions to a local frame (ci frame) so single precision
* can be
* used instead of double precision. Also shift the cell ci, particles positions
* due to BCs but leave cell cj. */
for (int i = 0; i < ci_cache_count; i++) {
/* Make sure ci_cache is filled from the first element. */ /* Make sure ci_cache is filled from the first element. */
ci_cache_idx = i - first_pi_align;
idx = sort_i[i].i;
ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0];
ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1];
ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2];
ci_cache->h[ci_cache_idx] = ci->parts[idx].h;
ci_cache->m[ci_cache_idx] = ci->parts[idx].mass;
ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0];
ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1];
ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2];
ci_cache->rho[ci_cache_idx] = ci->parts[idx].rho; idx = sort_i[i + first_pi_align].i;
ci_cache->grad_h[ci_cache_idx] = ci->parts[idx].force.f; x[i] = (float)(parts_i[idx].x[0] - loc[0] - shift[0]);
ci_cache->pOrho2[ci_cache_idx] = ci->parts[idx].force.P_over_rho2; y[i] = (float)(parts_i[idx].x[1] - loc[1] - shift[1]);
ci_cache->balsara[ci_cache_idx] = ci->parts[idx].force.balsara; z[i] = (float)(parts_i[idx].x[2] - loc[2] - shift[2]);
ci_cache->soundspeed[ci_cache_idx] = ci->parts[idx].force.soundspeed; h[i] = parts_i[idx].h;
m[i] = parts_i[idx].mass;
vx[i] = parts_i[idx].v[0];
vy[i] = parts_i[idx].v[1];
vz[i] = parts_i[idx].v[2];
rho[i] = parts_i[idx].rho;
grad_h[i] = parts_i[idx].force.f;
pOrho2[i] = parts_i[idx].force.P_over_rho2;
balsara[i] = parts_i[idx].force.balsara;
soundspeed[i] = parts_i[idx].force.soundspeed;
} }
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
...@@ -494,65 +514,77 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f ...@@ -494,65 +514,77 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_f
float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0];
for (int i = ci->count - first_pi_align; for (int i = ci->count - first_pi_align;
i < ci->count - first_pi_align + VEC_SIZE; i++) { i < ci->count - first_pi_align + VEC_SIZE; i++) {
ci_cache->x[i] = fake_pix; x[i] = fake_pix;
ci_cache->y[i] = 1.f; y[i] = 1.f;
ci_cache->z[i] = 1.f; z[i] = 1.f;
ci_cache->h[i] = 1.f; h[i] = 1.f;
ci_cache->m[i] = 1.f; m[i] = 1.f;
ci_cache->vx[i] = 1.f; vx[i] = 1.f;
ci_cache->vy[i] = 1.f; vy[i] = 1.f;
ci_cache->vz[i] = 1.f; vz[i] = 1.f;
ci_cache->rho[i] = 1.f; rho[i] = 1.f;
ci_cache->grad_h[i] = 1.f; grad_h[i] = 1.f;
ci_cache->pOrho2[i] = 1.f; pOrho2[i] = 1.f;
ci_cache->balsara[i] = 1.f; balsara[i] = 1.f;
ci_cache->soundspeed[i] = 1.f; soundspeed[i] = 1.f;
} }
#if defined(WITH_VECTORIZATION) && defined(__ICC) /* Let the compiler know that the data is aligned and create pointers to the
#pragma vector aligned * arrays inside the cache. */
#endif swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara, SWIFT_CACHE_ALIGNMENT);
swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed, SWIFT_CACHE_ALIGNMENT);
for (int i = 0; i <= last_pj_align; i++) { for (int i = 0; i <= last_pj_align; i++) {
idx = sort_j[i].i; idx = sort_j[i].i;
cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; xj[i] = (float)(parts_j[idx].x[0] - loc[0]);
cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; yj[i] = (float)(parts_j[idx].x[1] - loc[1]);
cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; zj[i] = (float)(parts_j[idx].x[2] - loc[2]);
cj_cache->h[i] = cj->parts[idx].h; hj[i] = parts_j[idx].h;
cj_cache->m[i] = cj->parts[idx].mass; mj[i] = parts_j[idx].mass;
cj_cache->vx[i] = cj->parts[idx].v[0]; vxj[i] = parts_j[idx].v[0];
cj_cache->vy[i] = cj->parts[idx].v[1]; vyj[i] = parts_j[idx].v[1];
cj_cache->vz[i] = cj->parts[idx].v[2]; vzj[i] = parts_j[idx].v[2];
cj_cache->rho[i] = cj->parts[idx].rho; rhoj[i] = parts_j[idx].rho;
cj_cache->grad_h[i] = cj->parts[idx].force.f; grad_hj[i] = parts_j[idx].force.f;
cj_cache->pOrho2[i] = cj->parts[idx].force.P_over_rho2; pOrho2j[i] = parts_j[idx].force.P_over_rho2;
cj_cache->balsara[i] = cj->parts[idx].force.balsara; balsaraj[i] = parts_j[idx].force.balsara;
cj_cache->soundspeed[i] = cj->parts[idx].force.soundspeed; soundspeedj[i] = parts_j[idx].force.soundspeed;
} }
/* Pad cache with fake particles that exist outside the cell so will not /* Pad cache with fake particles that exist outside the cell so will not
* interact.*/ * interact.*/
float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0];
for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) {
cj_cache->x[i] = fake_pjx; xj[i] = fake_pjx;
cj_cache->y[i] = 1.f; yj[i] = 1.f;
cj_cache->z[i] = 1.f; zj[i] = 1.f;
cj_cache->h[i] = 1.f; hj[i] = 1.f;
cj_cache->m[i] = 1.f;
cj_cache->vx[i] = 1.f;
cj_cache->vy[i] = 1.f;
cj_cache->vz[i] = 1.f;
cj_cache->rho[i] = 1.f;
cj_cache->grad_h[i] = 1.f;
cj_cache->pOrho2[i] = 1.f;
cj_cache->balsara[i] = 1.f;
cj_cache->soundspeed[i] = 1.f;
mj[i] = 1.f;
vxj[i] = 1.f;
vyj[i] = 1.f;
vzj[i] = 1.f;
rhoj[i] = 1.f;
grad_hj[i] = 1.f;
pOrho2j[i] = 1.f;
balsaraj[i] = 1.f;
soundspeedj[i] = 1.f;
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment