Commit c34b12e8 authored by James Willis's avatar James Willis
Browse files

Implemented Pedro's max_d algorithm improvement to intrinsic version of dopair1.

parent f54a189f
...@@ -943,8 +943,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -943,8 +943,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
/* Get some other useful values. */ /* Get some other useful values. */
const double hi_max = ci->h_max * kernel_gamma - rshift;
const double hj_max = cj->h_max * kernel_gamma;
const int count_i = ci->count; const int count_i = ci->count;
const int count_j = cj->count; const int count_j = cj->count;
struct part *restrict parts_i = ci->parts; struct part *restrict parts_i = ci->parts;
...@@ -967,14 +965,29 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -967,14 +965,29 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
//cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); //cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift);
cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift);
/* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
/* For particles in ci */
populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj);
float di, dj;
int max_ind_j = count_j - 1;
/* Loop over the parts in ci. */ /* Loop over the parts in ci. */
for (int pid = count_i - 1; for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) {
/* Get a hold of the ith part in ci. */ /* Get a hold of the ith part in ci. */
struct part *restrict pi = &parts_i[sort_i[pid].i]; struct part *restrict pi = &parts_i[sort_i[pid].i];
if (!part_is_active(pi, e)) continue; if (!part_is_active(pi, e)) continue;
dj = sort_j[max_ind_j].d;
while(max_ind_j > 0 && max_di[pid] < dj) {
max_ind_j--;
dj = sort_j[max_ind_j].d;
}
int exit_iteration = max_ind_j;
int ci_cache_idx = pid; //sort_i[pid].i; int ci_cache_idx = pid; //sort_i[pid].i;
const float hi = ci_cache->h[ci_cache_idx]; const float hi = ci_cache->h[ci_cache_idx];
...@@ -1014,14 +1027,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -1014,14 +1027,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
curlvySum.v = vec_setzero(); curlvySum.v = vec_setzero();
curlvzSum.v = vec_setzero(); curlvzSum.v = vec_setzero();
int exit_iteration = count_j;
for (int pjd = 0; pjd < count_j ; pjd++) {
if(sort_j[pjd].d > di) {
exit_iteration = pjd;
break;
}
}
/* Pad cache if there is a serial remainder. */ /* Pad cache if there is a serial remainder. */
int exit_iteration_align = exit_iteration; int exit_iteration_align = exit_iteration;
int rem = exit_iteration % (num_vec_proc * VEC_SIZE); int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
...@@ -1129,14 +1134,22 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -1129,14 +1134,22 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
} /* loop over the parts in ci. */ } /* loop over the parts in ci. */
int max_ind_i = 0;
/* Loop over the parts in cj. */ /* Loop over the parts in cj. */
for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
pjd++) {
/* Get a hold of the jth part in cj. */ /* Get a hold of the jth part in cj. */
struct part *restrict pj = &parts_j[sort_j[pjd].i]; struct part *restrict pj = &parts_j[sort_j[pjd].i];
if (!part_is_active(pj, e)) continue; if (!part_is_active(pj, e)) continue;
di = sort_i[max_ind_i].d;
while(max_ind_i < count_i - 1 && max_dj[pjd] > di) {
max_ind_i++;
di = sort_i[max_ind_i].d;
}
int exit_iteration = max_ind_i;
int cj_cache_idx = pjd; int cj_cache_idx = pjd;
const float hj = cj_cache.h[cj_cache_idx]; const float hj = cj_cache.h[cj_cache_idx];
...@@ -1177,14 +1190,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -1177,14 +1190,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
curlvySum.v = vec_setzero(); curlvySum.v = vec_setzero();
curlvzSum.v = vec_setzero(); curlvzSum.v = vec_setzero();
int exit_iteration = 0;
for (int pid = count_i - 1; pid >= 0; pid--) {
if(sort_i[pid].d < dj) {
exit_iteration = pid;
break;
}
}
/* Pad cache if there is a serial remainder. */ /* Pad cache if there is a serial remainder. */
int exit_iteration_align = exit_iteration; int exit_iteration_align = exit_iteration;
int rem = exit_iteration % (num_vec_proc * VEC_SIZE); int rem = exit_iteration % (num_vec_proc * VEC_SIZE);
...@@ -1198,7 +1203,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -1198,7 +1203,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
vector pivx, pivy, pivz, mi; vector pivx, pivy, pivz, mi;
/* Loop over the parts in ci. */ /* Loop over the parts in ci. */
//for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) {
for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) { for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) {
/* Get the cache index to the ith particle. */ /* Get the cache index to the ith particle. */
...@@ -1306,191 +1310,191 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * ...@@ -1306,191 +1310,191 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *
* @param ci The first #cell. * @param ci The first #cell.
* @param cj The second #cell. * @param cj The second #cell.
*/ */
void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci, struct cell *cj) { //void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci, struct cell *cj) {
//
#ifdef WITH_VECTORIZATION //#ifdef WITH_VECTORIZATION
const struct engine *restrict e = r->e; // const struct engine *restrict e = r->e;
//
TIMER_TIC; // TIMER_TIC;
//
/* Anything to do here? */ // /* Anything to do here? */
if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; // if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return;
//
#ifdef SWIFT_DEBUG_CHECKS //#ifdef SWIFT_DEBUG_CHECKS
cell_is_drifted(ci, e); // cell_is_drifted(ci, e);
cell_is_drifted(cj, e); // cell_is_drifted(cj, e);
#endif //#endif
//
/* Get the sort ID. */ // /* Get the sort ID. */
double shift[3] = {0.0, 0.0, 0.0}; // double shift[3] = {0.0, 0.0, 0.0};
const int sid = space_getsid(e->s, &ci, &cj, shift); // const int sid = space_getsid(e->s, &ci, &cj, shift);
//
/* Have the cells been sorted? */ // /* Have the cells been sorted? */
if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) // if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid)))
error("Trying to interact unsorted cells."); // error("Trying to interact unsorted cells.");
//
/* Get the cutoff shift. */ // /* Get the cutoff shift. */
double rshift = 0.0; // double rshift = 0.0;
for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; // for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k];
//
/* Pick-out the sorted lists. */ // /* Pick-out the sorted lists. */
const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; // const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)];
const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; // const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)];
//
/* Get some other useful values. */ // /* Get some other useful values. */
const int count_i = ci->count; // const int count_i = ci->count;
const int count_j = cj->count; // const int count_j = cj->count;
struct part *restrict parts_i = ci->parts; // struct part *restrict parts_i = ci->parts;
struct part *restrict parts_j = cj->parts; // struct part *restrict parts_j = cj->parts;
const double di_max = sort_i[count_i - 1].d - rshift; // const double di_max = sort_i[count_i - 1].d - rshift;
const double dj_min = sort_j[0].d; // const double dj_min = sort_j[0].d;
const float dx_max = (ci->dx_max + cj->dx_max); // const float dx_max = (ci->dx_max + cj->dx_max);
//
/* Get the particle cache from the runner and re-allocate // /* Get the particle cache from the runner and re-allocate
* the cache if it is not big enough for the cell. */ // * the cache if it is not big enough for the cell. */
//struct cache *restrict ci_cache = &r->par_cache; // //struct cache *restrict ci_cache = &r->par_cache;
//
//if (ci_cache->count < count_i) { // //if (ci_cache->count < count_i) {
// cache_init(ci_cache, count_i); // // cache_init(ci_cache, count_i);
//} // //}
//if (cj_cache.count < count_j) { // //if (cj_cache.count < count_j) {
// cache_init(&cj_cache, count_j); // // cache_init(&cj_cache, count_j);
//} // //}
//
//cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); // //cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift);
cache_read_two_cells_sorted(ci, cj, &ci_cache, &cj_cache, sort_i, sort_j, shift); // cache_read_two_cells_sorted(ci, cj, &ci_cache, &cj_cache, sort_i, sort_j, shift);
//
/* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ // /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */
/* For particles in ci */ // /* For particles in ci */
populate_max_d(ci, cj, sort_i, sort_j, &ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); // populate_max_d(ci, cj, sort_i, sort_j, &ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj);
//
float di, dj; // float di, dj;
//
int max_ind_j = count_j - 1; // int max_ind_j = count_j - 1;
//
/* Loop over the parts in ci. */ // /* Loop over the parts in ci. */
for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { // for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) {
//
/* Get a hold of the ith part in ci. */ // /* Get a hold of the ith part in ci. */
struct part *restrict pi = &parts_i[sort_i[pid].i]; // struct part *restrict pi = &parts_i[sort_i[pid].i];
if (!part_is_active(pi, e)) continue; // if (!part_is_active(pi, e)) continue;
//
dj = sort_j[max_ind_j].d; // dj = sort_j[max_ind_j].d;
while(max_ind_j > 0 && max_di[pid] < dj) { // while(max_ind_j > 0 && max_di[pid] < dj) {
max_ind_j--; // max_ind_j--;
//
dj = sort_j[max_ind_j].d; // dj = sort_j[max_ind_j].d;
} // }
int exit_iteration = max_ind_j; // int exit_iteration = max_ind_j;
//
int ci_cache_idx = pid; //sort_i[pid].i; // int ci_cache_idx = pid; //sort_i[pid].i;
//
const float hi = ci_cache.h[ci_cache_idx]; // const float hi = ci_cache.h[ci_cache_idx];
const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; // const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift;
if (di < dj_min) continue; // if (di < dj_min) continue;
//
const float hig2 = hi * hi * kernel_gamma2; // const float hig2 = hi * hi * kernel_gamma2;
//
float pix, piy, piz; // float pix, piy, piz;
float vix, viy, viz; // float vix, viy, viz;
float hi_inv; // float hi_inv;
//
/* Fill particle pi vectors. */ // /* Fill particle pi vectors. */
pix = ci_cache.x[ci_cache_idx]; // pix = ci_cache.x[ci_cache_idx];
piy = ci_cache.y[ci_cache_idx]; // piy = ci_cache.y[ci_cache_idx];
piz = ci_cache.z[ci_cache_idx]; // piz = ci_cache.z[ci_cache_idx];
vix = ci_cache.vx[ci_cache_idx]; // vix = ci_cache.vx[ci_cache_idx];
viy = ci_cache.vy[ci_cache_idx]; // viy = ci_cache.vy[ci_cache_idx];
viz = ci_cache.vz[ci_cache_idx]; // viz = ci_cache.vz[ci_cache_idx];
//
/* Get the inverse of hi. */ // /* Get the inverse of hi. */
hi_inv = 1.0f / hi; // hi_inv = 1.0f / hi;
//
/* Loop over the parts in cj. */ // /* Loop over the parts in cj. */
for (int pjd = 0; pjd <= exit_iteration; pjd++) { // for (int pjd = 0; pjd <= exit_iteration; pjd++) {
//
/* Get the cache index to the jth particle. */ // /* Get the cache index to the jth particle. */
//int cj_cache_idx = pjd; //sort_j[pjd].i; // //int cj_cache_idx = pjd; //sort_j[pjd].i;
//
float dx, dy, dz, r2; // float dx, dy, dz, r2;
//
/* Compute the pairwise distance. */ // /* Compute the pairwise distance. */
dx = pix - cj_cache.x[pjd]; // dx = pix - cj_cache.x[pjd];
dy = piy - cj_cache.y[pjd]; // dy = piy - cj_cache.y[pjd];
dz = piz - cj_cache.z[pjd]; // dz = piz - cj_cache.z[pjd];
//
r2 = dx*dx + dy*dy + dz*dz; // r2 = dx*dx + dy*dy + dz*dz;
//
runner_iact_nonsym_density_jsw(r2, hig2, dx, dy, dz, hi_inv, cj_cache.h[pjd], vix, viy, viz, cj_cache.vx[pjd], cj_cache.vy[pjd], cj_cache.vz[pjd], cj_cache.m[pjd], &ci_cache.rho[pid], &ci_cache.rho_dh[pid], &ci_cache.wcount[pid], &ci_cache.wcount_dh[pid], &ci_cache.div_v[pid], &ci_cache.curl_vx[pid], &ci_cache.curl_vy[pid], &ci_cache.curl_vz[pid]); // runner_iact_nonsym_density_jsw(r2, hig2, dx, dy, dz, hi_inv, cj_cache.h[pjd], vix, viy, viz, cj_cache.vx[pjd], cj_cache.vy[pjd], cj_cache.vz[pjd], cj_cache.m[pjd], &ci_cache.rho[pid], &ci_cache.rho_dh[pid], &ci_cache.wcount[pid], &ci_cache.wcount_dh[pid], &ci_cache.div_v[pid], &ci_cache.curl_vx[pid], &ci_cache.curl_vy[pid], &ci_cache.curl_vz[pid]);
//
} /* loop over the parts in cj. */ // } /* loop over the parts in cj. */
//
} /* loop over the parts in ci. */ // } /* loop over the parts in ci. */
//
int max_ind_i = 0; // int max_ind_i = 0;
/* Loop over the parts in cj. */ // /* Loop over the parts in cj. */
for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { // for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) {
//
/* Get a hold of the jth part in cj. */ // /* Get a hold of the jth part in cj. */
struct part *restrict pj = &parts_j[sort_j[pjd].i]; // struct part *restrict pj = &parts_j[sort_j[pjd].i];
if (!part_is_active(pj, e)) continue; // if (!part_is_active(pj, e)) continue;
//
di = sort_i[max_ind_i].d; // di = sort_i[max_ind_i].d;
//
while(max_ind_i < count_i - 1 && max_dj[pjd] > di) { // while(max_ind_i < count_i - 1 && max_dj[pjd] > di) {
max_ind_i++; // max_ind_i++;
//
di = sort_i[max_ind_i].d; // di = sort_i[max_ind_i].d;
} // }
int exit_iteration = max_ind_i; // int exit_iteration = max_ind_i;
//
int cj_cache_idx = pjd; // int cj_cache_idx = pjd;
//
const float hj = cj_cache.h[cj_cache_idx]; // const float hj = cj_cache.h[cj_cache_idx];
const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; // const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift;
if (dj > di_max) continue; // if (dj > di_max) continue;
//
const float hjg2 = hj * hj * kernel_gamma2; // const float hjg2 = hj * hj * kernel_gamma2;
//
float pjx, pjy, pjz; // float pjx, pjy, pjz;
float vjx, vjy, vjz; // float vjx, vjy, vjz;
float hj_inv; // float hj_inv;
//
/* Fill particle pi vectors. */ // /* Fill particle pi vectors. */
pjx = cj_cache.x[cj_cache_idx]; // pjx = cj_cache.x[cj_cache_idx];
pjy = cj_cache.y[cj_cache_idx]; // pjy = cj_cache.y[cj_cache_idx];
pjz = cj_cache.z[cj_cache_idx]; // pjz = cj_cache.z[cj_cache_idx];
vjx = cj_cache.vx[cj_cache_idx]; // vjx = cj_cache.vx[cj_cache_idx];
vjy = cj_cache.vy[cj_cache_idx]; // vjy = cj_cache.vy[cj_cache_idx];
vjz = cj_cache.vz[cj_cache_idx]; // vjz = cj_cache.vz[cj_cache_idx];
//
/* Get the inverse of hj. */ // /* Get the inverse of hj. */
hj_inv = 1.0f / hj; // hj_inv = 1.0f / hj;
//
/* Loop over the parts in ci. */ // /* Loop over the parts in ci. */
for (int pid = exit_iteration; pid < count_i; pid++) { // for (int pid = exit_iteration; pid < count_i; pid++) {
//
/* Get the cache index to the ith particle. */ // /* Get the cache index to the ith particle. */
int ci_cache_idx = pid; //sort_i[pid].i; // int ci_cache_idx = pid; //sort_i[pid].i;
//
float dx, dy, dz, r2; // float dx, dy, dz, r2;
//
/* Compute the pairwise distance. */ // /* Compute the pairwise distance. */
dx = pjx - ci_cache.x[ci_cache_idx]; // dx = pjx - ci_cache.x[ci_cache_idx];
dy = pjy - ci_cache.y[ci_cache_idx]; // dy = pjy - ci_cache.y[ci_cache_idx];
dz = pjz - ci_cache.z[ci_cache_idx]; // dz = pjz - ci_cache.z[ci_cache_idx];
//
r2 = dx*dx + dy*dy + dz*dz; // r2 = dx*dx + dy*dy + dz*dz;
//
runner_iact_nonsym_density_jsw(r2, hjg2, dx, dy, dz, hj_inv, ci_cache.h[ci_cache_idx], vjx, vjy, vjz, ci_cache.vx[ci_cache_idx], ci_cache.vy[ci_cache_idx], ci_cache.vz[ci_cache_idx], ci_cache.m[ci_cache_idx], &cj_cache.rho[cj_cache_idx], &cj_cache.rho_dh[cj_cache_idx], &cj_cache.wcount[cj_cache_idx], &cj_cache.wcount_dh[cj_cache_idx], &cj_cache.div_v[cj_cache_idx], &cj_cache.curl_vx[cj_cache_idx], &cj_cache.curl_vy[cj_cache_idx], &cj_cache.curl_vz[cj_cache_idx]); // runner_iact_nonsym_density_jsw(r2, hjg2, dx, dy, dz, hj_inv, ci_cache.h[ci_cache_idx], vjx, vjy, vjz, ci_cache.vx[ci_cache_idx], ci_cache.vy[ci_cache_idx], ci_cache.vz[ci_cache_idx], ci_cache.m[ci_cache_idx], &cj_cache.rho[cj_cache_idx], &cj_cache.rho_dh[cj_cache_idx], &cj_cache.wcount[cj_cache_idx], &cj_cache.wcount_dh[cj_cache_idx], &cj_cache.div_v[cj_cache_idx], &cj_cache.curl_vx[cj_cache_idx], &cj_cache.curl_vy[cj_cache_idx], &cj_cache.curl_vz[cj_cache_idx]);
//
} /* loop over the parts in ci. */ // } /* loop over the parts in ci. */
//
} /* loop over the parts in cj. */ // } /* loop over the parts in cj. */
//
cache_write_sorted_particles(&ci_cache, &cj_cache, ci, cj, sort_i, sort_j); // cache_write_sorted_particles(&ci_cache, &cj_cache, ci, cj, sort_i, sort_j);
//
TIMER_TOC(timer_dopair_density); // TIMER_TOC(timer_dopair_density);
//
#endif /* WITH_VECTORIZATION */ //#endif /* WITH_VECTORIZATION */
} //}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment