diff --git a/src/cache.h b/src/cache.h index 6739c2020e897d54e6586c9d121490aaab5661bc..c4ca563a0a87cd7e55f8a72f9c84421ac9dfdbd2 100644 --- a/src/cache.h +++ b/src/cache.h @@ -465,6 +465,144 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( } } +/** + * @brief Populate caches by only reading particles that are within range of + * each other within the adjoining cell.Also read the particles into the cache + * in sorted order. + * + * @param ci The i #cell. + * @param cj The j #cell. + * @param ci_cache The #cache for cell ci. + * @param cj_cache The #cache for cell cj. + * @param sort_i The array of sorted particle indices for cell ci. + * @param sort_j The array of sorted particle indices for cell ci. + * @param shift The amount to shift the particle positions to account for BCs + * @param first_pi The first particle in cell ci that is in range. + * @param last_pj The last particle in cell cj that is in range. + * @param num_vec_proc Number of vectors that will be used to process the + * interaction. + */ +__attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_force( + const struct cell *const ci, const struct cell *const cj, + struct cache *const ci_cache, struct cache *const cj_cache, + const struct entry *restrict sort_i, const struct entry *restrict sort_j, + const double *const shift, int *first_pi, int *last_pj, + const int num_vec_proc) { + + int idx, ci_cache_idx; + /* Pad number of particles read to the vector size. */ + int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + if (*first_pi - pad >= 0) *first_pi -= pad; + } + + rem = *last_pj % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + if (*last_pj + pad < cj->count) *last_pj += pad; + } + + int first_pi_align = *first_pi; + int last_pj_align = *last_pj; + +/* Shift the particles positions to a local frame (ci frame) so single precision + * can be + * used instead of double precision. Also shift the cell ci, particles positions + * due to BCs but leave cell cj. */ +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma vector aligned +#endif + for (int i = first_pi_align; i < ci->count; i++) { + /* Make sure ci_cache is filled from the first element. */ + ci_cache_idx = i - first_pi_align; + idx = sort_i[i].i; + ci_cache->x[ci_cache_idx] = ci->parts[idx].x[0] - ci->loc[0] - shift[0]; + ci_cache->y[ci_cache_idx] = ci->parts[idx].x[1] - ci->loc[1] - shift[1]; + ci_cache->z[ci_cache_idx] = ci->parts[idx].x[2] - ci->loc[2] - shift[2]; + ci_cache->h[ci_cache_idx] = ci->parts[idx].h; + + ci_cache->m[ci_cache_idx] = ci->parts[idx].mass; + ci_cache->vx[ci_cache_idx] = ci->parts[idx].v[0]; + ci_cache->vy[ci_cache_idx] = ci->parts[idx].v[1]; + ci_cache->vz[ci_cache_idx] = ci->parts[idx].v[2]; + + ci_cache->rho[ci_cache_idx] = ci->parts[idx].rho; + ci_cache->grad_h[ci_cache_idx] = ci->parts[idx].force.f; + ci_cache->pOrho2[ci_cache_idx] = ci->parts[idx].force.P_over_rho2; + ci_cache->balsara[ci_cache_idx] = ci->parts[idx].force.balsara; + ci_cache->soundspeed[ci_cache_idx] = ci->parts[idx].force.soundspeed; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact.*/ + float fake_pix = 2.0f * ci->parts[sort_i[ci->count - 1].i].x[0]; + for (int i = ci->count - first_pi_align; + i < ci->count - first_pi_align + VEC_SIZE; i++) { + ci_cache->x[i] = fake_pix; + ci_cache->y[i] = 1.f; + ci_cache->z[i] = 1.f; + ci_cache->h[i] = 1.f; + + ci_cache->m[i] = 1.f; + ci_cache->vx[i] = 1.f; + ci_cache->vy[i] = 1.f; + ci_cache->vz[i] = 1.f; + + ci_cache->rho[i] = 1.f; + ci_cache->grad_h[i] = 1.f; + ci_cache->pOrho2[i] = 1.f; + ci_cache->balsara[i] = 1.f; + ci_cache->soundspeed[i] = 1.f; + } + +#if defined(WITH_VECTORIZATION) && defined(__ICC) +#pragma vector aligned +#endif + for (int i = 0; i <= last_pj_align; i++) { + idx = sort_j[i].i; + cj_cache->x[i] = cj->parts[idx].x[0] - ci->loc[0]; + cj_cache->y[i] = cj->parts[idx].x[1] - ci->loc[1]; + cj_cache->z[i] = cj->parts[idx].x[2] - ci->loc[2]; + cj_cache->h[i] = cj->parts[idx].h; + + cj_cache->m[i] = cj->parts[idx].mass; + cj_cache->vx[i] = cj->parts[idx].v[0]; + cj_cache->vy[i] = cj->parts[idx].v[1]; + cj_cache->vz[i] = cj->parts[idx].v[2]; + + cj_cache->rho[i] = cj->parts[idx].rho; + cj_cache->grad_h[i] = cj->parts[idx].force.f; + cj_cache->pOrho2[i] = cj->parts[idx].force.P_over_rho2; + cj_cache->balsara[i] = cj->parts[idx].force.balsara; + cj_cache->soundspeed[i] = cj->parts[idx].force.soundspeed; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact.*/ + float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; + for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { + cj_cache->x[i] = fake_pjx; + cj_cache->y[i] = 1.f; + cj_cache->z[i] = 1.f; + cj_cache->h[i] = 1.f; + + cj_cache->m[i] = 1.f; + cj_cache->vx[i] = 1.f; + cj_cache->vy[i] = 1.f; + cj_cache->vz[i] = 1.f; + + cj_cache->rho[i] = 1.f; + cj_cache->grad_h[i] = 1.f; + cj_cache->pOrho2[i] = 1.f; + cj_cache->balsara[i] = 1.f; + cj_cache->soundspeed[i] = 1.f; + + } +} + /* @brief Clean the memory allocated by a #cache object. * * @param c The #cache to clean. diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index fc62845ce1788a85d90e6e748e648b7887be2065..dbf04a4625e142bf7771b7153f30078194ea27c8 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -1163,18 +1163,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( #ifdef WITH_VECTORIZATION __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force( - float *R2, float *Dx, float *Dy, float *Dz, vector *vix, vector *viy, + vector *r2, vector *dx, vector *dy, vector *dz, vector *vix, vector *viy, vector *viz, vector *pirho, vector *grad_hi, vector *piPOrho2, - vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, + vector *balsara_i, vector *ci, float *Vjx, float *Vjy, float *Vjz, float *Mj, float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, - float *Mj, vector *hi_inv, float *Hj_inv, vector *a_hydro_xSum, + vector *hi_inv, vector *hj, vector *a_hydro_xSum, vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, vector *entropy_dtSum, mask_t mask) { #ifdef WITH_VECTORIZATION - vector r, r2, ri; - vector dx, dy, dz; + vector r, ri; vector vjx, vjy, vjz; vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; vector xi, xj; @@ -1187,11 +1186,6 @@ runner_iact_nonsym_1_vec_force( vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; /* Fill vectors. */ - r2.v = vec_load(R2); - dx.v = vec_load(Dx); - dy.v = vec_load(Dy); - dz.v = vec_load(Dz); - vjx.v = vec_load(Vjx); vjy.v = vec_load(Vjy); vjz.v = vec_load(Vjz); @@ -1202,7 +1196,7 @@ runner_iact_nonsym_1_vec_force( pjPOrho2.v = vec_load(PjPOrho2); balsara_j.v = vec_load(Balsara_j); cj.v = vec_load(Cj); - hj_inv.v = vec_load(Hj_inv); + hj_inv = vec_reciprocal(*hj); fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ @@ -1210,8 +1204,8 @@ runner_iact_nonsym_1_vec_force( balsara.v = balsara_i->v + balsara_j.v; /* Get the radius and inverse radius. */ - ri = vec_reciprocal_sqrt(r2); - r.v = r2.v * ri.v; + ri = vec_reciprocal_sqrt(*r2); + r.v = r2->v * ri.v; /* Get the kernel for hi. */ hid_inv = pow_dimension_plus_one_vec(*hi_inv); @@ -1223,14 +1217,14 @@ runner_iact_nonsym_1_vec_force( hjd_inv = pow_dimension_plus_one_vec(hj_inv); xj.v = r.v * hj_inv.v; - /* Calculate the kernel for two particles. */ + /* Calculate the kernel. */ kernel_eval_dWdx_force_vec(&xj, &wj_dx); wj_dr.v = hjd_inv.v * wj_dx.v; /* Compute dv dot r. */ - dvdr.v = ((vix->v - vjx.v) * dx.v) + ((viy->v - vjy.v) * dy.v) + - ((viz->v - vjz.v) * dz.v); + dvdr.v = ((vix->v - vjx.v) * dx->v) + ((viy->v - vjy.v) * dy->v) + + ((viz->v - vjz.v) * dz->v); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ @@ -1255,9 +1249,9 @@ runner_iact_nonsym_1_vec_force( acc.v = visc_term.v + sph_term.v; /* Use the force, Luke! */ - piax.v = mj.v * dx.v * acc.v; - piay.v = mj.v * dy.v * acc.v; - piaz.v = mj.v * dz.v * acc.v; + piax.v = mj.v * dx->v * acc.v; + piay.v = mj.v * dy->v * acc.v; + piaz.v = mj.v * dz->v * acc.v; /* Get the time derivative for h. */ pih_dt.v = mj.v * dvdr.v * ri.v / pjrho.v * wi_dr.v; diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 91f49703224a6b114aa61547964c1acab216a4fb..b4decbc208c02a8771fceefd2c1fb13f1c63424d 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -1424,3 +1424,464 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #endif /* WITH_VECTORIZATION */ } + +/** + * @brief Compute the force interactions between a cell pair (non-symmetric) + * using vector intrinsics. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ +void runner_dopair2_force_vec(struct runner *r, struct cell *ci, + struct cell *cj) { + +#ifdef WITH_VECTORIZATION + const struct engine *restrict e = r->e; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; + vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci; + + TIMER_TIC; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + + if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) + error("Interacting undrifted cells."); + + /* Get the sort ID. */ + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || ci->dx_max_sort > space_maxreldx * ci->dmin) + runner_do_sort(r, ci, (1 << sid), 1); + if (!(cj->sorted & (1 << sid)) || cj->dx_max_sort > space_maxreldx * cj->dmin) + runner_do_sort(r, cj, (1 << sid), 1); + + /* Get the cutoff shift. */ + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; + + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + +#ifdef SWIFT_DEBUG_CHECKS + /* Check that the dx_max_sort values in the cell are indeed an upper + bound on particle movement. */ + for (int pid = 0; pid < ci->count; pid++) { + const struct part *p = &ci->parts[sort_i[pid].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > + 1.0e-6 * max(fabsf(d), ci->dx_max_sort)) + error("particle shift diff exceeds dx_max_sort."); + } + for (int pjd = 0; pjd < cj->count; pjd++) { + const struct part *p = &cj->parts[sort_j[pjd].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > + 1.0e-6 * max(fabsf(d), cj->dx_max_sort)) + error("particle shift diff exceeds dx_max_sort."); + } +#endif /* SWIFT_DEBUG_CHECKS */ + + /* Get some other useful values. */ + const int count_i = ci->count; + const int count_j = cj->count; + //const double hi_max = ci->h_max * kernel_gamma - rshift; + //const double hj_max = cj->h_max * kernel_gamma; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + //const double di_max = sort_i[count_i - 1].d - rshift; + //const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max_sort + cj->dx_max_sort); + + /* Check if any particles are active and return if there are not. */ + //int numActive = 0; + //for (int pid = count_i - 1; + // pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { + // struct part *restrict pi = &parts_i[sort_i[pid].i]; + // if (part_is_active(pi, e)) { + // numActive++; + // break; + // } + //} + + //if (!numActive) { + // for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + // pjd++) { + // struct part *restrict pj = &parts_j[sort_j[pjd].i]; + // if (part_is_active(pj, e)) { + // numActive++; + // break; + // } + // } + //} + + //if (numActive == 0) return; + + /* Get both particle caches from the runner and re-allocate + * them if they are not big enough for the cells. */ + struct cache *restrict ci_cache = &r->ci_cache; + struct cache *restrict cj_cache = &r->cj_cache; + + if (ci_cache->count < count_i) { + cache_init(ci_cache, count_i); + } + if (cj_cache->count < count_j) { + cache_init(cj_cache, count_j); + } + + int first_pi, last_pj; + float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); + float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); + + max_di = r->ci_cache.max_d; + max_dj = r->cj_cache.max_d; + + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* Also find the first pi that interacts with any particle in cj and the last + * pj that interacts with any particle in ci. */ + populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, + max_dj, &first_pi, &last_pj, e); + + /* Find the maximum index into cj that is required by a particle in ci. */ + /* Find the maximum index into ci that is required by a particle in cj. */ + float di, dj; + int max_ind_j = count_j - 1; + int max_ind_i = 0; + + dj = sort_j[max_ind_j].d; + while (max_ind_j > 0 && max_di[count_i - 1] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + + di = sort_i[max_ind_i].d; + while (max_ind_i < count_i - 1 && max_dj[0] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + + /* Limits of the outer loops. */ + //int first_pi_loop = first_pi; + //int last_pj_loop = last_pj; + + /* Take the max/min of both values calculated to work out how many particles + * to read into the cache. */ + last_pj = max(last_pj, max_ind_j); + first_pi = min(first_pi, max_ind_i); + + last_pj = count_j - 1; + first_pi = 0; + + /* Read the needed particles into the two caches. */ + int first_pi_align = first_pi; + int last_pj_align = last_pj; + cache_read_two_partial_cells_sorted_force(ci, cj, ci_cache, cj_cache, sort_i, + sort_j, shift, &first_pi_align, + &last_pj_align, 1); + + last_pj_align = count_j - 1; + first_pi_align = 0; + + /* Get the number of particles read into the ci cache. */ + int ci_cache_count = count_i - first_pi_align; + + if (cell_is_active(ci, e)) { + + /* Loop over the parts in ci. */ + //for (int pid = count_i - 1; pid >= first_pi_loop && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= 0; pid--) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[sort_i[pid].i]; + if (!part_is_active(pi, e)) continue; + + /* Determine the exit iteration of the interaction loop. */ + //dj = sort_j[max_ind_j].d; + //while (max_ind_j > 0 && max_di[pid] < dj) { + // max_ind_j--; + + // dj = sort_j[max_ind_j].d; + //} + //int exit_iteration = max_ind_j + 1; + int exit_iteration = count_j; + + /* Set the cache index. */ + int ci_cache_idx = pid - first_pi_align; + + const float hi = ci_cache->h[ci_cache_idx]; + const float hig2 = hi * hi * kernel_gamma2; + + vector pix, piy, piz; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(ci_cache->x[ci_cache_idx]); + piy.v = vec_set1(ci_cache->y[ci_cache_idx]); + piz.v = vec_set1(ci_cache->z[ci_cache_idx]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); + v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); + v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); + + v_rhoi.v = vec_set1(ci_cache->rho[ci_cache_idx]); + v_grad_hi.v = vec_set1(ci_cache->grad_h[ci_cache_idx]); + v_pOrhoi2.v = vec_set1(ci_cache->pOrho2[ci_cache_idx]); + v_balsara_i.v = vec_set1(ci_cache->balsara[ci_cache_idx]); + v_ci.v = vec_set1(ci_cache->soundspeed[ci_cache_idx]); + + v_hig2.v = vec_set1(hig2); + + /* Reset cumulative sums of update vectors. */ + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; + + /* Get the inverse of hi. */ + vector v_hi_inv; + v_hi_inv = vec_reciprocal(v_hi); + + a_hydro_xSum.v = vec_setzero(); + a_hydro_ySum.v = vec_setzero(); + a_hydro_zSum.v = vec_setzero(); + h_dtSum.v = vec_setzero(); + v_sigSum.v = vec_set1(pi->force.v_sig); + entropy_dtSum.v = vec_setzero(); + + /* Pad the exit iteration if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + int rem = exit_iteration % VEC_SIZE; + if (rem != 0) { + int pad = VEC_SIZE - rem; + + if (exit_iteration_align + pad <= last_pj_align + 1) + exit_iteration_align += pad; + } + + vector pjx, pjy, pjz, hj, hjg2; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { + + /* Get the cache index to the jth particle. */ + int cj_cache_idx = pjd; + + vector v_dx, v_dy, v_dz; + +#ifdef SWIFT_DEBUG_CHECKS + if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) { + error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx); + } +#endif + + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); + pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); + pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); + hj.v = vec_load(&cj_cache->h[cj_cache_idx]); + hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pix.v, pjx.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dz.v = vec_sub(piz.v, pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + mask_t v_doi_mask; + int doi_mask; + + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hig2.v, hjg2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + + /* Form integer masks. */ + doi_mask = vec_form_int_mask(v_doi_mask); + + /* If there are any interactions perform them. */ + if (doi_mask) { + + runner_iact_nonsym_1_vec_force( + &v_r2, &v_dx, &v_dy, &v_dz, &v_vix, &v_viy, &v_viz, + &v_rhoi, &v_grad_hi, &v_pOrhoi2, &v_balsara_i, &v_ci, + &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], + &cj_cache->rho[cj_cache_idx], &cj_cache->grad_h[cj_cache_idx], + &cj_cache->pOrho2[cj_cache_idx], &cj_cache->balsara[cj_cache_idx], + &cj_cache->soundspeed[cj_cache_idx], &v_hi_inv, &hj, + &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + &h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask); + + } + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in particle pi. + */ + VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HMAX(v_sigSum, pi->force.v_sig); + VEC_HADD(entropy_dtSum, pi->entropy_dt); + + } /* loop over the parts in ci. */ + } + + if (cell_is_active(cj, e)) { + /* Loop over the parts in cj. */ + //for (int pjd = 0; pjd <= last_pj_loop && max_ind_i < count_i; pjd++) { + for (int pjd = 0; pjd < count_j; pjd++) { + + /* Get a hold of the jth part in cj. */ + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (!part_is_active(pj, e)) continue; + + /* Determine the exit iteration of the interaction loop. */ + //di = sort_i[max_ind_i].d; + //while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { + // max_ind_i++; + + // di = sort_i[max_ind_i].d; + //} + //int exit_iteration = max_ind_i; + int exit_iteration = 0; + + /* Set the cache index. */ + int cj_cache_idx = pjd; + + const float hj = cj_cache->h[cj_cache_idx]; + const float hjg2 = hj * hj * kernel_gamma2; + + vector pjx, pjy, pjz; + vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; + vector v_rhoj, v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj; + + /* Fill particle pi vectors. */ + pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); + pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); + pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); + v_hj.v = vec_set1(hj); + v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); + v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); + v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]); + + v_rhoj.v = vec_set1(cj_cache->rho[cj_cache_idx]); + v_grad_hj.v = vec_set1(cj_cache->grad_h[cj_cache_idx]); + v_pOrhoj2.v = vec_set1(cj_cache->pOrho2[cj_cache_idx]); + v_balsara_j.v = vec_set1(cj_cache->balsara[cj_cache_idx]); + v_cj.v = vec_set1(cj_cache->soundspeed[cj_cache_idx]); + + v_hjg2.v = vec_set1(hjg2); + + /* Reset cumulative sums of update vectors. */ + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; + + /* Get the inverse of hj. */ + vector v_hj_inv; + + v_hj_inv = vec_reciprocal(v_hj); + + a_hydro_xSum.v = vec_setzero(); + a_hydro_ySum.v = vec_setzero(); + a_hydro_zSum.v = vec_setzero(); + h_dtSum.v = vec_setzero(); + v_sigSum.v = vec_set1(pj->force.v_sig); + entropy_dtSum.v = vec_setzero(); + + /* Convert exit iteration to cache indices. */ + int exit_iteration_align = exit_iteration - first_pi_align; + + /* Pad the exit iteration align so cache reads are aligned. */ + int rem = exit_iteration_align % VEC_SIZE; + if (exit_iteration_align < VEC_SIZE) { + exit_iteration_align = 0; + } else + exit_iteration_align -= rem; + + vector pix, piy, piz, hi, hig2; + + /* Loop over the parts in ci. */ + for (int ci_cache_idx = exit_iteration_align; + ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { + +#ifdef SWIFT_DEBUG_CHECKS + if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) { + error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx); + } +#endif + + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 2 sets of vectors from the particle cache. */ + pix.v = vec_load(&ci_cache->x[ci_cache_idx]); + piy.v = vec_load(&ci_cache->y[ci_cache_idx]); + piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + hi.v = vec_load(&ci_cache->h[ci_cache_idx]); + hig2.v = vec_mul(vec_mul(hi.v, hi.v), kernel_gamma2_vec.v); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(pjx.v, pix.v); + v_dy.v = vec_sub(pjy.v, piy.v); + v_dz.v = vec_sub(pjz.v, piz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + mask_t v_doj_mask; + int doj_mask; + + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hjg2.v, hig2.v); + vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + + /* Form integer masks. */ + doj_mask = vec_form_int_mask(v_doj_mask); + + /* If there are any interactions perform them. */ + if (doj_mask) { + + runner_iact_nonsym_1_vec_force( + &v_r2, &v_dx, &v_dy, &v_dz, &v_vjx, &v_vjy, &v_vjz, + &v_rhoj, &v_grad_hj, &v_pOrhoj2, &v_balsara_j, &v_cj, + &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], + &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], + &ci_cache->rho[ci_cache_idx], &ci_cache->grad_h[ci_cache_idx], + &ci_cache->pOrho2[ci_cache_idx], &ci_cache->balsara[ci_cache_idx], + &ci_cache->soundspeed[ci_cache_idx], &v_hj_inv, &hi, + &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + &h_dtSum, &v_sigSum, &entropy_dtSum, v_doj_mask); + + } + } /* loop over the parts in ci. */ + + /* Perform horizontal adds on vector sums and store result in particle pj. + */ + VEC_HADD(a_hydro_xSum, pj->a_hydro[0]); + VEC_HADD(a_hydro_ySum, pj->a_hydro[1]); + VEC_HADD(a_hydro_zSum, pj->a_hydro[2]); + VEC_HADD(h_dtSum, pj->force.h_dt); + VEC_HMAX(v_sigSum, pj->force.v_sig); + VEC_HADD(entropy_dtSum, pj->entropy_dt); + + } /* loop over the parts in cj. */ + + TIMER_TOC(timer_dopair_density); + } + +#endif /* WITH_VECTORIZATION */ +} diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 50d0722d577c38a4cb3cce35a339795b399161fe..49b3a506ba312c39c8b0a408cffa2287a95a245b 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -38,5 +38,6 @@ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); void runner_doself2_force_vec(struct runner *r, struct cell *restrict c); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj); - +void runner_dopair2_force_vec(struct runner *r, struct cell *restrict ci, + struct cell *restrict cj); #endif /* SWIFT_RUNNER_VEC_H */