diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index a8b7ecb18a3b4615f1ff9b9618bec3284cb5b9ad..553843f5303446499474a8285a3fbe234fac02a5 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -889,6 +889,184 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( } #endif +/* Loop over the particles in the cell. */ +for (int pid = 0; pid < count; pid++) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + + /* Is the ith particle active? */ + if (!part_is_active(pi, e)) continue; + + vector pix, piy, piz; + + const float hi = cell_cache->h[pid]; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(cell_cache->x[pid]); + piy.v = vec_set1(cell_cache->y[pid]); + piz.v = vec_set1(cell_cache->z[pid]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(cell_cache->vx[pid]); + v_viy.v = vec_set1(cell_cache->vy[pid]); + v_viz.v = vec_set1(cell_cache->vz[pid]); + + v_rhoi.v = vec_set1(cell_cache->rho[pid]); + v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]); + v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]); + v_balsara_i.v = vec_set1(cell_cache->balsara[pid]); + v_ci.v = vec_set1(cell_cache->soundspeed[pid]); + + const float hig2 = hi * hi * kernel_gamma2; + v_hig2.v = vec_set1(hig2); + + /* Reset cumulative sums of update vectors. */ + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; + + /* Get the inverse of hi. */ + vector v_hi_inv; + + v_hi_inv = vec_reciprocal(v_hi); + + a_hydro_xSum.v = vec_setzero(); + a_hydro_ySum.v = vec_setzero(); + a_hydro_zSum.v = vec_setzero(); + h_dtSum.v = vec_setzero(); + v_sigSum.v = vec_set1(pi->force.v_sig); + entropy_dtSum.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + count_align = count; + int rem = count % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + count_align += pad; + + /* Set positions to the same as particle pi so when the r2 > 0 mask is + * applied these extra contributions are masked out.*/ + for (int i = count; i < count_align; i++) { + cell_cache->x[i] = pix.f[0]; + cell_cache->y[i] = piy.f[0]; + cell_cache->z[i] = piz.f[0]; + cell_cache->h[i] = 1.f; + } + } + + vector pjx, pjy, pjz, hj, hjg2; + + /* Find all of particle pi's interacions and store needed values in the + * secondary cache.*/ + for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { + + int cj_cache_idx = pjd; + /* Load 2 sets of vectors from the particle cache. */ + pjx.v = vec_load(&cell_cache->x[pjd]); + pjy.v = vec_load(&cell_cache->y[pjd]); + pjz.v = vec_load(&cell_cache->z[pjd]); + hj.v = vec_load(&cell_cache->h[pjd]); + hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); + + /* Compute the pairwise distance. */ + vector v_dx_tmp, v_dy_tmp, v_dz_tmp; + + v_dx_tmp.v = vec_sub(pix.v, pjx.v); + v_dy_tmp.v = vec_sub(piy.v, pjy.v); + v_dz_tmp.v = vec_sub(piz.v, pjz.v); + + v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); + v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); + v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); + + /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ + mask_t v_doi_mask, v_doi_mask_self_check; + int doi_mask; + + /* Form r2 > 0 mask.*/ + vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); + + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hig2.v, hjg2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + + /* Combine all 3 masks and form integer mask. */ + v_doi_mask.v = vec_and(v_doi_mask.v, v_doi_mask_self_check.v); + doi_mask = vec_form_int_mask(v_doi_mask); + + /* If there are any interactions left pack interaction values into c2 + * cache. */ + if (doi_mask) { + vector v_hj, v_hj_inv; + v_hj.v = vec_load(&cell_cache->h[cj_cache_idx]); + v_hj_inv = vec_reciprocal(v_hj); + + runner_iact_nonsym_1_vec_force( + &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, v_vix, v_viy, v_viz, + v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, + &cell_cache->vx[cj_cache_idx], &cell_cache->vy[cj_cache_idx], + &cell_cache->vz[cj_cache_idx], &cell_cache->rho[cj_cache_idx], &cell_cache->grad_h[cj_cache_idx], + &cell_cache->pOrho2[cj_cache_idx], &cell_cache->balsara[cj_cache_idx], &cell_cache->soundspeed[cj_cache_idx], &cell_cache->m[cj_cache_idx], v_hi_inv, v_hj_inv, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + &h_dtSum, &v_sigSum, &entropy_dtSum, v_doi_mask); + + } + + } /* Loop over all other particles. */ + + VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HMAX(v_sigSum, pi->force.v_sig); + VEC_HADD(entropy_dtSum, pi->entropy_dt); + +} /* loop over all particles. */ + +TIMER_TOC(timer_doself_force); +#endif /* WITH_VECTORIZATION */ +} +__attribute__((always_inline)) INLINE void runner_doself2_force_vec_2( + struct runner *r, struct cell *restrict c) { +#ifdef WITH_VECTORIZATION + const struct engine *e = r->e; + struct part *restrict pi; + int count_align; + const int num_vec_proc = 1;//2; + + struct part *restrict parts = c->parts; + const int count = c->count; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; + vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci; + + TIMER_TIC + + if (!cell_is_active(c, e)) return; + + if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell."); + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ + struct cache *restrict cell_cache = &r->ci_cache; + + if (cell_cache->count < count) { + cache_init(cell_cache, count); + } + + /* Read the particles from the cell and store them locally in the cache. */ + cache_read_force_particles(c, cell_cache); + +#ifdef SWIFT_DEBUG_CHECKS + for (int i = 0; i < count; i++) { + pi = &c->parts[i]; + /* Check that particles have been drifted to the current time */ + if (pi->ti_drift != e->ti_current) + error("Particle pi not drifted to current time"); + } +} +#endif + /* Create secondary cache to store particle interactions. */ struct c2_cache int_cache; int icount = 0, icount_align = 0; @@ -959,6 +1137,7 @@ for (int pid = 0; pid < count; pid++) { } vector pjx, pjy, pjz, hj, hjg2; + //vector pjx_2, pjy_2, pjz_2, hj_2, hjg2_2, v_r2_2; /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ @@ -966,40 +1145,61 @@ for (int pid = 0; pid < count; pid++) { /* Load 2 sets of vectors from the particle cache. */ pjx.v = vec_load(&cell_cache->x[pjd]); + //pjx_2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); pjy.v = vec_load(&cell_cache->y[pjd]); + //pjy_2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); pjz.v = vec_load(&cell_cache->z[pjd]); + //pjz_2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); hj.v = vec_load(&cell_cache->h[pjd]); + //hj_2.v = vec_load(&cell_cache->h[pjd + VEC_SIZE]); hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); + //hjg2_2.v = vec_mul(vec_mul(hj_2.v, hj_2.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ vector v_dx_tmp, v_dy_tmp, v_dz_tmp; + //vector v_dx_tmp_2, v_dy_tmp_2, v_dz_tmp_2; v_dx_tmp.v = vec_sub(pix.v, pjx.v); + //v_dx_tmp_2.v = vec_sub(pix.v, pjx_2.v); v_dy_tmp.v = vec_sub(piy.v, pjy.v); + //v_dy_tmp_2.v = vec_sub(piy.v, pjy_2.v); v_dz_tmp.v = vec_sub(piz.v, pjz.v); + //v_dz_tmp_2.v = vec_sub(piz.v, pjz_2.v); v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); + //v_r2_2.v = vec_mul(v_dx_tmp_2.v, v_dx_tmp_2.v); v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); + //v_r2_2.v = vec_mul(v_dy_tmp_2.v, v_dy_tmp_2.v); v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); + //v_r2_2.v = vec_mul(v_dz_tmp_2.v, v_dz_tmp_2.v); /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ mask_t v_doi_mask, v_doi_mask_self_check; + //mask_t v_doi_mask_2, v_doi_mask_self_check_2; int doi_mask, doi_mask_self_check; + //int doi_mask_2, doi_mask_self_check_2; /* Form r2 > 0 mask.*/ vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); + //vec_create_mask(v_doi_mask_self_check_2, vec_cmp_gt(v_r2_2.v, vec_setzero())); /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ vector v_h2; + //vector v_h2_2; v_h2.v = vec_fmax(v_hig2.v, hjg2.v); + //v_h2_2.v = vec_fmax(v_hig2.v, hjg2_2.v); vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + //vec_create_mask(v_doi_mask_2, vec_cmp_lt(v_r2_2.v, v_h2_2.v)); /* Form integer masks. */ doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); + //doi_mask_self_check_2 = vec_form_int_mask(v_doi_mask_self_check_2); doi_mask = vec_form_int_mask(v_doi_mask); + //doi_mask_2 = vec_form_int_mask(v_doi_mask_2); /* Combine all 3 masks. */ doi_mask = doi_mask & doi_mask_self_check; + //doi_mask_2 = doi_mask_2 & doi_mask_self_check_2; /* If there are any interactions left pack interaction values into c2 * cache. */ @@ -1011,6 +1211,15 @@ for (int pid = 0; pid < count; pid++) { &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); } + + //if (doi_mask_2) { + + // storeForceInteractions( + // doi_mask_2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp_2, &v_dy_tmp_2, &v_dz_tmp_2, cell_cache, + // &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + // &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, + // v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); + //} } /* Loop over all other particles. */