diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 4cc0ac273941664be6aa43c92c1d7c30e7d3db30..0104868879aaffd3e9bd6549a82199fadfd9f755 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -275,56 +275,6 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( pi->density.rot_v[2] += fac * curlvr[2]; } -__attribute__((always_inline)) INLINE static void -runner_iact_nonsym_density_jsw( - const float r2, const float hig2, const float dx, const float dy, - const float dz, const float h_inv, const float hj, const float vi_x, - const float vi_y, const float vi_z, const float vj_x, const float vj_y, - const float vj_z, const float mj, float *const restrict rho, - float *const restrict rho_dh, float *const restrict wcount, - float *const restrict wcount_dh, float *const restrict div_v, - float *const restrict curl_vx, float *const restrict curl_vy, - float *const restrict curl_vz) { - - if (r2 < hig2) { - - float wi, wi_dx; - - /* Get r and r inverse. */ - const float r = sqrtf(r2); - const float ri = 1.0f / r; - - /* Compute kernel function */ - const float u = r * h_inv; - kernel_deval(u, &wi, &wi_dx); - - const float fac = mj * wi_dx * ri; - - /* Compute dv dot r */ - const float dv_x = vi_x - vj_x; - const float dv_y = vi_y - vj_y; - const float dv_z = vi_z - vj_z; - const float dvdr = dv_x * dx + dv_y * dy + dv_z * dz; - *div_v -= fac * dvdr; - - /* Compute dv cross r */ - const float curlvr_x = dv_y * dz - dv_z * dy; - const float curlvr_y = dv_z * dx - dv_x * dz; - const float curlvr_z = dv_x * dy - dv_y * dx; - - /* Compute contribution to the density */ - *rho += mj * wi; - *rho_dh -= mj * (3.0f * wi + u * wi_dx); - - /* Compute contribution to the number of neighbours */ - *wcount += wi; - *wcount_dh -= u * wi_dx; - *curl_vx += fac * curlvr_x; - *curl_vy += fac * curlvr_y; - *curl_vz += fac * curlvr_z; - } -} - /** * @brief Density loop (non-symmetric vectorized version) */ @@ -425,8 +375,13 @@ runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, } #ifdef WITH_VECTORIZATION + +/** + * @brief Density interaction computed using 1 vector + * (non-symmetric vectorized version). + */ __attribute__((always_inline)) INLINE static void -runner_iact_nonsym_intrinsic_vec_density( +runner_iact_nonsym_1_vec_density( vector *r2, vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, @@ -518,220 +473,6 @@ runner_iact_nonsym_intrinsic_vec_density( #endif } -__attribute__((always_inline)) INLINE static void -runner_iact_nonsym_intrinsic_vec_2_density( - const struct cache *const cj_cache, const int *const indices, vector *r2, - vector *dx, vector *dy, vector *dz, vector hi_inv, vector vix, vector viy, - vector viz, vector *rhoSum, vector *rho_dhSum, vector *wcountSum, - vector *wcount_dhSum, vector *div_vSum, vector *curlvxSum, - vector *curlvySum, vector *curlvzSum, vector mask, int knlMask) { - - // vector r, ri, r2, xi, wi, wi_dx; - vector r, ri, xi, wi, wi_dx; - vector mj; - // vector dx, dy, dz, dvx, dvy, dvz; - vector dvx, dvy, dvz; - vector vjx, vjy, vjz; - vector dvdr; - vector curlvrx, curlvry, curlvrz; - - /* Fill the vectors. */ - mj.v = vec_set(cj_cache->m[indices[0]], cj_cache->m[indices[1]], - cj_cache->m[indices[2]], cj_cache->m[indices[3]], - cj_cache->m[indices[4]], cj_cache->m[indices[5]], - cj_cache->m[indices[6]], cj_cache->m[indices[7]]); - vjx.v = vec_set(cj_cache->vx[indices[0]], cj_cache->vx[indices[1]], - cj_cache->vx[indices[2]], cj_cache->vx[indices[3]], - cj_cache->vx[indices[4]], cj_cache->vx[indices[5]], - cj_cache->vx[indices[6]], cj_cache->vx[indices[7]]); - vjy.v = vec_set(cj_cache->vy[indices[0]], cj_cache->vy[indices[1]], - cj_cache->vy[indices[2]], cj_cache->vy[indices[3]], - cj_cache->vy[indices[4]], cj_cache->vy[indices[5]], - cj_cache->vy[indices[6]], cj_cache->vy[indices[7]]); - vjz.v = vec_set(cj_cache->vz[indices[0]], cj_cache->vz[indices[1]], - cj_cache->vz[indices[2]], cj_cache->vz[indices[3]], - cj_cache->vz[indices[4]], cj_cache->vz[indices[5]], - cj_cache->vz[indices[6]], cj_cache->vz[indices[7]]); - // dx.v = vec_load(Dx); - // dy.v = vec_load(Dy); - // dz.v = vec_load(Dz); - - /* Get the radius and inverse radius. */ - // r2.v = vec_load(R2); - ri = vec_reciprocal_sqrt(*r2); - r.v = vec_mul(r2->v, ri.v); - - xi.v = vec_mul(r.v, hi_inv.v); - - /* Calculate the kernel for two particles. */ - kernel_deval_1_vec(&xi, &wi, &wi_dx); - - /* Compute dv. */ - dvx.v = vec_sub(vix.v, vjx.v); - dvy.v = vec_sub(viy.v, vjy.v); - dvz.v = vec_sub(viz.v, vjz.v); - - /* Compute dv dot r */ - dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v))); - dvdr.v = vec_mul(dvdr.v, ri.v); - - /* Compute dv cross r */ - curlvrx.v = - vec_fma(dvy.v, dz->v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy->v))); - curlvry.v = - vec_fma(dvz.v, dx->v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz->v))); - curlvrz.v = - vec_fma(dvx.v, dy->v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx->v))); - curlvrx.v = vec_mul(curlvrx.v, ri.v); - curlvry.v = vec_mul(curlvry.v, ri.v); - curlvrz.v = vec_mul(curlvrz.v, ri.v); - -/* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); - - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask, - wcount_dhSum->v, vec_mul(xi.v, wi_dx.v)); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); -#endif -} - -/** - * @brief Density interaction computed using 2 interleaved vectors - * (non-symmetric vectorized version). - */ -__attribute__((always_inline)) INLINE static void -runner_iact_nonsym_1_vec_density( - float *R2, float *Dx, float *Dy, float *Dz, vector hi_inv, vector vix, - vector viy, vector viz, float *Vjx, float *Vjy, float *Vjz, float *Mj, - vector *rhoSum, vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector mask, vector mask2, int knlMask, int knlMask2) { - - vector r, ri, r2, xi, wi, wi_dx; - vector mj; - vector dx, dy, dz, dvx, dvy, dvz; - vector vjx, vjy, vjz; - vector dvdr; - vector curlvrx, curlvry, curlvrz; - - /* Fill the vectors. */ - mj.v = vec_load(Mj); - vjx.v = vec_load(Vjx); - vjy.v = vec_load(Vjy); - vjz.v = vec_load(Vjz); - dx.v = vec_load(Dx); - dy.v = vec_load(Dy); - dz.v = vec_load(Dz); - - /* Get the radius and inverse radius. */ - r2.v = vec_load(R2); - ri = vec_reciprocal_sqrt(r2); - r.v = vec_mul(r2.v, ri.v); - - xi.v = vec_mul(r.v, hi_inv.v); - - /* Calculate the kernel for two particles. */ - kernel_deval_1_vec(&xi, &wi, &wi_dx); - - /* Compute dv. */ - dvx.v = vec_sub(vix.v, vjx.v); - dvy.v = vec_sub(viy.v, vjy.v); - dvz.v = vec_sub(viz.v, vjz.v); - - /* Compute dv dot r */ - dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v))); - dvdr.v = vec_mul(dvdr.v, ri.v); - - /* Compute dv cross r */ - curlvrx.v = - vec_fma(dvy.v, dz.v, vec_mul(vec_set1(-1.0f), vec_mul(dvz.v, dy.v))); - curlvry.v = - vec_fma(dvz.v, dx.v, vec_mul(vec_set1(-1.0f), vec_mul(dvx.v, dz.v))); - curlvrz.v = - vec_fma(dvx.v, dy.v, vec_mul(vec_set1(-1.0f), vec_mul(dvy.v, dx.v))); - curlvrx.v = vec_mul(curlvrx.v, ri.v); - curlvry.v = vec_mul(curlvry.v, ri.v); - curlvrz.v = vec_mul(curlvrz.v, ri.v); - -/* Mask updates to intermediate vector sums for particle pi. */ -#ifdef HAVE_AVX512_F - rhoSum->v = - _mm512_mask_add_ps(rhoSum->v, knlMask, vec_mul(mj.v, wi.v), rhoSum->v); - - rho_dhSum->v = - _mm512_mask_sub_ps(rho_dhSum->v, knlMask, rho_dhSum->v, - vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v)))); - - wcountSum->v = _mm512_mask_add_ps(wcountSum->v, knlMask, wi.v, wcountSum->v); - - wcount_dhSum->v = _mm512_mask_sub_ps(wcount_dhSum->v, knlMask, - wcount_dhSum->v, vec_mul(xi.v, wi_dx.v)); - - div_vSum->v = _mm512_mask_sub_ps(div_vSum->v, knlMask, div_vSum->v, - vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v))); - - curlvxSum->v = _mm512_mask_add_ps(curlvxSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), - curlvxSum->v); - - curlvySum->v = _mm512_mask_add_ps(curlvySum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), - curlvySum->v); - - curlvzSum->v = _mm512_mask_add_ps(curlvzSum->v, knlMask, - vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), - curlvzSum->v); -#else - rhoSum->v += vec_and(vec_mul(mj.v, wi.v), mask.v); - rho_dhSum->v -= vec_and(vec_mul(mj.v, vec_fma(vec_set1(hydro_dimension), wi.v, - vec_mul(xi.v, wi_dx.v))), - mask.v); - wcountSum->v += vec_and(wi.v, mask.v); - wcount_dhSum->v -= vec_and(vec_mul(xi.v, wi_dx.v), mask.v); - div_vSum->v -= vec_and(vec_mul(mj.v, vec_mul(dvdr.v, wi_dx.v)), mask.v); - curlvxSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrx.v, wi_dx.v)), mask.v); - curlvySum->v += vec_and(vec_mul(mj.v, vec_mul(curlvry.v, wi_dx.v)), mask.v); - curlvzSum->v += vec_and(vec_mul(mj.v, vec_mul(curlvrz.v, wi_dx.v)), mask.v); -#endif -} -#endif - -#ifdef WITH_VECTORIZATION /** * @brief Density interaction computed using 2 interleaved vectors * (non-symmetric vectorized version). diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 9f9a9c49a2a18c4905713b5472fa3c6d423c9259..33e6fc06b4537a72bf361fc2e4599e5f3dce9657 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -260,52 +260,6 @@ __attribute__((always_inline)) INLINE static void storeInteractions( } } -/* @brief Populates the arrays max_di and max_dj with the maximum distances of - * particles into their neighbouring cells. - * @param ci #cell pointer to ci - * @param cj #cell pointer to cj - * @param sort_i #entry array for particle distance in ci - * @param sort_j #entry array for particle distance in cj - * @param ci_cache #cache for cell ci - * @param cj_cache #cache for cell cj - * @param dx_max maximum particle movement allowed in cell - * @param rshift cutoff shift - * @param max_di array to hold the maximum distances of pi particles into cell - * cj - * @param max_dj array to hold the maximum distances of pj particles into cell - * cj - */ -__attribute__((always_inline)) INLINE static void populate_max_d( - const struct cell *ci, const struct cell *cj, - const struct entry *restrict sort_i, const struct entry *restrict sort_j, - const struct cache *ci_cache, const struct cache *cj_cache, - const float dx_max, const float rshift, float *max_di, float *max_dj) { - - float h = ci_cache->h[0]; - float d; - - /* For particles in ci */ - max_di[0] = sort_i[0].d + h * kernel_gamma + dx_max - rshift; - - for (int k = 1; k < ci->count; k++) { - h = ci_cache->h[k]; - d = sort_i[k].d + h * kernel_gamma + dx_max - rshift; - - max_di[k] = fmaxf(max_di[k - 1], d); - } - - /* For particles in cj */ - h = cj_cache->h[0]; - max_dj[0] = sort_j[0].d - h * kernel_gamma - dx_max - rshift; - - for (int k = 1; k < cj->count; k++) { - h = cj_cache->h[k]; - d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; - - max_dj[k] = fmaxf(max_dj[k - 1], d); - } -} - /* @brief Populates the arrays max_di and max_dj with the maximum distances of * particles into their neighbouring cells. Also finds the first pi that * interacts with any particle in cj and the last pj that interacts with any @@ -1224,7 +1178,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* If there are any interactions perform them. */ if (doi_mask) - runner_iact_nonsym_intrinsic_vec_density( + runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, @@ -1353,7 +1307,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* If there are any interactions perform them. */ if (doj_mask) - runner_iact_nonsym_intrinsic_vec_density( + runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, @@ -1384,2439 +1338,3 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #endif /* WITH_VECTORIZATION */ } - -FILE *faceIntFile; -FILE *edgeIntFile; -FILE *cornerIntFile; - -/** C2_CACHE VERSION - * @brief Compute the interactions between a cell pair (non-symmetric). - * - * @param r The #runner. - * @param ci The first #cell. - * @param cj The second #cell. - */ -void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, - struct cell *cj) { - -#ifdef WITH_VECTORIZATION - const struct engine *restrict e = r->e; - - static int faceIntCount = 0; - static int faceCtr = 0; - static int edgeIntCount = 0; - static int edgeCtr = 0; - static int cornerIntCount = 0; - static int cornerCtr = 0; - static int numFaceTested = 0; - static int numEdgeTested = 0; - static int numCornerTested = 0; - int icount = 0, icount_align = 0; - struct c2_cache int_cache; - int num_vec_proc = 1; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2; - - TIMER_TIC; - - if (faceCtr + edgeCtr + cornerCtr == 0) { - faceIntFile = fopen("particle_interactions_face.dat", "w"); - edgeIntFile = fopen("particle_interactions_edge.dat", "w"); - cornerIntFile = fopen("particle_interactions_corner.dat", "w"); - } - - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - -#ifdef SWIFT_DEBUG_CHECKS - cell_is_drifted(ci, e); - cell_is_drifted(cj, e); -#endif - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - int face = (sid == 4 || sid == 10 || sid == 12); - int edge = - (sid == 1 || sid == 3 || sid == 5 || sid == 7 || sid == 9 || sid == 11); - int corner = (sid == 0 || sid == 2 || sid == 6 || sid == 8); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) - error("Trying to interact unsorted cells."); - - /* Get the cutoff shift. */ - double rshift = 0.0; - for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; - - /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; - - /* Get some other useful values. */ - const int count_i = ci->count; - const int count_j = cj->count; - struct part *restrict parts_i = ci->parts; - struct part *restrict parts_j = cj->parts; - const double di_max = sort_i[count_i - 1].d - rshift; - const double dj_min = sort_j[0].d; - const float dx_max = (ci->dx_max + cj->dx_max); - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - struct cache *restrict ci_cache = &r->ci_cache; - struct cache *restrict cj_cache = &r->cj_cache; - - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } - - cache_read_two_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, sort_j, - shift); - - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); - - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; - - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ - /* For particles in ci */ - populate_max_d(ci, cj, sort_i, sort_j, ci_cache, cj_cache, dx_max, rshift, - max_di, max_dj); - - float di, dj; - - int max_ind_j = count_j - 1; - - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { - - /* Get a hold of the ith part in ci. */ - struct part *restrict pi = &parts_i[sort_i[pid].i]; - if (!part_is_active(pi, e)) continue; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j + 1; - - int ci_cache_idx = pid; // sort_i[pid].i; - - const float hi = ci_cache->h[ci_cache_idx]; - const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; - if (di < dj_min) continue; - - const float hig2 = hi * hi * kernel_gamma2; - - vector pix, piy, piz; - - /* Fill particle pi vectors. */ - pix.v = vec_set1(ci_cache->x[ci_cache_idx]); - piy.v = vec_set1(ci_cache->y[ci_cache_idx]); - piz.v = vec_set1(ci_cache->z[ci_cache_idx]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); - v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); - v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); - - v_hig2.v = vec_set1(hig2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - /* Get the inverse of hi. */ - vector v_hi_inv; - - v_hi_inv = vec_reciprocal(v_hi); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align += pad; - } - - vector pjx, pjy, pjz; - vector pjvx, pjvy, pjvz, mj; - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { - - /* Get the cache index to the jth particle. */ - int cj_cache_idx = pjd; // sort_j[pjd].i; - - vector v_dx, v_dy, v_dz, v_r2; - - /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); - pjvx.v = vec_load(&cj_cache->vx[cj_cache_idx]); - pjvy.v = vec_load(&cj_cache->vy[cj_cache_idx]); - pjvz.v = vec_load(&cj_cache->vz[cj_cache_idx]); - mj.v = vec_load(&cj_cache->m[cj_cache_idx]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dz.v = vec_sub(piz.v, pjz.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - - vector v_doi_mask; - int doi_mask; - - /* Form r2 < hig2 mask. */ - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); - - /* Form integer mask. */ - doi_mask = vec_cmp_result(v_doi_mask.v); - - /* If there are any interactions left pack interaction values into c2 - * cache. */ - if (doi_mask) - storeInteractions(doi_mask, cj_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz, - &mj, &pjvx, &pjvy, &pjvz, cj_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); - - } /* loop over the parts in cj. */ - - /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, - &icount_align); - - /* Initialise masks to true in case remainder interactions have been - * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif - - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - runner_iact_nonsym_2_vec_density( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, - &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif - } - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); - - if (face) { - faceIntCount += icount; - fprintf(faceIntFile, "%d\n", icount); - numFaceTested++; - } else if (edge) { - edgeIntCount += icount; - fprintf(edgeIntFile, "%d\n", icount); - numEdgeTested++; - } else if (corner) { - cornerIntCount += icount; - fprintf(cornerIntFile, "%d\n", icount); - numCornerTested++; - } - - icount = 0; - - } /* loop over the parts in ci. */ - - if (face) { - faceCtr++; - message( - "Total number of face interactions: %d, average per particle: %f, " - "number tested: %d.", - faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), - numFaceTested); - } else if (edge) { - edgeCtr++; - message( - "Total number of edge interactions: %d, average per particle: %f, " - "number tested: %d", - edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), - numEdgeTested); - } else if (corner) { - cornerCtr++; - message( - "Total number of corner interactions: %d, average per particle: %f, " - "number tested: %d", - cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), - numCornerTested); - } - - int max_ind_i = 0; - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { - - /* Get a hold of the jth part in cj. */ - struct part *restrict pj = &parts_j[sort_j[pjd].i]; - if (!part_is_active(pj, e)) continue; - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i - 1; - - int cj_cache_idx = pjd; - - const float hj = cj_cache->h[cj_cache_idx]; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - - const float hjg2 = hj * hj * kernel_gamma2; - - vector pjx, pjy, pjz; - vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; - - /* Fill particle pi vectors. */ - pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); - pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); - pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); - v_hj.v = vec_set1(hj); - v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); - v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); - v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]); - - v_hjg2.v = vec_set1(hjg2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - /* Get the inverse of hj. */ - vector v_hj_inv; - - v_hj_inv = vec_reciprocal(v_hj); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align -= pad; - } - - vector pix, piy, piz; - vector pivx, pivy, pivz, mi; - - /* Loop over the parts in ci. */ - // for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) { - for (int pid = exit_iteration_align; pid < count_i; - pid += (num_vec_proc * VEC_SIZE)) { - - /* Get the cache index to the ith particle. */ - int ci_cache_idx = pid; // sort_i[pid].i; - - vector v_dx, v_dy, v_dz, v_r2; - - /* Load 2 sets of vectors from the particle cache. */ - pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); - pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); - pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); - mi.v = vec_load(&ci_cache->m[ci_cache_idx]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dy.v = vec_sub(pjy.v, piy.v); - v_dz.v = vec_sub(pjz.v, piz.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - - vector v_doj_mask; - int doj_mask; - - /* Form r2 < hig2 mask. */ - v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); - - /* Form integer mask. */ - doj_mask = vec_cmp_result(v_doj_mask.v); - - /* If there are any interactions left pack interaction values into c2 - * cache. */ - if (doj_mask) - storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz, - &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz); - - } /* loop over the parts in cj. */ - - /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz, - &icount_align); - - /* Initialise masks to true in case remainder interactions have been - * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif - - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - runner_iact_nonsym_2_vec_density( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz, - &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, -#ifdef HAVE_AVX512_F - knl_mask, knl_mask2); -#else - 0, 0); -#endif - } - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pj->rho); - VEC_HADD(rho_dhSum, pj->density.rho_dh); - VEC_HADD(wcountSum, pj->density.wcount); - VEC_HADD(wcount_dhSum, pj->density.wcount_dh); - VEC_HADD(div_vSum, pj->density.div_v); - VEC_HADD(curlvxSum, pj->density.rot_v[0]); - VEC_HADD(curlvySum, pj->density.rot_v[1]); - VEC_HADD(curlvzSum, pj->density.rot_v[2]); - - icount = 0; - - } /* loop over the parts in ci. */ - - if (face) { - faceCtr++; - message( - "Total number of face interactions: %d, average per particle: %f, " - "number tested: %d.", - faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), - numFaceTested); - } else if (edge) { - edgeCtr++; - message( - "Total number of edge interactions: %d, average per particle: %f, " - "number tested: %d", - edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), - numEdgeTested); - } else if (corner) { - cornerCtr++; - message( - "Total number of corner interactions: %d, average per particle: %f, " - "number tested: %d", - cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), - numCornerTested); - } - TIMER_TOC(timer_dopair_density); - -#endif /* WITH_VECTORIZATION */ -} - -/* Similar to AUTO-VEC but process 2 pi at a time and use two vectors in - * interaction loop. */ -void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, - struct cell *cj) { - -#ifdef WITH_VECTORIZATION - const struct engine *restrict e = r->e; - - const int num_vec_proc = 2; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2; - vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2; - - TIMER_TIC; - - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - -#ifdef SWIFT_DEBUG_CHECKS - cell_is_drifted(ci, e); - cell_is_drifted(cj, e); -#endif - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) - error("Trying to interact unsorted cells."); - - /* Get the cutoff shift. */ - double rshift = 0.0; - for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; - - /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; - - /* Get some other useful values. */ - const int count_i = ci->count; - const int count_j = cj->count; - struct part *restrict parts_i = ci->parts; - struct part *restrict parts_j = cj->parts; - const double di_max = sort_i[count_i - 1].d - rshift; - const double dj_min = sort_j[0].d; - const float dx_max = (ci->dx_max + cj->dx_max); - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - struct cache *restrict ci_cache = &r->ci_cache; - struct cache *restrict cj_cache = &r->cj_cache; - - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } - - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); - - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; - - int first_pi, last_pj; - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ - /* For particles in ci */ - populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, - max_dj, &first_pi, &last_pj); - - float di, dj; - - int max_ind_j = count_j - 1; - int max_ind_i = 0; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[count_i - 1] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[0] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - - last_pj = max(last_pj, max_ind_j); - first_pi = min(first_pi, max_ind_i); - - cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, - sort_j, shift, &first_pi, &last_pj, - num_vec_proc); - - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid -= 2) { - - /* Get a hold of the ith part in ci. */ - struct part *restrict pi = &parts_i[sort_i[pid].i]; - struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i]; - if (!part_is_active(pi, e)) continue; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j; - - int ci_cache_idx = pid; // sort_i[pid].i; - - const float hi = ci_cache->h[ci_cache_idx]; - const float hi_2 = ci_cache->h[ci_cache_idx - 1]; - const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; - if (di < dj_min) continue; - - const float hig2 = hi * hi * kernel_gamma2; - const float hig2_2 = hi_2 * hi_2 * kernel_gamma2; - - vector pix, piy, piz; - vector pix2, piy2, piz2; - - /* Fill particle pi vectors. */ - pix.v = vec_set1(ci_cache->x[ci_cache_idx]); - piy.v = vec_set1(ci_cache->y[ci_cache_idx]); - piz.v = vec_set1(ci_cache->z[ci_cache_idx]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); - v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); - v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); - - v_hig2.v = vec_set1(hig2); - - pix2.v = vec_set1(ci_cache->x[ci_cache_idx - 1]); - piy2.v = vec_set1(ci_cache->y[ci_cache_idx - 1]); - piz2.v = vec_set1(ci_cache->z[ci_cache_idx - 1]); - v_hi_2.v = vec_set1(hi_2); - v_vix2.v = vec_set1(ci_cache->vx[ci_cache_idx - 1]); - v_viy2.v = vec_set1(ci_cache->vy[ci_cache_idx - 1]); - v_viz2.v = vec_set1(ci_cache->vz[ci_cache_idx - 1]); - - v_hig2_2.v = vec_set1(hig2_2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, - curlvxSum2, curlvySum2, curlvzSum2; - - /* Get the inverse of hi. */ - vector v_hi_inv; - vector v_hi_inv_2; - - v_hi_inv = vec_reciprocal(v_hi); - v_hi_inv_2 = vec_reciprocal(v_hi_2); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - rhoSum2.v = vec_setzero(); - rho_dhSum2.v = vec_setzero(); - wcountSum2.v = vec_setzero(); - wcount_dhSum2.v = vec_setzero(); - div_vSum2.v = vec_setzero(); - curlvxSum2.v = vec_setzero(); - curlvySum2.v = vec_setzero(); - curlvzSum2.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align += pad; - } - - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < exit_iteration_align; - pjd += (num_vec_proc * VEC_SIZE)) { - - /* Get the cache index to the jth particle. */ - int cj_cache_idx = pjd; // sort_j[pjd].i; - - vector v_dx, v_dy, v_dz, v_r2; - vector v_dx2, v_dy2, v_dz2, v_r2_2; - vector v2_dx, v2_dy, v2_dz, v2_r2; - vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; - - /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - pjx2.v = vec_load(&cj_cache->x[cj_cache_idx + VEC_SIZE]); - pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - pjy2.v = vec_load(&cj_cache->y[cj_cache_idx + VEC_SIZE]); - pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); - pjz2.v = vec_load(&cj_cache->z[cj_cache_idx + VEC_SIZE]); - // pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); - // pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); - // pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); - // mj.v = vec_load(&cj_cache.m[cj_cache_idx]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dx2.v = vec_sub(pix.v, pjx2.v); - v2_dx.v = vec_sub(pix2.v, pjx.v); - v2_dx2.v = vec_sub(pix2.v, pjx2.v); - - v_dy.v = vec_sub(piy.v, pjy.v); - v_dy2.v = vec_sub(piy.v, pjy2.v); - v2_dy.v = vec_sub(piy2.v, pjy.v); - v2_dy2.v = vec_sub(piy2.v, pjy2.v); - - v_dz.v = vec_sub(piz.v, pjz.v); - v_dz2.v = vec_sub(piz.v, pjz2.v); - v2_dz.v = vec_sub(piz2.v, pjz.v); - v2_dz2.v = vec_sub(piz2.v, pjz2.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); - v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); - v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); - - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); - - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); - - vector v_doi_mask, v_doi_mask2; - int doi_mask, doi_mask2; - - vector v2_doi_mask, v2_doi_mask2; - int doi2_mask, doi2_mask2; - - /* Form r2 < hig2 mask. */ - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); - v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); - v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); - - /* Form integer mask. */ - doi_mask = vec_cmp_result(v_doi_mask.v); - doi_mask2 = vec_cmp_result(v_doi_mask2.v); - doi2_mask = vec_cmp_result(v2_doi_mask.v); - doi2_mask2 = vec_cmp_result(v2_doi_mask2.v); - - if (doi_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, - &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], - &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hi_inv, v_vix, v_viy, v_viz, - &cj_cache->vx[cj_cache_idx + VEC_SIZE], - &cj_cache->vy[cj_cache_idx + VEC_SIZE], - &cj_cache->vz[cj_cache_idx + VEC_SIZE], - &cj_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum, &rho_dhSum, - &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_doi_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi2_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, v_viz2, - &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], - &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v2_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi2_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, v_viy2, - v_viz2, &cj_cache->vx[cj_cache_idx + VEC_SIZE], - &cj_cache->vy[cj_cache_idx + VEC_SIZE], - &cj_cache->vz[cj_cache_idx + VEC_SIZE], - &cj_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2, - &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2, - &curlvzSum2, v2_doi_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - - } /* loop over the parts in cj. */ - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); - - VEC_HADD(rhoSum2, pi2->rho); - VEC_HADD(rho_dhSum2, pi2->density.rho_dh); - VEC_HADD(wcountSum2, pi2->density.wcount); - VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); - VEC_HADD(div_vSum2, pi2->density.div_v); - VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); - VEC_HADD(curlvySum2, pi2->density.rot_v[1]); - VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); - - } /* loop over the parts in ci. */ - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd += 2) { - - /* Get a hold of the jth part in cj. */ - struct part *restrict pj = &parts_j[sort_j[pjd].i]; - struct part *restrict pj2 = &parts_j[sort_j[pjd + 1].i]; - if (!part_is_active(pj, e)) continue; - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; - - int cj_cache_idx = pjd; - - const float hj = cj_cache->h[cj_cache_idx]; - const float hj_2 = cj_cache->h[cj_cache_idx + 1]; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - - const float hjg2 = hj * hj * kernel_gamma2; - const float hjg2_2 = hj_2 * hj_2 * kernel_gamma2; - - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; - vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; - vector v_hj_2, v_vjx2, v_vjy2, v_vjz2, v_hjg2_2; - - /* Fill particle pi vectors. */ - pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); - pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); - pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); - v_hj.v = vec_set1(hj); - v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); - v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); - v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]); - - v_hjg2.v = vec_set1(hjg2); - - pjx2.v = vec_set1(cj_cache->x[cj_cache_idx + 1]); - pjy2.v = vec_set1(cj_cache->y[cj_cache_idx + 1]); - pjz2.v = vec_set1(cj_cache->z[cj_cache_idx + 1]); - v_hj_2.v = vec_set1(hj_2); - v_vjx2.v = vec_set1(cj_cache->vx[cj_cache_idx + 1]); - v_vjy2.v = vec_set1(cj_cache->vy[cj_cache_idx + 1]); - v_vjz2.v = vec_set1(cj_cache->vz[cj_cache_idx + 1]); - - v_hjg2_2.v = vec_set1(hjg2_2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, - curlvxSum2, curlvySum2, curlvzSum2; - - /* Get the inverse of hj. */ - vector v_hj_inv; - vector v_hj_inv_2; - - v_hj_inv = vec_reciprocal(v_hj); - v_hj_inv_2 = vec_reciprocal(v_hj_2); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - rhoSum2.v = vec_setzero(); - rho_dhSum2.v = vec_setzero(); - wcountSum2.v = vec_setzero(); - wcount_dhSum2.v = vec_setzero(); - div_vSum2.v = vec_setzero(); - curlvxSum2.v = vec_setzero(); - curlvySum2.v = vec_setzero(); - curlvzSum2.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align -= pad; - } - - vector pix, piy, piz; - vector pix2, piy2, piz2; - // vector pivx, pivy, pivz, mi; - - /* Loop over the parts in ci. */ - for (int pid = exit_iteration_align; pid < count_i; - pid += (num_vec_proc * VEC_SIZE)) { - - /* Get the cache index to the ith particle. */ - int ci_cache_idx = pid; // sort_i[pid].i; - int ci2_cache_idx = pid + VEC_SIZE; - - vector v_dx, v_dy, v_dz, v_r2; - vector v_dx2, v_dy2, v_dz2, v_r2_2; - vector v2_dx, v2_dy, v2_dz, v2_r2; - vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; - - /* Load 2 sets of vectors from the particle cache. */ - pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - pix2.v = vec_load(&ci_cache->x[ci2_cache_idx]); - piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - piy2.v = vec_load(&ci_cache->y[ci2_cache_idx]); - piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - piz2.v = vec_load(&ci_cache->z[ci2_cache_idx]); - // pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); - // pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); - // pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); - // mi.v = vec_load(&ci_cache->m[ci_cache_idx]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dx2.v = vec_sub(pjx.v, pix2.v); - v2_dx.v = vec_sub(pjx2.v, pix.v); - v2_dx2.v = vec_sub(pjx2.v, pix2.v); - - v_dy.v = vec_sub(pjy.v, piy.v); - v_dy2.v = vec_sub(pjy.v, piy2.v); - v2_dy.v = vec_sub(pjy2.v, piy.v); - v2_dy2.v = vec_sub(pjy2.v, piy2.v); - - v_dz.v = vec_sub(pjz.v, piz.v); - v_dz2.v = vec_sub(pjz.v, piz2.v); - v2_dz.v = vec_sub(pjz2.v, piz.v); - v2_dz2.v = vec_sub(pjz2.v, piz2.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); - v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); - v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); - - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); - - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); - - vector v_doj_mask, v_doj_mask2; - int doj_mask, doj_mask2; - - vector v2_doj_mask, v2_doj_mask2; - int doj2_mask, doj2_mask2; - - /* Form r2 < hig2 mask. */ - v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); - v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v); - v2_doj_mask.v = vec_cmp_lt(v2_r2.v, v_hjg2_2.v); - v2_doj_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hjg2_2.v); - - /* Form integer mask. */ - doj_mask = vec_cmp_result(v_doj_mask.v); - doj_mask2 = vec_cmp_result(v_doj_mask2.v); - doj2_mask = vec_cmp_result(v2_doj_mask.v); - doj2_mask2 = vec_cmp_result(v2_doj_mask2.v); - - /* Perform interaction with 2 vectors. */ - if (doj_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, - &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hj_inv, v_vjx, v_vjy, v_vjz, - &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx], - &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj2_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hj_inv_2, v_vjx2, v_vjy2, v_vjz2, - &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v2_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj2_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hj_inv_2, v_vjx2, v_vjy2, - v_vjz2, &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx], - &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v2_doj_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - } /* loop over the parts in cj. */ - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pj->rho); - VEC_HADD(rho_dhSum, pj->density.rho_dh); - VEC_HADD(wcountSum, pj->density.wcount); - VEC_HADD(wcount_dhSum, pj->density.wcount_dh); - VEC_HADD(div_vSum, pj->density.div_v); - VEC_HADD(curlvxSum, pj->density.rot_v[0]); - VEC_HADD(curlvySum, pj->density.rot_v[1]); - VEC_HADD(curlvzSum, pj->density.rot_v[2]); - - VEC_HADD(rhoSum2, pj2->rho); - VEC_HADD(rho_dhSum2, pj2->density.rho_dh); - VEC_HADD(wcountSum2, pj2->density.wcount); - VEC_HADD(wcount_dhSum2, pj2->density.wcount_dh); - VEC_HADD(div_vSum2, pj2->density.div_v); - VEC_HADD(curlvxSum2, pj2->density.rot_v[0]); - VEC_HADD(curlvySum2, pj2->density.rot_v[1]); - VEC_HADD(curlvzSum2, pj2->density.rot_v[2]); - - } /* loop over the parts in ci. */ - - TIMER_TOC(timer_dopair_density); - -#endif /* WITH_VECTORIZATION */ -} - -/* Use unsorted cache. */ -void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci, - struct cell *cj) { - -#ifdef WITH_VECTORIZATION - const struct engine *restrict e = r->e; - - int num_vec_proc = 2; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2; - // vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2; - - TIMER_TIC; - - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - -#ifdef SWIFT_DEBUG_CHECKS - cell_is_drifted(ci, e); - cell_is_drifted(cj, e); -#endif - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) - error("Trying to interact unsorted cells."); - - /* Get the cutoff shift. */ - double rshift = 0.0; - for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; - - /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; - - /* Get some other useful values. */ - const int count_i = ci->count; - const int count_j = cj->count; - struct part *restrict parts_i = ci->parts; - struct part *restrict parts_j = cj->parts; - const double di_max = sort_i[count_i - 1].d - rshift; - const double dj_min = sort_j[0].d; - const float dx_max = (ci->dx_max + cj->dx_max); - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - struct cache *restrict ci_cache = &r->ci_cache; - struct cache *restrict cj_cache = &r->cj_cache; - - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - if (cj_cache->count < count_j) { - cache_init(cj_cache, count_j); - } - - cache_read_two_cells(ci, cj, ci_cache, cj_cache, shift); - // cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, - // shift); - - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); - - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; - - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ - /* For particles in ci */ - populate_max_d(ci, cj, sort_i, sort_j, ci_cache, cj_cache, dx_max, rshift, - max_di, max_dj); - - float di, dj; - - int max_ind_j = count_j - 1; - - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { - // for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid-=2) { - - /* Get a hold of the ith part in ci. */ - struct part *restrict pi = &parts_i[sort_i[pid].i]; - // struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i]; - if (!part_is_active(pi, e)) continue; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j; - - int ci_cache_idx = sort_i[pid].i; - - const float hi = ci_cache->h[ci_cache_idx]; - // const float hi_2 = ci_cache->h[ci_cache_idx - 1]; - const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; - if (di < dj_min) continue; - - const float hig2 = hi * hi * kernel_gamma2; - // const float hig2_2 = hi_2 * hi_2 * kernel_gamma2; - - vector pix, piy, piz; - // vector pix2, piy2, piz2; - - /* Fill particle pi vectors. */ - pix.v = vec_set1(ci_cache->x[ci_cache_idx]); - piy.v = vec_set1(ci_cache->y[ci_cache_idx]); - piz.v = vec_set1(ci_cache->z[ci_cache_idx]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); - v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); - v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); - - v_hig2.v = vec_set1(hig2); - - // pix2.v = vec_set1(ci_cache->x[ci_cache_idx - 1]); - // piy2.v = vec_set1(ci_cache->y[ci_cache_idx - 1]); - // piz2.v = vec_set1(ci_cache->z[ci_cache_idx - 1]); - // v_hi_2.v = vec_set1(hi_2); - // v_vix2.v = vec_set1(ci_cache->vx[ci_cache_idx - 1]); - // v_viy2.v = vec_set1(ci_cache->vy[ci_cache_idx - 1]); - // v_viz2.v = vec_set1(ci_cache->vz[ci_cache_idx - 1]); - - // v_hig2_2.v = vec_set1(hig2_2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - // vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, - // curlvxSum2, - // curlvySum2, curlvzSum2; - - /* Get the inverse of hi. */ - vector v_hi_inv; - // vector v_hi_inv_2; - - v_hi_inv = vec_reciprocal(v_hi); - // v_hi_inv_2 = vec_reciprocal(v_hi_2); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - // rhoSum2.v = vec_setzero(); - // rho_dhSum2.v = vec_setzero(); - // wcountSum2.v = vec_setzero(); - // wcount_dhSum2.v = vec_setzero(); - // div_vSum2.v = vec_setzero(); - // curlvxSum2.v = vec_setzero(); - // curlvySum2.v = vec_setzero(); - // curlvzSum2.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align += pad; - } - - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < exit_iteration_align; - pjd += (num_vec_proc * VEC_SIZE)) { - - /* Get the cache index to the jth particle. */ - // int cj_cache_idx = sort_j[pjd].i; - int indices[2 * VEC_SIZE]; - - for (int i = 0; i < 2 * VEC_SIZE; i++) indices[i] = sort_j[pjd + i].i; - - vector v_dx, v_dy, v_dz, v_r2; - vector v_dx2, v_dy2, v_dz2, v_r2_2; - // vector v2_dx, v2_dy, v2_dz, v2_r2; - // vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; - - /* Load 2 sets of vectors from the particle cache. */ - // pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); - // pjx2.v = vec_load(&cj_cache.x[cj_cache_idx + VEC_SIZE]); - // pjy.v = vec_load(&cj_cache.y[cj_cache_idx]); - // pjy2.v = vec_load(&cj_cache.y[cj_cache_idx + VEC_SIZE]); - // pjz.v = vec_load(&cj_cache.z[cj_cache_idx]); - // pjz2.v = vec_load(&cj_cache.z[cj_cache_idx + VEC_SIZE]); - // pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); - // pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); - // pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); - // mj.v = vec_load(&cj_cache.m[cj_cache_idx]); - - pjx.v = vec_set(cj_cache->x[indices[0]], cj_cache->x[indices[1]], - cj_cache->x[indices[2]], cj_cache->x[indices[3]], - cj_cache->x[indices[4]], cj_cache->x[indices[5]], - cj_cache->x[indices[6]], cj_cache->x[indices[7]]); - pjx2.v = vec_set(cj_cache->x[indices[8]], cj_cache->x[indices[9]], - cj_cache->x[indices[10]], cj_cache->x[indices[11]], - cj_cache->x[indices[12]], cj_cache->x[indices[13]], - cj_cache->x[indices[14]], cj_cache->x[indices[15]]); - pjy.v = vec_set(cj_cache->y[indices[0]], cj_cache->y[indices[1]], - cj_cache->y[indices[2]], cj_cache->y[indices[3]], - cj_cache->y[indices[4]], cj_cache->y[indices[5]], - cj_cache->y[indices[6]], cj_cache->y[indices[7]]); - pjy2.v = vec_set(cj_cache->y[indices[8]], cj_cache->y[indices[9]], - cj_cache->y[indices[10]], cj_cache->y[indices[11]], - cj_cache->y[indices[12]], cj_cache->y[indices[13]], - cj_cache->y[indices[14]], cj_cache->y[indices[15]]); - pjz.v = vec_set(cj_cache->z[indices[0]], cj_cache->z[indices[1]], - cj_cache->z[indices[2]], cj_cache->z[indices[3]], - cj_cache->z[indices[4]], cj_cache->z[indices[5]], - cj_cache->z[indices[6]], cj_cache->z[indices[7]]); - pjz2.v = vec_set(cj_cache->z[indices[8]], cj_cache->z[indices[9]], - cj_cache->z[indices[10]], cj_cache->z[indices[11]], - cj_cache->z[indices[12]], cj_cache->z[indices[13]], - cj_cache->z[indices[14]], cj_cache->z[indices[15]]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dx2.v = vec_sub(pix.v, pjx2.v); - // v2_dx.v = vec_sub(pix2.v, pjx.v); - // v2_dx2.v = vec_sub(pix2.v, pjx2.v); - - v_dy.v = vec_sub(piy.v, pjy.v); - v_dy2.v = vec_sub(piy.v, pjy2.v); - // v2_dy.v = vec_sub(piy2.v, pjy.v); - // v2_dy2.v = vec_sub(piy2.v, pjy2.v); - - v_dz.v = vec_sub(piz.v, pjz.v); - v_dz2.v = vec_sub(piz.v, pjz2.v); - // v2_dz.v = vec_sub(piz2.v, pjz.v); - // v2_dz2.v = vec_sub(piz2.v, pjz2.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); - // v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); - // v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); - - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); - // v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); - // v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); - - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); - // v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); - // v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); - - vector v_doi_mask, v_doi_mask2; - int doi_mask, doi_mask2; - - // vector v2_doi_mask, v2_doi_mask2; - // int doi2_mask, doi2_mask2; - - /* Form r2 < hig2 mask. */ - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); - // v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); - // v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); - - /* Form integer mask. */ - doi_mask = vec_cmp_result(v_doi_mask.v); - doi_mask2 = vec_cmp_result(v_doi_mask2.v); - // doi2_mask = vec_cmp_result(v2_doi_mask.v); - // doi2_mask2 = vec_cmp_result(v2_doi_mask2.v); - - if (doi_mask) - runner_iact_nonsym_intrinsic_vec_2_density( - cj_cache, &indices[0], &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, - v_viy, v_viz, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi_mask2) - runner_iact_nonsym_intrinsic_vec_2_density( - cj_cache, &indices[VEC_SIZE], &v_r2_2, &v_dx2, &v_dy2, &v_dz2, - v_hi_inv, v_vix, v_viy, v_viz, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, - v_doi_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - // if(doi2_mask) - // runner_iact_nonsym_intrinsic_vec_density( - // &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, - // v_viz2, - // &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], - // &cj_cache.vz[cj_cache_idx], - // &cj_cache.m[cj_cache_idx], &rhoSum2, &rho_dhSum2, &wcountSum2, - // &wcount_dhSum2, - // &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, - // v2_doi_mask, - //#ifdef HAVE_AVX512_F - // knl_mask); - //#else - // 0); - //#endif - // if(doi2_mask2) - // runner_iact_nonsym_intrinsic_vec_density( - // &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, - // v_viy2, v_viz2, - // &cj_cache.vx[cj_cache_idx + VEC_SIZE], - // &cj_cache.vy[cj_cache_idx + VEC_SIZE], - // &cj_cache.vz[cj_cache_idx + VEC_SIZE], - // &cj_cache.m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2, - // &wcountSum2, &wcount_dhSum2, - // &div_vSum2, &curlvxSum2, &curlvySum2, &curlvzSum2, - // v2_doi_mask2, - //#ifdef HAVE_AVX512_F - // knl_mask); - //#else - // 0); - //#endif - - } /* loop over the parts in cj. */ - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); - - // VEC_HADD(rhoSum2, pi2->rho); - // VEC_HADD(rho_dhSum2, pi2->density.rho_dh); - // VEC_HADD(wcountSum2, pi2->density.wcount); - // VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); - // VEC_HADD(div_vSum2, pi2->density.div_v); - // VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); - // VEC_HADD(curlvySum2, pi2->density.rot_v[1]); - // VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); - - } /* loop over the parts in ci. */ - - int max_ind_i = 0; - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { - - /* Get a hold of the jth part in cj. */ - struct part *restrict pj = &parts_j[sort_j[pjd].i]; - if (!part_is_active(pj, e)) continue; - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; - - int cj_cache_idx = pjd; - - const float hj = cj_cache->h[cj_cache_idx]; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - - const float hjg2 = hj * hj * kernel_gamma2; - - vector pjx, pjy, pjz; - vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; - - /* Fill particle pi vectors. */ - pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); - pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); - pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); - v_hj.v = vec_set1(hj); - v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); - v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); - v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]); - - v_hjg2.v = vec_set1(hjg2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - /* Get the inverse of hj. */ - vector v_hj_inv; - - v_hj_inv = vec_reciprocal(v_hj); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align -= pad; - } - - vector pix, piy, piz; - vector pix2, piy2, piz2; - // vector pivx, pivy, pivz, mi; - - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0; pid -= (num_vec_proc * VEC_SIZE)) { - - int indices[2 * VEC_SIZE]; - - for (int i = (2 * VEC_SIZE) - 1; i >= 0; i--) - indices[i] = sort_j[pid - i].i; - - /* Get the cache index to the ith particle. */ - // int ci_cache_idx = sort_i[pid].i; - - vector v_dx, v_dy, v_dz, v_r2; - vector v_dx2, v_dy2, v_dz2, v_r2_2; - - /* Load 2 sets of vectors from the particle cache. */ - // pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - // pix2.v = vec_load(&ci_cache->x[ci_cache_idx - VEC_SIZE]); - // piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - // piy2.v = vec_load(&ci_cache->y[ci_cache_idx - VEC_SIZE]); - // piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - // piz2.v = vec_load(&ci_cache->z[ci_cache_idx - VEC_SIZE]); - // pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); - // pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); - // pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); - // mi.v = vec_load(&ci_cache->m[ci_cache_idx]); - - pix.v = vec_set(ci_cache->x[indices[0]], ci_cache->x[indices[1]], - ci_cache->x[indices[2]], ci_cache->x[indices[3]], - ci_cache->x[indices[4]], ci_cache->x[indices[5]], - ci_cache->x[indices[6]], ci_cache->x[indices[7]]); - pix2.v = vec_set(ci_cache->x[indices[8]], ci_cache->x[indices[9]], - ci_cache->x[indices[10]], ci_cache->x[indices[11]], - ci_cache->x[indices[12]], ci_cache->x[indices[13]], - ci_cache->x[indices[14]], ci_cache->x[indices[15]]); - piy.v = vec_set(ci_cache->y[indices[0]], ci_cache->y[indices[1]], - ci_cache->y[indices[2]], ci_cache->y[indices[3]], - ci_cache->y[indices[4]], ci_cache->y[indices[5]], - ci_cache->y[indices[6]], ci_cache->y[indices[7]]); - piy2.v = vec_set(ci_cache->y[indices[8]], ci_cache->y[indices[9]], - ci_cache->y[indices[10]], ci_cache->y[indices[11]], - ci_cache->y[indices[12]], ci_cache->y[indices[13]], - ci_cache->y[indices[14]], ci_cache->y[indices[15]]); - piz.v = vec_set(ci_cache->z[indices[0]], ci_cache->z[indices[1]], - ci_cache->z[indices[2]], ci_cache->z[indices[3]], - ci_cache->z[indices[4]], ci_cache->z[indices[5]], - ci_cache->z[indices[6]], ci_cache->z[indices[7]]); - piz2.v = vec_set(ci_cache->z[indices[8]], ci_cache->z[indices[9]], - ci_cache->z[indices[10]], ci_cache->z[indices[11]], - ci_cache->z[indices[12]], ci_cache->z[indices[13]], - ci_cache->z[indices[14]], ci_cache->z[indices[15]]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dx2.v = vec_sub(pjx.v, pix2.v); - v_dy.v = vec_sub(pjy.v, piy.v); - v_dy2.v = vec_sub(pjy.v, piy2.v); - v_dz.v = vec_sub(pjz.v, piz.v); - v_dz2.v = vec_sub(pjz.v, piz2.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); - - vector v_doj_mask, v_doj_mask2; - int doj_mask, doj_mask2; - - /* Form r2 < hig2 mask. */ - v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); - v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v); - - /* Form integer mask. */ - doj_mask = vec_cmp_result(v_doj_mask.v); - doj_mask2 = vec_cmp_result(v_doj_mask2.v); - - /* Perform interaction with 2 vectors. */ - if (doj_mask) - runner_iact_nonsym_intrinsic_vec_2_density( - ci_cache, &indices[0], &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, - v_vjy, v_vjz, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj_mask2) - runner_iact_nonsym_intrinsic_vec_2_density( - ci_cache, &indices[VEC_SIZE], &v_r2_2, &v_dx2, &v_dy2, &v_dz2, - v_hj_inv, v_vjx, v_vjy, v_vjz, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, - v_doj_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - } /* loop over the parts in cj. */ - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pj->rho); - VEC_HADD(rho_dhSum, pj->density.rho_dh); - VEC_HADD(wcountSum, pj->density.wcount); - VEC_HADD(wcount_dhSum, pj->density.wcount_dh); - VEC_HADD(div_vSum, pj->density.div_v); - VEC_HADD(curlvxSum, pj->density.rot_v[0]); - VEC_HADD(curlvySum, pj->density.rot_v[1]); - VEC_HADD(curlvzSum, pj->density.rot_v[2]); - - } /* loop over the parts in ci. */ - - TIMER_TOC(timer_dopair_density); - -#endif /* WITH_VECTORIZATION */ -} - -/* Read one cell at a time. */ -void runner_dopair1_density_vec_4(struct runner *r, struct cell *ci, - struct cell *cj) { - -#ifdef WITH_VECTORIZATION - const struct engine *restrict e = r->e; - - int num_vec_proc = 2; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2; - vector v_hi_2, v_vix2, v_viy2, v_viz2, v_hig2_2; - - TIMER_TIC; - - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - -#ifdef SWIFT_DEBUG_CHECKS - cell_is_drifted(ci, e); - cell_is_drifted(cj, e); -#endif - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) - error("Trying to interact unsorted cells."); - - /* Get the cutoff shift. */ - double rshift = 0.0; - for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; - - /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; - - /* Get some other useful values. */ - const int count_i = ci->count; - const int count_j = cj->count; - struct part *restrict parts_i = ci->parts; - struct part *restrict parts_j = cj->parts; - const double di_max = sort_i[count_i - 1].d - rshift; - const double dj_min = sort_j[0].d; - const float dx_max = (ci->dx_max + cj->dx_max); - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - struct cache *restrict ci_cache = &r->ci_cache; - - if (ci_cache->count < count_i) { - cache_init(ci_cache, count_i); - } - - double loc[3]; - loc[0] = ci->loc[0]; - loc[1] = ci->loc[1]; - loc[2] = ci->loc[2]; - - double shift_cj[3] = {0.0, 0.0, 0.0}; - - cache_read_cell_sorted(cj, ci_cache, sort_j, loc, shift_cj); - - float *max_di __attribute__((aligned(sizeof(float) * VEC_SIZE))); - float *max_dj __attribute__((aligned(sizeof(float) * VEC_SIZE))); - - max_di = r->ci_cache.max_d; - max_dj = r->cj_cache.max_d; - - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ - /* For particles in ci */ - float h = parts_i[sort_i[0].i].h; - float d; - - /* For particles in ci */ - max_di[0] = sort_i[0].d + h * kernel_gamma + dx_max - rshift; - - for (int k = 1; k < ci->count; k++) { - h = parts_i[sort_i[k].i].h; - d = sort_i[k].d + h * kernel_gamma + dx_max - rshift; - - max_di[k] = fmaxf(max_di[k - 1], d); - } - - float di, dj; - - int max_ind_j = count_j - 1; - - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid -= 2) { - - /* Get a hold of the ith part in ci. */ - struct part *restrict pi = &parts_i[sort_i[pid].i]; - struct part *restrict pi2 = &parts_i[sort_i[pid - 1].i]; - if (!part_is_active(pi, e)) continue; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j; - - const float hi = pi->h; - const float hi_2 = pi2->h; - const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; - if (di < dj_min) continue; - - const float hig2 = hi * hi * kernel_gamma2; - const float hig2_2 = hi_2 * hi_2 * kernel_gamma2; - - vector pix, piy, piz; - vector pix2, piy2, piz2; - - /* Fill particle pi vectors. */ - pix.v = vec_set1(pi->x[0] - loc[0] - shift[0]); - piy.v = vec_set1(pi->x[1] - loc[1] - shift[1]); - piz.v = vec_set1(pi->x[2] - loc[2] - shift[2]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(pi->v[0]); - v_viy.v = vec_set1(pi->v[1]); - v_viz.v = vec_set1(pi->v[2]); - - v_hig2.v = vec_set1(hig2); - - pix2.v = vec_set1(pi2->x[0] - loc[0] - shift[0]); - piy2.v = vec_set1(pi2->x[1] - loc[1] - shift[1]); - piz2.v = vec_set1(pi2->x[2] - loc[2] - shift[2]); - v_hi_2.v = vec_set1(hi_2); - v_vix2.v = vec_set1(pi2->v[0]); - v_viy2.v = vec_set1(pi2->v[1]); - v_viz2.v = vec_set1(pi2->v[2]); - - v_hig2_2.v = vec_set1(hig2_2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, - curlvxSum2, curlvySum2, curlvzSum2; - - /* Get the inverse of hi. */ - vector v_hi_inv; - vector v_hi_inv_2; - - v_hi_inv = vec_reciprocal(v_hi); - v_hi_inv_2 = vec_reciprocal(v_hi_2); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - rhoSum2.v = vec_setzero(); - rho_dhSum2.v = vec_setzero(); - wcountSum2.v = vec_setzero(); - wcount_dhSum2.v = vec_setzero(); - div_vSum2.v = vec_setzero(); - curlvxSum2.v = vec_setzero(); - curlvySum2.v = vec_setzero(); - curlvzSum2.v = vec_setzero(); - - // exit_iteration = count_j; - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - exit_iteration_align += pad; - } - - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < exit_iteration_align; - pjd += (num_vec_proc * VEC_SIZE)) { - - /* Get the cache index to the jth particle. */ - int cj_cache_idx = pjd; // sort_j[pjd].i; - - vector v_dx, v_dy, v_dz, v_r2; - vector v_dx2, v_dy2, v_dz2, v_r2_2; - vector v2_dx, v2_dy, v2_dz, v2_r2; - vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; - - /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&ci_cache->x[cj_cache_idx]); - pjx2.v = vec_load(&ci_cache->x[cj_cache_idx + VEC_SIZE]); - pjy.v = vec_load(&ci_cache->y[cj_cache_idx]); - pjy2.v = vec_load(&ci_cache->y[cj_cache_idx + VEC_SIZE]); - pjz.v = vec_load(&ci_cache->z[cj_cache_idx]); - pjz2.v = vec_load(&ci_cache->z[cj_cache_idx + VEC_SIZE]); - // pjvx.v = vec_load(&ci_cache->vx[cj_cache_idx]); - // pjvy.v = vec_load(&ci_cache->vy[cj_cache_idx]); - // pjvz.v = vec_load(&ci_cache->vz[cj_cache_idx]); - // mj.v = vec_load(&ci_cache->m[cj_cache_idx]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dx2.v = vec_sub(pix.v, pjx2.v); - v2_dx.v = vec_sub(pix2.v, pjx.v); - v2_dx2.v = vec_sub(pix2.v, pjx2.v); - - v_dy.v = vec_sub(piy.v, pjy.v); - v_dy2.v = vec_sub(piy.v, pjy2.v); - v2_dy.v = vec_sub(piy2.v, pjy.v); - v2_dy2.v = vec_sub(piy2.v, pjy2.v); - - v_dz.v = vec_sub(piz.v, pjz.v); - v_dz2.v = vec_sub(piz.v, pjz2.v); - v2_dz.v = vec_sub(piz2.v, pjz.v); - v2_dz2.v = vec_sub(piz2.v, pjz2.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); - v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); - v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); - - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); - - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); - - vector v_doi_mask, v_doi_mask2; - int doi_mask, doi_mask2; - - vector v2_doi_mask, v2_doi_mask2; - int doi2_mask, doi2_mask2; - - /* Form r2 < hig2 mask. */ - v_doi_mask.v = vec_cmp_lt(v_r2.v, v_hig2.v); - v_doi_mask2.v = vec_cmp_lt(v_r2_2.v, v_hig2.v); - v2_doi_mask.v = vec_cmp_lt(v2_r2.v, v_hig2_2.v); - v2_doi_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hig2_2.v); - - /* Form integer mask. */ - doi_mask = vec_cmp_result(v_doi_mask.v); - doi_mask2 = vec_cmp_result(v_doi_mask2.v); - doi2_mask = vec_cmp_result(v2_doi_mask.v); - doi2_mask2 = vec_cmp_result(v2_doi_mask2.v); - - if (doi_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, - &ci_cache->vx[cj_cache_idx], &ci_cache->vy[cj_cache_idx], - &ci_cache->vz[cj_cache_idx], &ci_cache->m[cj_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hi_inv, v_vix, v_viy, v_viz, - &ci_cache->vx[cj_cache_idx + VEC_SIZE], - &ci_cache->vy[cj_cache_idx + VEC_SIZE], - &ci_cache->vz[cj_cache_idx + VEC_SIZE], - &ci_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum, &rho_dhSum, - &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_doi_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi2_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hi_inv_2, v_vix2, v_viy2, v_viz2, - &ci_cache->vx[cj_cache_idx], &ci_cache->vy[cj_cache_idx], - &ci_cache->vz[cj_cache_idx], &ci_cache->m[cj_cache_idx], &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v2_doi_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doi2_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hi_inv_2, v_vix2, v_viy2, - v_viz2, &ci_cache->vx[cj_cache_idx + VEC_SIZE], - &ci_cache->vy[cj_cache_idx + VEC_SIZE], - &ci_cache->vz[cj_cache_idx + VEC_SIZE], - &ci_cache->m[cj_cache_idx + VEC_SIZE], &rhoSum2, &rho_dhSum2, - &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, &curlvySum2, - &curlvzSum2, v2_doi_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - - } /* loop over the parts in cj. */ - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); - - VEC_HADD(rhoSum2, pi2->rho); - VEC_HADD(rho_dhSum2, pi2->density.rho_dh); - VEC_HADD(wcountSum2, pi2->density.wcount); - VEC_HADD(wcount_dhSum2, pi2->density.wcount_dh); - VEC_HADD(div_vSum2, pi2->density.div_v); - VEC_HADD(curlvxSum2, pi2->density.rot_v[0]); - VEC_HADD(curlvySum2, pi2->density.rot_v[1]); - VEC_HADD(curlvzSum2, pi2->density.rot_v[2]); - - } /* loop over the parts in ci. */ - - cache_read_cell_sorted(ci, ci_cache, sort_i, loc, shift); - - h = parts_j[sort_j[0].i].h; - max_dj[0] = sort_j[0].d - h * kernel_gamma - dx_max - rshift; - - for (int k = 1; k < cj->count; k++) { - h = parts_j[sort_j[k].i].h; - d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; - - max_dj[k] = fmaxf(max_dj[k - 1], d); - } - - int max_ind_i = 0; - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd += 2) { - - /* Get a hold of the jth part in cj. */ - struct part *restrict pj = &parts_j[sort_j[pjd].i]; - struct part *restrict pj2 = &parts_j[sort_j[pjd + 1].i]; - if (!part_is_active(pj, e)) continue; - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; - - const float hj = pj->h; - const float hj_2 = pj2->h; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - - const float hjg2 = hj * hj * kernel_gamma2; - const float hjg2_2 = hj_2 * hj_2 * kernel_gamma2; - - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; - vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; - vector v_hj_2, v_vjx2, v_vjy2, v_vjz2, v_hjg2_2; - - /* Fill particle pi vectors. */ - pjx.v = vec_set1(pj->x[0] - loc[0]); - pjy.v = vec_set1(pj->x[1] - loc[1]); - pjz.v = vec_set1(pj->x[2] - loc[2]); - v_hj.v = vec_set1(hj); - v_vjx.v = vec_set1(pj->v[0]); - v_vjy.v = vec_set1(pj->v[1]); - v_vjz.v = vec_set1(pj->v[2]); - - v_hjg2.v = vec_set1(hjg2); - - pjx2.v = vec_set1(pj2->x[0] - loc[0]); - pjy2.v = vec_set1(pj2->x[1] - loc[1]); - pjz2.v = vec_set1(pj2->x[2] - loc[2]); - v_hj_2.v = vec_set1(hj_2); - v_vjx2.v = vec_set1(pj2->v[0]); - v_vjy2.v = vec_set1(pj2->v[1]); - v_vjz2.v = vec_set1(pj2->v[2]); - - v_hjg2_2.v = vec_set1(hjg2_2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - vector rhoSum2, rho_dhSum2, wcountSum2, wcount_dhSum2, div_vSum2, - curlvxSum2, curlvySum2, curlvzSum2; - - /* Get the inverse of hj. */ - vector v_hj_inv; - vector v_hj_inv_2; - - v_hj_inv = vec_reciprocal(v_hj); - v_hj_inv_2 = vec_reciprocal(v_hj_2); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - rhoSum2.v = vec_setzero(); - rho_dhSum2.v = vec_setzero(); - wcountSum2.v = vec_setzero(); - wcount_dhSum2.v = vec_setzero(); - div_vSum2.v = vec_setzero(); - curlvxSum2.v = vec_setzero(); - curlvySum2.v = vec_setzero(); - curlvzSum2.v = vec_setzero(); - - // exit_iteration = 0; - /* Pad cache if there is a serial remainder. */ - int exit_iteration_align = exit_iteration; - int rem = exit_iteration % (num_vec_proc * VEC_SIZE); - // if (rem != 0) { - // int pad = (num_vec_proc * VEC_SIZE) - rem; - - // exit_iteration_align -= pad; - //} - - vector pix, piy, piz; - vector pix2, piy2, piz2; - // vector pivx, pivy, pivz, mi; - - /* Loop over the parts in ci. */ - // for (int pid = count_i - 1; pid >= 0; pid -= (num_vec_proc * VEC_SIZE)) { - // for (int pid = count_i - 1; pid >= exit_iteration_align; pid -= - // (num_vec_proc * VEC_SIZE)) { - for (int pid = exit_iteration_align; - pid < (count_i + (num_vec_proc * VEC_SIZE) - rem); - pid += (num_vec_proc * VEC_SIZE)) { - - /* Get the cache index to the ith particle. */ - int ci_cache_idx = pid; // sort_i[pid].i; - int ci2_cache_idx = pid + VEC_SIZE; - - vector v_dx, v_dy, v_dz, v_r2; - vector v_dx2, v_dy2, v_dz2, v_r2_2; - vector v2_dx, v2_dy, v2_dz, v2_r2; - vector v2_dx2, v2_dy2, v2_dz2, v2_r2_2; - - /* Load 2 sets of vectors from the particle cache. */ - pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - pix2.v = vec_load(&ci_cache->x[ci2_cache_idx]); - piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - piy2.v = vec_load(&ci_cache->y[ci2_cache_idx]); - piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - piz2.v = vec_load(&ci_cache->z[ci2_cache_idx]); - // pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); - // pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); - // pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); - // mi.v = vec_load(&ci_cache->m[ci_cache_idx]); - - /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dx2.v = vec_sub(pjx.v, pix2.v); - v2_dx.v = vec_sub(pjx2.v, pix.v); - v2_dx2.v = vec_sub(pjx2.v, pix2.v); - - v_dy.v = vec_sub(pjy.v, piy.v); - v_dy2.v = vec_sub(pjy.v, piy2.v); - v2_dy.v = vec_sub(pjy2.v, piy.v); - v2_dy2.v = vec_sub(pjy2.v, piy2.v); - - v_dz.v = vec_sub(pjz.v, piz.v); - v_dz2.v = vec_sub(pjz.v, piz2.v); - v2_dz.v = vec_sub(pjz2.v, piz.v); - v2_dz2.v = vec_sub(pjz2.v, piz2.v); - - v_r2.v = vec_mul(v_dx.v, v_dx.v); - v_r2_2.v = vec_mul(v_dx2.v, v_dx2.v); - v2_r2.v = vec_mul(v2_dx.v, v2_dx.v); - v2_r2_2.v = vec_mul(v2_dx2.v, v2_dx2.v); - - v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy2.v, v_dy2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dy.v, v2_dy.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dy2.v, v2_dy2.v, v2_r2_2.v); - - v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz2.v, v_dz2.v, v_r2_2.v); - v2_r2.v = vec_fma(v2_dz.v, v2_dz.v, v2_r2.v); - v2_r2_2.v = vec_fma(v2_dz2.v, v2_dz2.v, v2_r2_2.v); - - vector v_doj_mask, v_doj_mask2; - int doj_mask, doj_mask2; - - vector v2_doj_mask, v2_doj_mask2; - int doj2_mask, doj2_mask2; - - /* Form r2 < hig2 mask. */ - v_doj_mask.v = vec_cmp_lt(v_r2.v, v_hjg2.v); - v_doj_mask2.v = vec_cmp_lt(v_r2_2.v, v_hjg2.v); - v2_doj_mask.v = vec_cmp_lt(v2_r2.v, v_hjg2_2.v); - v2_doj_mask2.v = vec_cmp_lt(v2_r2_2.v, v_hjg2_2.v); - - /* Form integer mask. */ - doj_mask = vec_cmp_result(v_doj_mask.v); - doj_mask2 = vec_cmp_result(v_doj_mask2.v); - doj2_mask = vec_cmp_result(v2_doj_mask.v); - doj2_mask2 = vec_cmp_result(v2_doj_mask2.v); - - /* Perform interaction with 2 vectors. */ - if (doj_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, - &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2_2, &v_dx2, &v_dy2, &v_dz2, v_hj_inv, v_vjx, v_vjy, v_vjz, - &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx], - &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj2_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2, &v2_dx, &v2_dy, &v2_dz, v_hj_inv_2, v_vjx2, v_vjy2, v_vjz2, - &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v2_doj_mask, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - if (doj2_mask2) - runner_iact_nonsym_intrinsic_vec_density( - &v2_r2_2, &v2_dx2, &v2_dy2, &v2_dz2, v_hj_inv_2, v_vjx2, v_vjy2, - v_vjz2, &ci_cache->vx[ci2_cache_idx], &ci_cache->vy[ci2_cache_idx], - &ci_cache->vz[ci2_cache_idx], &ci_cache->m[ci2_cache_idx], &rhoSum2, - &rho_dhSum2, &wcountSum2, &wcount_dhSum2, &div_vSum2, &curlvxSum2, - &curlvySum2, &curlvzSum2, v2_doj_mask2, -#ifdef HAVE_AVX512_F - knl_mask); -#else - 0); -#endif - } /* loop over the parts in cj. */ - - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pj->rho); - VEC_HADD(rho_dhSum, pj->density.rho_dh); - VEC_HADD(wcountSum, pj->density.wcount); - VEC_HADD(wcount_dhSum, pj->density.wcount_dh); - VEC_HADD(div_vSum, pj->density.div_v); - VEC_HADD(curlvxSum, pj->density.rot_v[0]); - VEC_HADD(curlvySum, pj->density.rot_v[1]); - VEC_HADD(curlvzSum, pj->density.rot_v[2]); - - VEC_HADD(rhoSum2, pj2->rho); - VEC_HADD(rho_dhSum2, pj2->density.rho_dh); - VEC_HADD(wcountSum2, pj2->density.wcount); - VEC_HADD(wcount_dhSum2, pj2->density.wcount_dh); - VEC_HADD(div_vSum2, pj2->density.div_v); - VEC_HADD(curlvxSum2, pj2->density.rot_v[0]); - VEC_HADD(curlvySum2, pj2->density.rot_v[1]); - VEC_HADD(curlvzSum2, pj2->density.rot_v[2]); - - } /* loop over the parts in ci. */ - - TIMER_TOC(timer_dopair_density); - -#endif /* WITH_VECTORIZATION */ -} -/** - * @brief Compute the interactions between a cell pair (non-symmetric). - * - * @param r The #runner. - * @param ci The first #cell. - * @param cj The second #cell. - */ -void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci, - struct cell *cj) { - -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_AUTO_VEC) - const struct engine *restrict e = r->e; - - TIMER_TIC; - - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - -#ifdef SWIFT_DEBUG_CHECKS - cell_is_drifted(ci, e); - cell_is_drifted(cj, e); -#endif - - /* Get the sort ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) - error("Trying to interact unsorted cells."); - - /* Get the cutoff shift. */ - double rshift = 0.0; - for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; - - /* Pick-out the sorted lists. */ - const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; - const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; - - /* Get some other useful values. */ - const int count_i = ci->count; - const int count_j = cj->count; - struct part *restrict parts_i = ci->parts; - struct part *restrict parts_j = cj->parts; - const double di_max = sort_i[count_i - 1].d - rshift; - const double dj_min = sort_j[0].d; - const float dx_max = (ci->dx_max + cj->dx_max); - - /* Get the particle cache from the runner and re-allocate - * the cache if it is not big enough for the cell. */ - // struct cache *restrict ci_cache = &r->par_cache; - - // if (ci_cache->count < count_i) { - // cache_init(ci_cache, count_i); - //} - // if (cj_cache.count < count_j) { - // cache_init(&cj_cache, count_j); - //} - - // cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); - cache_read_two_cells_sorted(ci, cj, &ci_cache, &cj_cache, sort_i, sort_j, - shift); - - /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ - /* For particles in ci */ - populate_max_d(ci, cj, sort_i, sort_j, &ci_cache, &cj_cache, dx_max, rshift, - max_di, max_dj); - - float di, dj; - - int max_ind_j = count_j - 1; - - /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { - - /* Get a hold of the ith part in ci. */ - struct part *restrict pi = &parts_i[sort_i[pid].i]; - if (!part_is_active(pi, e)) continue; - - dj = sort_j[max_ind_j].d; - while (max_ind_j > 0 && max_di[pid] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - int exit_iteration = max_ind_j; - - int ci_cache_idx = pid; // sort_i[pid].i; - - const float hi = ci_cache.h[ci_cache_idx]; - const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; - if (di < dj_min) continue; - - const float hig2 = hi * hi * kernel_gamma2; - - float pix, piy, piz; - float vix, viy, viz; - float hi_inv; - - /* Fill particle pi vectors. */ - pix = ci_cache.x[ci_cache_idx]; - piy = ci_cache.y[ci_cache_idx]; - piz = ci_cache.z[ci_cache_idx]; - vix = ci_cache.vx[ci_cache_idx]; - viy = ci_cache.vy[ci_cache_idx]; - viz = ci_cache.vz[ci_cache_idx]; - - /* Get the inverse of hi. */ - hi_inv = 1.0f / hi; - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd <= exit_iteration; pjd++) { - - /* Get the cache index to the jth particle. */ - // int cj_cache_idx = pjd; //sort_j[pjd].i; - - float dx, dy, dz, r2; - - /* Compute the pairwise distance. */ - dx = pix - cj_cache->x[pjd]; - dy = piy - cj_cache->y[pjd]; - dz = piz - cj_cache->z[pjd]; - - r2 = dx * dx + dy * dy + dz * dz; - - runner_iact_nonsym_density_jsw( - r2, hig2, dx, dy, dz, hi_inv, cj_cache->h[pjd], vix, viy, viz, - cj_cache->vx[pjd], cj_cache->vy[pjd], cj_cache->vz[pjd], - cj_cache->m[pjd], &ci_cache.rho[pid], &ci_cache.rho_dh[pid], - &ci_cache.wcount[pid], &ci_cache.wcount_dh[pid], &ci_cache.div_v[pid], - &ci_cache.curl_vx[pid], &ci_cache.curl_vy[pid], - &ci_cache.curl_vz[pid]); - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - int max_ind_i = 0; - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { - - /* Get a hold of the jth part in cj. */ - struct part *restrict pj = &parts_j[sort_j[pjd].i]; - if (!part_is_active(pj, e)) continue; - - di = sort_i[max_ind_i].d; - while (max_ind_i < count_i - 1 && max_dj[pjd] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - int exit_iteration = max_ind_i; - - int cj_cache_idx = pjd; - - const float hj = cj_cache->h[cj_cache_idx]; - const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; - if (dj > di_max) continue; - - const float hjg2 = hj * hj * kernel_gamma2; - - float pjx, pjy, pjz; - float vjx, vjy, vjz; - float hj_inv; - - /* Fill particle pi vectors. */ - pjx = cj_cache->x[cj_cache_idx]; - pjy = cj_cache->y[cj_cache_idx]; - pjz = cj_cache->z[cj_cache_idx]; - vjx = cj_cache->vx[cj_cache_idx]; - vjy = cj_cache->vy[cj_cache_idx]; - vjz = cj_cache->vz[cj_cache_idx]; - - /* Get the inverse of hj. */ - hj_inv = 1.0f / hj; - - /* Loop over the parts in ci. */ - for (int pid = exit_iteration; pid < count_i; pid++) { - - /* Get the cache index to the ith particle. */ - int ci_cache_idx = pid; // sort_i[pid].i; - - float dx, dy, dz, r2; - - /* Compute the pairwise distance. */ - dx = pjx - ci_cache.x[ci_cache_idx]; - dy = pjy - ci_cache.y[ci_cache_idx]; - dz = pjz - ci_cache.z[ci_cache_idx]; - - r2 = dx * dx + dy * dy + dz * dz; - - runner_iact_nonsym_density_jsw( - r2, hjg2, dx, dy, dz, hj_inv, ci_cache.h[ci_cache_idx], vjx, vjy, vjz, - ci_cache.vx[ci_cache_idx], ci_cache.vy[ci_cache_idx], - ci_cache.vz[ci_cache_idx], ci_cache.m[ci_cache_idx], - &cj_cache.rho[cj_cache_idx], &cj_cache.rho_dh[cj_cache_idx], - &cj_cache.wcount[cj_cache_idx], &cj_cache.wcount_dh[cj_cache_idx], - &cj_cache.div_v[cj_cache_idx], &cj_cache.curl_vx[cj_cache_idx], - &cj_cache.curl_vy[cj_cache_idx], &cj_cache.curl_vz[cj_cache_idx]); - - } /* loop over the parts in ci. */ - - } /* loop over the parts in cj. */ - - cache_write_sorted_particles(&ci_cache, &cj_cache, ci, cj, sort_i, sort_j); - - TIMER_TOC(timer_dopair_density); - -#endif /* WITH_VECTORIZATION */ -} diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 0a583aec27cc3d34cec08c1e1da9b5b9a01f40a7..b538f2ab2ebceac69c8a49c29de37f2343abeb7b 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -39,7 +39,5 @@ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj); -void runner_dopair1_density_vec_2(struct runner *r, struct cell *restrict ci, - struct cell *restrict cj); #endif /* SWIFT_RUNNER_VEC_H */ diff --git a/src/vector.h b/src/vector.h index 6c18b3590f34b55388dbc4a5151e633eb4a5673d..5582eecf93b6e7a9f82b885cda1f6c70d6bf059a 100644 --- a/src/vector.h +++ b/src/vector.h @@ -23,8 +23,6 @@ /* Have I already read this file? */ #ifndef VEC_MACRO -/*TODO: Tidy this file up with comments.*/ - #include "../config.h" #ifdef WITH_VECTORIZATION diff --git a/tests/test27cells.c b/tests/test27cells.c index feb13f1952461a73f32ae59d29e917766796ff11..9a8eb1c561a9c68dc6e53b219dc442c0aa1259e1 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -34,7 +34,9 @@ #if defined(WITH_VECTORIZATION) #define DOSELF1 runner_doself1_density_vec +#define DOPAIR1 runner_dopair1_density_vec #define DOSELF1_NAME "runner_doself1_density_vec" +#define DOPAIR1_NAME "runner_dopair1_density_vec" #endif #ifndef DOSELF1 @@ -42,41 +44,6 @@ #define DOSELF1_NAME "runner_doself1_density" #endif -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC) -#define DOPAIR1 runner_dopair1_density_vec -#define DOPAIR1_NAME "runner_dopair1_density_vec" -#endif - -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_1) -#define DOPAIR1 runner_dopair1_density_vec_1 -#define DOPAIR1_NAME "runner_dopair1_density_vec_1" -#endif - -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_2) -#define DOPAIR1 runner_dopair1_density_vec_2 -#define DOPAIR1_NAME "runner_dopair1_density_vec_2" -#endif - -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_3) -#define DOPAIR1 runner_dopair1_density_vec_3 -#define DOPAIR1_NAME "runner_dopair1_density_vec_3" -#endif - -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_VEC_4) -#define DOPAIR1 runner_dopair1_density_vec_4 -#define DOPAIR1_NAME "runner_dopair1_density_vec_4" -#endif - -#if defined(WITH_VECTORIZATION) && defined(DOPAIR1_AUTO_VEC) -#define DOPAIR1 runner_dopair1_density_auto_vec -#define DOPAIR1_NAME "runner_dopair1_density_auto_vec" -#endif - -#if defined(DOPAIR1_NOSORT_JSW) -#define DOPAIR1 runner_dopair1_nosort_density -#define DOPAIR1_NAME "runner_dopair1_nosort_density" -#endif - #ifndef DOPAIR1 #define DOPAIR1 runner_dopair1_density #define DOPAIR1_NAME "runner_dopair1_density" @@ -334,23 +301,11 @@ int check_results(struct part *serial_parts, struct part *vec_parts, int count, } /* Just a forward declaration... */ +void runner_doself1_density(struct runner *r, struct cell *ci); +void runner_doself1_density_vec(struct runner *r, struct cell *ci); void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); -void runner_dopair1_nosort_density(struct runner *r, struct cell *ci, - struct cell *cj); void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *cj); -void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, - struct cell *cj); -void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, - struct cell *cj); -void runner_dopair1_density_vec_3(struct runner *r, struct cell *ci, - struct cell *cj); -void runner_dopair1_density_vec_4(struct runner *r, struct cell *ci, - struct cell *cj); -void runner_dopair1_density_auto_vec(struct runner *r, struct cell *ci, - struct cell *cj); -void runner_doself1_density(struct runner *r, struct cell *ci); -void runner_doself1_density_vec(struct runner *r, struct cell *ci); /* And go... */ int main(int argc, char *argv[]) { @@ -492,8 +447,6 @@ int main(int argc, char *argv[]) { cache_init(&runner.ci_cache, 512); runner.cj_cache.count = 0; cache_init(&runner.cj_cache, 512); -// cj_cache.count = 0; -// cache_init(&cj_cache, 512); #endif /* Run all the pairs */ @@ -582,9 +535,9 @@ int main(int argc, char *argv[]) { dump_particle_fields(outputFileName, main_cell, cells); /* Check serial results against the vectorised results. */ - // if (check_results(main_cell->parts, vec_parts, main_cell->count, - // threshold)) - // message("Differences found..."); + if (check_results(main_cell->parts, vec_parts, main_cell->count, + threshold)) + message("Differences found..."); /* Output timing */ message("Brute force calculation took : %15lli ticks.", toc - tic);