diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 0b44a08ebcdb4426120c69c7587a4757da5fc2ff..552aa8e9deef12c10f0561e12816e1b49af0266b 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -975,45 +975,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec_2( float max_di[MAX_NO_OF_PARTS] __attribute__((aligned(sizeof(VEC_SIZE * sizeof(float))))); /* max distance into ci */ float max_dj[MAX_NO_OF_PARTS] __attribute__((aligned(sizeof(VEC_SIZE * sizeof(float))))); /* max distance into cj */ -FILE *faceIntFile; -FILE *edgeIntFile; -FILE *cornerIntFile; - -/** C2_CACHE VERSION - * @brief Compute the interactions between a cell pair (non-symmetric). - * - * @param r The #runner. - * @param ci The first #cell. - * @param cj The second #cell. - */ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell *cj) { #ifdef WITH_VECTORIZATION const struct engine *restrict e = r->e; - static int faceIntCount = 0; - static int faceCtr = 0; - static int edgeIntCount = 0; - static int edgeCtr = 0; - static int cornerIntCount = 0; - static int cornerCtr = 0; - static int numFaceTested = 0; - static int numEdgeTested = 0; - static int numCornerTested = 0; - int icount = 0, icount_align = 0; - struct c2_cache int_cache; int num_vec_proc = 1; vector v_hi, v_vix, v_viy, v_viz, v_hig2; TIMER_TIC; - if(faceCtr + edgeCtr + cornerCtr == 0) { - faceIntFile = fopen("particle_interactions_face.dat","w"); - edgeIntFile = fopen("particle_interactions_edge.dat","w"); - cornerIntFile = fopen("particle_interactions_corner.dat","w"); - } - /* Anything to do here? */ if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; @@ -1026,10 +998,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * double shift[3] = {0.0, 0.0, 0.0}; const int sid = space_getsid(e->s, &ci, &cj, shift); - int face = (sid == 4 || sid == 10 || sid == 12); - int edge = (sid == 1 || sid == 3 || sid == 5 || sid == 7 || sid == 9 || sid == 11); - int corner = (sid == 0 || sid == 2 || sid == 6 || sid == 8); - /* Have the cells been sorted? */ if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); @@ -1062,18 +1030,38 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * cache_init(&cj_cache, count_j); } - cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); - + int first_pi, last_pj; + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ /* For particles in ci */ - populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); + populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, max_dj, &first_pi, &last_pj); float di, dj; int max_ind_j = count_j - 1; + int max_ind_i = 0; + + dj = sort_j[max_ind_j].d; + while(max_ind_j > 0 && max_di[count_i - 1] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + + di = sort_i[max_ind_i].d; + while(max_ind_i < count_i - 1 && max_dj[0] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + + last_pj = max(last_pj, max_ind_j); + first_pi = min(first_pi, max_ind_i); + + cache_read_two_cells_sorted_2(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift, first_pi, last_pj, num_vec_proc); /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid--) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; @@ -1136,7 +1124,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * } vector pjx, pjy, pjz; - vector pjvx, pjvy, pjvz, mj; /* Loop over the parts in cj. */ for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { @@ -1150,10 +1137,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); pjy.v = vec_load(&cj_cache.y[cj_cache_idx]); pjz.v = vec_load(&cj_cache.z[cj_cache_idx]); - pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); - pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); - pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); - mj.v = vec_load(&cj_cache.m[cj_cache_idx]); + //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); + //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); + //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); + //mj.v = vec_load(&cj_cache.m[cj_cache_idx]); /* Compute the pairwise distance. */ v_dx.v = vec_sub(pix.v, pjx.v); @@ -1173,50 +1160,19 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * /* Form integer mask. */ doi_mask = vec_cmp_result(v_doi_mask.v); - /* If there are any interactions left pack interaction values into c2 - * cache. */ - if (doi_mask) - storeInteractions(doi_mask, cj_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz, - &mj, &pjvx, &pjvy, &pjvz, &cj_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); - - } /* loop over the parts in cj. */ - - /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, - &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, - &icount_align); - - /* Initialise masks to true in case remainder interactions have been - * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif - - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - runner_iact_nonsym_2_vec_density( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, - &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, + if(doi_mask) + runner_iact_nonsym_intrinsic_vec_density( + &v_r2, &v_dx, &v_dy,&v_dz, v_hi_inv, v_vix, v_viy, v_viz, + &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx], + &cj_cache.m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask, #ifdef HAVE_AVX512_F - knl_mask, knl_mask2); + knl_mask); #else - 0, 0); + 0); #endif - } + + } /* loop over the parts in cj. */ /* Perform horizontal adds on vector sums and store result in particle pi. */ @@ -1229,42 +1185,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * VEC_HADD(curlvySum, pi->density.rot_v[1]); VEC_HADD(curlvzSum, pi->density.rot_v[2]); - if(face) { - faceIntCount += icount; - fprintf(faceIntFile,"%d\n",icount); - numFaceTested++; - } - else if(edge) { - edgeIntCount += icount; - fprintf(edgeIntFile,"%d\n",icount); - numEdgeTested++; - } - else if(corner) { - cornerIntCount += icount; - fprintf(cornerIntFile,"%d\n",icount); - numCornerTested++; - } - - icount = 0; - } /* loop over the parts in ci. */ - if(face) { - faceCtr++; - message("Total number of face interactions: %d, average per particle: %f, number tested: %d.", faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), numFaceTested); - } - else if(edge) { - edgeCtr++; - message("Total number of edge interactions: %d, average per particle: %f, number tested: %d", edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), numEdgeTested); - } - else if(corner) { - cornerCtr++; - message("Total number of corner interactions: %d, average per particle: %f, number tested: %d", cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), numCornerTested); - } - - int max_ind_i = 0; /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { + for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -1328,11 +1252,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * } vector pix, piy, piz; - vector pivx, pivy, pivz, mi; + //vector pivx, pivy, pivz, mi; /* Loop over the parts in ci. */ - //for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) { - for (int pid = exit_iteration_align; pid < count_i; pid += (num_vec_proc * VEC_SIZE)) { + for (int pid = exit_iteration_align; pid < count_i; pid += VEC_SIZE) { /* Get the cache index to the ith particle. */ int ci_cache_idx = pid; //sort_i[pid].i; @@ -1343,10 +1266,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * pix.v = vec_load(&ci_cache->x[ci_cache_idx]); piy.v = vec_load(&ci_cache->y[ci_cache_idx]); piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); - pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); - pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); - mi.v = vec_load(&ci_cache->m[ci_cache_idx]); + //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); + //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); + //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); + //mi.v = vec_load(&ci_cache->m[ci_cache_idx]); /* Compute the pairwise distance. */ v_dx.v = vec_sub(pjx.v, pix.v); @@ -1366,50 +1289,20 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * /* Form integer mask. */ doj_mask = vec_cmp_result(v_doj_mask.v); - /* If there are any interactions left pack interaction values into c2 - * cache. */ + /* Perform interaction with 2 vectors. */ if (doj_mask) - storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz, - &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz); - - } /* loop over the parts in cj. */ - - /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, - &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz, - &icount_align); - - /* Initialise masks to true in case remainder interactions have been - * performed. */ - vector int_mask, int_mask2; -#ifdef HAVE_AVX512_F - KNL_MASK_16 knl_mask = 0xFFFF; - KNL_MASK_16 knl_mask2 = 0xFFFF; - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#else - int_mask.m = vec_setint1(0xFFFFFFFF); - int_mask2.m = vec_setint1(0xFFFFFFFF); -#endif - - /* Perform interaction with 2 vectors. */ - for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { - runner_iact_nonsym_2_vec_density( - &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz, - &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, + runner_iact_nonsym_intrinsic_vec_density( + &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, + &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], + &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask, #ifdef HAVE_AVX512_F - knl_mask, knl_mask2); + knl_mask); #else - 0, 0); + 0); #endif - } + + } /* loop over the parts in cj. */ /* Perform horizontal adds on vector sums and store result in particle pi. */ @@ -1422,8 +1315,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * VEC_HADD(curlvySum, pj->density.rot_v[1]); VEC_HADD(curlvzSum, pj->density.rot_v[2]); - icount = 0; - } /* loop over the parts in ci. */ TIMER_TOC(timer_dopair_density); @@ -1431,17 +1322,45 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * #endif /* WITH_VECTORIZATION */ } +FILE *faceIntFile; +FILE *edgeIntFile; +FILE *cornerIntFile; + +/** C2_CACHE VERSION + * @brief Compute the interactions between a cell pair (non-symmetric). + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell *cj) { #ifdef WITH_VECTORIZATION const struct engine *restrict e = r->e; + static int faceIntCount = 0; + static int faceCtr = 0; + static int edgeIntCount = 0; + static int edgeCtr = 0; + static int cornerIntCount = 0; + static int cornerCtr = 0; + static int numFaceTested = 0; + static int numEdgeTested = 0; + static int numCornerTested = 0; + int icount = 0, icount_align = 0; + struct c2_cache int_cache; int num_vec_proc = 1; vector v_hi, v_vix, v_viy, v_viz, v_hig2; TIMER_TIC; + if(faceCtr + edgeCtr + cornerCtr == 0) { + faceIntFile = fopen("particle_interactions_face.dat","w"); + edgeIntFile = fopen("particle_interactions_edge.dat","w"); + cornerIntFile = fopen("particle_interactions_corner.dat","w"); + } + /* Anything to do here? */ if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; @@ -1454,6 +1373,10 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell double shift[3] = {0.0, 0.0, 0.0}; const int sid = space_getsid(e->s, &ci, &cj, shift); + int face = (sid == 4 || sid == 10 || sid == 12); + int edge = (sid == 1 || sid == 3 || sid == 5 || sid == 7 || sid == 9 || sid == 11); + int corner = (sid == 0 || sid == 2 || sid == 6 || sid == 8); + /* Have the cells been sorted? */ if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); @@ -1486,38 +1409,18 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell cache_init(&cj_cache, count_j); } - int first_pi, last_pj; - + cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ /* For particles in ci */ - populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, max_dj, &first_pi, &last_pj); + populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); float di, dj; int max_ind_j = count_j - 1; - int max_ind_i = 0; - - dj = sort_j[max_ind_j].d; - while(max_ind_j > 0 && max_di[count_i - 1] < dj) { - max_ind_j--; - - dj = sort_j[max_ind_j].d; - } - - di = sort_i[max_ind_i].d; - while(max_ind_i < count_i - 1 && max_dj[0] > di) { - max_ind_i++; - - di = sort_i[max_ind_i].d; - } - - last_pj = max(last_pj, max_ind_j); - first_pi = min(first_pi, max_ind_i); - - cache_read_two_cells_sorted_2(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift, first_pi, last_pj, num_vec_proc); /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; @@ -1580,6 +1483,7 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell } vector pjx, pjy, pjz; + vector pjvx, pjvy, pjvz, mj; /* Loop over the parts in cj. */ for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { @@ -1593,10 +1497,10 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell pjx.v = vec_load(&cj_cache.x[cj_cache_idx]); pjy.v = vec_load(&cj_cache.y[cj_cache_idx]); pjz.v = vec_load(&cj_cache.z[cj_cache_idx]); - //pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); - //pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); - //pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); - //mj.v = vec_load(&cj_cache.m[cj_cache_idx]); + pjvx.v = vec_load(&cj_cache.vx[cj_cache_idx]); + pjvy.v = vec_load(&cj_cache.vy[cj_cache_idx]); + pjvz.v = vec_load(&cj_cache.vz[cj_cache_idx]); + mj.v = vec_load(&cj_cache.m[cj_cache_idx]); /* Compute the pairwise distance. */ v_dx.v = vec_sub(pix.v, pjx.v); @@ -1616,19 +1520,50 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell /* Form integer mask. */ doi_mask = vec_cmp_result(v_doi_mask.v); - if(doi_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2, &v_dx, &v_dy,&v_dz, v_hi_inv, v_vix, v_viy, v_viz, - &cj_cache.vx[cj_cache_idx], &cj_cache.vy[cj_cache_idx], &cj_cache.vz[cj_cache_idx], - &cj_cache.m[cj_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doi_mask, + /* If there are any interactions left pack interaction values into c2 + * cache. */ + if (doi_mask) + storeInteractions(doi_mask, cj_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz, + &mj, &pjvx, &pjvy, &pjvz, &cj_cache, &int_cache, + &icount, &rhoSum, &rho_dhSum, &wcountSum, + &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + + } /* loop over the parts in cj. */ + + /* Perform padded vector remainder interactions if any are present. */ + calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, + &icount_align); + + /* Initialise masks to true in case remainder interactions have been + * performed. */ + vector int_mask, int_mask2; #ifdef HAVE_AVX512_F - knl_mask); + KNL_MASK_16 knl_mask = 0xFFFF; + KNL_MASK_16 knl_mask2 = 0xFFFF; + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); #else - 0); + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); #endif - - } /* loop over the parts in cj. */ + + /* Perform interaction with 2 vectors. */ + for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { + runner_iact_nonsym_2_vec_density( + &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], + &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, + &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], + &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, +#ifdef HAVE_AVX512_F + knl_mask, knl_mask2); +#else + 0, 0); +#endif + } /* Perform horizontal adds on vector sums and store result in particle pi. */ @@ -1641,10 +1576,42 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell VEC_HADD(curlvySum, pi->density.rot_v[1]); VEC_HADD(curlvzSum, pi->density.rot_v[2]); + if(face) { + faceIntCount += icount; + fprintf(faceIntFile,"%d\n",icount); + numFaceTested++; + } + else if(edge) { + edgeIntCount += icount; + fprintf(edgeIntFile,"%d\n",icount); + numEdgeTested++; + } + else if(corner) { + cornerIntCount += icount; + fprintf(cornerIntFile,"%d\n",icount); + numCornerTested++; + } + + icount = 0; + } /* loop over the parts in ci. */ + if(face) { + faceCtr++; + message("Total number of face interactions: %d, average per particle: %f, number tested: %d.", faceIntCount, ((float)faceIntCount) / ((float)numFaceTested), numFaceTested); + } + else if(edge) { + edgeCtr++; + message("Total number of edge interactions: %d, average per particle: %f, number tested: %d", edgeIntCount, ((float)edgeIntCount) / ((float)numEdgeTested), numEdgeTested); + } + else if(corner) { + cornerCtr++; + message("Total number of corner interactions: %d, average per particle: %f, number tested: %d", cornerIntCount, ((float)cornerIntCount) / ((float)numCornerTested), numCornerTested); + } + + int max_ind_i = 0; /* Loop over the parts in cj. */ - for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd++) { + for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -1708,10 +1675,11 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell } vector pix, piy, piz; - //vector pivx, pivy, pivz, mi; + vector pivx, pivy, pivz, mi; /* Loop over the parts in ci. */ - for (int pid = exit_iteration_align; pid < count_i; pid += VEC_SIZE) { + //for (int pid = count_i - 1; pid >= 0; pid -= VEC_SIZE) { + for (int pid = exit_iteration_align; pid < count_i; pid += (num_vec_proc * VEC_SIZE)) { /* Get the cache index to the ith particle. */ int ci_cache_idx = pid; //sort_i[pid].i; @@ -1722,10 +1690,10 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell pix.v = vec_load(&ci_cache->x[ci_cache_idx]); piy.v = vec_load(&ci_cache->y[ci_cache_idx]); piz.v = vec_load(&ci_cache->z[ci_cache_idx]); - //pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); - //pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); - //pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); - //mi.v = vec_load(&ci_cache->m[ci_cache_idx]); + pivx.v = vec_load(&ci_cache->vx[ci_cache_idx]); + pivy.v = vec_load(&ci_cache->vy[ci_cache_idx]); + pivz.v = vec_load(&ci_cache->vz[ci_cache_idx]); + mi.v = vec_load(&ci_cache->m[ci_cache_idx]); /* Compute the pairwise distance. */ v_dx.v = vec_sub(pjx.v, pix.v); @@ -1745,20 +1713,50 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell /* Form integer mask. */ doj_mask = vec_cmp_result(v_doj_mask.v); - /* Perform interaction with 2 vectors. */ + /* If there are any interactions left pack interaction values into c2 + * cache. */ if (doj_mask) - runner_iact_nonsym_intrinsic_vec_density( - &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, - &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], &ci_cache->vz[ci_cache_idx], - &ci_cache->m[ci_cache_idx], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_doj_mask, + storeInteractions(doj_mask, ci_cache_idx, &v_r2, &v_dx, &v_dy, &v_dz, + &mi, &pivx, &pivy, &pivz, ci_cache, &int_cache, + &icount, &rhoSum, &rho_dhSum, &wcountSum, + &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, + &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz); + + } /* loop over the parts in cj. */ + + /* Perform padded vector remainder interactions if any are present. */ + calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, + &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, + &curlvySum, &curlvzSum, v_hj_inv, v_vjx, v_vjy, v_vjz, + &icount_align); + + /* Initialise masks to true in case remainder interactions have been + * performed. */ + vector int_mask, int_mask2; #ifdef HAVE_AVX512_F - knl_mask); + KNL_MASK_16 knl_mask = 0xFFFF; + KNL_MASK_16 knl_mask2 = 0xFFFF; + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); #else - 0); + int_mask.m = vec_setint1(0xFFFFFFFF); + int_mask2.m = vec_setint1(0xFFFFFFFF); #endif - - } /* loop over the parts in cj. */ + + /* Perform interaction with 2 vectors. */ + for (int pjd = 0; pjd < icount_align; pjd += (NUM_VEC_PROC * VEC_SIZE)) { + runner_iact_nonsym_2_vec_density( + &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], + &int_cache.dzq[pjd], v_hj_inv, v_vjx, v_vjy, v_vjz, + &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], + &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, + &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, +#ifdef HAVE_AVX512_F + knl_mask, knl_mask2); +#else + 0, 0); +#endif + } /* Perform horizontal adds on vector sums and store result in particle pi. */ @@ -1771,6 +1769,8 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell VEC_HADD(curlvySum, pj->density.rot_v[1]); VEC_HADD(curlvzSum, pj->density.rot_v[2]); + icount = 0; + } /* loop over the parts in ci. */ TIMER_TOC(timer_dopair_density); diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 42b5cc2587fa325f880f74e7e0e213f9d4ba0137..3c07645dffe7e761087b5e1d0dca0d86711acf95 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -38,5 +38,6 @@ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); void runner_doself1_density_vec_2(struct runner *r, struct cell *restrict c); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj); +void runner_dopair1_density_vec_2(struct runner *r, struct cell *restrict ci, struct cell *restrict cj); #endif /* SWIFT_RUNNER_VEC_H */