diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 5ecb0f7cd630162c6b34fd17f9bcc5264c23e76a..0c179bdcf41fad2916433c147d3cdd5a9dffbbe6 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -287,6 +287,71 @@ __attribute__((always_inline)) INLINE static void populate_max_d(const struct ce } } +__attribute__((always_inline)) INLINE static void populate_max_d_no_cache(const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const float dx_max, const float rshift, float *max_di, float *max_dj, int *init_ci, int *init_cj) { + + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + struct part *p = &parts_i[sort_i[0].i]; + + float h = p->h; + float d = sort_i[0].d; + + const float di_max = sort_i[ci->count - 1].d - rshift; + const float dj_min = sort_j[0].d; + + int first_pi = 0, last_pj = cj->count - 1; + int found_pi = 0, found_pj = 0; + + /* For particles in ci */ + max_di[0] = d + h * kernel_gamma + dx_max - rshift; + + if(max_di[0] >= dj_min) found_pi = 1; + + for (int k = 1; k < ci->count; k++) { + p = &parts_i[sort_i[k].i]; + h = p->h; + d = sort_i[k].d + h * kernel_gamma + dx_max - rshift; + + max_di[k] = fmaxf(max_di[k - 1], d); + + /* Find the first particle in ci to interact with any particle in cj. */ + if(!found_pi) { + if(d >= dj_min) { + first_pi = k; + found_pi = 1; + } + } + } + + /* For particles in cj */ + p = &parts_j[sort_j[0].i]; + h = p->h; + max_dj[0] = sort_j[0].d - h * kernel_gamma - dx_max - rshift; + + for (int k = 1; k < cj->count; k++) { + p = &parts_j[sort_j[k].i]; + h = p->h; + d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; + + max_dj[k] = fmaxf(max_dj[k - 1], d); + } + + /* Find the last particle in cj to interact with any particle in ci. */ + for (int k = cj->count - 1; k >= 0; k--) { + p = &parts_j[sort_j[k].i]; + h = p->h; + d = sort_j[k].d - h * kernel_gamma - dx_max - rshift; + + if(d <= di_max) { + last_pj = k; + found_pj = 1; + break; + } + } + + *init_ci = first_pi; + *init_cj = last_pj; +} #endif /* WITH_VECTORIZATION */ /** @@ -537,7 +602,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( icount = 0; } /* loop over all particles. */ - message("Total number of self interactions: %d, average per particle: %f.", intCount, ((float)intCount) / ((float)count)); + //message("Total number of self interactions: %d, average per particle: %f.", intCount, ((float)intCount) / ((float)count)); TIMER_TOC(timer_doself_density); #endif /* WITH_VECTORIZATION */ @@ -997,7 +1062,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, struct cell * cache_init(&cj_cache, count_j); } - //cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ @@ -1422,19 +1486,38 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell cache_init(&cj_cache, count_j); } - //cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); - cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); - + int first_pi, last_pj; + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ /* For particles in ci */ - populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); + populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, max_dj, &first_pi, &last_pj); float di, dj; int max_ind_j = count_j - 1; + int max_ind_i = 0; + + dj = sort_j[max_ind_j].d; + while(max_ind_j > 0 && max_di[count_i - 1] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + + di = sort_i[max_ind_i].d; + while(max_ind_i < count_i - 1 && max_dj[0] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + + last_pj = max(last_pj, max_ind_j); + first_pi = min(first_pi, max_ind_i); + + cache_read_two_cells_sorted_2(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift, first_pi, last_pj, num_vec_proc); /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid--) { + for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid--) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; @@ -1560,9 +1643,8 @@ void runner_dopair1_density_vec_1(struct runner *r, struct cell *ci, struct cell } /* loop over the parts in ci. */ - int max_ind_i = 0; /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd++) { + for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd++) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; @@ -1746,16 +1828,6 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell * the cache if it is not big enough for the cell. */ struct cache *restrict ci_cache = &r->par_cache; - //for(int i=0; i<ci->count; i++) - // message("i:%d di: %f",i,sort_i[i].d); - // - //message("Max distance into cj: %f", sort_i[ci->count - 1].d + ci->h_max); - - //for(int i=0; i<cj->count; i++) - // message("i:%d dj: %f",i,sort_j[i].d); - // - //message("Max distance into ci: %f", sort_j[0].d - cj->h_max); - if (ci_cache->count < count_i) { cache_init(ci_cache, count_i); } @@ -1763,19 +1835,37 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell cache_init(&cj_cache, count_j); } - //cache_read_two_cells(ci, cj, ci_cache, &cj_cache, shift); - cache_read_two_cells_sorted(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift); - + int first_pi, last_pj; /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ /* For particles in ci */ - populate_max_d(ci, cj, sort_i, sort_j, ci_cache, &cj_cache, dx_max, rshift, max_di, max_dj); + populate_max_d_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, max_di, max_dj, &first_pi, &last_pj); float di, dj; int max_ind_j = count_j - 1; + int max_ind_i = 0; + + dj = sort_j[max_ind_j].d; + while(max_ind_j > 0 && max_di[count_i - 1] < dj) { + max_ind_j--; + + dj = sort_j[max_ind_j].d; + } + + di = sort_i[max_ind_i].d; + while(max_ind_i < count_i - 1 && max_dj[0] > di) { + max_ind_i++; + + di = sort_i[max_ind_i].d; + } + + last_pj = max(last_pj, max_ind_j); + first_pi = min(first_pi, max_ind_i); + + cache_read_two_cells_sorted_2(ci, cj, ci_cache, &cj_cache, sort_i, sort_j, shift, first_pi, last_pj, num_vec_proc); /* Loop over the parts in ci. */ - for (int pid = count_i - 1; pid >= 0 && max_ind_j >= 0; pid-=2) { + for (int pid = count_i - 1; pid >= first_pi && max_ind_j >= 0; pid-=2) { /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; @@ -2009,9 +2099,8 @@ void runner_dopair1_density_vec_2(struct runner *r, struct cell *ci, struct cell } /* loop over the parts in ci. */ - int max_ind_i = 0; /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < count_j && max_ind_i < count_i; pjd+=2) { + for (int pjd = 0; pjd <= last_pj && max_ind_i < count_i; pjd+=2) { /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i];