diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 7fe19af853601e49de553a55ee205c654167d67e..c603d4ae4927183ad492244138e4217f833e1fea 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * 2016 Matthieu Schaller (matthieu.schaller@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -77,6 +78,12 @@ #define _IACT(f) PASTE(runner_iact, f) #define IACT _IACT(FUNCTION) +#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f) +#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION) + +#define _IACT_VEC(f) PASTE(runner_iact_vec, f) +#define IACT_VEC _IACT_VEC(FUNCTION) + #define _TIMER_DOSELF(f) PASTE(timer_doself, f) #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION) @@ -95,12 +102,6 @@ #define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f) #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION) -#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f) -#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION) - -#define _IACT_VEC(f) PASTE(runner_iact_vec, f) -#define IACT_VEC _IACT_VEC(FUNCTION) - /** * @brief Compute the interactions between a cell pair. * @@ -108,18 +109,14 @@ * @param ci The first #cell. * @param cj The second #cell. */ - void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, struct cell *restrict cj) { - struct engine *e = r->e; - int pid, pjd, k, count_i = ci->count, count_j = cj->count; - double shift[3] = {0.0, 0.0, 0.0}; - struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts; - struct part *restrict pi, *restrict pj; - double pix[3]; - float dx[3], hi, hig2, r2; + const struct engine *e = r->e; const int ti_current = e->ti_current; + + error("Don't use in actual runs ! Slow code !"); + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -128,44 +125,47 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) shift[k] = e->s->dim[k]; else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) shift[k] = -e->s->dim[k]; } - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - /* Loop over the parts in ci. */ - for (pid = 0; pid < count_i; pid++) { + for (int pid = 0; pid < count_i; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[pid]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + struct part *restrict pi = &parts_i[pid]; + const float hi = pi->h; + + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j; pjd++) { + for (int pjd = 0; pjd < count_j; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[pjd]; + struct part *restrict pj = &parts_j[pjd]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -206,7 +206,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -215,12 +215,10 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { - int pid, pjd, k, count = c->count; - struct part *restrict parts = c->parts; - struct part *restrict pi, *restrict pj; - double pix[3] = {0.0, 0.0, 0.0}; - float dx[3], hi, hig2, r2; const int ti_current = r->e->ti_current; + + error("Don't use in actual runs ! Slow code !"); + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -229,38 +227,34 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (c->ti_end_min > ti_current) return; - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ + const int count = c->count; + struct part *restrict parts = c->parts; /* Loop over the parts in ci. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts[pid]; - pix[0] = pi->x[0]; - pix[1] = pi->x[1]; - pix[2] = pi->x[2]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + struct part *restrict pi = &parts[pid]; + const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]}; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = pid + 1; pjd < count; pjd++) { + for (int pjd = pid + 1; pjd < count; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; + struct part *restrict pj = &parts[pjd]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -301,7 +295,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -319,18 +313,121 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { * @param count The number of particles in @c ind. * @param cj The second #cell. */ +void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, + struct part *restrict parts_i, int *restrict ind, + int count, struct cell *restrict cj) { + + struct engine *e = r->e; + + error("Don't use in actual runs ! Slow code !"); + +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + + TIMER_TIC + + const int count_j = cj->count; + struct part *restrict parts_j = cj->parts; + + /* Get the relative distance between the pairs, wrapping. */ + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + } + + /* Loop over the parts_i. */ + for (int pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[ind[pid]]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < count_j; pjd++) { + + /* Get a pointer to the jth particle. */ + struct part *restrict pj = &parts_j[pjd]; + + /* Compute the pairwise distance. */ + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (int k = 0; k < icount; k++) + IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + + TIMER_TOC(timer_dopair_subset); +} + +/** + * @brief Compute the interactions between a cell pair, but only for the + * given indices in ci. + * + * @param r The #runner. + * @param ci The first #cell. + * @param parts_i The #part to interact with @c cj. + * @param ind The list of indices of particles in @c ci to interact with. + * @param count The number of particles in @c ind. + * @param cj The second #cell. + */ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, struct part *restrict parts_i, int *restrict ind, int count, struct cell *restrict cj) { struct engine *e = r->e; - int pid, pjd, sid, k, count_j = cj->count, flipped; - double shift[3] = {0.0, 0.0, 0.0}; - struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; - double pix[3]; - float dx[3], hi, hig2, r2, di, dxj; - struct entry *sort_j; + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -339,10 +436,15 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC + const int count_j = cj->count; + struct part *restrict parts_j = cj->parts; + /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) shift[k] = e->s->dim[k]; else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) @@ -350,52 +452,50 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, } /* Get the sorting index. */ - for (sid = 0, k = 0; k < 3; k++) + int sid = 0; + for (int k = 0; k < 3; k++) sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); /* Switch the cells around? */ - flipped = runner_flip[sid]; + const int flipped = runner_flip[sid]; sid = sortlistID[sid]; /* Have the cells been sorted? */ if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - /* Pick-out the sorted lists. */ - sort_j = &cj->sort[sid * (cj->count + 1)]; - dxj = cj->dx_max; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; + const float dxj = cj->dx_max; /* Parts are on the left? */ if (!flipped) { /* Loop over the parts_i. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[ind[pid]]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] + - pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2]; + struct part *restrict pi = &parts_i[ind[pid]]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + const float di = hi * kernel_gamma + dxj + pix[0] * runner_shift[sid][0] + + pix[1] * runner_shift[sid][1] + + pix[2] * runner_shift[sid][2]; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -439,25 +539,28 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, else { /* Loop over the parts_i. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[ind[pid]]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] + - pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2]; + struct part *restrict pi = &parts_i[ind[pid]]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + const float di = + -hi * kernel_gamma - dxj + pix[0] * runner_shift[sid][0] + + pix[1] * runner_shift[sid][1] + pix[2] * runner_shift[sid][2]; /* Loop over the parts in cj. */ - for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) { + for (int pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -499,119 +602,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) - IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); -#endif - - TIMER_TOC(timer_dopair_subset); -} - -/** - * @brief Compute the interactions between a cell pair, but only for the - * given indices in ci. - * - * @param r The #runner. - * @param ci The first #cell. - * @param parts_i The #part to interact with @c cj. - * @param ind The list of indices of particles in @c ci to interact with. - * @param count The number of particles in @c ind. - * @param cj The second #cell. - */ - -void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, - struct part *restrict parts_i, int *restrict ind, - int count, struct cell *restrict cj) { - - struct engine *e = r->e; - int pid, pjd, k, count_j = cj->count; - double shift[3] = {0.0, 0.0, 0.0}; - struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; - double pix[3]; - float dx[3], hi, hig2, r2; -#ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__((aligned(16))); - float hiq[VEC_SIZE] __attribute__((aligned(16))); - float hjq[VEC_SIZE] __attribute__((aligned(16))); - float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; -#endif - TIMER_TIC - - /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { - if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) - shift[k] = e->s->dim[k]; - else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) - shift[k] = -e->s->dim[k]; - } - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Loop over the parts_i. */ - for (pid = 0; pid < count; pid++) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ind[pid]]; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j; pjd++) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[pjd]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k] * dx[k]; - } - - /* Hit or miss? */ - if (r2 < hig2) { - -#ifndef VECTORIZE - - IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); - -#else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3 * icount + 0] = dx[0]; - dxq[3 * icount + 1] = dx[1]; - dxq[3 * icount + 2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if (icount == VEC_SIZE) { - IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); - icount = 0; - } - -#endif - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - -#ifdef VECTORIZE - /* Pick up any leftovers. */ - if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -628,15 +619,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, * @param ind The list of indices of particles in @c ci to interact with. * @param count The number of particles in @c ind. */ - void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, struct part *restrict parts, int *restrict ind, int count) { - int pid, pjd, k, count_i = ci->count; - struct part *restrict parts_j = ci->parts; - struct part *restrict pi, *restrict pj; - double pix[3] = {0.0, 0.0, 0.0}; - float dx[3], hi, hig2, r2; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -645,35 +630,31 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with - %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , - cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ + const int count_i = ci->count; + struct part *restrict parts_j = ci->parts; /* Loop over the parts in ci. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a hold of the ith part in ci. */ - pi = &parts[ind[pid]]; - pix[0] = pi->x[0]; - pix[1] = pi->x[1]; - pix[2] = pi->x[2]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + struct part *restrict pi = &parts[ind[pid]]; + const double pix[3] = {pi->x[0], pi->x[1], pi->x[2]}; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_i; pjd++) { + for (int pjd = 0; pjd < count_i; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[pjd]; + struct part *restrict pj = &parts_j[pjd]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -714,7 +695,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif @@ -722,26 +703,17 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, } /** - * @brief Compute the interactions between a cell pair. + * @brief Compute the interactions between a cell pair (non-symmetric). * * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. */ - void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = {0.0, 0.0, 0.0}; - struct entry *restrict sort_i, *restrict sort_j; - struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3], pjx[3], di, dj; - float dx[3], hi, hig2, hj, hjg2, r2, dx_max; - double hi_max, hj_max; - double di_max, dj_min; - int count_i, count_j; const int ti_current = e->ti_current; + #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); @@ -750,60 +722,64 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; /* Get the sort ID. */ - sid = space_getsid(e->s, &ci, &cj, shift); + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); /* Have the cells been sorted? */ if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); /* Get the cutoff shift. */ - for (rshift = 0.0, k = 0; k < 3; k++) - rshift += shift[k] * runner_shift[sid][k]; + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - sort_i = &ci->sort[sid * (ci->count + 1)]; - sort_j = &cj->sort[sid * (cj->count + 1)]; + const struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + const struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; /* Get some other useful values. */ - hi_max = ci->h_max * kernel_gamma - rshift; - hj_max = cj->h_max * kernel_gamma; - count_i = ci->count; - count_j = cj->count; - parts_i = ci->parts; - parts_j = cj->parts; - di_max = sort_i[count_i - 1].d - rshift; - dj_min = sort_j[0].d; - dx_max = (ci->dx_max + cj->dx_max); + const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hj_max = cj->h_max * kernel_gamma; + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max + cj->dx_max); /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; - pid--) { + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[sort_i[pid].i]; + struct part *restrict pi = &parts_i[sort_i[pid].i]; if (pi->ti_end > ti_current) continue; - hi = pi->h; - di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + const float hi = pi->h; + const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; if (di < dj_min) continue; - hig2 = hi * hi * kernel_gamma2; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hig2 = hi * hi * kernel_gamma2; /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -841,33 +817,31 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { } /* loop over the parts in ci. */ - /* printf( "runner_dopair: first half took %.3f %s...\n" , - clocks_from_ticks(getticks() - tic), clocks_getunit()); - tic = getticks(); */ - /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; pjd++) { /* Get a hold of the jth part in cj. */ - pj = &parts_j[sort_j[pjd].i]; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; if (pj->ti_end > ti_current) continue; - hj = pj->h; - dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + const float hj = pj->h; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; if (dj > di_max) continue; - for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; - hjg2 = hj * hj * kernel_gamma2; + double pjx[3]; + for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; + const float hjg2 = hj * hj * kernel_gamma2; /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { + for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { /* Get a pointer to the jth particle. */ - pi = &parts_i[sort_i[pid].i]; + struct part *restrict pi = &parts_i[sort_i[pid].i]; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pjx[k] - pi->x[k]; r2 += dx[k] * dx[k]; } @@ -908,28 +882,25 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount > 0) - for (k = 0; k < icount; k++) + for (int k = 0; k < icount; k++) IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); #endif TIMER_TOC(TIMER_DOPAIR); } +/** + * @brief Compute the interactions between a cell pair (symmetric) + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = {0.0, 0.0, 0.0}; - struct entry *sort_i, *sort_j; - struct entry *sortdt_i = NULL, *sortdt_j = NULL; - int countdt_i = 0, countdt_j = 0; - struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3], pjx[3], di, dj; - float dx[3], hi, hig2, hj, hjg2, r2, dx_max; - double hi_max, hj_max; - double di_max, dj_min; - int count_i, count_j; const int ti_current = e->ti_current; + #ifdef VECTORIZE int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); @@ -944,38 +915,42 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { float dxq2[3 * VEC_SIZE] __attribute__((aligned(16))); struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; #endif + TIMER_TIC /* Anything to do here? */ if (ci->ti_end_min > ti_current && cj->ti_end_min > ti_current) return; /* Get the shift ID. */ - sid = space_getsid(e->s, &ci, &cj, shift); + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); /* Have the cells been sorted? */ if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); /* Get the cutoff shift. */ - for (rshift = 0.0, k = 0; k < 3; k++) - rshift += shift[k] * runner_shift[sid][k]; + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; /* Pick-out the sorted lists. */ - sort_i = &ci->sort[sid * (ci->count + 1)]; - sort_j = &cj->sort[sid * (cj->count + 1)]; + struct entry *restrict sort_i = &ci->sort[sid * (ci->count + 1)]; + struct entry *restrict sort_j = &cj->sort[sid * (cj->count + 1)]; /* Get some other useful values. */ - hi_max = ci->h_max * kernel_gamma - rshift; - hj_max = cj->h_max * kernel_gamma; - count_i = ci->count; - count_j = cj->count; - parts_i = ci->parts; - parts_j = cj->parts; - di_max = sort_i[count_i - 1].d - rshift; - dj_min = sort_j[0].d; - dx_max = (ci->dx_max + cj->dx_max); + const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hj_max = cj->h_max * kernel_gamma; + const int count_i = ci->count; + const int count_j = cj->count; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const double dx_max = (ci->dx_max + cj->dx_max); /* Collect the number of parts left and right below dt. */ + int countdt_i = 0, countdt_j = 0; + struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL; if (ci->ti_end_max <= ti_current) { sortdt_i = sort_i; countdt_i = count_i; @@ -983,7 +958,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) == NULL) error("Failed to allocate dt sortlists."); - for (k = 0; k < count_i; k++) + for (int k = 0; k < count_i; k++) if (parts_i[sort_i[k].i].ti_end <= ti_current) { sortdt_i[countdt_i] = sort_i[k]; countdt_i += 1; @@ -996,7 +971,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) == NULL) error("Failed to allocate dt sortlists."); - for (k = 0; k < count_j; k++) + for (int k = 0; k < count_j; k++) if (parts_j[sort_j[k].i].ti_end <= ti_current) { sortdt_j[countdt_j] = sort_j[k]; countdt_j += 1; @@ -1004,31 +979,33 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; - pid--) { + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { /* Get a hold of the ith part in ci. */ - pi = &parts_i[sort_i[pid].i]; - hi = pi->h; - di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + struct part *restrict pi = &parts_i[sort_i[pid].i]; + const float hi = pi->h; + const double di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; if (di < dj_min) continue; - hig2 = hi * hi * kernel_gamma2; - for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + const float hig2 = hi * hi * kernel_gamma2; /* Look at valid dt parts only? */ if (pi->ti_end > ti_current) { /* Loop over the parts in cj within dt. */ - for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sortdt_j[pjd].i]; - hj = pj->h; + struct part *restrict pj = &parts_j[sortdt_j[pjd].i]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pj->x[k] - pix[k]; r2 += dx[k] * dx[k]; } @@ -1070,15 +1047,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { else { /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts_j[sort_j[pjd].i]; - hj = pj->h; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -1144,36 +1122,34 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /* loop over the parts in ci. */ - /* printf( "runner_dopair: first half took %.3f %s...\n" , - clocks_from_ticks(getticks() - tic), clocks_getunit()); - tic = getticks(); */ - /* Loop over the parts in cj. */ - for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; pjd++) { /* Get a hold of the jth part in cj. */ - pj = &parts_j[sort_j[pjd].i]; - hj = pj->h; - dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + const float hj = pj->h; + const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; if (dj > di_max) continue; - for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; - hjg2 = hj * hj * kernel_gamma2; + double pjx[3]; + for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; + const float hjg2 = hj * hj * kernel_gamma2; /* Is this particle outside the dt? */ if (pj->ti_end > ti_current) { /* Loop over the parts in ci. */ - for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) { + for (int pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) { /* Get a pointer to the jth particle. */ - pi = &parts_i[sortdt_i[pid].i]; - hi = pi->h; + struct part *restrict pi = &parts_i[sortdt_i[pid].i]; + const float hi = pi->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pi->x[k] - pjx[k]; r2 += dx[k] * dx[k]; } @@ -1214,15 +1190,16 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { else { /* Loop over the parts in ci. */ - for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { + for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { /* Get a pointer to the jth particle. */ - pi = &parts_i[sort_i[pid].i]; - hi = pi->h; + struct part *restrict pi = &parts_i[sort_i[pid].i]; + const float hi = pi->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pjx[k] - pi->x[k]; r2 += dx[k] * dx[k]; } @@ -1291,10 +1268,10 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount1 > 0) - for (k = 0; k < icount1; k++) + for (int k = 0; k < icount1; k++) IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); if (icount2 > 0) - for (k = 0; k < icount2; k++) + for (int k = 0; k < icount2; k++) IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); #endif @@ -1302,20 +1279,15 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /** - * @brief Compute the cell self-interaction. + * @brief Compute the cell self-interaction (non-symmetric). * * @param r The #runner. * @param c The #cell. */ - void DOSELF1(struct runner *r, struct cell *restrict c) { - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hj, hig2, r2; - struct part *restrict parts = c->parts, *restrict pi, *restrict pj; const int ti_current = r->e->ti_current; - int firstdt = 0, countdt = 0, *indt = NULL, doj; + #ifdef VECTORIZE int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); @@ -1332,43 +1304,49 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { #endif TIMER_TIC - /* Set up indt if needed. */ - if (c->ti_end_min > ti_current) - return; - else if (c->ti_end_max > ti_current) { - if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) - error("Failed to allocate indt."); - for (k = 0; k < count; k++) - if (parts[k].ti_end <= ti_current) { - indt[countdt] = k; - countdt += 1; - } - } + if (c->ti_end_min > ti_current) return; + if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); + + struct part *restrict parts = c->parts; + const int count = c->count; + + /* Set up indt. */ + int *indt = NULL; + int countdt = 0, firstdt = 0; + if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) + error("Failed to allocate indt."); + for (int k = 0; k < count; k++) + if (parts[k].ti_end <= ti_current) { + indt[countdt] = k; + countdt += 1; + } /* Loop over the particles in the cell. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Get the particle position and radius. */ - for (k = 0; k < 3; k++) pix[k] = pi->x[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Is the ith particle inactive? */ if (pi->ti_end > ti_current) { /* Loop over the other particles .*/ - for (pjd = firstdt; pjd < countdt; pjd++) { + for (int pjd = firstdt; pjd < countdt; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[indt[pjd]]; - hj = pj->h; + struct part *restrict pj = &parts[indt[pjd]]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pj->x[k] - pix[k]; r2 += dx[k] * dx[k]; } @@ -1413,19 +1391,21 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { firstdt += 1; /* Loop over the other particles .*/ - for (pjd = pid + 1; pjd < count; pjd++) { + for (int pjd = pid + 1; pjd < count; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; - hj = pj->h; + struct part *restrict pj = &parts[pjd]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } - doj = (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2); + const int doj = + (pj->ti_end <= ti_current) && (r2 < hj * hj * kernel_gamma2); /* Hit or miss? */ if (r2 < hig2 || doj) { @@ -1516,24 +1496,26 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount1 > 0) - for (k = 0; k < icount1; k++) + for (int k = 0; k < icount1; k++) IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); if (icount2 > 0) - for (k = 0; k < icount2; k++) + for (int k = 0; k < icount2; k++) IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); #endif TIMER_TOC(TIMER_DOSELF); } +/** + * @brief Compute the cell self-interaction (symmetric). + * + * @param r The #runner. + * @param c The #cell. + */ void DOSELF2(struct runner *r, struct cell *restrict c) { - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hj, hig2, r2; - struct part *restrict parts = c->parts, *restrict pi, *restrict pj; const int ti_current = r->e->ti_current; - int firstdt = 0, countdt = 0, *indt = NULL; + #ifdef VECTORIZE int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); @@ -1550,43 +1532,49 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { #endif TIMER_TIC - /* Set up indt if needed. */ - if (c->ti_end_min > ti_current) - return; - else if (c->ti_end_max > ti_current) { - if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) - error("Failed to allocate indt."); - for (k = 0; k < count; k++) - if (parts[k].ti_end <= ti_current) { - indt[countdt] = k; - countdt += 1; - } - } + if (c->ti_end_min > ti_current) return; + if (c->ti_end_max < ti_current) error("Cell in an impossible time-zone"); + + struct part *restrict parts = c->parts; + const int count = c->count; + + /* Set up indt. */ + int *indt = NULL; + int countdt = 0, firstdt = 0; + if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) + error("Failed to allocate indt."); + for (int k = 0; k < count; k++) + if (parts[k].ti_end <= ti_current) { + indt[countdt] = k; + countdt += 1; + } /* Loop over the particles in the cell. */ - for (pid = 0; pid < count; pid++) { + for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Get the particle position and radius. */ - for (k = 0; k < 3; k++) pix[k] = pi->x[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; + double pix[3]; + for (int k = 0; k < 3; k++) pix[k] = pi->x[k]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; /* Is the ith particle not active? */ if (pi->ti_end > ti_current) { /* Loop over the other particles .*/ - for (pjd = firstdt; pjd < countdt; pjd++) { + for (int pjd = firstdt; pjd < countdt; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[indt[pjd]]; - hj = pj->h; + struct part *restrict pj = &parts[indt[pjd]]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pj->x[k] - pix[k]; r2 += dx[k] * dx[k]; } @@ -1631,15 +1619,16 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { firstdt += 1; /* Loop over the other particles .*/ - for (pjd = pid + 1; pjd < count; pjd++) { + for (int pjd = pid + 1; pjd < count; pjd++) { /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; - hj = pj->h; + struct part *restrict pj = &parts[pjd]; + const float hj = pj->h; /* Compute the pairwise distance. */ - r2 = 0.0f; - for (k = 0; k < 3; k++) { + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { dx[k] = pix[k] - pj->x[k]; r2 += dx[k] * dx[k]; } @@ -1708,10 +1697,10 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { #ifdef VECTORIZE /* Pick up any leftovers. */ if (icount1 > 0) - for (k = 0; k < icount1; k++) + for (int k = 0; k < icount1; k++) IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); if (icount2 > 0) - for (k = 0; k < icount2; k++) + for (int k = 0; k < icount2; k++) IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); #endif @@ -2290,17 +2279,14 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) { void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, int *ind, int count, struct cell *cj, int sid, int gettimer) { - int j, k; - double shift[3]; - float h; struct space *s = r->e->s; - struct cell *sub = NULL; const int ti_current = r->e->ti_current; TIMER_TIC /* Find out in which sub-cell of ci the parts are. */ - for (k = 0; k < 8; k++) + struct cell *sub = NULL; + for (int k = 0; k < 8; k++) if (ci->progeny[k] != NULL) { // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] && // parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] + @@ -2326,7 +2312,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, /* Loop over all progeny. */ DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0); - for (j = 0; j < 8; j++) + for (int j = 0; j < 8; j++) if (ci->progeny[j] != sub && ci->progeny[j] != NULL) DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0); @@ -2342,7 +2328,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, else { /* Get the cell dimensions. */ - h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + const float h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); /* Recurse? */ if (ci->split && cj->split && @@ -2350,6 +2336,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, h / 2) { /* Get the type of pair if not specified explicitly. */ + double shift[3] = {0.0, 0.0, 0.0}; sid = space_getsid(s, &ci, &cj, shift); /* Different types of flags. */ @@ -2858,7 +2845,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, else if (ci->ti_end_min <= ti_current || cj->ti_end_min <= ti_current) { /* Get the relative distance between the pairs, wrapping. */ - for (k = 0; k < 3; k++) { + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2) shift[k] = s->dim[k]; else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2) @@ -2866,7 +2854,8 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, } /* Get the sorting index. */ - for (sid = 0, k = 0; k < 3; k++) + int sid = 0; + for (int k = 0; k < 3; k++) sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0