diff --git a/src/cell.h b/src/cell.h index 23565c57694672fd5dd61e331c5d046c1b4727d7..27c4242ffdeac5312863d561f58daa8ead16d93b 100644 --- a/src/cell.h +++ b/src/cell.h @@ -41,6 +41,9 @@ struct cell { /* Max radii in this cell. */ double h_max; + /* Minimum and maximum dt in this cell. */ + double dt_min, dt_max; + /* The depth of this cell in the tree. */ int depth, split; diff --git a/src/engine.c b/src/engine.c index c1af8aa610e4c8822de35d2edacdeece251b4a03..53b62c81e16d953f144e7806c72c8c625e776452 100644 --- a/src/engine.c +++ b/src/engine.c @@ -76,6 +76,10 @@ void engine_prepare ( struct engine *e , int force ) { /* Clear the queues. */ for ( k = 0 ; k < e->nr_queues ; k++ ) e->queues[k].count = 0; + + /* Re-allocate the queue buffers? */ + for ( k = 0 ; k < e->nr_queues ; k++ ) + queue_init( &e->queues[k] , s->nr_tasks , s->tasks ); /* Fill the queues (round-robin). */ for ( k = 0 ; k < s->nr_tasks ; k++ ) { @@ -213,7 +217,7 @@ void engine_barrier( struct engine *e ) { * @param sort_queues Flag to try to sort the queues topologically. */ -void engine_run ( struct engine *e , int sort_queues ) { +void engine_run ( struct engine *e , int sort_queues , float dt_max ) { int k; @@ -225,6 +229,10 @@ void engine_run ( struct engine *e , int sort_queues ) { e->queues[k].next = 0; } } + + /* Set the maximum dt. */ + e->dt_max = dt_max; + e->s->dt_max = dt_max; /* Cry havoc and let loose the dogs of war. */ e->barrier_count = -e->barrier_count; diff --git a/src/engine.h b/src/engine.h index e7ad72d52cb788814094b7818bcdc3a3d32e032c..d3319f2d81ed91c0ddfae02710fc6827ff95f7e5 100644 --- a/src/engine.h +++ b/src/engine.h @@ -50,6 +50,9 @@ struct engine { /* The queues. */ struct queue *queues; + /* The maximum dt to step. */ + float dt_max; + /* Data for the threads' barrier. */ pthread_mutex_t barrier_mutex; pthread_cond_t barrier_cond; @@ -63,4 +66,4 @@ void engine_barrier( struct engine *e ); void engine_init ( struct engine *e , struct space *s , int nr_threads , int nr_queues , int policy ); void engine_prepare ( struct engine *e , int force ); void engine_ranktasks ( struct engine *e ); -void engine_run ( struct engine *e , int sort_queues ); +void engine_run ( struct engine *e , int sort_queues , float dt_max ); diff --git a/src/ic.c b/src/ic.c index 853311484b75b299d81b62839d88a7515d5a699c..c8cdfe8880b1d364314417c17db1528a88698a21 100644 --- a/src/ic.c +++ b/src/ic.c @@ -278,6 +278,7 @@ void read_ic ( char* fileName, double dim[3], struct part **parts, int* N, int* /* Allocate memory to store particles */ if(posix_memalign( (void*)parts , 32 , *N * sizeof(struct part)) != 0) error("Error while allocating memory for particles"); + bzero( *parts , *N * sizeof(struct part) ); printf("read_ic: Allocated %8.2f MB for particles.\n", *N * sizeof(struct part) / (1024.*1024.)); diff --git a/src/part.h b/src/part.h index 4caf61ae29f493597ce9ee46183d4fad8b6ae857..32adceb9d058d0f52e91633689c8751bba680a00 100644 --- a/src/part.h +++ b/src/part.h @@ -48,9 +48,6 @@ struct part { /* Particle velocity. */ float v[3]; - /* Particle acceleration. */ - float a[3]; - /* Particle density. */ float rho; @@ -69,6 +66,9 @@ struct part { /* Derivative of the density with respect to this particle's smoothing length. */ float rho_dh; + /* Particle acceleration. */ + float a[3]; + /* Particle number density. */ // int icount; float wcount; diff --git a/src/queue.c b/src/queue.c index a507828bf6e88cd010cb1ab4db005a19f1b6bb77..272582519a7e08dc21ba45887c25c8d98cca8bdd 100644 --- a/src/queue.c +++ b/src/queue.c @@ -124,10 +124,16 @@ void queue_insert ( struct queue *q , struct task *t ) { void queue_init ( struct queue *q , int size , struct task *tasks ) { - /* Allocate the task list. */ - q->size = size; - if ( ( q->tid = (int *)malloc( sizeof(int) * size ) ) == NULL ) - error( "Failed to allocate queue tids." ); + /* Allocate the task list if needed. */ + if ( q->tid == NULL || q->size < size ) { + if ( q->tid != NULL ) + free( q->tid ); + q->size = size; + if ( ( q->tid = (int *)malloc( sizeof(int) * size ) ) == NULL ) + error( "Failed to allocate queue tids." ); + } + + /* Set the tasks pointer. */ q->tasks = tasks; /* Init counters. */ diff --git a/src/runner.c b/src/runner.c index 353bb90a7b8e83987a8da80b670e6f6e557650b1..21c4135f44f2b394f248c8e5aae875c993cc33e7 100644 --- a/src/runner.c +++ b/src/runner.c @@ -325,6 +325,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) { int i, k, redo, count = c->count; int *pid; float ihg, ihg2; + float dt_max = r->e->dt_max; TIMER_TIC /* Recurse? */ @@ -352,28 +353,42 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Get a direct pointer on the part. */ p = &c->parts[ pid[i] ]; - - /* Adjust the computed rho. */ - ihg = kernel_igamma / p->h; - ihg2 = ihg * ihg; - p->rho *= ihg * ihg2; - p->rho_dh *= ihg2 * ihg2; - /* Update the smoothing length. */ - p->h -= ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh; - - /* Did we get the right number density? */ - if ( p->wcount + kernel_root > const_nwneigh + 1 || - p->wcount + kernel_root < const_nwneigh - 1 ) { - // printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%f.\n" , p->id , p->h , c->depth , p->wcount + kernel_root ); fflush(stdout); - // p->h += ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh; - pid[redo] = pid[i]; - redo += 1; - p->wcount = 0.0; - p->wcount_dh = 0.0; - p->rho = 0.0; - p->rho_dh = 0.0; - continue; + /* Is this part within the timestep? */ + if ( p->dt <= dt_max ) { + + /* Adjust the computed rho. */ + ihg = kernel_igamma / p->h; + ihg2 = ihg * ihg; + p->rho *= ihg * ihg2; + p->rho_dh *= ihg2 * ihg2; + + /* Update the smoothing length. */ + p->h -= ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh; + + /* Did we get the right number density? */ + if ( p->wcount + kernel_root > const_nwneigh + 1 || + p->wcount + kernel_root < const_nwneigh - 1 ) { + // printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%f.\n" , p->id , p->h , c->depth , p->wcount + kernel_root ); fflush(stdout); + // p->h += ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh; + pid[redo] = pid[i]; + redo += 1; + p->wcount = 0.0; + p->wcount_dh = 0.0; + p->rho = 0.0; + p->rho_dh = 0.0; + continue; + } + + /* Compute this particle's time step. */ + p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u ); + + /* Compute the pressure. */ + // p->P = p->rho * p->u * ( const_gamma - 1.0f ); + + /* Compute the P/Omega/rho2. */ + p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); + } /* Reset the acceleration. */ @@ -384,15 +399,6 @@ void runner_doghost ( struct runner *r , struct cell *c ) { p->u_dt = 0.0f; p->h_dt = 0.0f; - /* Compute this particle's time step. */ - p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u ); - - /* Compute the pressure. */ - // p->P = p->rho * p->u * ( const_gamma - 1.0f ); - - /* Compute the P/Omega/rho2. */ - p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); - } /* Re-set the counter for the next loop (potentially). */ @@ -570,18 +576,18 @@ void *runner_main ( void *data ) { switch ( t->type ) { case task_type_self: if ( t->subtype == task_subtype_density ) - runner_doself_density( r , ci ); + runner_doself1_density( r , ci ); else if ( t->subtype == task_subtype_force ) - runner_doself_force( r , ci ); + runner_doself2_force( r , ci ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); break; case task_type_pair: if ( t->subtype == task_subtype_density ) - runner_dopair_density( r , ci , cj ); + runner_dopair1_density( r , ci , cj ); else if ( t->subtype == task_subtype_force ) - runner_dopair_force( r , ci , cj ); + runner_dopair2_force( r , ci , cj ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); @@ -592,9 +598,9 @@ void *runner_main ( void *data ) { break; case task_type_sub: if ( t->subtype == task_subtype_density ) - runner_dosub_density( r , ci , cj , t->flags ); + runner_dosub1_density( r , ci , cj , t->flags ); else if ( t->subtype == task_subtype_force ) - runner_dosub_force( r , ci , cj , t->flags ); + runner_dosub2_force( r , ci , cj , t->flags ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); diff --git a/src/runner.h b/src/runner.h index 05906de43a650f83cb952b6d92d06987178ba424..9763a2ae50351427aabe8c01c28ae37d47a6b074 100644 --- a/src/runner.h +++ b/src/runner.h @@ -75,6 +75,7 @@ enum { runner_counter_steal_stall, runner_counter_steal_empty, runner_counter_keep, + runner_counter_iact, runner_counter_count, }; extern int runner_counter[ runner_counter_count ]; diff --git a/src/runner_doiact.h b/src/runner_doiact.h index d97728ff685e5ce4e23acd0e3be568ff29e34e23..fd4dd880116c06a225311df865775f4b765dcbc5 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -26,53 +26,65 @@ #define PASTE(x,y) x ## _ ## y -#define DOPAIR2(f) PASTE(runner_dopair,f) -#define DOPAIR DOPAIR2(FUNCTION) +#define _DOPAIR1(f) PASTE(runner_dopair1,f) +#define DOPAIR1 _DOPAIR1(FUNCTION) -#define DOPAIR_SUBSET2(f) PASTE(runner_dopair_subset,f) -#define DOPAIR_SUBSET DOPAIR_SUBSET2(FUNCTION) +#define _DOPAIR2(f) PASTE(runner_dopair2,f) +#define DOPAIR2 _DOPAIR2(FUNCTION) -#define DOPAIR_NAIVE2(f) PASTE(runner_dopair_naive,f) -#define DOPAIR_NAIVE DOPAIR_NAIVE2(FUNCTION) +#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset,f) +#define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION) -#define DOSELF2(f) PASTE(runner_doself,f) -#define DOSELF DOSELF2(FUNCTION) +#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive,f) +#define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION) -#define DOSELF_SUBSET2(f) PASTE(runner_doself_subset,f) -#define DOSELF_SUBSET DOSELF_SUBSET2(FUNCTION) +#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive,f) +#define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION) -#define DOSUB2(f) PASTE(runner_dosub,f) -#define DOSUB DOSUB2(FUNCTION) +#define _DOSELF1(f) PASTE(runner_doself1,f) +#define DOSELF1 _DOSELF1(FUNCTION) -#define DOSUB_SUBSET2(f) PASTE(runner_dosub_subset,f) -#define DOSUB_SUBSET DOSUB_SUBSET2(FUNCTION) +#define _DOSELF2(f) PASTE(runner_doself2,f) +#define DOSELF2 _DOSELF2(FUNCTION) -#define IACT_NONSYM2(f) PASTE(runner_iact_nonsym,f) -#define IACT_NONSYM IACT_NONSYM2(FUNCTION) +#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset,f) +#define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION) -#define IACT2(f) PASTE(runner_iact,f) -#define IACT IACT2(FUNCTION) +#define _DOSUB1(f) PASTE(runner_dosub1,f) +#define DOSUB1 _DOSUB1(FUNCTION) -#define TIMER_DOSELF2(f) PASTE(runner_timer_doself,f) -#define TIMER_DOSELF TIMER_DOSELF2(FUNCTION) +#define _DOSUB2(f) PASTE(runner_dosub2,f) +#define DOSUB2 _DOSUB2(FUNCTION) -#define TIMER_DOPAIR2(f) PASTE(runner_timer_dopair,f) -#define TIMER_DOPAIR TIMER_DOPAIR2(FUNCTION) +#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset,f) +#define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION) -#define TIMER_DOSUB2(f) PASTE(runner_timer_dosub,f) -#define TIMER_DOSUB TIMER_DOSUB2(FUNCTION) +#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym,f) +#define IACT_NONSYM _IACT_NONSYM(FUNCTION) -#define TIMER_DOSELF_SUBSET2(f) PASTE(runner_timer_doself_subset,f) -#define TIMER_DOSELF_SUBSET TIMER_DOSELF_SUBSET2(FUNCTION) +#define _IACT(f) PASTE(runner_iact,f) +#define IACT _IACT(FUNCTION) -#define TIMER_DOPAIR_SUBSET2(f) PASTE(runner_timer_dopair_subset,f) -#define TIMER_DOPAIR_SUBSET TIMER_DOPAIR_SUBSET2(FUNCTION) +#define _TIMER_DOSELF(f) PASTE(runner_timer_doself,f) +#define TIMER_DOSELF _TIMER_DOSELF(FUNCTION) -#define IACT_NONSYM_VEC2(f) PASTE(runner_iact_nonsym_vec,f) -#define IACT_NONSYM_VEC IACT_NONSYM_VEC2(FUNCTION) +#define _TIMER_DOPAIR(f) PASTE(runner_timer_dopair,f) +#define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION) -#define IACT_VEC2(f) PASTE(runner_iact_vec,f) -#define IACT_VEC IACT_VEC2(FUNCTION) +#define _TIMER_DOSUB(f) PASTE(runner_timer_dosub,f) +#define TIMER_DOSUB _TIMER_DOSUB(FUNCTION) + +#define _TIMER_DOSELF_SUBSET(f) PASTE(runner_timer_doself_subset,f) +#define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION) + +#define _TIMER_DOPAIR_SUBSET(f) PASTE(runner_timer_dopair_subset,f) +#define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION) + +#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec,f) +#define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION) + +#define _IACT_VEC(f) PASTE(runner_iact_vec,f) +#define IACT_VEC _IACT_VEC(FUNCTION) @@ -94,6 +106,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r struct cpart *restrict cpi, *restrict cparts_i = ci->cparts; double pix[3]; float dx[3], hi, hi2, r2; + float dt_max = e->dt_max; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -104,6 +117,10 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r #endif TIMER_TIC + /* Anything to do here? */ + if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + return; + /* Get the relative distance between the pairs, wrapping. */ for ( k = 0 ; k < 3 ; k++ ) { if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) @@ -191,6 +208,106 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r } +void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) { + + int pid, pjd, k, count = c->count; + struct part *restrict parts = c->parts; + struct cpart *restrict cpi, *restrict cpj,*restrict cparts = c->cparts; + double pix[3]; + float dx[3], hi, hi2, r2; + float dt_max = r->e->dt_max; + #ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; + #endif + TIMER_TIC + + /* Anything to do here? */ + if ( c->dt_min > dt_max ) + return; + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Loop over the parts in ci. */ + for ( pid = 0 ; pid < count ; pid++ ) { + + /* Get a hold of the ith part in ci. */ + cpi = &cparts[ pid ]; + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = cpi->x[k]; + hi = cpi->h; + hi2 = hi * hi; + + /* Loop over the parts in cj. */ + for ( pjd = pid+1 ; pjd < count ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + cpj = &cparts[ pjd ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - cpj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hi2 || r2 < cpj->h*cpj->h ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hi , cpj->h , &parts[ pid ] , &parts[pjd] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = cpj->h; + piq[icount] = &parts[ pid ]; + pjq[icount] = &parts[ pjd ]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + #ifdef VECTORIZE + /* Pick up any leftovers. */ + if ( icount > 0 ) + for ( k = 0 ; k < icount ; k++ ) + IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); + #endif + + #ifdef TIMER_VERBOSE + printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(TIMER_DOSELF); + #endif + + } + + /** * @brief Compute the interactions between a cell pair, but only for the * given indices in ci. @@ -507,7 +624,7 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part * * @param cj The second #cell. */ -void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { +void DOPAIR1 ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { struct engine *restrict e = r->e; int pid, pjd, k, sid; @@ -522,6 +639,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric double hi_max, hj_max; double di_max, dj_min; int count_i, count_j; + float dt_max = e->dt_max; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -532,6 +650,10 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric #endif TIMER_TIC + /* Anything to do here? */ + if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + return; + /* Get the relative distance between the pairs, wrapping. */ for ( k = 0 ; k < 3 ; k++ ) { if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) @@ -594,6 +716,8 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric /* Get a hold of the ith part in ci. */ pi = &parts_i[ sort_i[ pid ].i ]; cpi = &cparts_i[ sort_i[ pid ].i ]; + if ( cpi->dt > dt_max ) + continue; hi = cpi->h; di = sort_i[pid].d + hi - rshift; if ( di < dj_min ) @@ -621,7 +745,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric #ifndef VECTORIZE - IACT( r2 , dx , hi , cpj->h , &parts_i[ sort_i[ pid ].i ] , &parts_j[ sort_j[pjd].i ] ); + IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] ); #else @@ -638,7 +762,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric /* Flush? */ if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); icount = 0; } @@ -659,6 +783,8 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric /* Get a hold of the jth part in cj. */ pj = &parts_j[ sort_j[ pjd ].i ]; cpj = &cparts_j[ sort_j[ pjd ].i ]; + if ( cpj->dt > dt_max ) + continue; hj = cpj->h; dj = sort_j[pjd].d - hj - rshift; if ( dj > di_max ) @@ -682,11 +808,11 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric } /* Hit or miss? */ - if ( r2 < hj2 && r2 > cpi->h*cpi->h ) { + if ( r2 < hj2 ) { #ifndef VECTORIZE - IACT( r2 , dx , hj , cpi->h , &parts_j[ sort_j[ pjd ].i ] , &parts_i[ sort_i[pid].i ] ); + IACT_NONSYM( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] ); #else @@ -703,7 +829,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric /* Flush? */ if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); icount = 0; } @@ -719,7 +845,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric /* Pick up any leftovers. */ if ( icount > 0 ) for ( k = 0 ; k < icount ; k++ ) - IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); + IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); #endif #ifdef TIMER_VERBOSE @@ -731,20 +857,24 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric } -/** - * @brief Compute the cell self-interaction. - * - * @param r The #runner. - * @param c The #cell. - */ - -void DOSELF ( struct runner *r , struct cell *restrict c ) { +void DOPAIR2 ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hi2, r2; - struct part *restrict parts = c->parts; - struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts; + struct engine *restrict e = r->e; + int pid, pjd, k, sid; + double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }; + struct cell *temp; + struct entry *restrict sort_i, *restrict sort_j; + struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL; + int countdt_i = 0, countdt_j = 0; + struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; + struct cpart *restrict cpi, *restrict cparts_i; + struct cpart *restrict cpj, *restrict cparts_j; + double pix[3], pjx[3], di, dj; + float dx[3], hi, hi2, hj, hj2, r2; + double hi_max, hj_max; + double di_max, dj_min; + int count_i, count_j; + float dt_max = e->dt_max; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -755,64 +885,606 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) { #endif TIMER_TIC - /* Loop over the particles in the cell. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a pointer to the ith particle. */ - cpi = &cparts[pid]; + /* Anything to do here? */ + if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + return; - /* Get the particle position and radius. */ + /* Get the relative distance between the pairs, wrapping. */ + for ( k = 0 ; k < 3 ; k++ ) { + if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) + shift[k] = e->s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) + shift[k] = -e->s->dim[k]; + } + + /* Get the sorting index. */ + for ( sid = 0 , k = 0 ; k < 3 ; k++ ) + sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 ); + + /* Switch the cells around? */ + if ( runner_flip[sid] ) { + temp = ci; ci = cj; cj = temp; for ( k = 0 ; k < 3 ; k++ ) - pix[k] = cpi->x[k]; + shift[k] = -shift[k]; + } + sid = sortlistID[sid]; + + /* Get the cutoff shift. */ + for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ ) + rshift += shift[k]*runner_shift[ 3*sid + k ]; + + /* for ( k = 0 ; k < ci->count ; k++ ) + if ( ci->parts[k].id == 561590 ) + break; + if ( k == ci->count ) + for ( k = 0 ; k < cj->count ; k++ ) + if ( cj->parts[k].id == 561590 ) + break; + if ( k < cj->count ) + printf( "runner_dopair: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts, h_max=%g/%g, and shift = [ %g %g %g ] (rshift=%g).\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , + ci->count , cj->count , ci->h_max , cj->h_max , shift[0] , shift[1] , shift[2] , rshift ); fflush(stdout); */ + /* for ( hi = 0 , k = 0 ; k < ci->count ; k++ ) + hi += ci->parts[k].r; + for ( hj = 0 , k = 0 ; k < cj->count ; k++ ) + hj += cj->parts[k].r; + printf( "runner_dopair: avg. radii %g/%g for h=%g at depth=%i.\n" , hi/ci->count , hj/cj->count , ci->h[0] , ci->depth ); fflush(stdout); */ + + /* Pick-out the sorted lists. */ + sort_i = &ci->sort[ sid*(ci->count + 1) ]; + sort_j = &cj->sort[ sid*(cj->count + 1) ]; + + /* Get some other useful values. */ + hi_max = ci->h_max - rshift; hj_max = cj->h_max; + count_i = ci->count; count_j = cj->count; + parts_i = ci->parts; parts_j = cj->parts; + cparts_i = ci->cparts; cparts_j = cj->cparts; + di_max = sort_i[count_i-1].d - rshift; + dj_min = sort_j[0].d; + + /* Collect the number of parts left and right below dt. */ + if ( cj->dt_min > dt_max ) { + sortdt_i = sort_i; + countdt_i = count_i; + } + else if ( cj->dt_max > dt_max ) { + if ( ( sortdt_i = (struct entry *)alloca( sizeof(struct entry) * count_i ) ) == NULL ) + error( "Failed to allocate dt sortlists." ); + for ( k = 0 ; k < count_i ; k++ ) + if ( cparts_i[ sort_i[ k ].i ].dt <= dt_max ) { + sortdt_i[ countdt_i ] = sort_i[k]; + countdt_i += 1; + } + } + if ( ci->dt_min > dt_max ) { + sortdt_j = sort_j; + countdt_j = count_j; + } + else if ( ci->dt_max > dt_max ) { + if ( ( sortdt_j = (struct entry *)alloca( sizeof(struct entry) * count_j ) ) == NULL ) + error( "Failed to allocate dt sortlists." ); + for ( k = 0 ; k < count_j ; k++ ) + if ( cparts_j[ sort_j[ k ].i ].dt <= dt_max ) { + sortdt_j[ countdt_j ] = sort_j[k]; + countdt_j += 1; + } + } + + /* if ( ci->split && cj->split && sid == 4 ) + printf( "boing!\n" ); */ + + /* Loop over the parts in ci. */ + for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max > dj_min ; pid-- ) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ sort_i[ pid ].i ]; + cpi = &cparts_i[ sort_i[ pid ].i ]; hi = cpi->h; + di = sort_i[pid].d + hi - rshift; + if ( di < dj_min ) + continue; + hi2 = hi * hi; + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = cpi->x[k] - shift[k]; - /* Loop over the other particles .*/ - for ( pjd = pid+1 ; pjd < count ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - cpj = &cparts[pjd]; + /* Look at valid dt parts only? */ + if ( cpi->dt > dt_max ) { - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - cpj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hi2 || r2 < cpj->h*cpj->h ) { - - #ifndef VECTORIZE - - IACT( r2 , dx , hi , cpj->h , &parts[pid] , &parts[pjd] ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = cpj->h; - piq[icount] = &parts[pid]; - pjq[icount] = &parts[pjd]; - icount += 1; + /* Loop over the parts in cj within dt. */ + for ( pjd = 0 ; pjd < countdt_j && sortdt_j[pjd].d < di ; pjd++ ) { - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } + /* Get a pointer to the jth particle. */ + cpj = &cparts_j[ sortdt_j[pjd].i ]; - #endif - - } - - } /* loop over all other particles. */ - - } /* loop over all particles. */ + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - cpj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hi2 ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sortdt_j[pjd].i ] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = cpj->h; + piq[icount] = pi; + pjq[icount] = &parts_j[ sortdt_j[ pjd ].i ]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over the parts in cj. */ + + } + + /* Otherwise, look at all parts. */ + else { + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + cpj = &cparts_j[ sort_j[pjd].i ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - cpj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hi2 ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = cpj->h; + piq[icount] = pi; + pjq[icount] = &parts_j[ sort_j[ pjd ].i ]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over the parts in cj. */ + + } + + } /* loop over the parts in ci. */ + + /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); + tic = getticks(); */ + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max < di_max ; pjd++ ) { + + /* Get a hold of the jth part in cj. */ + pj = &parts_j[ sort_j[ pjd ].i ]; + cpj = &cparts_j[ sort_j[ pjd ].i ]; + hj = cpj->h; + dj = sort_j[pjd].d - hj - rshift; + if ( dj > di_max ) + continue; + + for ( k = 0 ; k < 3 ; k++ ) + pjx[k] = cpj->x[k] + shift[k]; + hj2 = hj * hj; + + /* Is this particle outside the dt? */ + if ( cpj->dt > dt_max ) { + + /* Loop over the parts in ci. */ + for ( pid = countdt_i-1 ; pid >= 0 && sortdt_i[pid].d > dj ; pid-- ) { + + /* Get a pointer to the jth particle. */ + cpi = &cparts_i[ sortdt_i[pid].i ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = cpi->x[k] - pjx[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hj2 && r2 > cpi->h*cpi->h ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sortdt_i[pid].i ] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hj; + hjq[icount] = cpi->h; + piq[icount] = pj; + pjq[icount] = &parts_i[ sortdt_i[ pid ].i ]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over the parts in cj. */ + } + + /* Otherwise, interact with all particles in cj. */ + else { + + /* Loop over the parts in ci. */ + for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) { + + /* Get a pointer to the jth particle. */ + cpi = &cparts_i[ sort_i[pid].i ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = cpi->x[k] - pjx[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hj2 && r2 > cpi->h*cpi->h ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hj; + hjq[icount] = cpi->h; + piq[icount] = pj; + pjq[icount] = &parts_i[ sort_i[ pid ].i ]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over the parts in cj. */ + + } + + } /* loop over the parts in ci. */ + + #ifdef VECTORIZE + /* Pick up any leftovers. */ + if ( icount > 0 ) + for ( k = 0 ; k < icount ; k++ ) + IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); + #endif + + #ifdef TIMER_VERBOSE + printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); + #else + TIMER_TOC(TIMER_DOPAIR); + #endif + + } + + +/** + * @brief Compute the cell self-interaction. + * + * @param r The #runner. + * @param c The #cell. + */ + +void DOSELF1 ( struct runner *r , struct cell *restrict c ) { + + int k, pid, pjd, count = c->count; + double pix[3]; + float dx[3], hi, hi2, r2; + struct part *restrict parts = c->parts, *restrict pi; + struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts; + float dt_max = r->e->dt_max; + #ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; + #endif + TIMER_TIC + + /* Any work here? */ + if ( c->dt_min > dt_max ) + return; + + /* Loop over the particles in the cell. */ + for ( pid = 0 ; pid < count ; pid++ ) { + + /* Get a pointer to the ith particle. */ + cpi = &cparts[pid]; + pi = &parts[pid]; + if ( cpi->dt > dt_max ) + continue; + + /* Get the particle position and radius. */ + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = cpi->x[k]; + hi = cpi->h; + hi2 = hi * hi; + + /* Loop over the other particles .*/ + for ( pjd = 0 ; pjd < count ; pjd++ ) { + + /* Skip self-interactions. */ + if ( pid == pjd ) + continue; + + /* Get a pointer to the jth particle. */ + cpj = &cparts[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - cpj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hi2 ) { + + #ifndef VECTORIZE + + IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts[pjd] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = cpj->h; + piq[icount] = pi; + pjq[icount] = &parts[pjd]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over all other particles. */ + + } /* loop over all particles. */ + + #ifdef VECTORIZE + /* Pick up any leftovers. */ + if ( icount > 0 ) + for ( k = 0 ; k < icount ; k++ ) + IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); + #endif + + #ifdef TIMER_VERBOSE + printf( "runner_doself1[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(TIMER_DOSELF); + #endif + + } + + +void DOSELF2 ( struct runner *r , struct cell *restrict c ) { + + int k, pid, pjd, count = c->count; + double pix[3]; + float dx[3], hi, hi2, r2; + struct part *restrict parts = c->parts, *restrict pi; + struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts; + float dt_max = r->e->dt_max; + int firstdt = 0, countdt = 0, *indt = NULL; + #ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; + #endif + TIMER_TIC + + /* Set up indt if needed. */ + if ( c->dt_min > dt_max ) + return; + else if ( c->dt_max > dt_max ) { + if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL ) + error( "Failed to allocate indt." ); + for ( k = 0 ; k < count ; k++ ) + if ( cparts[k].dt <= dt_max ) { + indt[ countdt ] = k; + countdt += 1; + } + } + + /* Loop over the particles in the cell. */ + for ( pid = 0 ; pid < count ; pid++ ) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + cpi = &cparts[pid]; + + /* Get the particle position and radius. */ + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = cpi->x[k]; + hi = cpi->h; + hi2 = hi * hi; + + /* Is the ith particle not active? */ + if ( cpi->dt > dt_max ) { + + /* Loop over the other particles .*/ + for ( pjd = firstdt ; pjd < countdt ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + cpj = &cparts[ indt[ pjd ] ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - cpj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hi2 || r2 < cpj->h*cpj->h ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hi , cpj->h , pi , &parts[indt[pjd]] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = cpj->h; + piq[icount] = pi; + pjq[icount] = &parts[indt[pjd]]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over all other particles. */ + + } + + /* Otherwise, interact with all candidates. */ + else { + + /* We caught a live one! */ + firstdt += 1; + + /* Loop over the other particles .*/ + for ( pjd = pid+1 ; pjd < count ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + cpj = &cparts[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - cpj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Hit or miss? */ + if ( r2 < hi2 || r2 < cpj->h*cpj->h ) { + + #ifndef VECTORIZE + + IACT( r2 , dx , hi , cpj->h , pi , &parts[pjd] ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = cpj->h; + piq[icount] = pi; + pjq[icount] = &parts[pjd]; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); + icount = 0; + } + + #endif + + } + + } /* loop over all other particles. */ + + } + + } /* loop over all particles. */ #ifdef VECTORIZE /* Pick up any leftovers. */ @@ -820,9 +1492,9 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) { for ( k = 0 ; k < icount ; k++ ) IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); #endif - + #ifdef TIMER_VERBOSE - printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); + printf( "runner_doself2[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); #else TIMER_TOC(TIMER_DOSELF); #endif @@ -840,7 +1512,290 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) { * redundant computations to find the sid on-the-fly. */ -void DOSUB ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj , int sid ) { +void DOSUB1 ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj , int sid ) { + + int j, k; + double shift[3]; + float h; + struct space *s = r->e->s; + + TIMER_TIC + + /* Is this a single cell? */ + if ( cj == NULL ) { + + /* Recurse? */ + if ( ci->split ) { + + /* Loop over all progeny. */ + for ( k = 0 ; k < 8 ; k++ ) + if ( ci->progeny[k] != NULL ) { + DOSUB1( r , ci->progeny[k] , NULL , -1 ); + for ( j = k+1 ; j < 8 ; j++ ) + if ( ci->progeny[j] != NULL ) + DOSUB1( r , ci->progeny[k] , ci->progeny[j] , -1 ); + } + + } + + /* Otherwsie, compute self-interaction. */ + else + DOSELF2( r , ci ); + + } /* self-interaction. */ + + /* Otherwise, it's a pair interaction. */ + else { + + /* Get the cell dimensions. */ + h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) ); + + /* Get the type of pair if not specified explicitly. */ + if ( sid < 0 ) { + + /* Get the relative distance between the pairs, wrapping. */ + for ( k = 0 ; k < 3 ; k++ ) { + if ( cj->loc[k] - ci->loc[k] < -s->dim[k]/2 ) + shift[k] = s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > s->dim[k]/2 ) + shift[k] = -s->dim[k]; + else + shift[k] = 0.0; + } + + /* Get the sorting index. */ + for ( sid = 0 , k = 0 ; k < 3 ; k++ ) + sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 ); + + /* Flip? */ + if ( sid < 13 ) { + struct cell *temp = cj; cj = ci; ci = temp; + } + else + sid = 26 - sid; + + } + + /* Recurse? */ + if ( ci->split && cj->split && + ci->h_max*2 < h && cj->h_max*2 < h ) { + + /* Different types of flags. */ + switch ( sid ) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + break; + + case 1: /* ( 1 , 1 , 0 ) */ + if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 ); + break; + + case 2: /* ( 1 , 1 , -1 ) */ + if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 ); + break; + + case 3: /* ( 1 , 0 , 1 ) */ + if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 ); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[0] , -1 ); + if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[1] , -1 ); + if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[2] , -1 ); + if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[1] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[3] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[2] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[3] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[3] , -1 ); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[1] , -1 ); + if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[3] , -1 ); + break; + + case 6: /* ( 1 , -1 , 1 ) */ + if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 ); + break; + + case 7: /* ( 1 , -1 , 0 ) */ + if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[2] , -1 ); + if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[3] , -1 ); + break; + + case 8: /* ( 1 , -1 , -1 ) */ + if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) + DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 ); + break; + + case 9: /* ( 0 , 1 , 1 ) */ + if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 ); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[2] , cj->progeny[0] , -1 ); + if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[2] , cj->progeny[1] , -1 ); + if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[2] , cj->progeny[4] , -1 ); + if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) + DOSUB1( r , ci->progeny[2] , cj->progeny[5] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[1] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[5] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[4] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[5] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[5] , -1 ); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[2] , cj->progeny[1] , -1 ); + if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) + DOSUB1( r , ci->progeny[2] , cj->progeny[5] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 ); + if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) + DOSUB1( r , ci->progeny[6] , cj->progeny[5] , -1 ); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[1] , cj->progeny[0] , -1 ); + if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[1] , cj->progeny[2] , -1 ); + if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[1] , cj->progeny[4] , -1 ); + if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL ) + DOSUB1( r , ci->progeny[1] , cj->progeny[6] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[2] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 ); + if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL ) + DOSUB1( r , ci->progeny[3] , cj->progeny[6] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[4] , -1 ); + if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL ) + DOSUB1( r , ci->progeny[5] , cj->progeny[6] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 ); + if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL ) + DOSUB1( r , ci->progeny[7] , cj->progeny[6] , -1 ); + break; + + } + + } + + /* Otherwise, compute the pair directly. */ + else + DOPAIR1( r , ci , cj ); + + } /* otherwise, pair interaction. */ + + + #ifdef TIMER_VERBOSE + printf( "runner_DOSUB[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(TIMER_DOSUB); + #endif + + } + + +void DOSUB2 ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj , int sid ) { int j, k; double shift[3]; @@ -858,17 +1813,17 @@ void DOSUB ( struct runner *r , struct cell *restrict ci , struct cell *restrict /* Loop over all progeny. */ for ( k = 0 ; k < 8 ; k++ ) if ( ci->progeny[k] != NULL ) { - DOSUB( r , ci->progeny[k] , NULL , -1 ); + DOSUB2( r , ci->progeny[k] , NULL , -1 ); for ( j = k+1 ; j < 8 ; j++ ) if ( ci->progeny[j] != NULL ) - DOSUB( r , ci->progeny[k] , ci->progeny[j] , -1 ); + DOSUB2( r , ci->progeny[k] , ci->progeny[j] , -1 ); } } /* Otherwsie, compute self-interaction. */ else - DOSELF( r , ci ); + DOSELF2( r , ci ); } /* self-interaction. */ @@ -914,193 +1869,193 @@ void DOSUB ( struct runner *r , struct cell *restrict ci , struct cell *restrict /* Regular sub-cell interactions of a single cell. */ case 0: /* ( 1 , 1 , 1 ) */ if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); break; case 1: /* ( 1 , 1 , 0 ) */ if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 ); break; case 2: /* ( 1 , 1 , -1 ) */ if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 ); break; case 3: /* ( 1 , 0 , 1 ) */ if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 ); break; case 4: /* ( 1 , 0 , 0 ) */ if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[0] , -1 ); if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[1] , -1 ); if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[2] , -1 ); if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[1] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[3] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[2] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[3] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[3] , -1 ); break; case 5: /* ( 1 , 0 , -1 ) */ if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[1] , -1 ); if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[3] , -1 ); break; case 6: /* ( 1 , -1 , 1 ) */ if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 ); break; case 7: /* ( 1 , -1 , 0 ) */ if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[2] , -1 ); if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[3] , -1 ); break; case 8: /* ( 1 , -1 , -1 ) */ if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB( r , ci->progeny[4] , cj->progeny[3] , -1 ); + DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 ); break; case 9: /* ( 0 , 1 , 1 ) */ if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 ); break; case 10: /* ( 0 , 1 , 0 ) */ if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[2] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[2] , cj->progeny[0] , -1 ); if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[2] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[2] , cj->progeny[1] , -1 ); if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[2] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[2] , cj->progeny[4] , -1 ); if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOSUB( r , ci->progeny[2] , cj->progeny[5] , -1 ); + DOSUB2( r , ci->progeny[2] , cj->progeny[5] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[1] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[5] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[5] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[4] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[5] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[5] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[5] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[5] , -1 ); break; case 11: /* ( 0 , 1 , -1 ) */ if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[2] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[2] , cj->progeny[1] , -1 ); if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOSUB( r , ci->progeny[2] , cj->progeny[5] , -1 ); + DOSUB2( r , ci->progeny[2] , cj->progeny[5] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[1] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 ); if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOSUB( r , ci->progeny[6] , cj->progeny[5] , -1 ); + DOSUB2( r , ci->progeny[6] , cj->progeny[5] , -1 ); break; case 12: /* ( 0 , 0 , 1 ) */ if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[1] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[1] , cj->progeny[0] , -1 ); if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[1] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[1] , cj->progeny[2] , -1 ); if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[1] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[1] , cj->progeny[4] , -1 ); if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL ) - DOSUB( r , ci->progeny[1] , cj->progeny[6] , -1 ); + DOSUB2( r , ci->progeny[1] , cj->progeny[6] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[2] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 ); if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL ) - DOSUB( r , ci->progeny[3] , cj->progeny[6] , -1 ); + DOSUB2( r , ci->progeny[3] , cj->progeny[6] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[4] , -1 ); if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL ) - DOSUB( r , ci->progeny[5] , cj->progeny[6] , -1 ); + DOSUB2( r , ci->progeny[5] , cj->progeny[6] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[0] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[2] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[4] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 ); if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL ) - DOSUB( r , ci->progeny[7] , cj->progeny[6] , -1 ); + DOSUB2( r , ci->progeny[7] , cj->progeny[6] , -1 ); break; } @@ -1109,7 +2064,7 @@ void DOSUB ( struct runner *r , struct cell *restrict ci , struct cell *restrict /* Otherwise, compute the pair directly. */ else - DOPAIR( r , ci , cj ); + DOPAIR2( r , ci , cj ); } /* otherwise, pair interaction. */ diff --git a/src/runner_iact.h b/src/runner_iact.h index 750f44d02a9f417cdd9b06e90e4109ba774871bc..fff1382fc08bb8070fc1e4fb4545eb56c39eb30c 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -89,9 +89,10 @@ __attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , floa } +#ifdef VECTORIZE + __attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) { -#ifdef VECTORIZE vector ind, c[kernel_degree+1]; int j, k; @@ -113,15 +114,9 @@ __attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x w->v = ( x->v * w->v ) + c[k].v; } -#else + } - /* Fake vectorization, i.e. hope the compiler gets it. */ - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - kernel_deval( x.f[k] , &w.f[k] , &dw_dx.f[k] ); - #endif - - } __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) { diff --git a/src/space.c b/src/space.c index bd34b1e0e8c8f5cf8937a0009bddc8e85a0db4ff..51eec54bf80d70041423e317ad6de5d2530e2968 100644 --- a/src/space.c +++ b/src/space.c @@ -104,7 +104,7 @@ void space_rebuild_recycle ( struct space *s , struct cell *c ) { int space_rebuild_recurse ( struct space *s , struct cell *c ) { int k, count, changes = 0, wasmt[8]; - float h, h_limit, h_max = 0.0f; + float h, h_limit, h_max = 0.0f, dt_min = c->parts[0].dt, dt_max = dt_min; struct cell *temp; /* If the cell is already split, check that the split is still ok. */ @@ -124,8 +124,14 @@ int space_rebuild_recurse ( struct space *s , struct cell *c ) { count += 1; if ( h > h_max ) h_max = h; + if ( c->cparts[k].dt < dt_min ) + dt_min = c->cparts[k].dt; + if ( c->cparts[k].dt > dt_max ) + dt_max = c->cparts[k].dt; } c->h_max = h_max; + c->dt_min = dt_min; + c->dt_max = dt_max; /* Un-split? */ if ( count < c->count*space_splitratio || c->count < space_splitsize ) { @@ -209,7 +215,7 @@ int space_rebuild_recurse ( struct space *s , struct cell *c ) { int space_rebuild ( struct space *s , int force , double cell_max ) { - float h_max = s->parts[0].h, h_min = s->parts[0].h; + float h_max = s->cell_min, h_min = s->parts[0].h; int i, j, k, cdim[3]; struct cell *c; struct part *finger; @@ -235,19 +241,22 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { /* Free the old cells, if they were allocated. */ if ( s->cells != NULL ) { - for ( k = 0 ; k < s->nr_cells ; k++ ) + for ( k = 0 ; k < s->nr_cells ; k++ ) { space_rebuild_recycle( s , &s->cells[k] ); + if ( s->cells[k].sort != NULL ) + free( s->cells[k].sort ); + } free( s->cells ); s->maxdepth = 0; } - /* Set the new cell dimensions. */ + /* Set the new cell dimensions only if smaller. */ for ( k = 0 ; k < 3 ; k++ ) { s->cdim[k] = cdim[k]; s->h[k] = s->dim[k] / cdim[k]; s->ih[k] = 1.0 / s->h[k]; } - + /* Allocate the highest level of cells. */ s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2]; if ( posix_memalign( (void *)&s->cells , 64 , s->nr_cells * sizeof(struct cell) ) != 0 ) @@ -274,7 +283,8 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { /* Run through the particles and get their cell index. */ - ind = (int *)alloca( sizeof(int) * s->nr_parts ); + if ( ( ind = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL ) + error( "Failed to allocate temporary particle indices." ); for ( k = 0 ; k < s->nr_cells ; k++ ) s->cells[ k ].count = 0; for ( k = 0 ; k < s->nr_parts ; k++ ) { @@ -283,7 +293,10 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { } /* Sort the parts according to their cells. */ - parts_sort( s->parts , ind , s->nr_parts , 0 , s->nr_cells ); + parts_sort( s->parts , ind , s->nr_parts , 0 , s->nr_cells ); + + /* We no longer need the indices as of here. */ + free( ind ); /* Update the condensed particle data. */ for ( k = 0 ; k < s->nr_parts ; k++ ) { @@ -295,7 +308,9 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { } /* Hook the cells up to the parts. */ - for ( finger = s->parts , cfinger = s->cparts , k = 0 ; k < s->nr_cells ; k++ ) { + finger = s->parts; + cfinger = s->cparts; + for ( k = 0 ; k < s->nr_cells ; k++ ) { c = &s->cells[ k ]; c->parts = finger; c->cparts = cfinger; @@ -623,7 +638,7 @@ void space_splittasks ( struct space *s ) { continue; /* Make a sub? */ - if ( ci->count < space_subsize ) { + if ( space_dosub && ci->count < space_subsize ) { /* convert to a self-subtask. */ t->type = task_type_sub; @@ -701,22 +716,19 @@ void space_splittasks ( struct space *s ) { sid = 26 - sid; /* Replace by a single sub-task? */ - if ( ci->count < space_subsize && cj->count < space_subsize && + if ( space_dosub && + ci->count < space_subsize && cj->count < space_subsize && sid != 0 && sid != 2 && sid != 6 && sid != 8 ) { /* Make this task a sub task. */ t->type = task_type_sub; t->flags = sid; - /* Make it depend on all the sorts of its sub-cells. */ - for ( j = 0 ; j < 8 ; j++ ) { - if ( ci->progeny[j] != NULL ) - for ( k = 0 ; k < 14 ; k++ ) - task_addunlock( ci->progeny[j]->sorts[k] , t ); - if ( cj->progeny[j] != NULL ) - for ( k = 0 ; k < 14 ; k++ ) - task_addunlock( cj->progeny[j]->sorts[k] , t ); - } + /* Make it depend on all the sorts of its two cells. */ + for ( k = 0 ; k < 14 ; k++ ) + task_addunlock( ci->sorts[k] , t ); + for ( k = 0 ; k < 14 ; k++ ) + task_addunlock( cj->sorts[k] , t ); /* Don't go any further. */ continue; @@ -1070,12 +1082,12 @@ void space_maketasks ( struct space *s , int do_sort ) { /* Allocate the task-list, if needed. */ - if ( s->tasks == NULL || s->tasks_size < s->tot_cells * 43 ) { + if ( s->tasks == NULL || s->tasks_size < s->tot_cells * space_maxtaskspercell ) { if ( s->tasks != NULL ) free( s->tasks ); if ( s->tasks_ind != NULL ) free( s->tasks_ind ); - s->tasks_size = s->tot_cells * 43; + s->tasks_size = s->tot_cells * space_maxtaskspercell; if ( posix_memalign( (void *)&s->tasks , 64 , sizeof(struct task) * s->tasks_size ) != 0 ) error( "Failed to allocate task list." ); if ( ( s->tasks_ind = (int *)malloc( sizeof(int) * s->tasks_size ) ) == NULL ) @@ -1128,6 +1140,34 @@ void space_maketasks ( struct space *s , int do_sort ) { /* Split the tasks. */ space_splittasks( s ); + /* Remove pairs and self-interactions for cells with no parts in dt. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + t = &s->tasks[k]; + if ( t->type == task_type_self ) { + if ( t->ci->dt_min > s->dt_max ) + t->type = task_type_none; + } + else if ( t->type == task_type_pair ) { + if ( t->ci->dt_min > s->dt_max && t->cj->dt_min > s->dt_max ) { + t->type = task_type_none; + for ( j = 0 ; j < 13 ; j++ ) + task_rmunlock_blind( t->ci->sorts[j] , t ); + for ( j = 0 ; j < 13 ; j++ ) + task_rmunlock_blind( t->cj->sorts[j] , t ); + } + } + else if ( t->type == task_type_sub ) { + if ( t->ci->dt_min > s->dt_max && ( t->cj == NULL || t->cj->dt_min > s->dt_max ) ) { + t->type = task_type_none; + for ( j = 0 ; j < 13 ; j++ ) + task_rmunlock_blind( t->ci->sorts[j] , t ); + if ( t->cj != NULL ) + for ( j = 0 ; j < 13 ; j++ ) + task_rmunlock_blind( t->cj->sorts[j] , t ); + } + } + } + /* Remove sort tasks with no dependencies. */ for ( k = 0 ; k < s->nr_tasks ; k++ ) { t = &s->tasks[k]; @@ -1247,7 +1287,7 @@ void space_maketasks ( struct space *s , int do_sort ) { void space_split ( struct space *s , struct cell *c ) { int k, count; - double h, h_limit, h_max = 0.0; + double h, h_limit, h_max = 0.0, dt_min = c->parts[0].dt, dt_max = dt_min; struct cell *temp; /* Check the depth. */ @@ -1264,8 +1304,14 @@ void space_split ( struct space *s , struct cell *c ) { count += 1; if ( h > h_max ) h_max = h; + if ( c->cparts[k].dt < dt_min ) + dt_min = c->cparts[k].dt; + if ( c->cparts[k].dt > dt_max ) + dt_max = c->cparts[k].dt; } c->h_max = h_max; + c->dt_min = dt_min; + c->dt_max = dt_max; /* Split or let it be? */ if ( count > c->count*space_splitratio && c->count > space_splitsize ) { @@ -1420,11 +1466,12 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N , s->periodic = periodic; s->nr_parts = N; s->parts = parts; + s->cell_min = h_max; /* Allocate the cparts array. */ if ( posix_memalign( (void *)&s->cparts , 32 , N * sizeof(struct cpart) ) != 0 ) error( "Failed to allocate cparts." ); - + /* Init the space lock. */ if ( lock_init( &s->lock ) != 0 ) error( "Failed to create space spin-lock." ); diff --git a/src/space.h b/src/space.h index c4fb6db77411537942ea5e50314a541dfccf8e2f..0c52849363cae498a2d58930d4906e013bb3dc28 100644 --- a/src/space.h +++ b/src/space.h @@ -26,8 +26,9 @@ #define space_splitratio 0.875 #define space_splitsize_default 400 #define space_subsize_default 1000 -#define space_dosub 1 +#define space_dosub 0 #define space_stretch 1.0 +#define space_maxtaskspercell 43 /* Convert cell location to ID. */ @@ -58,10 +59,10 @@ struct space { double h[3], ih[3]; /* The minimum and maximum cutoff radii. */ - double h_min, h_max; + double h_min, h_max, cell_min; /* Current time step for particles. */ - float dt; + float dt_max; /* Number of cells. */ int nr_cells, tot_cells; @@ -79,9 +80,6 @@ struct space { struct part *parts; struct cpart *cparts; - /* The sortlist data (cells hold pointers to these). */ - struct entry *sortlist; - /* The total number of parts in the space. */ int nr_parts;