diff --git a/src/atomic.h b/src/atomic.h index 882f4288dfc262d7f62d2ad8b25f7a6625ed90b2..5deb82c6d77522ad2d05a24fd937c5e239b3b91f 100644 --- a/src/atomic.h +++ b/src/atomic.h @@ -22,3 +22,4 @@ #define atomic_add(v,i) __sync_fetch_and_add( v , i ) #define atomic_inc(v) atomic_add( v , 1 ) +#define atomic_dec(v) atomic_add( v , -1 ) diff --git a/src/cell.h b/src/cell.h index a560fcb1a54000188b454913b2776308c0fce24b..265c7ef42efa07e8b7c93f97afc143ff70b58747 100644 --- a/src/cell.h +++ b/src/cell.h @@ -88,7 +88,7 @@ struct cell { int nr_density; /* The ghost task to link density to interactions. */ - struct task *ghost; + struct task *ghost, *kick2; /* Number of tasks that are associated with this cell. */ int nr_tasks; @@ -105,6 +105,15 @@ struct cell { /* ID of the previous owner, e.g. runner. */ int owner; + /* Momentum of particles in cell. */ + float mom[3], ang[3]; + + /* Potential and kinetic energy of particles in this cell. */ + float epot, ekin; + + /* Number of particles updated in this cell. */ + int updated; + /* Linking pointer for "memory management". */ struct cell *next; diff --git a/src/engine.c b/src/engine.c index c649d00c7ab591cc5fee84d5c10aa7101c8fc579..43e3c21b4b6c362cf0b7b42168f9640e6fc16759 100644 --- a/src/engine.c +++ b/src/engine.c @@ -33,6 +33,7 @@ /* Local headers. */ #include "cycle.h" +#include "atomic.h" #include "timers.h" #include "const.h" #include "vector.h" @@ -94,7 +95,7 @@ void engine_prepare ( struct engine *e ) { if ( s->tasks[k].skip ) continue; for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) - __sync_add_and_fetch( &s->tasks[k].unlock_tasks[j]->wait , 1 ); + atomic_inc( &s->tasks[k].unlock_tasks[j]->wait ); } // printf( "engine_prepare: preparing task dependencies took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); @@ -288,6 +289,51 @@ void engine_map_kick_first ( struct cell *c , void *data ) { } +/** + * @brief Mapping function to collect the data from the second kick. + */ + +void engine_collect_kick2 ( struct cell *c ) { + + int k, updated = 0; + float dt_min = FLT_MAX, dt_max = 0.0f, ekin = 0.0f, epot = 0.0f; + float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f }; + struct cell *cp; + + /* If I am a super-cell, return immediately. */ + if ( c->kick2 != NULL ) + return; + + /* If this cell is not split, I'm in trouble. */ + if ( !c->split ) + error( "Cell has no super-cell." ); + + /* Collect the values from the progeny. */ + for ( k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) { + cp = c->progeny[k]; + engine_collect_kick2( cp ); + dt_min = fminf( dt_min , cp->dt_min ); + dt_max = fmaxf( dt_max , cp->dt_max ); + updated += cp->updated; + ekin += cp->ekin; + epot += cp->epot; + mom[0] += cp->mom[0]; mom[1] += cp->mom[1]; mom[2] += cp->mom[2]; + ang[0] += cp->ang[0]; ang[1] += cp->ang[1]; ang[2] += cp->ang[2]; + } + + /* Store the collected values in the cell. */ + c->dt_min = dt_min; + c->dt_max = dt_max; + c->updated = updated; + c->ekin = ekin; + c->epot = epot; + c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2]; + c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2]; + + } + + /** * @brief Compute the force on a single particle brute-force. */ @@ -354,16 +400,13 @@ void engine_single ( double *dim , long long int pid , struct part *__restrict__ void engine_step ( struct engine *e , int sort_queues ) { - int k, nr_parts = e->s->nr_parts; - struct part *restrict parts = e->s->parts, *restrict p; - struct xpart *restrict xp; - float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max; - float pdt, h, m, v[3], u; - double x[3]; - double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 }; - double lang[3], ang[3] = { 0.0 , 0.0 , 0.0 }; + int k; + float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX; + float epot = 0.0, ekin = 0.0, mom[3] = { 0.0 , 0.0 , 0.0 }; + float ang[3] = { 0.0 , 0.0 , 0.0 }; // double lent, ent = 0.0; - int threadID, nthreads, count = 0, lcount; + int count = 0; + struct cell *c; TIMER_TIC2 @@ -424,83 +467,19 @@ void engine_step ( struct engine *e , int sort_queues ) { // printParticle( parts , 432626 ); // printParticle( parts , 432628 ); - /* Second kick. */ - TIMER_TIC_ND - dt_min = FLT_MAX; dt_max = 0.0f; - #pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lang,lekin,lepot,threadID,nthreads,pdt,v,h,m,x,u) - { - threadID = omp_get_thread_num(); - nthreads = omp_get_num_threads(); - ldt_min = FLT_MAX; ldt_max = 0.0f; - lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0; - lang[0] = 0.0; lang[1] = 0.0; lang[2] = 0.0; - lekin = 0.0; lepot = 0.0; /* lent=0.0; */ lcount = 0; - for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) { - - /* Get a handle on the part. */ - p = &parts[k]; - xp = p->xtras; - - /* Get local copies of particle data. */ - pdt = p->dt; - h = p->h; - m = p->mass; - x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2]; - - /* Scale the derivatives if they're freshly computed. */ - if ( pdt <= dt_step ) { - p->force.h_dt *= h * 0.333333333f; - lcount += 1; - } - - /* Update the particle's time step. */ - p->dt = pdt = const_cfl * h / ( p->force.v_sig ); - - /* Update positions and energies at the half-step. */ - p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0]; - p->v[1] = v[1] = xp->v_old[1] + hdt * p->a[1]; - p->v[2] = v[2] = xp->v_old[2] + hdt * p->a[2]; - p->u = u = xp->u_old + hdt * p->force.u_dt; - - /* Get the smallest/largest dt. */ - ldt_min = fminf( ldt_min , pdt ); - ldt_max = fmaxf( ldt_max , pdt ); - - /* Collect total energy. */ - lekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] ); - lepot += m * u; - - /* Collect momentum */ - lmom[0] += m * p->v[0]; - lmom[1] += m * p->v[1]; - lmom[2] += m * p->v[2]; - - /* Collect angular momentum */ - lang[0] += m * ( x[1]*v[2] - v[2]*x[1] ); - lang[1] += m * ( x[2]*v[0] - v[0]*x[2] ); - lang[2] += m * ( x[0]*v[1] - v[1]*x[0] ); - - /* Collect entropic function */ - // lent += u * pow( p->rho, 1.f-const_gamma ); - - } - #pragma omp critical - { - dt_min = fminf( dt_min , ldt_min ); - dt_max = fmaxf( dt_max , ldt_max ); - mom[0] += lmom[0]; - mom[1] += lmom[1]; - mom[2] += lmom[2]; - ang[0] += lang[0]; - ang[1] += lang[1]; - ang[2] += lang[2]; - // ent += (const_gamma -1.) * lent; - epot += lepot; - ekin += lekin; - count += lcount; + /* Collect the cell data from the second kick. */ + for ( k = 0 ; k < e->s->nr_cells ; k++ ) { + c = &e->s->cells[k]; + engine_collect_kick2( c ); + dt_min = fminf( dt_min , c->dt_min ); + dt_max = fmaxf( dt_max , c->dt_max ); + ekin += c->ekin; + epot += c->epot; + count += c->updated; + mom[0] += c->mom[0]; mom[1] += c->mom[1]; mom[2] += c->mom[2]; + ang[0] += c->ang[0]; ang[1] += c->ang[1]; ang[2] += c->ang[2]; } - } - TIMER_TOC( timer_kick2 ); + e->dt_min = dt_min; // printParticle( parts , 432626 ); printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout); diff --git a/src/queue.c b/src/queue.c index d2c47b4d17833109bed04bfb083d7f5d8db9ea7f..82bf86c16943a7da8368c5fdb83f2409c6272059 100644 --- a/src/queue.c +++ b/src/queue.c @@ -309,8 +309,8 @@ struct task *queue_gettask ( struct queue *q , int rid , int blocking , int keep continue; /* Get the score for this task. */ - if ( res->type == task_type_self || res->type == task_type_ghost || res->type == task_type_sort || ( res->type == task_type_sub && res->cj == NULL ) ) - score = ( res->ci->owner == rid ); + if ( res->cj == NULL ) + score = 2 * ( res->ci->owner == rid ); else score = ( res->ci->owner == rid ) + ( res->cj->owner == rid ); if ( score <= score_best ) @@ -352,8 +352,7 @@ struct task *queue_gettask ( struct queue *q , int rid , int blocking , int keep hits += 1; /* Should we bother looking any farther? */ - if ( ( qtasks[ qtid[ ind_best ] ].cj == NULL && score_best == 1 ) || - score_best == 2 ); + if ( score_best == 2 ); break; } /* loop over the task IDs. */ diff --git a/src/runner.c b/src/runner.c index adbf48ddd2d5de1b094492abfc7e51edbaa25500..deba874d50ab98f8b7b944d0fc61f5560cc467ba 100644 --- a/src/runner.c +++ b/src/runner.c @@ -33,6 +33,7 @@ /* Local headers. */ #include "cycle.h" +#include "atomic.h" #include "timers.h" #include "const.h" #include "lock.h" @@ -331,7 +332,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) * @brief Intermediate task between density and force * * @param r The runner thread. - * @param c THe cell. + * @param c The cell. */ void runner_doghost ( struct runner *r , struct cell *c ) { @@ -506,6 +507,96 @@ void runner_doghost ( struct runner *r , struct cell *c ) { #endif } + + +/** + * @brief Compute the second kick of the given cell. + * + * @param r The runner thread. + * @param c The cell. + */ + +void runner_dokick2 ( struct runner *r , struct cell *c ) { + + int k, count = 0, nr_parts = c->count; + float dt_min = FLT_MAX, dt_max = 0.0f, ekin = 0.0f, epot = 0.0f; + float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f }; + float x[3], v[3], u, h, pdt, m; + float dt_step = r->e->dt_step, dt = r->e->dt, hdt = 0.5f*dt; + struct part *p, *parts = c->parts; + struct xpart *xp; + + TIMER_TIC + + /* Loop over the particles and kick them. */ + for ( k = 0 ; k < nr_parts ; k++ ) { + + /* Get a handle on the part. */ + p = &parts[k]; + xp = p->xtras; + + /* Get local copies of particle data. */ + pdt = p->dt; + h = p->h; + m = p->mass; + x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2]; + + /* Scale the derivatives if they're freshly computed. */ + if ( pdt <= dt_step ) { + p->force.h_dt *= h * 0.333333333f; + count += 1; + } + + /* Update the particle's time step. */ + p->dt = pdt = const_cfl * h / ( p->force.v_sig ); + + /* Update positions and energies at the half-step. */ + p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0]; + p->v[1] = v[1] = xp->v_old[1] + hdt * p->a[1]; + p->v[2] = v[2] = xp->v_old[2] + hdt * p->a[2]; + p->u = u = xp->u_old + hdt * p->force.u_dt; + + /* Get the smallest/largest dt. */ + dt_min = fminf( dt_min , pdt ); + dt_max = fmaxf( dt_max , pdt ); + + /* Collect total energy. */ + ekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] ); + epot += m * u; + + /* Collect momentum */ + mom[0] += m * p->v[0]; + mom[1] += m * p->v[1]; + mom[2] += m * p->v[2]; + + /* Collect angular momentum */ + ang[0] += m * ( x[1]*v[2] - v[2]*x[1] ); + ang[1] += m * ( x[2]*v[0] - v[0]*x[2] ); + ang[2] += m * ( x[0]*v[1] - v[1]*x[0] ); + + /* Collect entropic function */ + // lent += u * pow( p->rho, 1.f-const_gamma ); + + } + + #ifdef TIMER_VERBOSE + printf( "runner_dokick2[%02i]: %i parts at depth %i took %.3f ms.\n" , + r->id , c->count , c->depth , + ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout); + #else + TIMER_TOC(timer_kick2); + #endif + + /* Store the computed values in the cell. */ + c->dt_min = dt_min; + c->dt_max = dt_max; + c->updated = count; + c->ekin = ekin; + c->epot = epot; + c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2]; + c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2]; + + } /** @@ -660,14 +751,17 @@ void *runner_main ( void *data ) { if ( ci->super == ci ) runner_doghost( r , ci ); break; + case task_type_kick2: + runner_dokick2( r , ci ); + break; default: error( "Unknown task type." ); } /* Resolve any dependencies. */ for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) - if ( __sync_fetch_and_sub( &t->unlock_tasks[k]->wait , 1 ) == 0 ) - abort(); + if ( atomic_dec( &t->unlock_tasks[k]->wait ) == 0 ) + error( "Task negative wait." ); } /* main loop. */ diff --git a/src/space.c b/src/space.c index 08ac0b81e24fb34b0873f0dfe4b7e3a1f6e17d5b..20cd589cb4d0ff9e5997f1b4721dc7069542df5f 100644 --- a/src/space.c +++ b/src/space.c @@ -106,8 +106,8 @@ int space_marktasks ( struct space *s ) { /* Single-cell task? */ else if ( t->type == task_type_self || - t->type == task_type_ghost || - ( t->type == task_type_sub && t->cj == NULL ) ) { + t->type == task_type_ghost || + ( t->type == task_type_sub && t->cj == NULL ) ) { /* Set this task's skip. */ t->skip = ( t->ci->dt_min > dt_step ); @@ -736,6 +736,8 @@ void space_map_clearsort ( struct cell *c , void *data ) { * Looks for the super cell, e.g. the highest-level cell above each * cell for which a pair is defined. All ghosts below this cell will * depend on the ghost of their parents (sounds spooky, but it isn't). + * + * A kick2-task is appended to each super cell. */ void space_map_mkghosts ( struct cell *c , void *data ) { @@ -754,11 +756,15 @@ void space_map_mkghosts ( struct cell *c , void *data ) { if ( c->super != c || c->nr_tasks > 0 ) c->ghost = space_addtask( s , task_type_ghost , task_subtype_none , 0 , 0 , c , NULL , 0 ); + /* Append a kick task if we are the active super cell. */ + if ( c->super == c && c->nr_tasks > 0 ) + c->kick2 = space_addtask( s , task_type_kick2 , task_subtype_none , 0 , 0 , c , NULL , 0 ); + /* If we are not the super cell ourselves, make our ghost depend on our parent cell. */ if ( c->super != c ) task_addunlock( c->parent->ghost , c->ghost ); - + } @@ -1601,7 +1607,9 @@ void space_maketasks ( struct space *s , int do_sort ) { /* Append a ghost task to each cell. */ space_map_cells_pre( s , 1 , &space_map_mkghosts , s ); - /* Run through the tasks and make force tasks for each density task. */ + /* Run through the tasks and make force tasks for each density task. + Each force task depends on the cell ghosts and unlocks the kick2 task + of its super-cell. */ kk = s->nr_tasks; #pragma omp parallel for private(t,t2) for ( k = 0 ; k < kk ; k++ ) { @@ -1618,6 +1626,7 @@ void space_maketasks ( struct space *s , int do_sort ) { task_addunlock( t , t->ci->super->ghost ); t2 = space_addtask( s , task_type_self , task_subtype_force , 0 , 0 , t->ci , NULL , 0 ); task_addunlock( t->ci->ghost , t2 ); + task_addunlock( t2 , t->ci->super->kick2 ); } /* Otherwise, pair interaction? */ @@ -1628,17 +1637,23 @@ void space_maketasks ( struct space *s , int do_sort ) { t2 = space_addtask( s , task_type_pair , task_subtype_force , 0 , 0 , t->ci , t->cj , 0 ); task_addunlock( t->ci->ghost , t2 ); task_addunlock( t->cj->ghost , t2 ); + task_addunlock( t2 , t->ci->super->kick2 ); + if ( t->ci->super != t->cj->super ) + task_addunlock( t2 , t->cj->super->kick2 ); } /* Otherwise, sub interaction? */ else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) { task_addunlock( t , t->ci->super->ghost ); - if ( t->cj != NULL ) + if ( t->cj != NULL && t->ci->super != t->cj->super ) task_addunlock( t , t->cj->super->ghost ); t2 = space_addtask( s , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , 0 ); task_addunlock( t->ci->ghost , t2 ); if ( t->cj != NULL ) task_addunlock( t->cj->ghost , t2 ); + task_addunlock( t2 , t->ci->super->kick2 ); + if ( t->cj != NULL && t->ci->super != t->cj->super ) + task_addunlock( t2 , t->cj->super->kick2 ); } } diff --git a/src/space.h b/src/space.h index fed17faec562675b86d82c043914af88b240fe4e..7d886d98e4321eb7195aef37cef056437282f3b1 100644 --- a/src/space.h +++ b/src/space.h @@ -28,7 +28,7 @@ #define space_subsize_default 1000 #define space_dosub 0 #define space_stretch 1.05 -#define space_maxtaskspercell 30 +#define space_maxtaskspercell 31 /* Convert cell location to ID. */ diff --git a/src/task.h b/src/task.h index 696fe889a1f3919c059424106216796d4b8d580a..26068aa4000133505f11ff194357dc6cac898e42 100644 --- a/src/task.h +++ b/src/task.h @@ -31,6 +31,7 @@ enum task_types { task_type_pair, task_type_sub, task_type_ghost, + task_type_kick2, task_type_count };