/******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ #include #include #include #include #include #include #include #include #include /* Local headers. */ #include "cycle.h" #include "timers.h" #include "const.h" #include "lock.h" #include "task.h" #include "part.h" #include "cell.h" #include "space.h" #include "queue.h" #include "engine.h" #include "runner.h" #include "runner_iact.h" #include "error.h" /* Convert cell location to ID. */ #define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) /* The counters. */ int runner_counter[ runner_counter_count ]; const float runner_shift[13*3] = { 5.773502691896258e-01 , 5.773502691896258e-01 , 5.773502691896258e-01 , 7.071067811865475e-01 , 7.071067811865475e-01 , 0.0 , 5.773502691896258e-01 , 5.773502691896258e-01 , -5.773502691896258e-01 , 7.071067811865475e-01 , 0.0 , 7.071067811865475e-01 , 1.0 , 0.0 , 0.0 , 7.071067811865475e-01 , 0.0 , -7.071067811865475e-01 , 5.773502691896258e-01 , -5.773502691896258e-01 , 5.773502691896258e-01 , 7.071067811865475e-01 , -7.071067811865475e-01 , 0.0 , 5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 , 0.0 , 7.071067811865475e-01 , 7.071067811865475e-01 , 0.0 , 1.0 , 0.0 , 0.0 , 7.071067811865475e-01 , -7.071067811865475e-01 , 0.0 , 0.0 , 1.0 , }; const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; /* Import the density functions. */ #define FUNCTION density #include "runner_doiact.h" #undef FUNCTION #define FUNCTION force #include "runner_doiact.h" /** * @brief Sort the entries in ascending order using QuickSort. * * @param sort The entries * @param N The number of entries. */ void runner_dosort_ascending ( struct entry *sort , int N ) { struct { short int lo, hi; } qstack[10]; int qpos, i, j, lo, hi, imin; struct entry temp; float pivot; /* Sort parts in cell_i in decreasing order with quicksort */ qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0; while ( qpos >= 0 ) { lo = qstack[qpos].lo; hi = qstack[qpos].hi; qpos -= 1; if ( hi - lo < 15 ) { for ( i = lo ; i < hi ; i++ ) { imin = i; for ( j = i+1 ; j <= hi ; j++ ) if ( sort[j].d < sort[imin].d ) imin = j; if ( imin != i ) { temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp; } } } else { pivot = sort[ ( lo + hi ) / 2 ].d; i = lo; j = hi; while ( i <= j ) { while ( sort[i].d < pivot ) i++; while ( sort[j].d > pivot ) j--; if ( i <= j ) { if ( i < j ) { temp = sort[i]; sort[i] = sort[j]; sort[j] = temp; } i += 1; j -= 1; } } if ( j > ( lo + hi ) / 2 ) { if ( lo < j ) { qpos += 1; qstack[qpos].lo = lo; qstack[qpos].hi = j; } if ( i < hi ) { qpos += 1; qstack[qpos].lo = i; qstack[qpos].hi = hi; } } else { if ( i < hi ) { qpos += 1; qstack[qpos].lo = i; qstack[qpos].hi = hi; } if ( lo < j ) { qpos += 1; qstack[qpos].lo = lo; qstack[qpos].hi = j; } } } } } /** * @brief Sort the particles in the given cell along all cardinal directions. * * @param r The #runner. * @param c The #cell. * @param flags Cell flag. * @param clock Flag indicating whether to record the timing or not, needed * for recursive calls. */ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) { struct entry *finger; struct entry *fingers[8]; struct cpart *cparts = c->cparts; int j, k, count = c->count; int i, ind, off[8], inds[8], temp_i, missing; // float shift[3]; float buff[8], px[3]; TIMER_TIC /* Clean-up the flags, i.e. filter out what's already been sorted. */ flags &= ~c->sorted; if ( flags == 0 ) return; /* start by allocating the entry arrays. */ if ( lock_lock( &c->lock ) != 0 ) error( "Failed to lock cell." ); if ( c->sort == NULL || c->sortsize < c->count ) { if ( c->sort != NULL ) free( c->sort ); c->sortsize = c->count * 1.1; if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL ) error( "Failed to allocate sort memory." ); } if ( lock_unlock( &c->lock ) != 0 ) error( "Failed to unlock cell." ); /* Does this cell have any progeny? */ if ( c->split ) { /* Fill in the gaps within the progeny. */ for ( k = 0 ; k < 8 ; k++ ) { if ( c->progeny[k] == NULL ) continue; if ( c->progeny[k]->sorts == NULL ) missing = flags; else missing = ( c->progeny[k]->sorts->flags ^ flags ) & flags; if ( missing ) runner_dosort( r , c->progeny[k] , missing , 0 ); } /* Loop over the 13 different sort arrays. */ for ( j = 0 ; j < 13 ; j++ ) { /* Has this sort array been flagged? */ if ( !( flags & (1 << j) ) ) continue; /* Init the particle index offsets. */ for ( off[0] = 0 , k = 1 ; k < 8 ; k++ ) if ( c->progeny[k-1] != NULL ) off[k] = off[k-1] + c->progeny[k-1]->count; else off[k] = off[k-1]; /* Init the entries and indices. */ for ( k = 0 ; k < 8 ; k++ ) { inds[k] = k; if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) { fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ]; buff[k] = fingers[k]->d; off[k] = off[k]; } else buff[k] = FLT_MAX; } /* Sort the buffer. */ for ( i = 0 ; i < 7 ; i++ ) for ( k = i+1 ; k < 8 ; k++ ) if ( buff[ inds[k] ] < buff[ inds[i] ] ) { temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i; } /* For each entry in the new sort list. */ finger = &c->sort[ j*(count + 1) ]; for ( ind = 0 ; ind < count ; ind++ ) { /* Copy the minimum into the new sort array. */ finger[ind].d = buff[inds[0]]; finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; /* Update the buffer. */ fingers[inds[0]] += 1; buff[inds[0]] = fingers[inds[0]]->d; /* Find the smallest entry. */ for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) { temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i; } } /* Merge. */ /* Add a sentinel. */ c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX; c->sort[ j*(c->count + 1) + c->count ].i = 0; /* Mark as sorted. */ c->sorted |= ( 1 << j ); } /* loop over sort arrays. */ } /* progeny? */ /* Otherwise, just sort. */ else { /* Fill the sort array. */ for ( k = 0 ; k < count ; k++ ) { px[0] = cparts[k].x[0]; px[1] = cparts[k].x[1]; px[2] = cparts[k].x[2]; for ( j = 0 ; j < 13 ; j++ ) if ( flags & (1 << j) ) { c->sort[ j*(count + 1) + k].i = k; c->sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ]; } } /* Add the sentinel and sort. */ for ( j = 0 ; j < 13 ; j++ ) if ( flags & (1 << j) ) { c->sort[ j*(count + 1) + c->count ].d = FLT_MAX; c->sort[ j*(count + 1) + c->count ].i = 0; runner_dosort_ascending( &c->sort[ j*(count + 1) ] , c->count ); c->sorted |= ( 1 << j ); } } /* Verify the sorting. */ /* for ( j = 0 ; j < 13 ; j++ ) { if ( !( flags & (1 << j) ) ) continue; finger = &c->sort[ j*(c->count + 1) ]; for ( k = 1 ; k < c->count ; k++ ) { if ( finger[k].d < finger[k-1].d ) error( "Sorting failed, ascending array." ); if ( finger[k].i >= c->count ) error( "Sorting failed, indices borked." ); } } */ #ifdef TIMER_VERBOSE printf( "runner_dosort[%02i]: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms.\n" , r->id , c->count , c->depth , (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); #else if ( clock ) TIMER_TOC(timer_dosort); #endif } /** * @brief Intermediate task between density and force * * @param r The runner thread. * @param c THe cell. */ void runner_doghost ( struct runner *r , struct cell *c ) { struct part *p; struct cpart *cp; struct cell *finger; int i, k, redo, count = c->count; int *pid; float h, hg, ihg, ihg2, ihg4, h_corr; float normDiv_v, normCurl_v; float dt_step = r->e->dt_step; TIMER_TIC /* Recurse? */ if ( c->split ) { for ( k = 0 ; k < 8 ; k++ ) if ( c->progeny[k] != NULL ) runner_doghost( r , c->progeny[k] ); return; } /* Init the IDs that have to be updated. */ if ( ( pid = (int *)alloca( sizeof(int) * count ) ) == NULL ) error( "Call to alloca failed." ); for ( k = 0 ; k < count ; k++ ) pid[k] = k; /* While there are particles that need to be updated... */ while ( count > 0 ) { /* Reset the redo-count. */ redo = 0; /* Loop over the parts in this cell. */ for ( i = 0 ; i < count ; i++ ) { /* Get a direct pointer on the part. */ p = &c->parts[ pid[i] ]; cp = &c->cparts[ pid[i] ]; /* Is this part within the timestep? */ if ( cp->dt <= dt_step ) { /* Some smoothing length multiples. */ h = p->h; hg = kernel_gamma * h; ihg = 1.0f / hg; ihg2 = ihg * ihg; ihg4 = ihg2 * ihg2; /* Adjust the computed weighted number of neighbours. */ p->density.wcount += kernel_wroot; /* Final operation on the density. */ p->rho = ihg * ihg2 * ( p->rho + p->mass*kernel_root ); p->rho_dh *= ihg4; /* Compute the smoothing length update (Newton step). */ if ( p->density.wcount_dh == 0.0f ) h_corr = p->h; else { h_corr = ( const_nwneigh - p->density.wcount ) / p->density.wcount_dh; /* Truncate to the range [ -p->h/2 , p->h ]. */ h_corr = fminf( h_corr , h ); h_corr = fmaxf( h_corr , -h/2 ); } /* Apply the correction to p->h. */ p->h += h_corr; cp->h = p->h; /* Did we get the right number density? */ if ( p->density.wcount > const_nwneigh + const_delta_nwneigh || p->density.wcount < const_nwneigh - const_delta_nwneigh ) { // printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%.3f.\n" , p->id , p->h , c->depth , p->density.wcount ); fflush(stdout); // p->h += ( p->density.wcount + kernel_root - const_nwneigh ) / p->density.wcount_dh; pid[redo] = pid[i]; redo += 1; p->density.wcount = 0.0; p->density.wcount_dh = 0.0; p->rho = 0.0; p->rho_dh = 0.0; p->density.div_v = 0.0; for ( k=0 ; k < 3 ; k++) p->density.curl_v[k] = 0.0; continue; } /* Pre-compute some stuff for the balsara switch. */ normDiv_v = fabs( p->density.div_v / p->rho * ihg4 ); normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / p->rho * ihg4; /* As of here, particle force variables will be set. Do _NOT_ try to read any particle density variables! */ /* Compute this particle's sound speed. */ p->force.c = sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u ); /* Compute the P/Omega/rho2. */ p->force.POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + hg * p->rho_dh / 3.0f ); /* Balsara switch */ p->force.balsara = normCurl_v / ( normDiv_v + normCurl_v + 0.0001f * p->force.c * ihg ); /* Reset the acceleration. */ for ( k = 0 ; k < 3 ; k++ ) p->a[k] = 0.0f; /* Reset the time derivatives. */ p->force.u_dt = 0.0f; p->force.h_dt = 0.0f; p->force.v_sig = 0.0f; } } /* Re-set the counter for the next loop (potentially). */ count = redo; if ( count > 0 ) { // error( "Bad smoothing length, fixing this isn't implemented yet." ); /* Climb up the cell hierarchy. */ for ( finger = c ; finger != NULL ; finger = finger->parent ) { /* Run through this cell's density interactions. */ for ( k = 0 ; k < finger->nr_density ; k++ ) { /* Self-interaction? */ if ( finger->density[k]->type == task_type_self ) runner_doself_subset_density( r , finger , c->parts , pid , count ); /* Otherwise, pair interaction? */ else if ( finger->density[k]->type == task_type_pair ) { /* Left or right? */ if ( finger->density[k]->ci == finger ) runner_dopair_subset_density( r , finger , c->parts , pid , count , finger->density[k]->cj ); else runner_dopair_subset_density( r , finger , c->parts , pid , count , finger->density[k]->ci ); } /* Otherwise, sub interaction? */ else if ( finger->density[k]->type == task_type_sub ) { /* Left or right? */ if ( finger->density[k]->ci == finger ) runner_dosub_subset_density( r , finger , c->parts , pid , count , finger->density[k]->cj , -1 ); else runner_dosub_subset_density( r , finger , c->parts , pid , count , finger->density[k]->ci , -1 ); } } } } } #ifdef TIMER_VERBOSE printf( "runner_doghost[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , c->count , c->depth , ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout); #else TIMER_TOC(timer_doghost); #endif } /** * @brief The #runner main thread routine. * * @param data A pointer to this thread's data. */ void *runner_main ( void *data ) { struct runner *r = (struct runner *)data; struct engine *e = r->e; int threadID = r->id; int k, qid, naq, keep, tpq; struct queue *queues[ e->nr_queues ], *myq; struct task *t; struct cell *ci, *cj; unsigned int myseed = rand() + r->id; #ifdef TIMER ticks stalled; #endif /* Main loop. */ while ( 1 ) { /* Wait at the barrier. */ engine_barrier( e ); /* Set some convenient local data. */ keep = e->policy & engine_policy_keep; myq = &e->queues[ threadID * e->nr_queues / e->nr_threads ]; tpq = ceil( ((double)e->nr_threads) / e->nr_queues ); #ifdef TIMER stalled = 0; #endif /* Set up the local list of active queues. */ naq = e->nr_queues; for ( k = 0 ; k < naq ; k++ ) queues[k] = &e->queues[k]; /* Set up the local list of active queues. */ naq = e->nr_queues; for ( k = 0 ; k < naq ; k++ ) queues[k] = &e->queues[k]; /* Loop while there are tasks... */ while ( 1 ) { /* Remove any inactive queues. */ for ( k = 0 ; k < naq ; k++ ) if ( queues[k]->next == queues[k]->count ) { naq -= 1; queues[k] = queues[naq]; k -= 1; } if ( naq == 0 ) break; /* Get a task, how and from where depends on the policy. */ TIMER_TIC t = NULL; if ( e->nr_queues == 1 ) { t = queue_gettask_old( &e->queues[0] , 1 , 0 ); } else if ( e->policy & engine_policy_steal ) { if ( ( myq->next == myq->count ) || ( t = queue_gettask( myq , r->id , 0 , 0 ) ) == NULL ) { TIMER_TIC2 qid = rand_r( &myseed ) % naq; keep = ( e->policy & engine_policy_keep ) && ( myq->count <= myq->size-tpq ); if ( myq->next == myq->count ) COUNT(runner_counter_steal_empty); else COUNT(runner_counter_steal_stall); t = queue_gettask( queues[qid] , r->id , 0 , keep ); if ( t != NULL && keep ) queue_insert( myq , t ); TIMER_TOC2(timer_steal); } } else if ( e->policy & engine_policy_rand ) { qid = rand_r( &myseed ) % naq; t = queue_gettask( queues[qid] , r->id , e->policy & engine_policy_block , 0 ); } else { t = queue_gettask( &e->queues[threadID] , r->id , e->policy & engine_policy_block , 0 ); } TIMER_TOC(timer_getpair); /* Did I get anything? */ if ( t == NULL ) { COUNT(runner_counter_stall); #ifdef TIMER if ( !stalled ) stalled = getticks(); #endif continue; } #ifdef TIMER else if ( stalled ) { timers_toc( timer_stalled , stalled ); #ifdef TIMER_VERBOSE printf( "runner_main[%02i]: stalled %.3f ms\n" , r->id , ((double)stalled) / CPU_TPS * 1000 ); fflush(stdout); #endif stalled = 0; } #endif /* Get the cells. */ ci = t->ci; cj = t->cj; /* Different types of tasks... */ switch ( t->type ) { case task_type_self: if ( t->subtype == task_subtype_density ) runner_doself1_density( r , ci ); else if ( t->subtype == task_subtype_force ) runner_doself2_force( r , ci ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); break; case task_type_pair: if ( t->subtype == task_subtype_density ) runner_dopair1_density( r , ci , cj ); else if ( t->subtype == task_subtype_force ) runner_dopair2_force( r , ci , cj ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); cell_unlocktree( cj ); break; case task_type_sort: runner_dosort( r , ci , t->flags , 1 ); break; case task_type_sub: if ( t->subtype == task_subtype_density ) runner_dosub1_density( r , ci , cj , t->flags ); else if ( t->subtype == task_subtype_force ) runner_dosub2_force( r , ci , cj , t->flags ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); if ( cj != NULL ) cell_unlocktree( cj ); break; case task_type_ghost: if ( ci->super == ci ) runner_doghost( r , ci ); break; default: error( "Unknown task type." ); } /* Resolve any dependencies. */ for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) if ( __sync_fetch_and_sub( &t->unlock_tasks[k]->wait , 1 ) == 0 ) abort(); } /* main loop. */ /* Any leftover stalls? */ #ifdef TIMER if ( stalled ) { timers_toc( timer_stalled , stalled ); #ifdef TIMER_VERBOSE printf( "runner_main[%02i]: stalled %.3f ms\n" , r->id , ((double)stalled) / CPU_TPS * 1000 ); fflush(stdout); #endif stalled = 0; } #endif } /* Be kind, rewind. */ return NULL; }