diff --git a/src/Makefile.am b/src/Makefile.am index 6dc507a5bfb846fc19cd6d879e1c7721ca23fda9..6f1e27f4d59e81d3755ce27ab4d2cb6a90eee9b3 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -26,8 +26,9 @@ AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) -version-info 0:0:0 # Build the libgadgetsmp library lib_LTLIBRARIES = libgadgetsmp.la -libgadgetsmp_la_SOURCES = space.c runner.c queue.c task.c cell.c +libgadgetsmp_la_SOURCES = space.c runner.c queue.c task.c cell.c engine.c # List required headers -include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h gadgetsmp.h +include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ + engine.h gadgetsmp.h diff --git a/src/engine.c b/src/engine.c new file mode 100644 index 0000000000000000000000000000000000000000..22fd0562c3fd0876288c9a29e4b852f32114c472 --- /dev/null +++ b/src/engine.c @@ -0,0 +1,293 @@ +/******************************************************************************* + * This file is part of GadgetSMP. + * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +/* Config parameters. */ +#include "../config.h" + +/* Some standard headers. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> +#include <sched.h> + +/* Local headers. */ +#include "cycle.h" +#include "const.h" +#include "lock.h" +#include "task.h" +#include "part.h" +#include "cell.h" +#include "space.h" +#include "queue.h" +#include "engine.h" +#include "runner.h" +#include "runner_iact.h" + +/* Error macro. */ +#define error(s) { printf( "%s:%s:%i: %s\n" , __FILE__ , __FUNCTION__ , __LINE__ , s ); abort(); } + +/* Convert cell location to ID. */ +#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) + + +/** + * @brief Sort the tasks in topological order over all queues. + * + * @param e The #engine. + */ + +void engine_ranktasks ( struct engine *e ) { + + int i, j = 0, k, temp, left = 0, rank; + struct task *t; + struct space *s = e->s; + int *tid; + + /* Run throught the tasks and get all the waits right. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) + s->tasks[k].unlock_tasks[j]->wait += 1; + } + + /* Allocate and init the task-ID array. */ + if ( ( tid = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL ) + error( "Failed to allocate temporary tid array." ); + for ( k = 0 ; k < s->nr_tasks ; k++ ) + tid[k] = k; + + /* Main loop. */ + for ( rank = 0 ; left < s->nr_tasks ; rank++ ) { + + /* Load the tids of tasks with no waits. */ + for ( k = left ; k < s->nr_tasks ; k++ ) + if ( s->tasks[ tid[k] ].wait == 0 ) { + temp = tid[j]; tid[j] = tid[k]; tid[k] = temp; + j += 1; + } + + /* Traverse the task tree and add tasks with no weight. */ + for ( i = left ; i < j ; i++ ) { + t = &s->tasks[ tid[i] ]; + t->rank = rank; + s->tasks_ind[i] = t - s->tasks; + /* printf( "engine_ranktasks: task %i of type %s has rank %i.\n" , i , + (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? "pair" : "sort" , rank ); */ + for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) + t->unlock_tasks[k]->wait -= 1; + } + + /* The new left (no, not tony). */ + left = j; + + } + + /* Release the temporary array. */ + free(tid); + + } + + +/** + * @brief Implements a barrier for the #runner threads. + * + * @param e The #engine. + */ + +void engine_barrier( struct engine *e ) { + + /* First, get the barrier mutex. */ + if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 ) + error( "Failed to get barrier mutex." ); + + /* Wait for the barrier to close. */ + while ( e->barrier_count < 0 ) + if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) + error( "Eror waiting for barrier to close." ); + + /* Once I'm in, increase the barrier count. */ + e->barrier_count += 1; + + /* If all threads are in, send a signal... */ + if ( e->barrier_count == e->nr_threads ) + if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 ) + error( "Failed to broadcast barrier full condition." ); + + /* Wait for barrier to be released. */ + while ( e->barrier_count > 0 ) + if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) + error( "Error waiting for barrier to be released." ); + + /* Decrease the counter before leaving... */ + e->barrier_count += 1; + + /* If I'm the last one out, signal the condition again. */ + if ( e->barrier_count == 0 ) + if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 ) + error( "Failed to broadcast empty barrier condition." ); + + /* Last but not least, release the mutex. */ + if ( pthread_mutex_unlock( &e->barrier_mutex ) != 0 ) + error( "Failed to get unlock the barrier mutex." ); + + } + + +/** + * @brief Let the #engine loose to compute the forces. + * + * @param e The #engine. + * @param sort_queues Flag to try to sort the queues topologically. + */ + +void engine_run ( struct engine *e , int sort_queues ) { + + int j, k; + struct space *s = e->s; + + /* Run throught the tasks and get all the waits right. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + s->tasks[k].done = 0; + for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) + s->tasks[k].unlock_tasks[j]->wait += 1; + for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ ) + s->tasks[k].unlock_cells[j]->wait += 1; + } + + /* Re-set the queues.*/ + if ( sort_queues ) { + #pragma omp parallel for default(none), shared(e) + for ( k = 0 ; k < e->nr_queues ; k++ ) { + queue_sort( &e->queues[k] ); + e->queues[k].next = 0; + } + } + else + for ( k = 0 ; k < e->nr_queues ; k++ ) + e->queues[k].next = 0; + + /* Cry havoc and let loose the dogs of war. */ + e->barrier_count = -e->barrier_count; + if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 ) + error( "Failed to broadcast barrier open condition." ); + + /* Sit back and wait for the runners to come home. */ + while ( e->barrier_count < e->nr_threads ) + if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) + error( "Error while waiting for barrier." ); + + } + + +/** + * @brief init an engine with the given number of threads, queues, and + * the given policy. + * + * @param e The #engine. + * @param s The #space in which this #runner will run. + * @param nr_threads The number of threads to spawn. + * @param nr_queues The number of task queues to create. + * @param policy The queueing policy to use. + */ + +void engine_init ( struct engine *e , struct space *s , int nr_threads , int nr_queues , int policy ) { + + #if defined(HAVE_SETAFFINITY) + cpu_set_t cpuset; + #endif + int k, qid, nrq; + + /* Store the values. */ + e->s = s; + e->nr_threads = nr_threads; + e->nr_queues = nr_queues; + e->policy = policy; + + /* First of all, init the barrier and lock it. */ + if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 ) + error( "Failed to initialize barrier mutex." ); + if ( pthread_cond_init( &e->barrier_cond , NULL ) != 0 ) + error( "Failed to initialize barrier condition variable." ); + if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 ) + error( "Failed to lock barrier mutex." ); + e->barrier_count = 0; + + /* Allocate the queues. */ + if ( posix_memalign( (void *)(&e->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 ) + error( "Failed to allocate queues." ); + bzero( e->queues , nr_queues * sizeof(struct queue) ); + + /* Init the queues. */ + for ( k = 0 ; k < nr_queues ; k++ ) + queue_init( &e->queues[k] , s->nr_tasks , s->tasks ); + + /* Rank the tasks in topological order. */ + engine_ranktasks( e ); + + /* How many queues to fill initially? */ + for ( nrq = 0 , k = nr_queues ; k > 0 ; k = k / 2 ) + nrq += 1; + + /* Fill the queues (round-robin). */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none ) + continue; + // qid = 0; + // qid = k % nrq; + qid = k % nr_queues; + e->queues[qid].tid[ e->queues[qid].count ] = s->tasks_ind[k]; + e->queues[qid].count += 1; + } + + /* Sort the queues topologically. */ + for ( k = 0 ; k < nr_queues ; k++ ) + queue_sort( &e->queues[k] ); + + /* Allocate and init the threads. */ + if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL ) + error( "Failed to allocate threads array." ); + for ( k = 0 ; k < nr_threads ; k++ ) { + e->runners[k].id = k; + e->runners[k].e = e; + if ( pthread_create( &e->runners[k].thread , NULL , &runner_main , &e->runners[k] ) != 0 ) + error( "Failed to create runner thread." ); + #if defined(HAVE_SETAFFINITY) + /* Set the cpu mask to zero | e->id. */ + CPU_ZERO( &cpuset ); + CPU_SET( e->runners[k].id , &cpuset ); + + /* Apply this mask to the runner's pthread. */ + if ( pthread_setaffinity_np( e->runners[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 ) + error( "Failed to set thread affinity." ); + #endif + } + + /* Wait for the runner threads to be in place. */ + while ( e->barrier_count != e->nr_threads ) + if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) + error( "Error while waiting for runner threads to get in place." ); + + } + + + diff --git a/src/engine.h b/src/engine.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ca98abfe3ee5d687accfaaa51f78e4d593b611 --- /dev/null +++ b/src/engine.h @@ -0,0 +1,65 @@ +/******************************************************************************* + * This file is part of GadgetSMP. + * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + + + +/* Some constants. */ +#define engine_policy_none 0 +#define engine_policy_rand 1 +#define engine_policy_steal 2 +#define engine_policy_keep 4 +#define engine_policy_block 8 + +#define engine_queue_scale 1.2 + + +/* Data structure for the engine. */ +struct engine { + + /* Number of threads on which to run. */ + int nr_threads; + + /* The space with which the runner is associated. */ + struct space *s; + + /* The runner's threads. */ + struct runner *runners; + + /* The running policy. */ + int policy; + + /* The number of queues. */ + int nr_queues; + + /* The queues. */ + struct queue *queues; + + /* Data for the threads' barrier. */ + pthread_mutex_t barrier_mutex; + pthread_cond_t barrier_cond; + int barrier_count; + + }; + + +/* Function prototypes. */ +void engine_barrier( struct engine *e ); +void engine_init ( struct engine *e , struct space *s , int nr_threads , int nr_queues , int policy ); +void engine_ranktasks ( struct engine *e ); +void engine_run ( struct engine *e , int sort_queues ); diff --git a/src/gadgetsmp.h b/src/gadgetsmp.h index 866413474718a9bc9c5bfb1d1c050d1e2f4cb3a9..3193cc8200cab4906a027faa775659e128cf3d62 100644 --- a/src/gadgetsmp.h +++ b/src/gadgetsmp.h @@ -30,5 +30,6 @@ #include "space.h" #include "queue.h" #include "runner_iact.h" +#include "engine.h" #include "runner.h" diff --git a/src/runner.c b/src/runner.c index dd38394b7c57dcfae922bc510e6be2fee15dfb23..5d945986a4a2aedfb1b29a30e2bfdd92dff25667 100644 --- a/src/runner.c +++ b/src/runner.c @@ -40,6 +40,7 @@ #include "cell.h" #include "space.h" #include "queue.h" +#include "engine.h" #include "runner.h" #include "runner_iact.h" @@ -85,63 +86,6 @@ const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 #include "runner_doiact.h" -/** - * @brief Sort the tasks in topological order over all queues. - * - * @param r The #runner. - */ - -void runner_ranktasks ( struct runner *r ) { - - int i, j = 0, k, temp, left = 0, rank; - struct task *t; - struct space *s = r->s; - int *tid; - - /* Run throught the tasks and get all the waits right. */ - for ( k = 0 ; k < s->nr_tasks ; k++ ) { - for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) - s->tasks[k].unlock_tasks[j]->wait += 1; - } - - /* Allocate and init the task-ID array. */ - if ( ( tid = (int *)malloc( sizeof(int) * s->nr_tasks ) ) == NULL ) - error( "Failed to allocate temporary tid array." ); - for ( k = 0 ; k < s->nr_tasks ; k++ ) - tid[k] = k; - - /* Main loop. */ - for ( rank = 0 ; left < s->nr_tasks ; rank++ ) { - - /* Load the tids of tasks with no waits. */ - for ( k = left ; k < s->nr_tasks ; k++ ) - if ( s->tasks[ tid[k] ].wait == 0 ) { - temp = tid[j]; tid[j] = tid[k]; tid[k] = temp; - j += 1; - } - - /* Traverse the task tree and add tasks with no weight. */ - for ( i = left ; i < j ; i++ ) { - t = &s->tasks[ tid[i] ]; - t->rank = rank; - s->tasks_ind[i] = t - s->tasks; - /* printf( "runner_ranktasks: task %i of type %s has rank %i.\n" , i , - (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? "pair" : "sort" , rank ); */ - for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) - t->unlock_tasks[k]->wait -= 1; - } - - /* The new left (no, not tony). */ - left = j; - - } - - /* Release the temporary array. */ - free(tid); - - } - - /** * @brief Sort the entries in ascending order using QuickSort. * @@ -282,7 +226,7 @@ inline void merge_backward ( struct entry *__restrict__ one , int none , struct * @param c The #cell. */ -void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) { +void runner_dosort ( struct runner *r , struct cell *c , int flags ) { struct entry *finger; struct entry *fingers[8]; @@ -478,7 +422,7 @@ void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) { #ifdef TIMER_VERBOSE printf( "runner_dosort[%02i]: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms.\n" , - rt->id , c->count , c->depth , + r->id , c->count , c->depth , (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , ((double)TIMER_TOC(runner_timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); #else @@ -491,20 +435,16 @@ void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) { /** * @brief Intermediate task between density and force * - * @param rt The runner thread. + * @param r The runner thread. * @param ci THe cell. */ -void runner_doghost ( struct runner_thread *r , struct cell *c ) { +void runner_doghost ( struct runner *r , struct cell *c ) { struct part *p; int i, k; TIMER_TIC - /* If this cell has progeny, don't bother. */ - if ( c->split ) - return; - /* Loop over the parts in this cell. */ for ( i = 0 ; i < c->count ; i++ ) { @@ -529,7 +469,7 @@ void runner_doghost ( struct runner_thread *r , struct cell *c ) { #ifdef TIMER_VERBOSE printf( "runner_doghost[%02i]: %i parts at depth %i took %.3f ms.\n" , - rt->id , c->count , c->depth , + r->id , c->count , c->depth , ((double)TIMER_TOC(runner_timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout); #else TIMER_TOC(runner_timer_doghost); @@ -538,50 +478,6 @@ void runner_doghost ( struct runner_thread *r , struct cell *c ) { } -/** - * @brief Implements a barrier for the #runner threads. - * - */ - -void runner_barrier( struct runner *r ) { - - /* First, get the barrier mutex. */ - if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 ) - error( "Failed to get barrier mutex." ); - - /* Wait for the barrier to close. */ - while ( r->barrier_count < 0 ) - if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) - error( "Eror waiting for barrier to close." ); - - /* Once I'm in, increase the barrier count. */ - r->barrier_count += 1; - - /* If all threads are in, send a signal... */ - if ( r->barrier_count == r->nr_threads ) - if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 ) - error( "Failed to broadcast barrier full condition." ); - - /* Wait for barrier to be released. */ - while ( r->barrier_count > 0 ) - if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) - error( "Error waiting for barrier to be released." ); - - /* Decrease the counter before leaving... */ - r->barrier_count += 1; - - /* If I'm the last one out, signal the condition again. */ - if ( r->barrier_count == 0 ) - if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 ) - error( "Failed to broadcast empty barrier condition." ); - - /* Last but not least, release the mutex. */ - if ( pthread_mutex_unlock( &r->barrier_mutex ) != 0 ) - error( "Failed to get unlock the barrier mutex." ); - - } - - /** * @brief The #runner main thread routine. * @@ -590,14 +486,14 @@ void runner_barrier( struct runner *r ) { void *runner_main ( void *data ) { - struct runner_thread *rt = (struct runner_thread *)data; - struct runner *r = rt->r; - int threadID = rt->id; + struct runner *r = (struct runner *)data; + struct engine *e = r->e; + int threadID = r->id; int k, qid, naq, keep, tpq; - struct queue *queues[ r->nr_queues ], *myq; + struct queue *queues[ e->nr_queues ], *myq; struct task *t; struct cell *ci, *cj; - unsigned int myseed = rand() + rt->id; + unsigned int myseed = rand() + r->id; #ifdef TIMER ticks stalled; #endif @@ -606,23 +502,23 @@ void *runner_main ( void *data ) { while ( 1 ) { /* Wait at the barrier. */ - runner_barrier( r ); + engine_barrier( e ); /* Set some convenient local data. */ - keep = r->policy & runner_policy_keep; - myq = &r->queues[ threadID % r->nr_queues ]; - tpq = ceil( ((double)r->nr_threads) / r->nr_queues ); + keep = e->policy & engine_policy_keep; + myq = &e->queues[ threadID % e->nr_queues ]; + tpq = ceil( ((double)e->nr_threads) / e->nr_queues ); stalled = 0; /* Set up the local list of active queues. */ - naq = r->nr_queues; + naq = e->nr_queues; for ( k = 0 ; k < naq ; k++ ) - queues[k] = &r->queues[k]; + queues[k] = &e->queues[k]; /* Set up the local list of active queues. */ - naq = r->nr_queues; + naq = e->nr_queues; for ( k = 0 ; k < naq ; k++ ) - queues[k] = &r->queues[k]; + queues[k] = &e->queues[k]; /* Loop while there are tasks... */ while ( 1 ) { @@ -640,32 +536,32 @@ void *runner_main ( void *data ) { /* Get a task, how and from where depends on the policy. */ TIMER_TIC t = NULL; - if ( r->nr_queues == 1 ) { - t = queue_gettask( &r->queues[0] , 1 , 0 ); + if ( e->nr_queues == 1 ) { + t = queue_gettask( &e->queues[0] , 1 , 0 ); } - else if ( r->policy & runner_policy_steal ) { + else if ( e->policy & engine_policy_steal ) { if ( ( myq->next == myq->count ) || - ( t = queue_gettask_new( myq , rt->id , 0 , 0 ) ) == NULL ) { + ( t = queue_gettask_new( myq , r->id , 0 , 0 ) ) == NULL ) { TIMER_TIC2 qid = rand_r( &myseed ) % naq; - keep = ( r->policy & runner_policy_keep ) && + keep = ( e->policy & engine_policy_keep ) && ( myq->count <= myq->size-tpq ); if ( myq->next == myq->count ) COUNT(runner_counter_steal_empty); else COUNT(runner_counter_steal_stall); - t = queue_gettask_new( queues[qid] , rt->id , 0 , keep ); + t = queue_gettask_new( queues[qid] , r->id , 0 , keep ); if ( t != NULL && keep ) queue_insert( myq , t ); TIMER_TOC2(runner_timer_steal); } } - else if ( r->policy & runner_policy_rand ) { + else if ( e->policy & engine_policy_rand ) { qid = rand_r( &myseed ) % naq; - t = queue_gettask( queues[qid] , r->policy & runner_policy_block , 0 ); + t = queue_gettask( queues[qid] , e->policy & engine_policy_block , 0 ); } else { - t = queue_gettask( &r->queues[threadID] , r->policy & runner_policy_block , 0 ); + t = queue_gettask( &e->queues[threadID] , e->policy & engine_policy_block , 0 ); } TIMER_TOC(runner_timer_getpair); @@ -681,7 +577,7 @@ void *runner_main ( void *data ) { stalled = getticks() - stalled; __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled ); #ifdef TIMER_VERBOSE - printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 ); + printf( "runner_main[%02i]: stalled %.3f ms\n" , r->id , ((double)stalled) / CPU_TPS * 1000 ); fflush(stdout); #endif stalled = 0; @@ -696,31 +592,31 @@ void *runner_main ( void *data ) { switch ( t->type ) { case task_type_self: if ( t->subtype == task_subtype_density ) - runner_doself_density( rt , ci ); + runner_doself_density( r , ci ); else if ( t->subtype == task_subtype_force ) - runner_doself_force( rt , ci ); + runner_doself_force( r , ci ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); break; case task_type_pair: if ( t->subtype == task_subtype_density ) - runner_dopair_density( rt , ci , cj ); + runner_dopair_density( r , ci , cj ); else if ( t->subtype == task_subtype_force ) - runner_dopair_force( rt , ci , cj ); + runner_dopair_force( r , ci , cj ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); cell_unlocktree( cj ); break; case task_type_sort: - runner_dosort( rt , ci , t->flags ); + runner_dosort( r , ci , t->flags ); break; case task_type_sub: if ( t->subtype == task_subtype_density ) - runner_dosub_density( rt , ci , cj , t->flags ); + runner_dosub_density( r , ci , cj , t->flags ); else if ( t->subtype == task_subtype_force ) - runner_dosub_force( rt , ci , cj , t->flags ); + runner_dosub_force( r , ci , cj , t->flags ); else error( "Unknown task subtype." ); cell_unlocktree( ci ); @@ -728,7 +624,8 @@ void *runner_main ( void *data ) { cell_unlocktree( cj ); break; case task_type_ghost: - runner_doghost( rt , ci ); + if ( t->flags ) + runner_doghost( r , ci ); break; default: error( "Unknown task type." ); @@ -751,7 +648,7 @@ void *runner_main ( void *data ) { stalled = getticks() - stalled; __sync_add_and_fetch( &runner_timer[runner_timer_stalled] , stalled ); #ifdef TIMER_VERBOSE - printf( "runner_main[%02i]: stalled %.3f ms\n" , rt->id , ((double)stalled) / CPU_TPS * 1000 ); + printf( "runner_main[%02i]: stalled %.3f ms\n" , r->id , ((double)stalled) / CPU_TPS * 1000 ); fflush(stdout); #endif stalled = 0; @@ -766,141 +663,3 @@ void *runner_main ( void *data ) { } -/** - * @brief Let the #runner loose on the given #space. - * - * @param r The #runner. - * @param s The #space. - */ - -void runner_run ( struct runner *r , int sort_queues ) { - - int j, k; - struct space *s = r->s; - - /* Run throught the tasks and get all the waits right. */ - for ( k = 0 ; k < s->nr_tasks ; k++ ) { - s->tasks[k].done = 0; - for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) - s->tasks[k].unlock_tasks[j]->wait += 1; - for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ ) - s->tasks[k].unlock_cells[j]->wait += 1; - } - - /* Re-set the queues.*/ - if ( sort_queues ) { - #pragma omp parallel for default(none), shared(r) - for ( k = 0 ; k < r->nr_queues ; k++ ) { - queue_sort( &r->queues[k] ); - r->queues[k].next = 0; - } - } - else - for ( k = 0 ; k < r->nr_queues ; k++ ) - r->queues[k].next = 0; - - /* Cry havoc and let loose the dogs of war. */ - r->barrier_count = -r->barrier_count; - if ( pthread_cond_broadcast( &r->barrier_cond ) != 0 ) - error( "Failed to broadcast barrier open condition." ); - - /* Sit back and wait for the runner_threads to come home. */ - while ( r->barrier_count < r->nr_threads ) - if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) - error( "Error while waiting for barrier." ); - - } - - -/** - * @brief init a runner with the given number of threads, queues, and - * the given policy. - * - * @param r The #runner. - * @param s The #space in which this #runner will run. - * @param nr_threads The number of threads to spawn. - * @param nr_queues The number of task queues to create. - * @param policy The queueing policy to use. - */ - -void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy ) { - - #if defined(HAVE_SETAFFINITY) - cpu_set_t cpuset; - #endif - int k, qid, nrq; - - /* Store the values. */ - r->s = s; - r->nr_threads = nr_threads; - r->nr_queues = nr_queues; - r->policy = policy; - - /* First of all, init the barrier and lock it. */ - if ( pthread_mutex_init( &r->barrier_mutex , NULL ) != 0 ) - error( "Failed to initialize barrier mutex." ); - if ( pthread_cond_init( &r->barrier_cond , NULL ) != 0 ) - error( "Failed to initialize barrier condition variable." ); - if ( pthread_mutex_lock( &r->barrier_mutex ) != 0 ) - error( "Failed to lock barrier mutex." ); - r->barrier_count = 0; - - /* Allocate the queues. */ - if ( posix_memalign( (void *)(&r->queues) , 64 , nr_queues * sizeof(struct queue) ) != 0 ) - error( "Failed to allocate queues." ); - bzero( r->queues , nr_queues * sizeof(struct queue) ); - - /* Init the queues. */ - for ( k = 0 ; k < nr_queues ; k++ ) - queue_init( &r->queues[k] , s->nr_tasks , s->tasks ); - - /* Rank the tasks in topological order. */ - runner_ranktasks( r ); - - /* How many queues to fill initially? */ - for ( nrq = 0 , k = nr_queues ; k > 0 ; k = k / 2 ) - nrq += 1; - - /* Fill the queues (round-robin). */ - for ( k = 0 ; k < s->nr_tasks ; k++ ) { - if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none ) - continue; - // qid = 0; - // qid = k % nrq; - qid = k % nr_queues; - r->queues[qid].tid[ r->queues[qid].count ] = s->tasks_ind[k]; - r->queues[qid].count += 1; - } - - /* Sort the queues topologically. */ - for ( k = 0 ; k < nr_queues ; k++ ) - queue_sort( &r->queues[k] ); - - /* Allocate and init the threads. */ - if ( ( r->threads = (struct runner_thread *)malloc( sizeof(struct runner_thread) * nr_threads ) ) == NULL ) - error( "Failed to allocate threads array." ); - for ( k = 0 ; k < nr_threads ; k++ ) { - r->threads[k].id = k; - r->threads[k].r = r; - if ( pthread_create( &r->threads[k].thread , NULL , &runner_main , &r->threads[k] ) != 0 ) - error( "Failed to create runner thread." ); - #if defined(HAVE_SETAFFINITY) - /* Set the cpu mask to zero | r->id. */ - CPU_ZERO( &cpuset ); - CPU_SET( r->threads[k].id , &cpuset ); - - /* Apply this mask to the runner's pthread. */ - if ( pthread_setaffinity_np( r->threads[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 ) - error( "Failed to set thread affinity." ); - #endif - } - - /* Wait for the runner threads to be in place. */ - while ( r->barrier_count != r->nr_threads ) - if ( pthread_cond_wait( &r->barrier_cond , &r->barrier_mutex ) != 0 ) - error( "Error while waiting for runner threads to get in place." ); - - } - - - diff --git a/src/runner.h b/src/runner.h index 505e7561a6c7159f9db22b748f67c5f1fd873c7d..4d1a6ec64c6bf77276c5eacf078d54454596bff6 100644 --- a/src/runner.h +++ b/src/runner.h @@ -19,16 +19,6 @@ -/* Some constants. */ -#define runner_policy_none 0 -#define runner_policy_rand 1 -#define runner_policy_steal 2 -#define runner_policy_keep 4 -#define runner_policy_block 8 - -#define runner_queue_scale 1.2 - - /* The timers themselves. */ enum { runner_timer_none = 0, @@ -114,7 +104,7 @@ long long int runner_hist_bins[ runner_hist_N ]; /* A struct representing a runner's thread and its data. */ -struct runner_thread { +struct runner { /* The id of this thread. */ int id; @@ -123,45 +113,15 @@ struct runner_thread { pthread_t thread; /* The underlying runner. */ - struct runner *r; - - }; - - -/* Data structure for the runner. */ -struct runner { - - /* Number of threads on which to run. */ - int nr_threads; - - /* The space with which the runner is associated. */ - struct space *s; - - /* The runner's threads. */ - struct runner_thread *threads; - - /* The running policy. */ - int policy; - - /* The number of queues. */ - int nr_queues; - - /* The queues. */ - struct queue *queues; - - /* Data for the threads' barrier. */ - pthread_mutex_t barrier_mutex; - pthread_cond_t barrier_cond; - int barrier_count; + struct engine *e; }; /* Function prototypes. */ -void runner_run ( struct runner *r , int sort_queues ); -void runner_doghost ( struct runner_thread *rt , struct cell *c ); -void runner_dopair_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj ); -void runner_doself_density ( struct runner_thread *rt , struct cell *c ); -void runner_dosub_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags ); -void runner_dosort ( struct runner_thread *rt , struct cell *c , int flag ); -void runner_init ( struct runner *r , struct space *s , int nr_threads , int nr_queues , int policy ); +void runner_doghost ( struct runner *r , struct cell *c ); +void runner_dopair_density ( struct runner *r , struct cell *ci , struct cell *cj ); +void runner_doself_density ( struct runner *r , struct cell *c ); +void runner_dosub_density ( struct runner *r , struct cell *ci , struct cell *cj , int flags ); +void runner_dosort ( struct runner *r , struct cell *c , int flag ); +void *runner_main ( void *data ); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 52039b151a43d7f7d5ca1c8ca5243e280bd98be1..20e5798b3525b1551214e9573627f91143525a20 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -59,9 +59,9 @@ * @param cj The second #cell. */ -void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { +void DOPAIR_NAIVE ( struct runner *r , struct cell *ci , struct cell *cj ) { - struct runner *r = rt->r; + struct engine *e = r->e; int pid, pjd, k, count_i = ci->count, count_j = cj->count; double shift[3] = { 0.0 , 0.0 , 0.0 }; struct part *pi, *pj, *parts_i = ci->parts, *parts_j = cj->parts; @@ -71,10 +71,10 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj /* Get the relative distance between the pairs, wrapping. */ for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 ) - shift[k] = r->s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 ) - shift[k] = -r->s->dim[k]; + if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) + shift[k] = e->s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) + shift[k] = -e->s->dim[k]; } /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , @@ -117,7 +117,7 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj } /* loop over the parts in ci. */ #ifdef TIMER_VERBOSE - printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); + printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); #else TIMER_TOC(TIMER_DOPAIR); #endif @@ -134,9 +134,9 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj * @param cj The second #cell. */ -void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { +void DOPAIR ( struct runner *r , struct cell *ci , struct cell *cj ) { - struct runner *r = rt->r; + struct engine *e = r->e; int pid, pjd, k, sid; double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }; struct cell *temp; @@ -150,10 +150,10 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { /* Get the relative distance between the pairs, wrapping. */ for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -r->s->dim[k]/2 ) - shift[k] = r->s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > r->s->dim[k]/2 ) - shift[k] = -r->s->dim[k]; + if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) + shift[k] = e->s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) + shift[k] = -e->s->dim[k]; } /* Get the sorting index. */ @@ -275,7 +275,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { } /* loop over the parts in ci. */ #ifdef TIMER_VERBOSE - printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); + printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); #else TIMER_TOC(TIMER_DOPAIR); #endif @@ -290,7 +290,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { * @param c The #cell. */ -void DOSELF ( struct runner_thread *rt , struct cell *c ) { +void DOSELF ( struct runner *r , struct cell *c ) { int k, pid, pjd, count = c->count; double pix[3]; @@ -338,7 +338,7 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) { } /* loop over all particles. */ #ifdef TIMER_VERBOSE - printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); + printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); #else TIMER_TOC(TIMER_DOSELF); #endif @@ -353,7 +353,7 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) { * @param c The #cell. */ -void DOSUB ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags ) { +void DOSUB ( struct runner *r , struct cell *ci , struct cell *cj , int flags ) { int j, k; @@ -367,185 +367,185 @@ void DOSUB ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int for ( j = 0 ; j < 7 ; j++ ) for ( k = j + 1 ; k < 8 ; k++ ) if ( ci->progeny[j] != NULL && ci->progeny[k] != NULL ) - DOPAIR( rt , ci->progeny[j] , ci->progeny[k] ); + DOPAIR( r , ci->progeny[j] , ci->progeny[k] ); break; case 1: /* ( 1 , 1 , 0 ) */ if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[0] ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[1] ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[0] ); if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[1] ); break; case 3: /* ( 1 , 0 , 1 ) */ if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[0] ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[2] ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[0] ); if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[2] ); break; case 4: /* ( 1 , 0 , 0 ) */ if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[0] ); if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[1] ); if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[2] ); if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[3] ); if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[0] ); if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[1] ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[2] ); if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[3] ); if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[0] ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[1] ); if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[2] ); if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[3] ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[0] ); if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[1] ); if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[2] ); if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[3] ); break; case 5: /* ( 1 , 0 , -1 ) */ if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[1] ); if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[3] ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[1] ); if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[3] ); break; case 7: /* ( 1 , -1 , 0 ) */ if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[2] ); if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[4] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[4] , cj->progeny[3] ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[2] ); if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[3] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[3] ); break; case 9: /* ( 0 , 1 , 1 ) */ if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[0] ); if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[4] ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[0] ); if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[4] ); break; case 10: /* ( 0 , 1 , 0 ) */ if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[2] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[2] , cj->progeny[0] ); if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[2] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[2] , cj->progeny[1] ); if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[2] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[2] , cj->progeny[4] ); if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOPAIR( rt , ci->progeny[2] , cj->progeny[5] ); + DOPAIR( r , ci->progeny[2] , cj->progeny[5] ); if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[0] ); if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[1] ); if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[4] ); if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[5] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[5] ); if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[0] ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[1] ); if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[4] ); if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[5] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[5] ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[0] ); if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[1] ); if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[4] ); if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[5] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[5] ); break; case 11: /* ( 0 , 1 , -1 ) */ if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[2] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[2] , cj->progeny[1] ); if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOPAIR( rt , ci->progeny[2] , cj->progeny[5] ); + DOPAIR( r , ci->progeny[2] , cj->progeny[5] ); if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[1] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[1] ); if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOPAIR( rt , ci->progeny[6] , cj->progeny[5] ); + DOPAIR( r , ci->progeny[6] , cj->progeny[5] ); break; case 12: /* ( 0 , 0 , 1 ) */ if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[1] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[1] , cj->progeny[0] ); if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[1] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[1] , cj->progeny[2] ); if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[1] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[1] , cj->progeny[4] ); if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL ) - DOPAIR( rt , ci->progeny[1] , cj->progeny[6] ); + DOPAIR( r , ci->progeny[1] , cj->progeny[6] ); if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[0] ); if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[2] ); if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[4] ); if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL ) - DOPAIR( rt , ci->progeny[3] , cj->progeny[6] ); + DOPAIR( r , ci->progeny[3] , cj->progeny[6] ); if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[0] ); if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[2] ); if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[4] ); if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL ) - DOPAIR( rt , ci->progeny[5] , cj->progeny[6] ); + DOPAIR( r , ci->progeny[5] , cj->progeny[6] ); if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[0] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[0] ); if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[2] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[2] ); if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[4] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[4] ); if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL ) - DOPAIR( rt , ci->progeny[7] , cj->progeny[6] ); + DOPAIR( r , ci->progeny[7] , cj->progeny[6] ); break; } #ifdef TIMER_VERBOSE - printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); + printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); #else TIMER_TOC(TIMER_DOSUB); #endif diff --git a/src/runner_iact.h b/src/runner_iact.h index 0fb85d8d1e1c39aa4675a0a7916372a2a68240e9..b578faef6a5c938bd7dc898b5b3f60af101953d7 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -147,6 +147,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 /* Compute dv dot r. */ dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2]; + dvdr *= ri; /* Get the time derivative for u. */ pi->u_dt += pj->mass * dvdr * wi_dr; diff --git a/src/space.c b/src/space.c index 6a3203ee0601aac09cb109b91890d9911866367d..a8e42fbefa2467ca520ab39f18cc5ee4ec60f411 100644 --- a/src/space.c +++ b/src/space.c @@ -970,7 +970,7 @@ void space_maketasks ( struct space *s , int do_sort ) { if ( t->cj != NULL ) task_addunlock( t->cj->ghost , t2 ); } - + } /* Did we already create indices? */ @@ -987,13 +987,28 @@ void space_maketasks ( struct space *s , int do_sort ) { for ( k = 0 ; k < s->nr_tasks ; k++ ) { t = &s->tasks[k]; if ( ( t->type == task_type_sort || t->type == task_type_ghost ) && t->nr_unlock_tasks == 0 ) { + if ( t->type == task_type_sort && t->ci->split ) + for ( i = 0 ; i < 13 ; i++ ) + if ( t->flags & ( 1 << i ) ) { + for ( j = 0 ; j < 8 ; j++ ) + if ( t->ci->progeny[j] != NULL ) + task_rmunlock( t->ci->progeny[j]->sorts[i] , t ); + t->ci->sorts[i] = NULL; + } t->type = task_type_none; - if ( t->ci->split ) - for ( j = 0 ; j < 8 ; j++ ) - if ( t->ci->progeny[j] != NULL && t->flags & ( 1 << j ) ) - task_rmunlock( t->ci->progeny[j]->sorts[j] , t ); } } + + /* Make each remaining ghost task unlock the ghosts of its progeny. */ + for ( k = 0 ; k < s->nr_tasks ; k++ ) { + t = &s->tasks[k]; + if ( t->type == task_type_ghost && t->ci->split ) + for ( j = 0 ; j < 8 ; j++ ) + if ( t->ci->progeny[j] != NULL ) + task_addunlock( t->ci->ghost , t->ci->progeny[j]->ghost ); + if ( t->type == task_type_ghost && ( t->ci->parent == NULL || t->ci->parent->ghost->type == task_type_none ) ) + t->flags = 1; + } /* Count the number of each task type. */ for ( k = 0 ; k < task_type_count ; k++ )