diff --git a/src/Makefile.am b/src/Makefile.am index f1376a6258180bd0a72306a60af0beece6e097d8..820ca314772f406ed900260a79824a7c842f7597 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -22,7 +22,7 @@ AUTOMAKE_OPTIONS=gnu # Add the debug flag to the whole thing AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ - -DTIMER -DCOUNTER -DCPU_TPS=2.40e9 + -DTIMER -DCOUNTER -DCPU_TPS=2.40e9 -mfma4 # AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \ # -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 diff --git a/src/engine.c b/src/engine.c index 8637ee07f44b8f88aa00d9dab9f83e1ed4318cc6..e1e1e6c6c500d6280ad91dafc701f76dcd36c88c 100644 --- a/src/engine.c +++ b/src/engine.c @@ -326,7 +326,7 @@ void engine_prepare ( struct engine *e ) { /* Run through the tasks and mark as skip or not. */ // tic = getticks(); - rebuild = ( e->step == 0 || engine_marktasks( e ) ); + rebuild = 1 || ( e->step == 0 || engine_marktasks( e ) ); // printf( "space_prepare: space_marktasks took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 ); /* Did this not go through? */ diff --git a/src/scheduler.c b/src/scheduler.c index 66708fb14ca4ac6014502771bc1029659b889b9e..aef13a10bacc5147fb5ed17a5d71b3eb846319ee 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -625,7 +625,7 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) { void scheduler_enqueue ( struct scheduler *s , struct task *t ) { - int k, qid = -1; + int qid = -1; /* Ignore skipped tasks. */ if ( t->skip ) @@ -652,9 +652,10 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) { /* If no previous owner, find the shortest queue. */ if ( qid < 0 ) - for ( qid = 0 , k = 1 ; k < s->nr_queues ; k++ ) + qid = rand() % s->nr_queues; + /* for ( qid = 0 , int k = 1 ; k < s->nr_queues ; k++ ) if ( s->queues[k].count < s->queues[qid].count ) - qid = k; + qid = k; */ /* Increase the waiting counter. */ atomic_inc( &s->waiting ); @@ -674,7 +675,7 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) { void scheduler_done ( struct scheduler *s , struct task *t ) { - int k; + int k, res; struct task *t2; /* Release whatever locks this task held. */ @@ -695,15 +696,17 @@ void scheduler_done ( struct scheduler *s , struct task *t ) { they are ready. */ for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) { t2 = t->unlock_tasks[k]; - if ( atomic_dec( &t2->wait ) == 1 && !t2->skip ) + if ( ( res = atomic_dec( &t2->wait ) ) < 1 ) + error( "Negative wait!" ); + if ( res == 1 && !t2->skip ) scheduler_enqueue( s , t2 ); } /* Task definitely done. */ - pthread_mutex_lock( &s->sleep_mutex ); + // pthread_mutex_lock( &s->sleep_mutex ); atomic_dec( &s->waiting ); - pthread_cond_broadcast( &s->sleep_cond ); - pthread_mutex_unlock( &s->sleep_mutex ); + // pthread_cond_broadcast( &s->sleep_cond ); + // pthread_mutex_unlock( &s->sleep_mutex ); } @@ -751,10 +754,10 @@ struct task *scheduler_gettask ( struct scheduler *s , int qid ) { } /* If we failed, take a short nap. */ - pthread_mutex_lock( &s->sleep_mutex ); + /* pthread_mutex_lock( &s->sleep_mutex ); if ( s->waiting > 0 ) pthread_cond_wait( &s->sleep_cond , &s->sleep_mutex ); - pthread_mutex_unlock( &s->sleep_mutex ); + pthread_mutex_unlock( &s->sleep_mutex ); */ } diff --git a/src/scheduler.h b/src/scheduler.h index b20c7ecaf793bc0dceb6d29ad4f74d0423ba7067..b1d6d8a274a09f63205446027cc23a465e30c09e 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -27,7 +27,7 @@ /* Flags . */ #define scheduler_flag_none 0 #define scheduler_flag_steal 1 -#define scheduler_flag_maxsteal 2 +#define scheduler_flag_maxsteal 1 /* Data of a scheduler. */ diff --git a/src/space.c b/src/space.c index 97e1e45109a5836019c7db11342d8132be206b36..40ed7412efb66180c1749e5c332c774554619aa2 100644 --- a/src/space.c +++ b/src/space.c @@ -161,7 +161,7 @@ void space_rebuild ( struct space *s , double cell_max ) { struct part *restrict finger, *restrict p, *parts = s->parts; int *ind; double ih[3], dim[3]; - // ticks tic; + ticks tic; /* Be verbose about this. */ printf( "space_rebuild: (re)building space...\n" ); fflush(stdout); @@ -252,6 +252,7 @@ void space_rebuild ( struct space *s , double cell_max ) { s->cells[k].dx_max = 0.0f; s->cells[k].sorted = 0; s->cells[k].count = 0; + s->cells[k].kick1 = NULL; s->cells[k].kick2 = NULL; } s->maxdepth = 0; @@ -259,7 +260,7 @@ void space_rebuild ( struct space *s , double cell_max ) { } /* Run through the particles and get their cell index. */ - // tic = getticks(); + tic = getticks(); if ( ( ind = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL ) error( "Failed to allocate temporary particle indices." ); ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2]; @@ -276,12 +277,12 @@ void space_rebuild ( struct space *s , double cell_max ) { ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] ); atomic_inc( &s->cells[ ind[k] ].count ); } - // printf( "space_rebuild: getting particle indices took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); + printf( "space_rebuild: getting particle indices took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Sort the parts according to their cells. */ - // tic = getticks(); + tic = getticks(); parts_sort( parts , ind , s->nr_parts , 0 , s->nr_cells-1 ); - // printf( "space_rebuild: parts_sort took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); + printf( "space_rebuild: parts_sort took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Verify sort. */ /* for ( k = 1 ; k < nr_parts ; k++ ) { @@ -307,21 +308,19 @@ void space_rebuild ( struct space *s , double cell_max ) { /* At this point, we have the upper-level cells, old or new. Now make sure that the parts in each cell are ok. */ - // tic = getticks(); + tic = getticks(); k = 0; - #pragma omp parallel shared(s,k) + #pragma omp parallel num_threads(8) shared(s,k) { while ( 1 ) { - int myk; - #pragma omp critical - myk = k++; + int myk = atomic_inc( &k ); if ( myk < s->nr_cells ) space_split( s , &s->cells[myk] ); else break; } } - // printf( "space_rebuild: space_rebuild_recurse took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); + printf( "space_rebuild: space_rebuild_recurse took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); } @@ -358,7 +357,7 @@ void parts_sort ( struct part *parts , int *ind , int N , int min , int max ) { first = 0; last = 1; waiting = 1; /* Parallel bit. */ - #pragma omp parallel default(shared) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_p) + #pragma omp parallel num_threads(8) default(shared) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_p) { /* Main loop. */ @@ -565,11 +564,11 @@ void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cel } /* Call the recursive function on all higher-level cells. */ - #pragma omp parallel shared(s,cid) + // #pragma omp parallel shared(s,cid) { int mycid; while ( 1 ) { - #pragma omp critical + // #pragma omp critical mycid = cid++; if ( mycid < s->nr_cells ) rec_map( &s->cells[mycid] ); @@ -602,11 +601,11 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell } /* Call the recursive function on all higher-level cells. */ - #pragma omp parallel shared(s,cid) + // #pragma omp parallel shared(s,cid) { int mycid; while ( 1 ) { - #pragma omp critical + // #pragma omp critical mycid = cid++; if ( mycid < s->nr_cells ) rec_map( &s->cells[mycid] ); @@ -790,15 +789,22 @@ struct cell *space_getcell ( struct space *s ) { s->cells_new = c->next; s->tot_cells += 1; + /* Unlock the space. */ + lock_unlock_blind( &s->lock ); + /* Init some things in the cell. */ - bzero( c , sizeof(struct cell) ); + c->sorts = NULL; + c->nr_tasks = 0; + c->nr_density = 0; + c->dx_max = 0.0f; + c->sorted = 0; + c->count = 0; + c->kick1 = NULL; + c->kick2 = NULL; if ( lock_init( &c->lock ) != 0 ) error( "Failed to initialize cell spinlock." ); c->owner = -1; - /* Unlock the space. */ - lock_unlock_blind( &s->lock ); - return c; } diff --git a/src/space.h b/src/space.h index 88b012a2279c1dcc679f482f46d039e53251af54..40788bd24f70e768e83724447818da9a185c9643 100644 --- a/src/space.h +++ b/src/space.h @@ -26,8 +26,8 @@ #define space_splitratio 0.875f #define space_splitsize_default 400 #define space_subsize_default 5000 -#define space_stretch 1.05f -#define space_maxreldx 0.2f +#define space_stretch 1.10f +#define space_maxreldx 0.25f #define space_qstack 1000