diff --git a/src/engine.c b/src/engine.c index 1f93b1ab9884fd6834cacb60f00fcdce04f5a422..62b9273db671e667eb46e4fb66261d1a09fff491 100644 --- a/src/engine.c +++ b/src/engine.c @@ -60,55 +60,60 @@ void engine_prepare ( struct engine *e , int force ) { - int j, k, qid, changes; + int j, k, qid, changes, count; struct space *s = e->s; + // ticks tic; /* Rebuild the space. */ + // tic = getticks(); changes = space_rebuild( e->s , force , 0 ); - // printf( "engine_prepare: space_rebuild with %i changes.\n" , changes ); + // printf( "engine_prepare: space_rebuild with %i changes took %.3f ms.\n" , changes , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Has anything changed? */ + // tic = getticks(); if ( changes ) { /* Rank the tasks in topological order. */ engine_ranktasks( e ); - /* Clear the queues. */ - for ( k = 0 ; k < e->nr_queues ; k++ ) - e->queues[k].count = 0; - - /* Re-allocate the queue buffers? */ - for ( k = 0 ; k < e->nr_queues ; k++ ) - queue_init( &e->queues[k] , s->nr_tasks , s->tasks ); - /* Fill the queues (round-robin). */ - for ( k = 0 ; k < s->nr_tasks ; k++ ) { - if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none ) - continue; - qid = k % e->nr_queues; - e->queues[qid].tid[ e->queues[qid].count ] = s->tasks_ind[k]; - e->queues[qid].count += 1; + #pragma omp parallel for schedule(static) private(count,k) + for ( qid = 0 ; qid < e->nr_queues ; qid++ ) { + queue_init( &e->queues[qid] , s->nr_tasks , s->tasks ); + for ( count = 0 , k = qid ; k < s->nr_tasks ; k += e->nr_queues ) { + if ( s->tasks[ s->tasks_ind[k] ].type == task_type_none ) + continue; + e->queues[qid].tid[ count ] = s->tasks_ind[k]; + count += 1; + } + e->queues[qid].count = count; + e->queues[qid].next = 0; } } + // printf( "engine_prepare: re-filling queues took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Re-set the particle data. */ - #pragma omp parallel for + // tic = getticks(); + #pragma omp parallel for schedule(static) for ( k = 0 ; k < s->nr_parts ; k++ ) { s->parts[k].wcount = 0.0f; s->parts[k].wcount_dh = 0.0f; s->parts[k].rho = 0.0f; s->parts[k].rho_dh = 0.0f; } + // printf( "engine_prepare: re-setting particle data took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Run throught the tasks and get all the waits right. */ - #pragma omp parallel for private(j) + // tic = getticks(); + #pragma omp parallel for schedule(static) private(j) for ( k = 0 ; k < s->nr_tasks ; k++ ) { for ( j = 0 ; j < s->tasks[k].nr_unlock_tasks ; j++ ) __sync_add_and_fetch( &s->tasks[k].unlock_tasks[j]->wait , 1 ); for ( j = 0 ; j < s->tasks[k].nr_unlock_cells ; j++ ) __sync_add_and_fetch( &s->tasks[k].unlock_cells[j]->wait , 1 ); } + // printf( "engine_prepare: preparing task dependencies took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Re-set the queues.*/ for ( k = 0 ; k < e->nr_queues ; k++ ) diff --git a/src/space.c b/src/space.c index 858fc7313f4ce15189b6f49390b61949d4e261ac..0b5f5bec01b2a022203efbe9cb2465435d107871 100644 --- a/src/space.c +++ b/src/space.c @@ -269,8 +269,10 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { struct part *finger; struct cpart *cfinger; int *ind, changes = 0; + // ticks tic; /* Run through the parts and get the current h_max. */ + // tic = getticks(); for ( k = 0 ; k < s->nr_parts ; k++ ) { if ( s->parts[k].h > h_max ) h_max = s->parts[k].h; @@ -279,12 +281,14 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { } s->h_min = h_min; s->h_max = h_max; + // printf( "space_rebuild: getting h_min and h_max took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Get the new putative cell dimensions. */ for ( k = 0 ; k < 3 ; k++ ) cdim[k] = floor( s->dim[k] / fmax( h_max*space_stretch , cell_max ) ); /* Do we need to re-build the upper-level cells? */ + // tic = getticks(); if ( force || cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) { /* Free the old cells, if they were allocated. */ @@ -328,9 +332,11 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { changes = 1; } /* re-build upper-level cells? */ + // printf( "space_rebuild: rebuilding upper-level cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Run through the particles and get their cell index. */ + // tic = getticks(); if ( ( ind = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL ) error( "Failed to allocate temporary particle indices." ); for ( k = 0 ; k < s->nr_cells ; k++ ) @@ -339,14 +345,19 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { ind[k] = cell_getid( s->cdim , s->parts[k].x[0]*s->ih[0] , s->parts[k].x[1]*s->ih[1] , s->parts[k].x[2]*s->ih[2] ); s->cells[ ind[k] ].count += 1; } + // printf( "space_rebuild: getting particle indices took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Sort the parts according to their cells. */ + // tic = getticks(); parts_sort( s->parts , ind , s->nr_parts , 0 , s->nr_cells ); + // printf( "space_rebuild: parts_sort took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* We no longer need the indices as of here. */ free( ind ); /* Update the condensed particle data. */ + // tic = getticks(); + #pragma omp parallel for schedule(static) for ( k = 0 ; k < s->nr_parts ; k++ ) { s->cparts[k].x[0] = s->parts[k].x[0]; s->cparts[k].x[1] = s->parts[k].x[1]; @@ -354,8 +365,10 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { s->cparts[k].h = s->parts[k].h; s->cparts[k].dt = s->parts[k].dt; } + // printf( "space_rebuild: creating condensed parts took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Hook the cells up to the parts. */ + // tic = getticks(); finger = s->parts; cfinger = s->cparts; for ( k = 0 ; k < s->nr_cells ; k++ ) { @@ -365,17 +378,22 @@ int space_rebuild ( struct space *s , int force , double cell_max ) { finger = &finger[ c->count ]; cfinger = &cfinger[ c->count ]; } + // printf( "space_rebuild: hooking up cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* At this point, we have the upper-level cells, old or new. Now make sure that the parts in each cell are ok. */ - #pragma omp parallel for shared(s) reduction(+:changes) + // tic = getticks(); + #pragma omp parallel for schedule(dynamic) shared(s) reduction(+:changes) for ( k = 0 ; k < s->nr_cells ; k++ ) changes += space_rebuild_recurse( s , &s->cells[k] ); + // printf( "space_rebuild: space_rebuild_recurse took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Now that we have the cell structre, re-build the tasks. */ + // tic = getticks(); if ( changes ) space_maketasks( s , 1 ); + // printf( "space_rebuild: maketasks took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Return the number of changes. */ return changes; @@ -460,16 +478,16 @@ void parts_sort ( struct part *parts , int *ind , int N , int min , int max ) { } else - #pragma omp parallel sections + // #pragma omp parallel sections { /* Recurse on the left? */ - #pragma omp section + // #pragma omp section if ( j > 0 && pivot > min ) parts_sort( parts , ind , j+1 , min , pivot ); /* Recurse on the right? */ - #pragma omp section + // #pragma omp section if ( i < N && pivot+1 < max ) parts_sort( &parts[i], &ind[i], N-i , pivot+1 , max );