Commit c826e1a1 authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

more efficient distribution of work in engine_prepare.


Former-commit-id: 1d78f54c1bc07c1f0d81e0451509058f106b6c08
parent 58d81f85
......@@ -72,7 +72,7 @@ void engine_prepare ( struct engine *e ) {
/* Rebuild the space. */
// tic = getticks();
space_prepare( e->s );
// printf( "engine_prepare: space_prepare with %i changes took %.3f ms.\n" , changes , (double)(getticks() - tic) / CPU_TPS * 1000 );
// printf( "engine_prepare: space_prepare took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
// tic = getticks();
/* Init the queues (round-robin). */
......@@ -92,7 +92,7 @@ void engine_prepare ( struct engine *e ) {
/* Re-set the particle data. */
// tic = getticks();
#pragma omp parallel for schedule(static) private(j)
#pragma omp parallel for schedule(static) private(j)
for ( k = 0 ; k < s->nr_parts ; k++ )
if ( s->parts[k].dt <= dt_step ) {
s->parts[k].wcount = 0.0f;
......@@ -171,42 +171,40 @@ void engine_barrier( struct engine *e ) {
/**
* @brief Let the #engine loose to compute the forces.
*
* @param e The #engine.
* @param sort_queues Flag to try to sort the queues topologically.
* @brief Mapping function to set dt_min and dt_max, do the first
* kick.
*/
void engine_step ( struct engine *e , int sort_queues ) {
void engine_map_kick_first ( struct cell *c , void *data ) {
int k, nr_parts = e->s->nr_parts;
struct part *restrict parts = e->s->parts, *restrict p;
int j, k;
struct engine *e = (struct engine *)data;
float dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
float dt_min, dt_max, h_max, dx_max;
struct part *restrict p;
struct xpart *restrict xp;
float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max;
double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
double lang[3], ang[3] = { 0.0 , 0.0 , 0.0 };
double lent, ent = 0.0;
int threadID, nthreads, count = 0, lcount;
struct cpart *restrict cp;
/* Get the maximum dt. */
dt_step = 2.0f*dt;
for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
dt_step *= 2;
// dt_step = FLT_MAX;
/* No children? */
if ( !c->split ) {
/* Set the maximum dt. */
e->dt_step = dt_step;
e->s->dt_step = dt_step;
printf( "engine_step: dt_step set to %.3e.\n" , dt_step ); fflush(stdout);
/* Init the min/max counters. */
dt_min = FLT_MAX;
dt_max = 0.0f;
h_max = 0.0f;
dx_max = 0.0f;
/* First kick. */
TIMER_TIC
#pragma omp parallel for schedule(static) private(p,xp)
for ( k = 0 ; k < nr_parts ; k++ ) {
/* Loop over parts. */
for ( k = 0 ; k < c->count ; k++ ) {
/* Get a handle on the part. */
p = &parts[k];
/* Get a handle on the kth particle. */
p = &c->parts[k];
xp = p->xtras;
cp = &c->cparts[k];
/* Store the min/max dt. */
dt_min = fminf( dt_min , p->dt );
dt_max = fmaxf( dt_max , p->dt );
/* Step and store the velocity and internal energy. */
xp->v_old[0] = p->v[0] + hdt * p->a[0];
......@@ -218,6 +216,9 @@ void engine_step ( struct engine *e , int sort_queues ) {
p->x[0] += dt * xp->v_old[0];
p->x[1] += dt * xp->v_old[1];
p->x[2] += dt * xp->v_old[2];
dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) +
(p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) +
(p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h );
/* Update positions and energies at the half-step. */
p->v[0] += dt * p->a[0];
......@@ -225,6 +226,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
p->v[2] += dt * p->a[2];
p->u *= expf( p->force.u_dt / p->u * dt );
p->h *= expf( p->force.h_dt / p->h * dt );
h_max = fmaxf( h_max , p->h );
/* Integrate other values if this particle will not be updated. */
if ( p->dt > dt_step ) {
......@@ -232,7 +234,95 @@ void engine_step ( struct engine *e , int sort_queues ) {
p->force.POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
}
/* Fill the cpart. */
cp->x[0] = p->x[0];
cp->x[1] = p->x[1];
cp->x[2] = p->x[2];
cp->h = p->h;
cp->dt = p->dt;
/* Init fields for density calculation. */
if ( p->dt <= dt_step ) {
p->wcount = 0.0f;
p->density.wcount_dh = 0.0f;
p->rho = 0.0f;
p->rho_dh = 0.0f;
p->density.div_v = 0.0f;
for ( j = 0 ; j < 3 ; ++j)
p->density.curl_v[j] = 0.0f;
}
}
}
/* Otherwise, agregate data from children. */
else {
/* Init with the first non-null child. */
for ( k = 0 ; c->progeny[k] == 0 ; k++ );
dt_min = c->progeny[k]->dt_min;
dt_max = c->progeny[k]->dt_max;
h_max = c->progeny[k]->h_max;
dx_max = c->progeny[k]->dx_max;
/* Loop over the remaining progeny. */
for ( k += 1 ; k < 8 ; k++ )
if ( c->progeny[k] != NULL ) {
dt_min = fminf( dt_min , c->progeny[k]->dt_min );
dt_max = fmaxf( dt_max , c->progeny[k]->dt_max );
h_max = fmaxf( h_max , c->progeny[k]->h_max );
dx_max = fmaxf( dx_max , c->progeny[k]->dx_max );
}
}
/* Store the values. */
c->dt_min = dt_min;
c->dt_max = dt_max;
c->h_max = h_max;
c->dx_max = dx_max;
/* Clean out the task pointers. */
c->sorts[0] = NULL;
c->nr_tasks = 0;
c->nr_density = 0;
}
/**
* @brief Let the #engine loose to compute the forces.
*
* @param e The #engine.
* @param sort_queues Flag to try to sort the queues topologically.
*/
void engine_step ( struct engine *e , int sort_queues ) {
int k, nr_parts = e->s->nr_parts;
struct part *restrict parts = e->s->parts, *restrict p;
struct xpart *restrict xp;
float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max;
double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
double lang[3], ang[3] = { 0.0 , 0.0 , 0.0 };
double lent, ent = 0.0;
int threadID, nthreads, count = 0, lcount;
/* Get the maximum dt. */
dt_step = 2.0f*dt;
for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ )
dt_step *= 2;
// dt_step = FLT_MAX;
/* Set the maximum dt. */
e->dt_step = dt_step;
e->s->dt_step = dt_step;
printf( "engine_step: dt_step set to %.3e.\n" , dt_step ); fflush(stdout);
/* First kick. */
TIMER_TIC
space_map_cells_post( e->s , 1 , &engine_map_kick_first , e );
TIMER_TOC( timer_kick1 );
// for(k=0; k<10; ++k)
......
......@@ -85,8 +85,9 @@ void space_map_prepare ( struct cell *c , void *data ) {
int k;
float dt_min, dt_max, h_max, dx_max;
struct part *p;
struct xpart *xp;
struct part *restrict p;
struct xpart *restrict xp;
struct cpart *restrict cp;
/* No children? */
if ( !c->split ) {
......@@ -94,23 +95,36 @@ void space_map_prepare ( struct cell *c , void *data ) {
/* Init with first part. */
p = &c->parts[0];
xp = p->xtras;
cp = &c->cparts[0];
dt_min = p->dt;
dt_max = p->dt;
h_max = p->h;
dx_max = sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) +
(p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) +
(p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h;
cp->x[0] = p->x[0];
cp->x[1] = p->x[1];
cp->x[2] = p->x[2];
cp->h = p->h;
cp->dt = p->dt;
/* Loop over parts. */
for ( k = 1 ; k < c->count ; k++ ) {
p = &c->parts[k];
xp = p->xtras;
cp = &c->cparts[0];
dt_min = fminf( dt_min , p->dt );
dt_max = fmaxf( dt_max , p->dt );
h_max = fmaxf( h_max , p->h );
dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) +
(p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) +
(p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h );
cp->x[0] = p->x[0];
cp->x[1] = p->x[1];
cp->x[2] = p->x[2];
cp->h = p->h;
cp->dt = p->dt;
}
}
......@@ -166,9 +180,12 @@ void space_prepare ( struct space *s ) {
struct task *t;
float dt_step = s->dt_step, dx_max = 0.0f;
int counts[ task_type_count + 1 ];
ticks tic;
/* Traverse the cells and set their dt_min and dx_max. */
space_map_cells_post( s , 1 , &space_map_prepare , NULL );
// tic = getticks();
// space_map_cells_post( s , 1 , &space_map_prepare , NULL );
// printf( "space_prepare: space_map_prepare took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 );
/* Get the maximum displacement in the whole system. */
for ( k = 0 ; k < s->nr_cells ; k++ )
......@@ -176,6 +193,7 @@ void space_prepare ( struct space *s ) {
printf( "space_prepare: dx_max is %e.\n" , dx_max );
/* Run through the tasks and mark as skip or not. */
tic = getticks();
for ( k = 0 ; k < s->nr_tasks ; k++ ) {
t = &s->tasks[k];
if ( t->type == task_type_sort ||
......@@ -190,34 +208,30 @@ void space_prepare ( struct space *s ) {
break;
}
}
printf( "space_prepare: checking tasks took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 );
/* Did this not go through? */
if ( k < s->nr_tasks ) {
/* Re-build the space. */
tic = getticks();
space_rebuild( s , 0.0 );
printf( "space_prepare: space_rebuild took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 );
/* Traverse the cells and set their dt_min and dx_max. */
tic = getticks();
space_map_cells_post( s , 1 , &space_map_prepare , NULL );
printf( "space_prepare: space_map_prepare took %.3f ms.\n" , (double)(getticks() - tic)/CPU_TPS*1000 );
}
/* Store the condensed particle data. */
#pragma omp parallel for schedule(static)
for ( k = 0 ; k < s->nr_parts ; k++ ) {
s->cparts[k].x[0] = s->parts[k].x[0];
s->cparts[k].x[1] = s->parts[k].x[1];
s->cparts[k].x[2] = s->parts[k].x[2];
s->cparts[k].h = s->parts[k].h;
s->cparts[k].dt = s->parts[k].dt;
}
/* Now that we have the cell structre, re-build the tasks. */
// tic = getticks();
tic = getticks();
space_maketasks( s , 1 );
// printf( "space_prepare: maketasks took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
printf( "space_prepare: maketasks took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
/* Count the number of each task type. */
tic = getticks();
for ( k = 0 ; k <= task_type_count ; k++ )
counts[k] = 0;
for ( k = 0 ; k < s->nr_tasks ; k++ )
......@@ -229,6 +243,7 @@ void space_prepare ( struct space *s ) {
for ( k = 1 ; k < task_type_count ; k++ )
printf( " %s=%i" , taskID_names[k] , counts[k] );
printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout);
printf( "space_prepare: task counting took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 );
}
......@@ -762,6 +777,7 @@ void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct ce
}
/* Call the recursive function on all higher-level cells. */
#pragma omp parallel for schedule(dynamic,1)
for ( i = 0 ; i < s->nr_cells ; i++ )
rec_map( &s->cells[i] );
......@@ -798,6 +814,7 @@ void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cel
}
/* Call the recursive function on all higher-level cells. */
#pragma omp parallel for schedule(dynamic,1)
for ( i = 0 ; i < s->nr_cells ; i++ )
rec_map( &s->cells[i] );
......@@ -825,6 +842,7 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell
}
/* Call the recursive function on all higher-level cells. */
#pragma omp parallel for schedule(dynamic,1)
for ( i = 0 ; i < s->nr_cells ; i++ )
rec_map( &s->cells[i] );
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment