From c969365b55de835660b29f371af01080f70b332d Mon Sep 17 00:00:00 2001 From: Pedro Gonnet <pedro.gonnet@durham.ac.uk> Date: Thu, 7 Feb 2013 18:22:15 +0000 Subject: [PATCH] several bug fixes using the new test cases. Former-commit-id: 36567220a6b658f9dbff8091561e53ed45c4bf7a --- examples/PertubedBox/makeIC.py | 2 +- examples/UniformBox/makeIC.py | 2 +- examples/test.c | 54 ++++++++++--- src/Makefile.am | 8 +- src/debug.c | 15 +++- src/debug.h | 2 +- src/engine.c | 135 ++++++++++++++++++++++----------- src/runner.c | 22 +++--- src/runner_iact.h | 10 +-- src/space.c | 37 +++++---- src/vector.h | 18 ++++- 11 files changed, 207 insertions(+), 98 deletions(-) diff --git a/examples/PertubedBox/makeIC.py b/examples/PertubedBox/makeIC.py index 03c97798bd..8301b0f630 100644 --- a/examples/PertubedBox/makeIC.py +++ b/examples/PertubedBox/makeIC.py @@ -32,7 +32,7 @@ L = 50 # Number of particles along one axis rho = 1. # Density P = 1. # Pressure gamma = 5./3. # Gas adiabatic index -pert = 0.1 # Perturbation scale (in units of the interparticle separation) +pert = 0.01 # Perturbation scale (in units of the interparticle separation) fileName = "perturbedBox.hdf5" diff --git a/examples/UniformBox/makeIC.py b/examples/UniformBox/makeIC.py index b68badbe80..e61106d30c 100644 --- a/examples/UniformBox/makeIC.py +++ b/examples/UniformBox/makeIC.py @@ -28,7 +28,7 @@ from numpy import * periodic= 1 # 1 For periodic box boxSize = 1. L = 50 # Number of particles along one axis -rho = 1. # Density +rho = 2. # Density P = 1. # Pressure gamma = 5./3. # Gas adiabatic index fileName = "uniformBox.hdf5" diff --git a/examples/test.c b/examples/test.c index 7a98ab26ec..57669065b0 100644 --- a/examples/test.c +++ b/examples/test.c @@ -28,6 +28,7 @@ #include <string.h> #include <pthread.h> #include <math.h> +#include <fenv.h> #include <omp.h> /* Conditional headers. */ @@ -193,6 +194,24 @@ void map_wcount_max ( struct part *p , struct cell *c , void *data ) { } +void map_h_min ( struct part *p , struct cell *c , void *data ) { + + struct part **p2 = (struct part **)data; + + if ( p->h < (*p2)->h ) + *p2 = p; + + } + +void map_h_max ( struct part *p , struct cell *c , void *data ) { + + struct part **p2 = (struct part **)data; + + if ( p->h > (*p2)->h ) + *p2 = p; + + } + /** * @brief Mapping function for neighbour count. @@ -689,6 +708,9 @@ int main ( int argc , char *argv[] ) { float dt_max = 0.0f; ticks tic; + /* Choke on FP-exceptions. */ + feenableexcept( FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW ); + /* Init the space. */ bzero( &s , sizeof(struct space) ); @@ -862,6 +884,7 @@ int main ( int argc , char *argv[] ) { parts[k].x[2] += shift[2]; } + /* Dump the first few particles. */ for(k=0; k<10; ++k) printParticle(parts, k); @@ -910,12 +933,6 @@ int main ( int argc , char *argv[] ) { /* Dump the particle positions. */ // space_map_parts( &s , &map_dump , shift ); - /* Dump the acceleration of the first particle. */ - for ( k = 0 ; k < 3 ; k++ ) { - printf( "main: parts[%lli].a is [ %.16e %.16e %.16e ].\n" , s.parts[k].id , s.parts[k].a[0] , s.parts[k].a[1] , s.parts[k].a[2] ); - printf( "main: parts[%lli].a has h=%e, rho=%e, wcount=%.3f.\n" , s.parts[k].id , s.parts[k].h , s.parts[k].rho , s.parts[k].wcount + 32.0/3 ); - } - /* Initialize the runner with this space. */ tic = getticks(); engine_init( &e , &s , nr_threads , nr_queues , engine_policy_steal | engine_policy_keep ); @@ -943,6 +960,23 @@ int main ( int argc , char *argv[] ) { /* Take a step. */ engine_step( &e , 0 ); + /* Dump the first few particles. */ + for(k=0; k<10; ++k) + printParticle(parts, k); + printParticle( parts , 113531 ); + + /* Get the particle with the lowest h. */ + p = &s.parts[0]; + space_map_parts( &s , &map_h_min , &p ); + printf( "main: particle %lli/%i at [ %e %e %e ] has minimum h=%.3e (h_dt=%.3e).\n" , + p->id , (int)(p - s.parts) , p->x[0] , p->x[1] , p->x[2] , p->h , p->h_dt ); + + /* Get the particle with the highest h. */ + p = &s.parts[0]; + space_map_parts( &s , &map_h_max , &p ); + printf( "main: particle %lli/%i at [ %e %e %e ] has maximum h=%.3e (h_dt=%.3e).\n" , + p->id , (int)(p - s.parts) , p->x[0] , p->x[1] , p->x[2] , p->h , p->h_dt ); + /* Output. */ #ifdef TIMER printf( "main: runner timers are [ %.3f" , timers[0]/CPU_TPS*1000 ); @@ -1010,11 +1044,9 @@ int main ( int argc , char *argv[] ) { // space_map_parts( &s , &map_icount , &icount ); // printf( "main: average neighbours per particle is %.3f.\n" , (double)icount / s.nr_parts ); - /* Dump the acceleration of the first particle. */ - for ( k = 0 ; k < 3 ; k++ ) { - printf( "main: parts[%lli].a is [ %.16e %.16e %.16e ].\n" , s.parts[k].id , s.parts[k].a[0] , s.parts[k].a[1] , s.parts[k].a[2] ); - printf( "main: parts[%lli].a has h=%e, rho=%e, wcount=%.3f.\n" , s.parts[k].id , s.parts[k].h , s.parts[k].rho , s.parts[k].wcount ); - } + /* Dump the first few particles. */ + for(k=0; k<10; ++k) + printParticle(parts, k); /* Get all the cells of a certain depth. */ // icount = 1; diff --git a/src/Makefile.am b/src/Makefile.am index 53fc189f38..6acf9e47dd 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -20,11 +20,11 @@ AUTOMAKE_OPTIONS=gnu # Add the debug flag to the whole thing -AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ - -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ - -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 -# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \ +# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ +# -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ # -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 +AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \ + -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 # Assign a "safe" version number AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 diff --git a/src/debug.c b/src/debug.c index 194666f9a2..4d885521bb 100644 --- a/src/debug.c +++ b/src/debug.c @@ -22,17 +22,26 @@ #include "part.h" -void printParticle(struct part *parts, int i) -{ - printf("## Particle[%d]: id= %lld x=( %f, %f, %f) v=( %f, %f, %f) h= %f m= %f rho= %f u= %f dt= %f\n", +void printParticle ( struct part *parts , long long int id ) { + + int i; + + /* Look for the particle. */ + for ( i = 0 ; parts[i].id != id ; i++ ); + + printf("## Particle[%d]: id=%lld, x=(%f,%f,%f), v=(%f,%f,%f), a=(%f,%f,%f), h=%f, h_dt=%f, wcount=%f, m=%f, rho=%f, u=%f, dudt=%f, dt=%.3e\n", i, parts[i].id, parts[i].x[0], parts[i].x[1], parts[i].x[2], parts[i].v[0], parts[i].v[1], parts[i].v[2], + parts[i].a[0], parts[i].a[1], parts[i].a[2], parts[i].h, + parts[i].h_dt, + parts[i].wcount, parts[i].mass, parts[i].rho, parts[i].u, + parts[i].u_dt, parts[i].dt ); } diff --git a/src/debug.h b/src/debug.h index a75c61a1d5..693c50f6b0 100644 --- a/src/debug.h +++ b/src/debug.h @@ -20,4 +20,4 @@ -void printParticle(struct part *parts, int i); +void printParticle(struct part *parts, long long int i); diff --git a/src/engine.c b/src/engine.c index 70f7ca3f4f..7513914b11 100644 --- a/src/engine.c +++ b/src/engine.c @@ -39,6 +39,7 @@ #include "lock.h" #include "task.h" #include "part.h" +#include "debug.h" #include "cell.h" #include "space.h" #include "queue.h" @@ -65,6 +66,7 @@ void engine_prepare ( struct engine *e ) { int j, k, qid; struct space *s = e->s; struct queue *q; + float dt_max = e->dt_max; TIMER_TIC @@ -92,12 +94,13 @@ void engine_prepare ( struct engine *e ) { /* Re-set the particle data. */ // tic = getticks(); #pragma omp parallel for schedule(static) - for ( k = 0 ; k < s->nr_parts ; k++ ) { - s->parts[k].wcount = 0.0f; - s->parts[k].wcount_dh = 0.0f; - s->parts[k].rho = 0.0f; - s->parts[k].rho_dh = 0.0f; - } + for ( k = 0 ; k < s->nr_parts ; k++ ) + if ( s->parts[k].dt <= dt_max ) { + s->parts[k].wcount = 0.0f; + s->parts[k].wcount_dh = 0.0f; + s->parts[k].rho = 0.0f; + s->parts[k].rho_dh = 0.0f; + } // printf( "engine_prepare: re-setting particle data took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Run throught the tasks and get all the waits right. */ @@ -177,15 +180,18 @@ void engine_step ( struct engine *e , int sort_queues ) { int k, nr_parts = e->s->nr_parts; struct part *restrict parts = e->s->parts, *restrict p; float *restrict v_bar; - float dt = e->dt, hdt = 0.5*dt, dt_max; - #ifdef __SSE2__ - VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt ); - #endif + float dt = e->dt, hdt = 0.5*dt, dt_max, dt_min, ldt_min, ldt_max; + double etot = 0.0, letot, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 }; + int threadID, nthreads; + // #ifdef __SSE2__ + // VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt ); + // #endif /* Get the maximum dt. */ - dt_max = dt; + dt_max = 2.0f*dt; for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ ) dt_max *= 2; + dt_max = 1; /* Set the maximum dt. */ e->dt_max = dt_max; @@ -198,43 +204,46 @@ void engine_step ( struct engine *e , int sort_queues ) { /* First kick. */ TIMER_TIC - #pragma omp parallel for schedule(static) private(p) + // #pragma omp parallel for schedule(static) private(p) for ( k = 0 ; k < nr_parts ; k++ ) { /* Get a handle on the part. */ p = &parts[k]; /* Step and store the velocity and internal energy. */ - #ifdef __SSE__ - _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) ); - #else + // #ifdef __SSE__ + // _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) ); + // #else v_bar[4*k+0] = p->v[0] + hdt * p->a[0]; v_bar[4*k+1] = p->v[1] + hdt * p->a[1]; v_bar[4*k+2] = p->v[2] + hdt * p->a[2]; - #endif + // #endif v_bar[4*k+3] = p->u + hdt * p->u_dt; /* Move the particles with the velocitie at the half-step. */ - // p->x[0] += dt * v_bar[3*k+0]; - // p->x[1] += dt * v_bar[3*k+1]; - // p->x[2] += dt * v_bar[3*k+2]; + p->x[0] += dt * v_bar[4*k+0]; + p->x[1] += dt * v_bar[4*k+1]; + p->x[2] += dt * v_bar[4*k+2]; /* Update positions and energies at the half-step. */ - // p->v[0] += dt * p->a[0]; - // p->v[1] += dt * p->a[1]; - // p->v[2] += dt * p->a[2]; - // p->u *= expf( p->u_dt / p->u * dt ); - // p->h *= expf( -1.0f * p->h_dt / p->h * dt ); + p->v[0] += dt * p->a[0]; + p->v[1] += dt * p->a[1]; + p->v[2] += dt * p->a[2]; + p->u *= expf( p->u_dt / p->u * dt ); + // p->h *= expf( p->h_dt / p->h * dt ); /* Integrate other values if this particle will not be updated. */ - if ( p->dt > dt_max ) { - p->rho *= expf( -3.0f * p->h_dt / p->h * dt ); - p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); - } + // if ( p->dt > dt_max ) { + // p->rho *= expf( -3.0f * p->h_dt / p->h * dt ); + // p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); + // } } TIMER_TOC( timer_kick1 ); + // for(k=0; k<10; ++k) + // printParticle(parts, k); + /* Prepare the space. */ engine_prepare( e ); @@ -263,43 +272,79 @@ void engine_step ( struct engine *e , int sort_queues ) { /* Stop the clock. */ TIMER_TOC(timer_step); + // for(k=0; k<10; ++k) + // printParticle(parts, k); + /* Second kick. */ TIMER_TIC_ND - e->dt_min = FLT_MAX; - #pragma omp parallel private(p,k) + dt_min = FLT_MAX; dt_max = 0.0f; + #pragma omp parallel private(p,k,ldt_min,ldt_max,lmom,letot,threadID,nthreads) { - int threadID = omp_get_thread_num(); - int nthreads = omp_get_num_threads(); - float dt_min = FLT_MAX; + threadID = omp_get_thread_num(); + nthreads = omp_get_num_threads(); + ldt_min = FLT_MAX; ldt_max = 0.0f; + lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0; + letot = 0.0; for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) { /* Get a handle on the part. */ p = &parts[k]; - /* Scale the derivatives. */ - p->u_dt *= p->POrho2; - p->h_dt *= p->h * 0.333333333f; + /* Scale the derivatives if they're freshly computed. */ + if ( p->dt <= dt_max ) { + p->u_dt *= p->POrho2; + p->h_dt *= p->h * 0.333333333f; + } /* Update positions and energies at the half-step. */ - #ifdef __SSE__ - _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) ); - #else + // #ifdef __SSE__ + // _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) ); + // #else p->v[0] = v_bar[4*k+0] + hdt * p->a[0]; p->v[1] = v_bar[4*k+1] + hdt * p->a[1]; p->v[2] = v_bar[4*k+2] + hdt * p->a[2]; - #endif - // p->u = v_bar[4*k+3] + hdt * p->u_dt; + // #endif + p->u = v_bar[4*k+3] + hdt * p->u_dt; - /* Get the smallest dt. */ - dt_min = fminf( dt_min , p->dt ); + /* Get the smallest/largest dt. */ + ldt_min = fminf( ldt_min , p->dt ); + ldt_max = fmaxf( ldt_max , p->dt ); + + /* Collect total energy. */ + letot += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] ) + p->mass * p->u; + + /* Collect momentum */ + lmom[0] += p->mass * p->v[0]; + lmom[1] += p->mass * p->v[1]; + lmom[2] += p->mass * p->v[2]; } #pragma omp critical - e->dt_min = fminf( e->dt_min , dt_min ); + { + dt_min = fminf( dt_min , ldt_min ); + dt_max = fmaxf( dt_max , ldt_max ); + mom[0] += lmom[0]; + mom[1] += lmom[1]; + mom[2] += lmom[2]; + etot += letot; + } } TIMER_TOC( timer_kick2 ); - printf( "engine_step: dt_min is %e.\n" , e->dt_min ); fflush(stdout); + e->dt_min = dt_min; + printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout); + printf( "engine_step: etot is %e.\n" , etot ); fflush(stdout); + printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout); + /* Does the time step need adjusting? */ + if ( dt_min < e->dt ) { + e->dt *= 0.5; + printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt ); + } + else if ( dt_min > 2*e->dt ) { + e->dt *= 2.0; + printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt ); + } + /* Clean up. */ free( v_bar ); diff --git a/src/runner.c b/src/runner.c index 52e6be949e..de4d5b7db7 100644 --- a/src/runner.c +++ b/src/runner.c @@ -325,6 +325,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) { void runner_doghost ( struct runner *r , struct cell *c ) { struct part *p; + struct cpart *cp; struct cell *finger; int i, k, redo, count = c->count; int *pid; @@ -357,9 +358,10 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Get a direct pointer on the part. */ p = &c->parts[ pid[i] ]; + cp = &c->cparts[ pid[i] ]; /* Is this part within the timestep? */ - if ( p->dt <= dt_max ) { + if ( cp->dt <= dt_max ) { /* Adjust the computed rho. */ ihg = kernel_igamma / p->h; @@ -370,11 +372,12 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Update the smoothing length. */ p->h -= ( p->wcount - const_nwneigh ) / p->wcount_dh; + cp->h = p->h; /* Did we get the right number density? */ if ( p->wcount > const_nwneigh + 1 || p->wcount < const_nwneigh - 1 ) { - printf( "runner_doghost: particle %lli (h=%e,depth=%i) has bad wcount=%f.\n" , p->id , p->h , c->depth , p->wcount ); fflush(stdout); + // printf( "runner_doghost: particle %lli (h=%e,h_dt=%e,depth=%i) has bad wcount=%.3f.\n" , p->id , p->h , p->h_dt , c->depth , p->wcount ); fflush(stdout); // p->h += ( p->wcount + kernel_root - const_nwneigh ) / p->wcount_dh; pid[redo] = pid[i]; redo += 1; @@ -387,6 +390,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Compute this particle's time step. */ p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u ); + cp->dt = p->dt; /* Compute the pressure. */ // p->P = p->rho * p->u * ( const_gamma - 1.0f ); @@ -394,15 +398,15 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Compute the P/Omega/rho2. */ p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); - } + /* Reset the acceleration. */ + for ( k = 0 ; k < 3 ; k++ ) + p->a[k] = 0.0f; - /* Reset the acceleration. */ - for ( k = 0 ; k < 3 ; k++ ) - p->a[k] = 0.0f; + /* Reset the time derivatives. */ + p->u_dt = 0.0f; + p->h_dt = 0.0f; - /* Reset the time derivatives. */ - p->u_dt = 0.0f; - p->h_dt = 0.0f; + } } diff --git a/src/runner_iact.h b/src/runner_iact.h index 0b1c2f93d9..f4b117ffa6 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -368,7 +368,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 pj->u_dt += pi->mass * dvdr * wj_dr; /* Get the time derivative for h. */ - pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr; + pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr; pj->h_dt -= pi->mass / pi->rho * dvdr * wj_dr; #ifdef HIST @@ -485,8 +485,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float for ( k = 0 ; k < VEC_SIZE ; k++ ) { pi[k]->u_dt += piu_dt.f[k]; pj[k]->u_dt += pju_dt.f[k]; - pi[k]->h_dt += pih_dt.f[k]; - pj[k]->h_dt += pjh_dt.f[k]; + pi[k]->h_dt -= pih_dt.f[k]; + pj[k]->h_dt -= pjh_dt.f[k]; for ( j = 0 ; j < 3 ; j++ ) { pi[k]->a[j] -= pia[j].f[k]; pj[k]->a[j] += pja[j].f[k]; @@ -545,7 +545,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl pi->u_dt += pj->mass * dvdr * wi_dr; /* Get the time derivative for h. */ - pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr; + pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr; } @@ -646,7 +646,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force /* Store the forces back on the particles. */ for ( k = 0 ; k < VEC_SIZE ; k++ ) { pi[k]->u_dt += piu_dt.f[k]; - pi[k]->h_dt += pih_dt.f[k]; + pi[k]->h_dt -= pih_dt.f[k]; for ( j = 0 ; j < 3 ; j++ ) pi[k]->a[j] -= pia[j].f[k]; } diff --git a/src/space.c b/src/space.c index cf15bb969f..ba81b3b551 100644 --- a/src/space.c +++ b/src/space.c @@ -161,12 +161,17 @@ void space_prepare ( struct space *s ) { int k; struct task *t; - float dt_max = s->dt_max; + float dt_max = s->dt_max, dx_max = 0.0f; int counts[ task_type_count + 1 ]; /* Traverse the cells and set their dt_min and dt_max. */ space_map_cells_post( s , 1 , &space_map_prepare , NULL ); + /* Get the maximum displacement in the whole system. */ + for ( k = 0 ; k < s->nr_cells ; k++ ) + dx_max = fmaxf( dx_max , s->cells[k].dx_max ); + printf( "space_prepare: dx_max is %e.\n" , dx_max ); + /* Run through the tasks and mark as skip or not. */ for ( k = 0 ; k < s->nr_tasks ; k++ ) { t = &s->tasks[k]; @@ -192,15 +197,6 @@ void space_prepare ( struct space *s ) { /* Traverse the cells and set their dt_min and dt_max. */ space_map_cells_post( s , 1 , &space_map_prepare , NULL ); - /* Run through the tasks and mark as skip or not. */ - for ( k = 0 ; k < s->nr_tasks ; k++ ) { - t = &s->tasks[k]; - if ( t->type == task_type_sort || t->type == task_type_self || t->type == task_type_ghost ) - t->skip = ( t->ci->dt_min > dt_max ); - else if ( t->type == task_type_pair ) - t->skip = ( t->ci->dt_min > dt_max && t->cj->dt_min > dt_max ); - } - } /* Store the condensed particle data. */ @@ -262,6 +258,10 @@ void space_ranktasks ( struct space *s ) { temp = tid[j]; tid[j] = tid[k]; tid[k] = temp; j += 1; } + + /* Did we get anything? */ + if ( j == left ) + error( "Unsatisfiable task dependencies detected." ); /* Traverse the task tree and add tasks with no weight. */ for ( i = left ; i < j ; i++ ) { @@ -488,6 +488,7 @@ void space_rebuild ( struct space *s , double cell_max ) { } s->h_min = h_min; s->h_max = h_max; + printf( "space_rebuild: h_min/h_max is %.3e/%.3e.\n" , h_min , h_max ); // printf( "space_rebuild: getting h_min and h_max took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); /* Get the new putative cell dimensions. */ @@ -537,6 +538,9 @@ void space_rebuild ( struct space *s , double cell_max ) { c->dmin = dmin; c->depth = 0; } + + /* Be verbose about the change. */ + printf( "space_rebuild: set cell dimensions to [ %i %i %i ].\n" , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout); } /* re-build upper-level cells? */ // printf( "space_rebuild: rebuilding upper-level cells took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); @@ -844,7 +848,7 @@ struct task *space_addtask ( struct space *s , int type , int subtype , int flag t->ci = ci; t->cj = cj; t->skip = 0; - t->tight = 0; + t->tight = tight; t->nr_unlock_tasks = 0; t->nr_unlock_cells = 0; @@ -1398,9 +1402,14 @@ void space_maketasks ( struct space *s , int do_sort ) { if ( t->skip ) continue; if ( t->type == task_type_sort && t->ci->split ) - for ( j = 0 ; j < 8 ; j++ ) - if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts[0] != NULL ) - task_addunlock( t->ci->progeny[j]->sorts[0] , t ); + for ( j = 0 ; j < 8 ; j++ ) { + if ( t->ci->progeny[j] == NULL ) + continue; + if ( t->ci->progeny[j]->sorts[0] == NULL ) + t->ci->progeny[j]->sorts[0] = space_addtask( s , task_type_sort , task_subtype_none , 0 /* t->flags? */ , 0 , t->ci , NULL , 0 ); + t->ci->progeny[j]->sorts[0]->skip = 0; + task_addunlock( t->ci->progeny[j]->sorts[0] , t ); + } } /* Count the number of tasks associated with each cell and diff --git a/src/vector.h b/src/vector.h index 3461134754..96534c6866 100644 --- a/src/vector.h +++ b/src/vector.h @@ -27,8 +27,11 @@ #define VEC_MACRO(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type /* So what will the vector size be? */ - #ifdef __AVX__ + #ifdef NO__AVX__ + #define VECTORIZE #define VEC_SIZE 8 + #define VEC_FLOAT __m256 + #define VEC_INT __m256i #define vec_load(a) _mm256_load_ps(a) #define vec_set1(a) _mm256_set1_ps(a) #define vec_sqrt(a) _mm256_sqrt_ps(a) @@ -36,8 +39,11 @@ #define vec_rsqrt(a) _mm256_rsqrt_ps(a) #define vec_ftoi(a) _mm256_cvttps_epi32(a) #define vec_fmin(a,b) _mm256_min_ps(a,b) - #else + #elif defined( NO__SSE2__ ) + #define VECTORIZE #define VEC_SIZE 4 + #define VEC_FLOAT __m128 + #define VEC_INT __m128i #define vec_load(a) _mm_load_ps(a) #define vec_set1(a) _mm_set1_ps(a) #define vec_sqrt(a) _mm_sqrt_ps(a) @@ -45,6 +51,8 @@ #define vec_rsqrt(a) _mm_rsqrt_ps(a) #define vec_ftoi(a) _mm_cvttps_epi32(a) #define vec_fmin(a,b) _mm_min_ps(a,b) + #else + #define VEC_SIZE 4 #endif // #ifdef __AVX__ // #define VEC_SIZE 8 @@ -61,8 +69,10 @@ /* Define the composite types for element access. */ #ifdef VECTORIZE typedef union { - VEC_MACRO(VEC_SIZE,float) v; - VEC_MACRO(VEC_SIZE,int) m; + // VEC_MACRO(VEC_SIZE,float) v; + // VEC_MACRO(VEC_SIZE,int) m; + VEC_FLOAT v; + VEC_INT m; float f[VEC_SIZE]; int i[VEC_SIZE]; } vector; -- GitLab