diff --git a/src/Makefile.am b/src/Makefile.am index 6acf9e47dd85e34e2d123496879d439c791ed60a..53fc189f38d20616bdc75a8ef8a2abcf49e8c87e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -20,11 +20,11 @@ AUTOMAKE_OPTIONS=gnu # Add the debug flag to the whole thing -# AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ -# -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ -# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 -AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \ +AM_CFLAGS = -g -O3 -Wall -Werror -ffast-math -fstrict-aliasing -ftree-vectorize \ + -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 +# AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \ +# -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 # Assign a "safe" version number AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 diff --git a/src/debug.c b/src/debug.c index 4d885521bb814cdeb89c4494036d5ae3b5854a48..c11edffc943fd9f4af1856ae5060489138f23a13 100644 --- a/src/debug.c +++ b/src/debug.c @@ -1,6 +1,7 @@ /******************************************************************************* * This file is part of SWIFT. - * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). + * Coypright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk), + * Pedro Gonnet (pedro.gonnet@durham.ac.uk). * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -29,7 +30,7 @@ void printParticle ( struct part *parts , long long int id ) { /* Look for the particle. */ for ( i = 0 ; parts[i].id != id ; i++ ); - printf("## Particle[%d]: id=%lld, x=(%f,%f,%f), v=(%f,%f,%f), a=(%f,%f,%f), h=%f, h_dt=%f, wcount=%f, m=%f, rho=%f, u=%f, dudt=%f, dt=%.3e\n", + printf("## Particle[%d]: id=%lld, x=[%f,%f,%f], v=[%f,%f,%f], a=[%f,%f,%f], h=%.3e, h_dt=%.3e, wcount=%f, m=%.3e, rho=%f, u=%f, dudt=%f, dt=%.3e\n", i, parts[i].id, parts[i].x[0], parts[i].x[1], parts[i].x[2], diff --git a/src/engine.c b/src/engine.c index 7513914b11e0ecf2d738764a181dc298ac49e04a..b101a1e34c94cc5a460fb23d457356e55eb6c5b1 100644 --- a/src/engine.c +++ b/src/engine.c @@ -66,7 +66,7 @@ void engine_prepare ( struct engine *e ) { int j, k, qid; struct space *s = e->s; struct queue *q; - float dt_max = e->dt_max; + float dt_step = e->dt_step; TIMER_TIC @@ -95,7 +95,7 @@ void engine_prepare ( struct engine *e ) { // tic = getticks(); #pragma omp parallel for schedule(static) for ( k = 0 ; k < s->nr_parts ; k++ ) - if ( s->parts[k].dt <= dt_max ) { + if ( s->parts[k].dt <= dt_step ) { s->parts[k].wcount = 0.0f; s->parts[k].wcount_dh = 0.0f; s->parts[k].rho = 0.0f; @@ -179,71 +179,70 @@ void engine_step ( struct engine *e , int sort_queues ) { int k, nr_parts = e->s->nr_parts; struct part *restrict parts = e->s->parts, *restrict p; - float *restrict v_bar; - float dt = e->dt, hdt = 0.5*dt, dt_max, dt_min, ldt_min, ldt_max; - double etot = 0.0, letot, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 }; - int threadID, nthreads; + struct xpart *restrict xp; + float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max; + double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 }; + int threadID, nthreads, count = 0, lcount; // #ifdef __SSE2__ // VEC_MACRO(4,float) hdtv = _mm_set1_ps( hdt ); // #endif /* Get the maximum dt. */ - dt_max = 2.0f*dt; + dt_step = 2.0f*dt; for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ ) - dt_max *= 2; - dt_max = 1; + dt_step *= 2; + // dt_step = FLT_MAX; /* Set the maximum dt. */ - e->dt_max = dt_max; - e->s->dt_max = dt_max; - printf( "engine_step: dt_max set to %.3e.\n" , dt_max ); fflush(stdout); + e->dt_step = dt_step; + e->s->dt_step = dt_step; + printf( "engine_step: dt_step set to %.3e.\n" , dt_step ); fflush(stdout); - /* Allocate a buffer for the old velocities. */ - if ( posix_memalign( (void **)&v_bar , 16 , sizeof(float) * nr_parts * 4 ) != 0 ) - error( "Failed to allocate v_old buffer." ); - /* First kick. */ TIMER_TIC - // #pragma omp parallel for schedule(static) private(p) + #pragma omp parallel for schedule(static) private(p,xp) for ( k = 0 ; k < nr_parts ; k++ ) { /* Get a handle on the part. */ p = &parts[k]; + xp = p->xtras; /* Step and store the velocity and internal energy. */ // #ifdef __SSE__ - // _mm_store_ps( &v_bar[4*k] , _mm_add_ps( _mm_load_ps( &p->v[0] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) ); + // _mm_store_ps( &v_bar[4*k] , _mm_load_ps( &p->v[0] ) + hdtv * _mm_load_ps( &p->a[0] ) ); // #else - v_bar[4*k+0] = p->v[0] + hdt * p->a[0]; - v_bar[4*k+1] = p->v[1] + hdt * p->a[1]; - v_bar[4*k+2] = p->v[2] + hdt * p->a[2]; + xp->v_old[0] = p->v[0] + hdt * p->a[0]; + xp->v_old[1] = p->v[1] + hdt * p->a[1]; + xp->v_old[2] = p->v[2] + hdt * p->a[2]; // #endif - v_bar[4*k+3] = p->u + hdt * p->u_dt; + // xp->u_old = fmaxf( p->u + hdt * p->u_dt , FLT_EPSILON ); + xp->u_old = p->u + hdt * p->u_dt; /* Move the particles with the velocitie at the half-step. */ - p->x[0] += dt * v_bar[4*k+0]; - p->x[1] += dt * v_bar[4*k+1]; - p->x[2] += dt * v_bar[4*k+2]; + p->x[0] += dt * xp->v_old[0]; + p->x[1] += dt * xp->v_old[1]; + p->x[2] += dt * xp->v_old[2]; /* Update positions and energies at the half-step. */ p->v[0] += dt * p->a[0]; p->v[1] += dt * p->a[1]; p->v[2] += dt * p->a[2]; p->u *= expf( p->u_dt / p->u * dt ); - // p->h *= expf( p->h_dt / p->h * dt ); + p->h *= expf( p->h_dt / p->h * dt ); /* Integrate other values if this particle will not be updated. */ - // if ( p->dt > dt_max ) { - // p->rho *= expf( -3.0f * p->h_dt / p->h * dt ); - // p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); - // } + if ( p->dt > dt_step ) { + p->rho *= expf( -3.0f * p->h_dt / p->h * dt ); + p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); + } } TIMER_TOC( timer_kick1 ); // for(k=0; k<10; ++k) // printParticle(parts, k); - + // printParticle( parts , 494849 ); + /* Prepare the space. */ engine_prepare( e ); @@ -274,44 +273,52 @@ void engine_step ( struct engine *e , int sort_queues ) { // for(k=0; k<10; ++k) // printParticle(parts, k); + // printParticle( parts , 494849 ); /* Second kick. */ TIMER_TIC_ND dt_min = FLT_MAX; dt_max = 0.0f; - #pragma omp parallel private(p,k,ldt_min,ldt_max,lmom,letot,threadID,nthreads) + #pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lekin,lepot,threadID,nthreads) { threadID = omp_get_thread_num(); nthreads = omp_get_num_threads(); ldt_min = FLT_MAX; ldt_max = 0.0f; lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0; - letot = 0.0; + lekin = 0.0; lepot = 0.0; lcount = 0; for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) { /* Get a handle on the part. */ p = &parts[k]; + xp = p->xtras; /* Scale the derivatives if they're freshly computed. */ - if ( p->dt <= dt_max ) { + if ( p->dt <= dt_step ) { p->u_dt *= p->POrho2; p->h_dt *= p->h * 0.333333333f; + lcount += 1; } + + /* Update the particle's time step. */ + p->dt = const_cfl * p->h / ( p->c + p->v_sig ); /* Update positions and energies at the half-step. */ // #ifdef __SSE__ - // _mm_store_ps( &p->v[0] , _mm_add_ps( _mm_load_ps( &v_bar[4*k] ) , _mm_mul_ps( hdtv , _mm_load_ps( &p->a[0] ) ) ) ); + // _mm_store_ps( &p->v[0] , _mm_load_ps( &v_bar[4*k] ) + hdtv * _mm_load_ps( &p->a[0] ) ); // #else - p->v[0] = v_bar[4*k+0] + hdt * p->a[0]; - p->v[1] = v_bar[4*k+1] + hdt * p->a[1]; - p->v[2] = v_bar[4*k+2] + hdt * p->a[2]; + p->v[0] = xp->v_old[0] + hdt * p->a[0]; + p->v[1] = xp->v_old[1] + hdt * p->a[1]; + p->v[2] = xp->v_old[2] + hdt * p->a[2]; // #endif - p->u = v_bar[4*k+3] + hdt * p->u_dt; - + // p->u = fmaxf( xp->u_old + hdt * p->u_dt , FLT_EPSILON ); + p->u = xp->u_old + hdt * p->u_dt; + /* Get the smallest/largest dt. */ ldt_min = fminf( ldt_min , p->dt ); ldt_max = fmaxf( ldt_max , p->dt ); /* Collect total energy. */ - letot += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] ) + p->mass * p->u; + lekin += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] ); + lepot += p->mass * p->u; /* Collect momentum */ lmom[0] += p->mass * p->v[0]; @@ -326,31 +333,33 @@ void engine_step ( struct engine *e , int sort_queues ) { mom[0] += lmom[0]; mom[1] += lmom[1]; mom[2] += lmom[2]; - etot += letot; + epot += lepot; + ekin += lekin; + count += lcount; } } TIMER_TOC( timer_kick2 ); e->dt_min = dt_min; printf( "engine_step: dt_min/dt_max is %e/%e.\n" , dt_min , dt_max ); fflush(stdout); - printf( "engine_step: etot is %e.\n" , etot ); fflush(stdout); + printf( "engine_step: etot is %e (ekin=%e, epot=%e).\n" , ekin+epot , ekin , epot ); fflush(stdout); printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout); + printf( "engine_step: updated %i parts (dt_step=%.3e).\n" , count , dt_step ); fflush(stdout); + /* Increase the step counter. */ + e->step += 1; + /* Does the time step need adjusting? */ - if ( dt_min < e->dt ) { + while ( dt_min < e->dt ) { e->dt *= 0.5; + e->step *= 2; printf( "engine_step: dt_min dropped below time step, adjusting to dt=%e.\n" , e->dt ); } - else if ( dt_min > 2*e->dt ) { + while ( dt_min > 2*e->dt && (e->step & 1) == 0 ) { e->dt *= 2.0; + e->step /= 2; printf( "engine_step: dt_min is larger than twice the time step, adjusting to dt=%e.\n" , e->dt ); } - /* Clean up. */ - free( v_bar ); - - /* Increase the step counter. */ - e->step += 1; - } diff --git a/src/engine.h b/src/engine.h index 74a059addc7a5ac28854d808a80a9df2e29e4a94..f872f3cef703dfc58b4ed6dddfc365cc9268cfe3 100644 --- a/src/engine.h +++ b/src/engine.h @@ -50,8 +50,10 @@ struct engine { /* The queues. */ struct queue *queues; - /* The maximum dt to step. */ - float dt_max; + /* The maximum dt to step (current). */ + float dt_step; + + /* The minimum dt over all particles in the system. */ float dt_min; /* The system time step. */ diff --git a/src/part.h b/src/part.h index d4c2f2ea5f21e9245308836b098db675c4051342..bfa88567b79b83e4656ce66c3c3aa5edb6cc634e 100644 --- a/src/part.h +++ b/src/part.h @@ -38,6 +38,21 @@ struct cpart { } __attribute__((aligned (32))); + +/* Extra particle data not needed during the computation. */ +struct xpart { + + /* Old position, at last tree rebuild. */ + double x_old[3]; + + /* Old velocity. */ + float v_old[3]; + + /* Old entropy. */ + float u_old; + + } __attribute__((aligned (32))); + /* Data of a single particle. */ struct part { @@ -66,6 +81,9 @@ struct part { /* Particle acceleration. */ float a[3] __attribute__((aligned (16))); + /* Maximum neighbouring u. */ + float c, v_sig; + /* Derivative of the density with respect to this particle's smoothing length. */ float rho_dh; @@ -80,8 +98,8 @@ struct part { /* Particle ID. */ unsigned long long id; - /* Old position, at last tree rebuild. */ - double x_old[3]; + /* Pointer to extra particle data. */ + struct xpart *xtras; /* Particle position. */ double x[3]; diff --git a/src/runner.c b/src/runner.c index de4d5b7db7ce9c7d60fdb202f22dd82d3eba5a6c..9b74876d6f492ba277c6e64cfd06d4beb6060571 100644 --- a/src/runner.c +++ b/src/runner.c @@ -329,8 +329,8 @@ void runner_doghost ( struct runner *r , struct cell *c ) { struct cell *finger; int i, k, redo, count = c->count; int *pid; - float ihg, ihg2; - float dt_max = r->e->dt_max; + float ihg, ihg2, h_corr; + float dt_step = r->e->dt_step; TIMER_TIC /* Recurse? */ @@ -361,7 +361,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) { cp = &c->cparts[ pid[i] ]; /* Is this part within the timestep? */ - if ( cp->dt <= dt_max ) { + if ( cp->dt <= dt_step ) { /* Adjust the computed rho. */ ihg = kernel_igamma / p->h; @@ -370,8 +370,15 @@ void runner_doghost ( struct runner *r , struct cell *c ) { p->rho_dh *= ihg2 * ihg2; p->wcount += kernel_wroot; - /* Update the smoothing length. */ - p->h -= ( p->wcount - const_nwneigh ) / p->wcount_dh; + /* Compute the smoothing length update (Newton step). */ + h_corr = ( const_nwneigh - p->wcount ) / p->wcount_dh; + + /* Truncate to the range [ -p->h/2 , p->h ]. */ + h_corr = fminf( h_corr , p->h ); + h_corr = fmaxf( h_corr , -p->h/2 ); + + /* Apply the correction to p->h. */ + p->h += h_corr; cp->h = p->h; /* Did we get the right number density? */ @@ -389,8 +396,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) { } /* Compute this particle's time step. */ - p->dt = const_cfl * p->h / sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u ); - cp->dt = p->dt; + p->c = sqrtf( const_gamma * ( const_gamma - 1.0f ) * p->u ); /* Compute the pressure. */ // p->P = p->rho * p->u * ( const_gamma - 1.0f ); @@ -405,6 +411,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Reset the time derivatives. */ p->u_dt = 0.0f; p->h_dt = 0.0f; + p->v_sig = 0.0f; } diff --git a/src/runner_doiact.h b/src/runner_doiact.h index f22f9a6ad5b5c0d0aa8866cbc3acb63f88ebe57e..8436e7c06b24f4b8b6725e4be031d96d1448bb0e 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -106,7 +106,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r struct cpart *restrict cpi, *restrict cparts_i = ci->cparts; double pix[3]; float dx[3], hi, hi2, r2; - float dt_max = e->dt_max; + float dt_step = e->dt_step; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -118,7 +118,7 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r TIMER_TIC /* Anything to do here? */ - if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) return; /* Get the relative distance between the pairs, wrapping. */ @@ -215,7 +215,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) { struct cpart *restrict cpi, *restrict cpj,*restrict cparts = c->cparts; double pix[3]; float dx[3], hi, hi2, r2; - float dt_max = r->e->dt_max; + float dt_step = r->e->dt_step; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -227,7 +227,7 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) { TIMER_TIC /* Anything to do here? */ - if ( c->dt_min > dt_max ) + if ( c->dt_min > dt_step ) return; /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , @@ -638,7 +638,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) { double hi_max, hj_max; double di_max, dj_min; int count_i, count_j; - float dt_max = e->dt_max; + float dt_step = e->dt_step; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -650,7 +650,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) { TIMER_TIC /* Anything to do here? */ - if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) return; /* Get the sort ID. */ @@ -722,7 +722,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) { /* Get a hold of the ith part in ci. */ pi = &parts_i[ sort_i[ pid ].i ]; cpi = &cparts_i[ sort_i[ pid ].i ]; - if ( cpi->dt > dt_max ) + if ( cpi->dt > dt_step ) continue; hi = cpi->h; di = sort_i[pid].d + hi - rshift; @@ -789,7 +789,7 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) { /* Get a hold of the jth part in cj. */ pj = &parts_j[ sort_j[ pjd ].i ]; cpj = &cparts_j[ sort_j[ pjd ].i ]; - if ( cpj->dt > dt_max ) + if ( cpj->dt > dt_step ) continue; hj = cpj->h; dj = sort_j[pjd].d - hj - rshift; @@ -879,19 +879,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { double hi_max, hj_max; double di_max, dj_min; int count_i, count_j; - float dt_max = e->dt_max; + float dt_step = e->dt_step; #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; + int icount1 = 0; + float r2q1[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq1[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq1[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; + int icount2 = 0; + float r2q2[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq2[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq2[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; #endif TIMER_TIC /* Anything to do here? */ - if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) return; /* Get the shift ID. */ @@ -931,28 +937,28 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { dj_min = sort_j[0].d; /* Collect the number of parts left and right below dt. */ - if ( cj->dt_min > dt_max ) { + if ( ci->dt_max <= dt_step ) { sortdt_i = sort_i; countdt_i = count_i; } - else if ( cj->dt_max > dt_max ) { + else if ( ci->dt_min <= dt_step ) { if ( ( sortdt_i = (struct entry *)alloca( sizeof(struct entry) * count_i ) ) == NULL ) error( "Failed to allocate dt sortlists." ); for ( k = 0 ; k < count_i ; k++ ) - if ( cparts_i[ sort_i[ k ].i ].dt <= dt_max ) { + if ( cparts_i[ sort_i[ k ].i ].dt <= dt_step ) { sortdt_i[ countdt_i ] = sort_i[k]; countdt_i += 1; } } - if ( ci->dt_min > dt_max ) { + if ( cj->dt_max <= dt_step ) { sortdt_j = sort_j; countdt_j = count_j; } - else if ( ci->dt_max > dt_max ) { + else if ( cj->dt_min <= dt_step ) { if ( ( sortdt_j = (struct entry *)alloca( sizeof(struct entry) * count_j ) ) == NULL ) error( "Failed to allocate dt sortlists." ); for ( k = 0 ; k < count_j ; k++ ) - if ( cparts_j[ sort_j[ k ].i ].dt <= dt_max ) { + if ( cparts_j[ sort_j[ k ].i ].dt <= dt_step ) { sortdt_j[ countdt_j ] = sort_j[k]; countdt_j += 1; } @@ -977,7 +983,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { pix[k] = cpi->x[k] - shift[k]; /* Look at valid dt parts only? */ - if ( cpi->dt > dt_max ) { + if ( cpi->dt > dt_step ) { /* Loop over the parts in cj within dt. */ for ( pjd = 0 ; pjd < countdt_j && sortdt_j[pjd].d < di ; pjd++ ) { @@ -988,7 +994,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { /* Compute the pairwise distance. */ r2 = 0.0f; for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - cpj->x[k]; + dx[k] = cpj->x[k] - pix[k]; r2 += dx[k]*dx[k]; } @@ -997,25 +1003,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { #ifndef VECTORIZE - IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sortdt_j[pjd].i ] ); + IACT_NONSYM( r2 , dx , cpj->h , hi , &parts_j[ sortdt_j[pjd].i ] , pi ); #else /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = cpj->h; - piq[icount] = pi; - pjq[icount] = &parts_j[ sortdt_j[ pjd ].i ]; - icount += 1; + r2q1[icount1] = r2; + dxq1[3*icount1+0] = dx[0]; + dxq1[3*icount1+1] = dx[1]; + dxq1[3*icount1+2] = dx[2]; + hiq1[icount1] = cpj->h; + hjq1[icount1] = hi; + piq1[icount1] = &parts_j[ sortdt_j[ pjd ].i ]; + pjq1[icount1] = pi; + icount1 += 1; /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; + if ( icount1 == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); + icount1 = 0; } #endif @@ -1047,25 +1053,55 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { #ifndef VECTORIZE - IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] ); + /* Does pj need to be updated too? */ + if ( cpj->dt <= dt_step ) + IACT( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] ); + else + IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[pjd].i ] ); #else - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = cpj->h; - piq[icount] = pi; - pjq[icount] = &parts_j[ sort_j[ pjd ].i ]; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; + /* Does pj need to be updated too? */ + if ( cpj->dt <= dt_step ) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3*icount2+0] = dx[0]; + dxq2[3*icount2+1] = dx[1]; + dxq2[3*icount2+2] = dx[2]; + hiq2[icount2] = hi; + hjq2[icount2] = cpj->h; + piq2[icount2] = pi; + pjq2[icount2] = &parts_j[ sort_j[ pjd ].i ]; + icount2 += 1; + + /* Flush? */ + if ( icount2 == VEC_SIZE ) { + IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); + icount2 = 0; + } + + } + + else { + + /* Add this interaction to the non-symmetric queue. */ + r2q1[icount1] = r2; + dxq1[3*icount1+0] = dx[0]; + dxq1[3*icount1+1] = dx[1]; + dxq1[3*icount1+2] = dx[2]; + hiq1[icount1] = hi; + hjq1[icount1] = cpj->h; + piq1[icount1] = pi; + pjq1[icount1] = &parts_j[ sort_j[ pjd ].i ]; + icount1 += 1; + + /* Flush? */ + if ( icount1 == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); + icount1 = 0; + } + } #endif @@ -1097,7 +1133,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { hj2 = hj * hj; /* Is this particle outside the dt? */ - if ( cpj->dt > dt_max ) { + if ( cpj->dt > dt_step ) { /* Loop over the parts in ci. */ for ( pid = countdt_i-1 ; pid >= 0 && sortdt_i[pid].d > dj ; pid-- ) { @@ -1108,7 +1144,7 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { /* Compute the pairwise distance. */ r2 = 0.0f; for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = cpi->x[k] - pjx[k]; + dx[k] = pjx[k] - cpi->x[k]; r2 += dx[k]*dx[k]; } @@ -1117,25 +1153,25 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { #ifndef VECTORIZE - IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sortdt_i[pid].i ] ); + IACT_NONSYM( r2 , dx , cpi->h , hj , &parts_i[ sortdt_i[pid].i ] , pj ); #else /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hj; - hjq[icount] = cpi->h; - piq[icount] = pj; - pjq[icount] = &parts_i[ sortdt_i[ pid ].i ]; - icount += 1; + r2q1[icount1] = r2; + dxq1[3*icount1+0] = dx[0]; + dxq1[3*icount1+1] = dx[1]; + dxq1[3*icount1+2] = dx[2]; + hiq1[icount1] = cpi->h; + hjq1[icount1] = hj; + piq1[icount1] = &parts_i[ sortdt_i[ pid ].i ]; + pjq1[icount1] = pj; + icount1 += 1; /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; + if ( icount1 == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); + icount1 = 0; } #endif @@ -1166,25 +1202,55 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { #ifndef VECTORIZE - IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] ); + /* Does pi need to be updated too? */ + if ( cpi->dt <= dt_step ) + IACT( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] ); + else + IACT_NONSYM( r2 , dx , hj , cpi->h , pj , &parts_i[ sort_i[pid].i ] ); #else - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hj; - hjq[icount] = cpi->h; - piq[icount] = pj; - pjq[icount] = &parts_i[ sort_i[ pid ].i ]; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; + /* Does pi need to be updated too? */ + if ( cpi->dt <= dt_step ) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3*icount2+0] = dx[0]; + dxq2[3*icount2+1] = dx[1]; + dxq2[3*icount2+2] = dx[2]; + hiq2[icount2] = hj; + hjq2[icount2] = cpi->h; + piq2[icount2] = pj; + pjq2[icount2] = &parts_i[ sort_i[ pid ].i ]; + icount2 += 1; + + /* Flush? */ + if ( icount2 == VEC_SIZE ) { + IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); + icount2 = 0; + } + + } + + else { + + /* Add this interaction to the non-summetric queue. */ + r2q1[icount1] = r2; + dxq1[3*icount1+0] = dx[0]; + dxq1[3*icount1+1] = dx[1]; + dxq1[3*icount1+2] = dx[2]; + hiq1[icount1] = hj; + hjq1[icount1] = cpi->h; + piq1[icount1] = pj; + pjq1[icount1] = &parts_i[ sort_i[ pid ].i ]; + icount1 += 1; + + /* Flush? */ + if ( icount1 == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); + icount1 = 0; + } + } #endif @@ -1199,9 +1265,12 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { #ifdef VECTORIZE /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); + if ( icount1 > 0 ) + for ( k = 0 ; k < icount1 ; k++ ) + IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] ); + if ( icount2 > 0 ) + for ( k = 0 ; k < icount2 ; k++ ) + IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] ); #endif #ifdef TIMER_VERBOSE @@ -1227,7 +1296,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) { float dx[3], hi, hi2, r2; struct part *restrict parts = c->parts, *restrict pi; struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts; - float dt_max = r->e->dt_max; + float dt_step = r->e->dt_step; #ifdef VECTORIZE int icount = 0; float r2q[VEC_SIZE] __attribute__ ((aligned (16))); @@ -1239,7 +1308,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) { TIMER_TIC /* Any work here? */ - if ( c->dt_min > dt_max ) + if ( c->dt_min > dt_step ) return; /* Loop over the particles in the cell. */ @@ -1248,7 +1317,7 @@ void DOSELF1 ( struct runner *r , struct cell *restrict c ) { /* Get a pointer to the ith particle. */ cpi = &cparts[pid]; pi = &parts[pid]; - if ( cpi->dt > dt_max ) + if ( cpi->dt > dt_step ) continue; /* Get the particle position and radius. */ @@ -1331,26 +1400,32 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { float dx[3], hi, hi2, r2; struct part *restrict parts = c->parts, *restrict pi; struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts; - float dt_max = r->e->dt_max; + float dt_step = r->e->dt_step; int firstdt = 0, countdt = 0, *indt = NULL; #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; + int icount1 = 0; + float r2q1[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq1[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq1[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; + int icount2 = 0; + float r2q2[VEC_SIZE] __attribute__ ((aligned (16))); + float hiq2[VEC_SIZE] __attribute__ ((aligned (16))); + float hjq2[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; #endif TIMER_TIC /* Set up indt if needed. */ - if ( c->dt_min > dt_max ) + if ( c->dt_min > dt_step ) return; - else if ( c->dt_max > dt_max ) { + else if ( c->dt_max > dt_step ) { if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL ) error( "Failed to allocate indt." ); for ( k = 0 ; k < count ; k++ ) - if ( cparts[k].dt <= dt_max ) { + if ( cparts[k].dt <= dt_step ) { indt[ countdt ] = k; countdt += 1; } @@ -1370,7 +1445,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { hi2 = hi * hi; /* Is the ith particle not active? */ - if ( cpi->dt > dt_max ) { + if ( cpi->dt > dt_step ) { /* Loop over the other particles .*/ for ( pjd = firstdt ; pjd < countdt ; pjd++ ) { @@ -1381,7 +1456,7 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { /* Compute the pairwise distance. */ r2 = 0.0f; for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - cpj->x[k]; + dx[k] = cpj->x[k] - pix[k]; r2 += dx[k]*dx[k]; } @@ -1390,25 +1465,25 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { #ifndef VECTORIZE - IACT( r2 , dx , hi , cpj->h , pi , &parts[indt[pjd]] ); + IACT_NONSYM( r2 , dx , cpj->h , hi , &parts[indt[pjd]] , pi ); #else /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = cpj->h; - piq[icount] = pi; - pjq[icount] = &parts[indt[pjd]]; - icount += 1; + r2q1[icount1] = r2; + dxq1[3*icount1+0] = dx[0]; + dxq1[3*icount1+1] = dx[1]; + dxq1[3*icount1+2] = dx[2]; + hiq1[icount1] = cpj->h; + hjq1[icount1] = hi; + piq1[icount1] = &parts[indt[pjd]]; + pjq1[icount1] = pi; + icount1 += 1; /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; + if ( icount1 == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); + icount1 = 0; } #endif @@ -1443,25 +1518,55 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { #ifndef VECTORIZE - IACT( r2 , dx , hi , cpj->h , pi , &parts[pjd] ); + /* Does pj need to be updated too? */ + if ( cpj->dt <= dt_step ) + IACT( r2 , dx , hi , cpj->h , pi , &parts[pjd] ); + else + IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts[pjd] ); #else - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = cpj->h; - piq[icount] = pi; - pjq[icount] = &parts[pjd]; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; + /* Does pj need to be updated too? */ + if ( cpj->dt <= dt_step ) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3*icount2+0] = dx[0]; + dxq2[3*icount2+1] = dx[1]; + dxq2[3*icount2+2] = dx[2]; + hiq2[icount2] = hi; + hjq2[icount2] = cpj->h; + piq2[icount2] = pi; + pjq2[icount2] = &parts[pjd]; + icount2 += 1; + + /* Flush? */ + if ( icount2 == VEC_SIZE ) { + IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); + icount2 = 0; + } + + } + + else { + + /* Add this interaction to the non-symmetric queue. */ + r2q1[icount1] = r2; + dxq1[3*icount1+0] = dx[0]; + dxq1[3*icount1+1] = dx[1]; + dxq1[3*icount1+2] = dx[2]; + hiq1[icount1] = hi; + hjq1[icount1] = cpj->h; + piq1[icount1] = pi; + pjq1[icount1] = &parts[pjd]; + icount1 += 1; + + /* Flush? */ + if ( icount1 == VEC_SIZE ) { + IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); + icount1 = 0; + } + } #endif @@ -1476,9 +1581,12 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { #ifdef VECTORIZE /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); + if ( icount1 > 0 ) + for ( k = 0 ; k < icount1 ; k++ ) + IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] ); + if ( icount2 > 0 ) + for ( k = 0 ; k < icount2 ; k++ ) + IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] ); #endif #ifdef TIMER_VERBOSE @@ -1506,7 +1614,7 @@ void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) { double shift[3]; float h; struct space *s = r->e->s; - float dt_max = r->e->dt_max; + float dt_step = r->e->dt_step; TIMER_TIC @@ -1514,7 +1622,7 @@ void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) { if ( cj == NULL ) { /* Should we even bother? */ - if ( ci->dt_min > dt_max ) + if ( ci->dt_min > dt_step ) return; /* Recurse? */ @@ -1541,7 +1649,7 @@ void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) { else { /* Should we even bother? */ - if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) return; /* Get the cell dimensions. */ @@ -1776,7 +1884,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) { double shift[3]; float h; struct space *s = r->e->s; - float dt_max = r->e->dt_max; + float dt_step = r->e->dt_step; TIMER_TIC @@ -1784,7 +1892,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) { if ( cj == NULL ) { /* Should we even bother? */ - if ( ci->dt_min > dt_max ) + if ( ci->dt_min > dt_step ) return; /* Recurse? */ @@ -1811,7 +1919,7 @@ void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid ) { else { /* Should we even bother? */ - if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) return; /* Get the cell dimensions. */ diff --git a/src/runner_iact.h b/src/runner_iact.h index f4b117ffa68fb65ec788c990e3ef626503a4f3b1..70d07a6b658554e1583b5ac89dc5d30b19863f3e 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -329,26 +329,26 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 float r = sqrtf( r2 ), ri = 1.0f / r; float xi, xj; - float hig_inv, hig2_inv; - float hjg_inv, hjg2_inv; + float hi_inv, hi2_inv; + float hj_inv, hj2_inv; float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; float f; int k; /* Get the kernel for hi. */ - hig_inv = kernel_igamma / hi; - hig2_inv = hig_inv * hig_inv; - xi = r * hig_inv; + hi_inv = 1.0f / hi; + hi2_inv = hi_inv * hi_inv; + xi = r * hi_inv * kernel_igamma; kernel_deval( xi , &wi , &wi_dx ); - wi_dr = hig2_inv * hig2_inv * wi_dx; + wi_dr = hi2_inv * hi2_inv * wi_dx; /* Get the kernel for hj. */ - hjg_inv = kernel_igamma / hj; - hjg2_inv = hjg_inv * hjg_inv; - xj = r * hjg_inv; + hj_inv = 1.0f / hj; + hj2_inv = hj_inv * hj_inv; + xj = r * hj_inv * kernel_igamma; kernel_deval( xj , &wj , &wj_dx ); - wj_dr = hjg2_inv * hjg2_inv * wj_dx; - + wj_dr = hj2_inv * hj2_inv * wj_dx; + /* Get the common factor out. */ w = ri * ( pi->POrho2 * wi_dr + pj->POrho2 * wj_dr ); @@ -370,7 +370,11 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 /* Get the time derivative for h. */ pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr; pj->h_dt -= pi->mass / pi->rho * dvdr * wj_dr; - + + /* Update the signal velocity. */ + pi->v_sig = fmaxf( pi->v_sig , pj->c - 3*dvdr ); + pj->v_sig = fmaxf( pj->v_sig , pi->c - 3*dvdr ); + #ifdef HIST if ( hi > hj ) runner_hist_hit( hi / hj ); @@ -388,8 +392,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float vector r, r2, ri; vector xi, xj; vector hi, hj, hi_inv, hj_inv; - vector hig_inv, hig2_inv; - vector hjg_inv, hjg2_inv; + vector hig_inv, hi2_inv; + vector hjg_inv, hj2_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector w; vector piPOrho2, pjPOrho2, pirho, pjrho; @@ -444,20 +448,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v; - hig2_inv.v = hig_inv.v * hig_inv.v; + hi2_inv.v = hi_inv.v * hi_inv.v; xi.v = r.v * hig_inv.v; kernel_deval_vec( &xi , &wi , &wi_dx ); - wi_dr.v = hig2_inv.v * hig2_inv.v * wi_dx.v; + wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; /* Get the kernel for hj. */ hj.v = vec_load( Hj ); hj_inv.v = vec_rcp( hj.v ); hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v; - hjg2_inv.v = hjg_inv.v * hjg_inv.v; + hj2_inv.v = hj_inv.v * hj_inv.v; xj.v = r.v * hjg_inv.v; kernel_deval_vec( &xj , &wj , &wj_dx ); - wj_dr.v = hjg2_inv.v * hjg2_inv.v * wj_dx.v; + wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; /* Get the common factor out. */ w.v = ri.v * ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ); @@ -508,25 +512,25 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl float r = sqrtf( r2 ), ri = 1.0f / r; float xi, xj; - float hig_inv, hig2_inv; - float hjg_inv, hjg2_inv; + float hi_inv, hi2_inv; + float hj_inv, hj2_inv; float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; float f; int k; /* Get the kernel for hi. */ - hig_inv = kernel_igamma / hi; - hig2_inv = hig_inv * hig_inv; - xi = r * hig_inv; + hi_inv = 1.0f / hi; + hi2_inv = hi_inv * hi_inv; + xi = r * hi_inv * kernel_igamma; kernel_deval( xi , &wi , &wi_dx ); - wi_dr = hig2_inv * hig2_inv * wi_dx; + wi_dr = hi2_inv * hi2_inv * wi_dx; /* Get the kernel for hj. */ - hjg_inv = kernel_igamma / hj; - hjg2_inv = hjg_inv * hjg_inv; - xj = r * hjg_inv; + hj_inv = 1.0f / hj; + hj2_inv = hj_inv * hj_inv; + xj = r * hj_inv * kernel_igamma; kernel_deval( xj , &wj , &wj_dx ); - wj_dr = hjg2_inv * hjg2_inv * wj_dx; + wj_dr = hj2_inv * hj2_inv * wj_dx; /* Get the common factor out. */ w = ri * ( pi->POrho2 * wi_dr + pj->POrho2 * wj_dr ); @@ -547,6 +551,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl /* Get the time derivative for h. */ pi->h_dt -= pj->mass / pj->rho * dvdr * wi_dr; + /* Update the signal velocity (this is always symmetrical). */ + pi->v_sig = fmaxf( pi->v_sig , pj->c - 3*dvdr ); + pj->v_sig = fmaxf( pj->v_sig , pi->c - 3*dvdr ); + } @@ -557,8 +565,8 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force vector r, r2, ri; vector xi, xj; vector hi, hj, hi_inv, hj_inv; - vector hig_inv, hig2_inv; - vector hjg_inv, hjg2_inv; + vector hig_inv, hi2_inv; + vector hjg_inv, hj2_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector w; vector piPOrho2, pjPOrho2, pjrho; @@ -609,20 +617,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v; - hig2_inv.v = hig_inv.v * hig_inv.v; + hi2_inv.v = hi_inv.v * hi_inv.v; xi.v = r.v * hig_inv.v; kernel_deval_vec( &xi , &wi , &wi_dx ); - wi_dr.v = hig2_inv.v * hig2_inv.v * wi_dx.v; + wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; /* Get the kernel for hj. */ hj.v = vec_load( Hj ); hj_inv.v = vec_rcp( hj.v ); hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v; - hjg2_inv.v = hjg_inv.v * hjg_inv.v; + hj2_inv.v = hj_inv.v * hj_inv.v; xj.v = r.v * hjg_inv.v; kernel_deval_vec( &xj , &wj , &wj_dx ); - wj_dr.v = hjg2_inv.v * hjg2_inv.v * wj_dx.v; + wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; /* Get the common factor out. */ w.v = ri.v * ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ); diff --git a/src/space.c b/src/space.c index ba81b3b551a71f6126c4b95f59afe617e6caf5f1..ca43394f3923704b65a3af59ec6650b6f847f236 100644 --- a/src/space.c +++ b/src/space.c @@ -86,28 +86,31 @@ void space_map_prepare ( struct cell *c , void *data ) { int k; float dt_min, dt_max, h_max, dx_max; struct part *p; + struct xpart *xp; /* No children? */ if ( !c->split ) { /* Init with first part. */ p = &c->parts[0]; + xp = p->xtras; dt_min = p->dt; dt_max = p->dt; h_max = p->h; - dx_max = sqrtf( (p->x[0] - p->x_old[0])*(p->x[0] - p->x_old[0]) + - (p->x[1] - p->x_old[1])*(p->x[1] - p->x_old[1]) + - (p->x[2] - p->x_old[2])*(p->x[2] - p->x_old[2]) )*2 + p->h; + dx_max = sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) + + (p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) + + (p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h; /* Loop over parts. */ for ( k = 1 ; k < c->count ; k++ ) { p = &c->parts[k]; + xp = p->xtras; dt_min = fminf( dt_min , p->dt ); dt_max = fmaxf( dt_max , p->dt ); h_max = fmaxf( h_max , p->h ); - dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - p->x_old[0])*(p->x[0] - p->x_old[0]) + - (p->x[1] - p->x_old[1])*(p->x[1] - p->x_old[1]) + - (p->x[2] - p->x_old[2])*(p->x[2] - p->x_old[2]) )*2 + p->h ); + dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) + + (p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) + + (p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h ); } } @@ -161,10 +164,10 @@ void space_prepare ( struct space *s ) { int k; struct task *t; - float dt_max = s->dt_max, dx_max = 0.0f; + float dt_step = s->dt_step, dx_max = 0.0f; int counts[ task_type_count + 1 ]; - /* Traverse the cells and set their dt_min and dt_max. */ + /* Traverse the cells and set their dt_min and dx_max. */ space_map_cells_post( s , 1 , &space_map_prepare , NULL ); /* Get the maximum displacement in the whole system. */ @@ -179,9 +182,9 @@ void space_prepare ( struct space *s ) { t->type == task_type_self || t->type == task_type_ghost || ( t->type == task_type_sub && t->cj == NULL ) ) - t->skip = ( t->ci->dt_min > dt_max ); + t->skip = ( t->ci->dt_min > dt_step ); else if ( t->type == task_type_pair || ( t->type == task_type_sub && t->cj != NULL ) ) { - t->skip = ( t->ci->dt_min > dt_max && t->cj->dt_min > dt_max ); + t->skip = ( t->ci->dt_min > dt_step && t->cj->dt_min > dt_step ); if ( !t->skip && t->tight && ( t->ci->dx_max > t->ci->dmin || t->cj->dx_max > t->cj->dmin ) ) break; @@ -194,7 +197,7 @@ void space_prepare ( struct space *s ) { /* Re-build the space. */ space_rebuild( s , 0.0 ); - /* Traverse the cells and set their dt_min and dt_max. */ + /* Traverse the cells and set their dt_min and dx_max. */ space_map_cells_post( s , 1 , &space_map_prepare , NULL ); } @@ -576,9 +579,9 @@ void space_rebuild ( struct space *s , double cell_max ) { // tic = getticks(); #pragma omp parallel for schedule(static) for ( k = 0 ; k < s->nr_parts ; k++ ) { - s->parts[k].x_old[0] = s->parts[k].x[0]; - s->parts[k].x_old[1] = s->parts[k].x[1]; - s->parts[k].x_old[2] = s->parts[k].x[2]; + s->parts[k].xtras->x_old[0] = s->parts[k].x[0]; + s->parts[k].xtras->x_old[1] = s->parts[k].x[1]; + s->parts[k].xtras->x_old[2] = s->parts[k].x[2]; } // printf( "space_rebuild: storing old positions took %.3f ms.\n" , (double)(getticks() - tic) / CPU_TPS * 1000 ); @@ -877,7 +880,7 @@ void space_splittasks ( struct space *s ) { struct cell *ci, *cj; double hi, hj, shift[3]; struct task *t; - float dt_max = s->dt_max; + float dt_step = s->dt_step; int pts[7][8] = { { -1 , 12 , 10 , 9 , 4 , 3 , 1 , 0 } , { -1 , -1 , 11 , 10 , 5 , 4 , 2 , 1 } , { -1 , -1 , -1 , 12 , 7 , 6 , 4 , 3 } , @@ -899,7 +902,7 @@ void space_splittasks ( struct space *s ) { ci = t->ci; /* Ingore this task? */ - if ( ci->dt_min > dt_max ) { + if ( ci->dt_min > dt_step ) { t->skip = 1; continue; } @@ -953,7 +956,7 @@ void space_splittasks ( struct space *s ) { hj = fmin( cj->h[0] , fmin( cj->h[1] , cj->h[2] ) ); /* Ingore this task? */ - if ( ci->dt_min > dt_max && cj->dt_min > dt_max ) { + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) { t->skip = 1; continue; } @@ -1338,7 +1341,7 @@ void space_maketasks ( struct space *s , int do_sort ) { int *cdim = s->cdim; struct task *t, *t2; struct cell *ci, *cj; - // float dt_max = s->dt_max; + float dt_step = s->dt_step; /* Allocate the task-list, if needed. */ if ( s->tasks == NULL || s->tasks_size < s->tot_cells * space_maxtaskspercell ) { @@ -1364,7 +1367,7 @@ void space_maketasks ( struct space *s , int do_sort ) { ci = &s->cells[cid]; if ( ci->count == 0 ) continue; - // if ( ci->dt_min <= dt_max ) + if ( ci->dt_min <= dt_step ) space_addtask( s , task_type_self , task_subtype_density , 0 , 0 , ci , NULL , 0 ); for ( ii = -1 ; ii < 2 ; ii++ ) { iii = i + ii; @@ -1383,8 +1386,8 @@ void space_maketasks ( struct space *s , int do_sort ) { kkk = ( kkk + cdim[2] ) % cdim[2]; cjd = cell_getid( cdim , iii , jjj , kkk ); cj = &s->cells[cjd]; - if ( cid >= cjd || cj->count == 0 /* || - ( ci->dt_min > dt_max && cj->dt_min > dt_max ) */ ) + if ( cid >= cjd || cj->count == 0 || + ( ci->dt_min > dt_step && cj->dt_min > dt_step ) ) continue; sid = sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ]; t = space_addtask( s , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 ); @@ -1688,6 +1691,8 @@ struct cell *space_getcell ( struct space *s ) { void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ) { + int k; + /* Store eveything in the space. */ s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2]; s->periodic = periodic; @@ -1696,9 +1701,15 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N , s->cell_min = h_max; /* Allocate the cparts array. */ - if ( posix_memalign( (void *)&s->cparts , 32 , N * sizeof(struct cpart) ) != 0 ) + if ( posix_memalign( (void *)&s->cparts , 32 , N * sizeof(struct cpart) ) != 0 ) error( "Failed to allocate cparts." ); + /* Allocate and link the xtra parts array. */ + if ( posix_memalign( (void *)&s->xparts , 32 , N * sizeof(struct xpart) ) != 0 ) + error( "Failed to allocate xparts." ); + for ( k = 0 ; k < N ; k++ ) + s->parts[k].xtras = &s->xparts[k]; + /* Init the space lock. */ if ( lock_init( &s->lock ) != 0 ) error( "Failed to create space spin-lock." ); diff --git a/src/space.h b/src/space.h index a93ac3a5db939c73ff8379cee81914489cece75e..13aa6c300dce0bf5e521840dfac0f0b8573bdcd0 100644 --- a/src/space.h +++ b/src/space.h @@ -62,7 +62,7 @@ struct space { double h_min, h_max, cell_min; /* Current time step for particles. */ - float dt_max; + float dt_step; /* Number of cells. */ int nr_cells, tot_cells; @@ -79,6 +79,7 @@ struct space { /* The particle data (cells have pointers to this). */ struct part *parts; struct cpart *cparts; + struct xpart *xparts; /* The total number of parts in the space. */ int nr_parts;