diff --git a/src/engine.c b/src/engine.c index 4872df41e0549c76c325edab69756c1d3eb09884..898ac2f1f32bb79b2242cd9cbb97bfc33d1def2f 100644 --- a/src/engine.c +++ b/src/engine.c @@ -176,7 +176,7 @@ void engine_map_kick_first ( struct cell *c , void *data ) { struct engine *e = (struct engine *)data; float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt; float dt_min, dt_max, h_max, dx, h2dx_max, dx_max; - float a[3], v[3], u, u_dt, h, h_dt, rho, v_old[3]; + float a[3], v[3], u, u_dt, h, h_dt, rho, v_old[3], w; double x[3], x_old[3]; struct part *restrict p; struct xpart *restrict xp; @@ -235,12 +235,16 @@ void engine_map_kick_first ( struct cell *c , void *data ) { p->v[0] = v[0] += dt * a[0]; p->v[1] = v[1] += dt * a[1]; p->v[2] = v[2] += dt * a[2]; - p->u = u += u_dt * dt; - // p->h = h += h_dt * dt; - { - float w = h_dt / h *dt; - p->h = h *= 1.0f + w*( -1.0f + w*( 0.5f + w*(-1.0f/6.0f + 1.0f/24.0*w ) ) ); - } + w = u_dt / u * dt; + if ( w > 0.9f && w < 1.1f ) + p->u = u *= 1.0f + w*( -1.0f + w*( 0.5f - w*1.0f/6.0f ) ); + else + p->u = u *= expf( w ); + w = h_dt / h * dt; + if ( w > 0.9f && w < 1.1f ) + p->h = h *= 1.0f + w*( -1.0f + w*( 0.5f - w*1.0f/6.0f ) ); + else + p->h = h *= expf( w ); h_max = fmaxf( h_max , h ); @@ -421,7 +425,6 @@ void engine_step ( struct engine *e , int sort_queues ) { float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX; float epot = 0.0, ekin = 0.0, mom[3] = { 0.0 , 0.0 , 0.0 }; float ang[3] = { 0.0 , 0.0 , 0.0 }; - // double lent, ent = 0.0; int count = 0; struct cell *c; @@ -518,6 +521,7 @@ void engine_step ( struct engine *e , int sort_queues ) { } else { if ( e->dt == 0 ) { + // e->nullstep += 1; e->dt = e->dt_orig; while ( dt_min < e->dt ) e->dt *= 0.5; @@ -540,7 +544,7 @@ void engine_step ( struct engine *e , int sort_queues ) { } /* Set the system time. */ - e->time = e->dt * e->step; + e->time = e->dt * (e->step - e->nullstep); TIMER_TOC2(timer_step); @@ -573,6 +577,8 @@ void engine_init ( struct engine *e , struct space *s , float dt , int nr_thread e->nr_queues = nr_queues; e->policy = policy; e->step = 0; + e->nullstep = 0; + e->time = 0.0; /* First of all, init the barrier and lock it. */ if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 ) diff --git a/src/engine.h b/src/engine.h index 5d332ad88adf21f6cecae250a11f8bd221b73694..4e29a7b670790d094fb7616777c39ea9ef7da464 100644 --- a/src/engine.h +++ b/src/engine.h @@ -61,7 +61,7 @@ struct engine { float dt, dt_orig; /* The current step number. */ - int step; + int step, nullstep; /* The current system time. */ float time; diff --git a/src/runner.c b/src/runner.c index 8f639b89e84680aa2e0ff09d0b3f24a0c424491a..51af88c1513190bb18ce20dd3a549678d15a6cc2 100644 --- a/src/runner.c +++ b/src/runner.c @@ -432,7 +432,7 @@ void runner_doghost ( struct runner *r , struct cell *c ) { /* Balsara switch */ p->force.balsara = normCurl_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ihg ); - + /* Reset the acceleration. */ for ( k = 0 ; k < 3 ; k++ ) p->a[k] = 0.0f; diff --git a/src/runner_iact.h b/src/runner_iact.h index 79708a311724959a9202ccd009a1b02f5cfc5659..c1a156edbc23e7ed5b52071dd4d35e4fe4e2c17b 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -362,16 +362,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 int k; /* Get the kernel for hi. */ - hi_inv = 1.0f / hi; + hi_inv = kernel_igamma / hi; hi2_inv = hi_inv * hi_inv; - xi = r * hi_inv * kernel_igamma; + xi = r * hi_inv; kernel_deval( xi , &wi , &wi_dx ); wi_dr = hi2_inv * hi2_inv * wi_dx; /* Get the kernel for hj. */ - hj_inv = 1.0f / hj; + hj_inv = kernel_igamma / hj; hj2_inv = hj_inv * hj_inv; - xj = r * hj_inv * kernel_igamma; + xj = r * hj_inv; kernel_deval( xj , &wj , &wj_dx ); wj_dr = hj2_inv * hj2_inv * wj_dx; @@ -389,7 +389,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / (pi->rho + pj->rho); /* Apply balsara switch */ - Pi_ij *= (pi->force.balsara + pj->force.balsara); + Pi_ij *= ( pi->force.balsara + pj->force.balsara ); /* Get the common factor out. */ w = ri * ( ( pi->force.POrho2 * wi_dr + pj->force.POrho2 * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) ); @@ -434,8 +434,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float vector r, r2, ri; vector xi, xj; vector hi, hj, hi_inv, hj_inv; - vector hig_inv, hi2_inv; - vector hjg_inv, hj2_inv; + vector hi2_inv, hj2_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector w; vector piPOrho2, pjPOrho2, pirho, pjrho; @@ -500,22 +499,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float r.v = r2.v * ri.v; /* Get the kernel for hi. */ - hi.v = vec_load( Hi ); + hi.v = vec_set1( kernel_gamma ) * vec_load( Hi ); hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); - hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v; hi2_inv.v = hi_inv.v * hi_inv.v; - xi.v = r.v * hig_inv.v; + xi.v = r.v * hi_inv.v; kernel_deval_vec( &xi , &wi , &wi_dx ); wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; /* Get the kernel for hj. */ - hj.v = vec_load( Hj ); + hj.v = vec_set1( kernel_gamma ) * vec_load( Hj ); hj_inv.v = vec_rcp( hj.v ); hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); - hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v; hj2_inv.v = hj_inv.v * hj_inv.v; - xj.v = r.v * hjg_inv.v; + xj.v = r.v * hj_inv.v; kernel_deval_vec( &xj , &wj , &wj_dx ); wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; @@ -595,16 +592,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( fl int k; /* Get the kernel for hi. */ - hi_inv = 1.0f / hi; + hi_inv = kernel_igamma / hi; hi2_inv = hi_inv * hi_inv; - xi = r * hi_inv * kernel_igamma; + xi = r * hi_inv; kernel_deval( xi , &wi , &wi_dx ); wi_dr = hi2_inv * hi2_inv * wi_dx; /* Get the kernel for hj. */ - hj_inv = 1.0f / hj; + hj_inv = kernel_igamma / hj; hj2_inv = hj_inv * hj_inv; - xj = r * hj_inv * kernel_igamma; + xj = r * hj_inv; kernel_deval( xj , &wj , &wj_dx ); wj_dr = hj2_inv * hj2_inv * wj_dx; @@ -664,8 +661,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force vector r, r2, ri; vector xi, xj; vector hi, hj, hi_inv, hj_inv; - vector hig_inv, hi2_inv; - vector hjg_inv, hj2_inv; + vector hi2_inv, hj2_inv; vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector w; vector piPOrho2, pjPOrho2, pirho, pjrho; @@ -728,22 +724,20 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force r.v = r2.v * ri.v; /* Get the kernel for hi. */ - hi.v = vec_load( Hi ); + hi.v = vec_set1( kernel_gamma ) * vec_load( Hi ); hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); - hig_inv.v = vec_set1( kernel_igamma ) * hi_inv.v; hi2_inv.v = hi_inv.v * hi_inv.v; - xi.v = r.v * hig_inv.v; + xi.v = r.v * hi_inv.v; kernel_deval_vec( &xi , &wi , &wi_dx ); wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; /* Get the kernel for hj. */ - hj.v = vec_load( Hj ); + hj.v = vec_set1( kernel_gamma ) * vec_load( Hj ); hj_inv.v = vec_rcp( hj.v ); hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); - hjg_inv.v = vec_set1( kernel_igamma ) * hj_inv.v; hj2_inv.v = hj_inv.v * hj_inv.v; - xj.v = r.v * hjg_inv.v; + xj.v = r.v * hj_inv.v; kernel_deval_vec( &xj , &wj , &wj_dx ); wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; diff --git a/src/vector.h b/src/vector.h index 65f0d30c67601e8d17d8347e508aacdd30fffcc2..39f1385e57b5fec3cfc30911c9ed900d6ba78524 100644 --- a/src/vector.h +++ b/src/vector.h @@ -27,7 +27,7 @@ #define VEC_MACRO(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type /* So what will the vector size be? */ - #ifdef NO__AVX__ + #ifdef __AVX__ #define VECTORIZE #define VEC_SIZE 8 #define VEC_FLOAT __m256 @@ -54,7 +54,7 @@ #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a) #define vec_dbl_fmin(a,b) _mm256_min_pd(a,b) #define vec_dbl_fmax(a,b) _mm256_max_pd(a,b) - #elif defined( NO__SSE2__ ) + #elif defined( __SSE2__ ) #define VECTORIZE #define VEC_SIZE 4 #define VEC_FLOAT __m128