Commit 072b0d3f authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

tighten up code for integrator kick.


Former-commit-id: 52ee3b0259133472f476542be1b3461d1c9a9e12
parent 58a9668e
......@@ -40,27 +40,28 @@ void printParticle ( struct part *parts , long long int id, int N ) {
/* Look for the particle. */
for ( i = 0 ; i < N && parts[i].id != id; i++ );
if(i < N)
printf("## Particle[%d]: id=%lld, x=[%.3e,%.3e,%.3e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
i,
parts[i].id,
parts[i].x[0], parts[i].x[1], parts[i].x[2],
parts[i].v[0], parts[i].v[1], parts[i].v[2],
parts[i].a[0], parts[i].a[1], parts[i].a[2],
parts[i].h,
parts[i].force.h_dt,
parts[i].wcount,
parts[i].mass,
parts[i].rho, parts[i].rho_dh,
parts[i].density.div_v,
parts[i].u,
parts[i].force.u_dt,
parts[i].force.balsara,
parts[i].force.POrho2,
parts[i].force.v_sig,
parts[i].dt
);
if ( i < N )
printf("## Particle[%d]: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n",
i,
parts[i].id,
parts[i].x[0], parts[i].x[1], parts[i].x[2],
parts[i].v[0], parts[i].v[1], parts[i].v[2],
parts[i].a[0], parts[i].a[1], parts[i].a[2],
parts[i].h,
parts[i].force.h_dt,
parts[i].wcount,
parts[i].mass,
parts[i].rho, parts[i].rho_dh,
parts[i].density.div_v,
parts[i].u,
parts[i].force.u_dt,
parts[i].force.balsara,
parts[i].force.POrho2,
parts[i].force.v_sig,
parts[i].dt
);
else
printf("## Particles[???] id=%lld not found\n", id);
}
printf("## Particles[???] id=%lld not found\n", id);
}
......@@ -163,8 +163,9 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
int j, k;
struct engine *e = (struct engine *)data;
float dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
float dt_min, dt_max, h_max, dx_max;
float pdt, dt_step = e->dt_step, dt = e->dt, hdt = 0.5f*dt;
float dt_min, dt_max, h_max, dx_max, a[3], v[3], u, u_dt, h, h_dt, rho, v_old[3];
double x[3], x_old[3];
struct part *restrict p;
struct xpart *restrict xp;
struct cpart *restrict cp;
......@@ -186,47 +187,59 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
xp = p->xtras;
cp = &c->cparts[k];
/* Load the data locally. */
a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2];
v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2];
x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2];
h = p->h;
u = p->u;
h_dt = p->force.h_dt;
u_dt = p->force.u_dt;
pdt = p->dt;
/* Store the min/max dt. */
dt_min = fminf( dt_min , p->dt );
dt_max = fmaxf( dt_max , p->dt );
dt_min = fminf( dt_min , pdt );
dt_max = fmaxf( dt_max , pdt );
/* Step and store the velocity and internal energy. */
xp->v_old[0] = p->v[0] + hdt * p->a[0];
xp->v_old[1] = p->v[1] + hdt * p->a[1];
xp->v_old[2] = p->v[2] + hdt * p->a[2];
xp->v_old[0] = v_old[0] = v[0] + hdt * a[0];
xp->v_old[1] = v_old[1] = v[1] + hdt * a[1];
xp->v_old[2] = v_old[2] = v[2] + hdt * a[2];
xp->u_old = p->u + hdt * p->force.u_dt;
/* Move the particles with the velocitie at the half-step. */
p->x[0] += dt * xp->v_old[0];
p->x[1] += dt * xp->v_old[1];
p->x[2] += dt * xp->v_old[2];
dx_max = fmaxf( dx_max , sqrtf( (p->x[0] - xp->x_old[0])*(p->x[0] - xp->x_old[0]) +
(p->x[1] - xp->x_old[1])*(p->x[1] - xp->x_old[1]) +
(p->x[2] - xp->x_old[2])*(p->x[2] - xp->x_old[2]) )*2 + p->h );
p->x[0] = x[0] += dt * v_old[0];
p->x[1] = x[1] += dt * v_old[1];
p->x[2] = x[2] += dt * v_old[2];
dx_max = fmaxf( dx_max , sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) +
(x[1] - x_old[1])*(x[1] - x_old[1]) +
(x[2] - x_old[2])*(x[2] - x_old[2]) )*2 + h );
/* Update positions and energies at the half-step. */
p->v[0] += dt * p->a[0];
p->v[1] += dt * p->a[1];
p->v[2] += dt * p->a[2];
p->u *= expf( p->force.u_dt / p->u * dt );
p->h *= expf( p->force.h_dt / p->h * dt );
h_max = fmaxf( h_max , p->h );
p->v[0] = v[0] += dt * a[0];
p->v[1] = v[1] += dt * a[1];
p->v[2] = v[2] += dt * a[2];
p->u = u += u_dt * dt;
p->h = h += h_dt * dt;
h_max = fmaxf( h_max , h );
/* Integrate other values if this particle will not be updated. */
if ( p->dt > dt_step ) {
p->rho *= expf( -3.0f * p->force.h_dt / p->h * dt );
p->force.POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f );
}
/* Fill the cpart. */
cp->x[0] = p->x[0];
cp->x[1] = p->x[1];
cp->x[2] = p->x[2];
cp->h = p->h;
cp->dt = p->dt;
cp->x[0] = x[0];
cp->x[1] = x[1];
cp->x[2] = x[2];
cp->h = h;
cp->dt = pdt;
/* Integrate other values if this particle will not be updated. */
/* Init fields for density calculation. */
if ( p->dt <= dt_step ) {
if ( pdt > dt_step ) {
rho = p->rho *= expf( -3.0f * h_dt / h * dt );
// rho = p->rho += cbrt( h_dt ) * dt;
p->force.POrho2 = u * ( const_gamma - 1.0f ) / ( rho + h * p->rho_dh / 3.0f );
}
else {
p->wcount = 0.0f;
p->density.wcount_dh = 0.0f;
p->rho = 0.0f;
......@@ -276,7 +289,11 @@ void engine_map_kick_first ( struct cell *c , void *data ) {
}
void engine_single2 ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
/**
* @brief Compute the force on a single particle brute-force.
*/
void engine_single ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) {
int i, k;
double r2, dx[3];
......@@ -342,9 +359,11 @@ void engine_step ( struct engine *e , int sort_queues ) {
struct part *restrict parts = e->s->parts, *restrict p;
struct xpart *restrict xp;
float dt = e->dt, hdt = 0.5*dt, dt_step, dt_max, dt_min, ldt_min, ldt_max;
float pdt, h, m, v[3], u;
double x[3];
double epot = 0.0, ekin = 0.0, lepot, lekin, lmom[3], mom[3] = { 0.0 , 0.0 , 0.0 };
double lang[3], ang[3] = { 0.0 , 0.0 , 0.0 };
double lent, ent = 0.0;
// double lent, ent = 0.0;
int threadID, nthreads, count = 0, lcount;
/* Get the maximum dt. */
......@@ -400,62 +419,68 @@ void engine_step ( struct engine *e , int sort_queues ) {
// for(k=0; k<10; ++k)
// printParticle(parts, k);
// engine_single2( e->s->dim , 432626 , e->s->parts , e->s->nr_parts , e->s->periodic );
// engine_single( e->s->dim , 432626 , e->s->parts , e->s->nr_parts , e->s->periodic );
// printParticle( parts , 432626 );
// printParticle( parts , 432628 );
/* Second kick. */
TIMER_TIC_ND
dt_min = FLT_MAX; dt_max = 0.0f;
#pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lang,lent,lekin,lepot,threadID,nthreads)
#pragma omp parallel private(p,xp,k,ldt_min,lcount,ldt_max,lmom,lang,lekin,lepot,threadID,nthreads,pdt,v,h,m,x,u)
{
threadID = omp_get_thread_num();
nthreads = omp_get_num_threads();
ldt_min = FLT_MAX; ldt_max = 0.0f;
lmom[0] = 0.0; lmom[1] = 0.0; lmom[2] = 0.0;
lang[0] = 0.0; lang[1] = 0.0; lang[2] = 0.0;
lekin = 0.0; lepot = 0.0; lent=0.0; lcount = 0;
lekin = 0.0; lepot = 0.0; /* lent=0.0; */ lcount = 0;
for ( k = nr_parts * threadID / nthreads ; k < nr_parts * (threadID + 1) / nthreads ; k++ ) {
/* Get a handle on the part. */
p = &parts[k];
xp = p->xtras;
/* Get local copies of particle data. */
pdt = p->dt;
h = p->h;
m = p->mass;
x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2];
/* Scale the derivatives if they're freshly computed. */
if ( p->dt <= dt_step ) {
p->force.h_dt *= p->h * 0.333333333f;
if ( pdt <= dt_step ) {
p->force.h_dt *= h * 0.333333333f;
lcount += 1;
}
/* Update the particle's time step. */
p->dt = const_cfl * p->h / ( p->force.v_sig );
p->dt = pdt = const_cfl * h / ( p->force.v_sig );
/* Update positions and energies at the half-step. */
p->v[0] = xp->v_old[0] + hdt * p->a[0];
p->v[1] = xp->v_old[1] + hdt * p->a[1];
p->v[2] = xp->v_old[2] + hdt * p->a[2];
p->u = xp->u_old + hdt * p->force.u_dt;
p->v[0] = v[0] = xp->v_old[0] + hdt * p->a[0];
p->v[1] = v[1] = xp->v_old[1] + hdt * p->a[1];
p->v[2] = v[2] = xp->v_old[2] + hdt * p->a[2];
p->u = u = xp->u_old + hdt * p->force.u_dt;
/* Get the smallest/largest dt. */
ldt_min = fminf( ldt_min , p->dt );
ldt_max = fmaxf( ldt_max , p->dt );
ldt_min = fminf( ldt_min , pdt );
ldt_max = fmaxf( ldt_max , pdt );
/* Collect total energy. */
lekin += 0.5 * p->mass * ( p->v[0]*p->v[0] + p->v[1]*p->v[1] + p->v[2]*p->v[2] );
lepot += p->mass * p->u;
lekin += 0.5 * m * ( v[0]*v[0] + v[1]*v[1] + v[2]*v[2] );
lepot += m * u;
/* Collect momentum */
lmom[0] += p->mass * p->v[0];
lmom[1] += p->mass * p->v[1];
lmom[2] += p->mass * p->v[2];
lmom[0] += m * p->v[0];
lmom[1] += m * p->v[1];
lmom[2] += m * p->v[2];
/* Collect angular momentum */
lang[0] += p->mass * ( p->x[1]*p->v[2] - p->v[2]*p->x[1] );
lang[1] += p->mass * ( p->x[2]*p->v[0] - p->v[0]*p->x[2] );
lang[2] += p->mass * ( p->x[0]*p->v[1] - p->v[1]*p->x[0] );
lang[0] += m * ( x[1]*v[2] - v[2]*x[1] );
lang[1] += m * ( x[2]*v[0] - v[0]*x[2] );
lang[2] += m * ( x[0]*v[1] - v[1]*x[0] );
/* Collect entropic function */
lent += p->u * pow( p->rho, 1.f-const_gamma );
// lent += u * pow( p->rho, 1.f-const_gamma );
}
#pragma omp critical
......@@ -468,7 +493,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
ang[0] += lang[0];
ang[1] += lang[1];
ang[2] += lang[2];
ent += (const_gamma -1.) * lent;
// ent += (const_gamma -1.) * lent;
epot += lepot;
ekin += lekin;
count += lcount;
......@@ -481,7 +506,7 @@ void engine_step ( struct engine *e , int sort_queues ) {
printf( "engine_step: etot is %e (ekin=%e, epot=%e).\n" , ekin+epot , ekin , epot ); fflush(stdout);
printf( "engine_step: total momentum is [ %e , %e , %e ].\n" , mom[0] , mom[1] , mom[2] ); fflush(stdout);
printf( "engine_step: total angular momentum is [ %e , %e , %e ].\n" , ang[0] , ang[1] , ang[2] ); fflush(stdout);
printf( "engine_step: total entropic function is %e .\n", ent ); fflush(stdout);
// printf( "engine_step: total entropic function is %e .\n", ent ); fflush(stdout);
printf( "engine_step: updated %i parts (dt_step=%.3e).\n" , count , dt_step ); fflush(stdout);
/* Increase the step counter. */
......
......@@ -70,7 +70,7 @@ struct part {
float rho_dh;
/* Store density/force specific stuff. */
// union {
union {
struct {
......@@ -107,7 +107,7 @@ struct part {
} force;
// };
};
/* Particle pressure. */
// float P;
......
......@@ -411,9 +411,6 @@ void runner_doghost ( struct runner *r , struct cell *c ) {
continue;
}
if ( p->id == 432626 )
printf( "runner_doghost: updating particle %lli.\n" , p->id );
/* Pre-compute some stuff for the balsara switch. */
normDiv_v = fabs( p->density.div_v / p->rho * ihg4 );
normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / p->rho * ihg4;
......
......@@ -27,32 +27,60 @@
#define VEC_MACRO(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type
/* So what will the vector size be? */
#ifdef NO__AVX__
#ifdef __AVX__
#define VECTORIZE
#define VEC_SIZE 8
#define VEC_FLOAT __m256
#define VEC_DBL __m256d
#define VEC_INT __m256i
#define vec_load(a) _mm256_load_ps(a)
#define vec_set1(a) _mm256_set1_ps(a)
#define vec_set(a,b,c,d,e,f,g,h) _mm256_set_ps(h,g,f,e,d,c,b,a)
#define vec_dbl_set(a,b,c,d) _mm256_set_pd(d,c,b,a)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a,b) _mm256_min_ps(a,b)
#define vec_fmax(a,b) _mm256_max_ps(a,b)
#elif defined( NO__SSE2__ )
#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,0))
#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,1))
#define vec_dbl_tofloat(a,b) _mm256_insertf128( _mm256_castps128_ps256(a) , b , 1 )
#define vec_dbl_load(a) _mm256_load_pd(a)
#define vec_dbl_set1(a) _mm256_set1_pd(a)
#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm256_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a)
#define vec_dbl_fmin(a,b) _mm256_min_pd(a,b)
#define vec_dbl_fmax(a,b) _mm256_max_pd(a,b)
#elif defined( __SSE2__ )
#define VECTORIZE
#define VEC_SIZE 4
#define VEC_FLOAT __m128
#define VEC_DBL __m128d
#define VEC_INT __m128i
#define vec_load(a) _mm_load_ps(a)
#define vec_set1(a) _mm_set1_ps(a)
#define vec_set(a,b,c,d) _mm_set_ps(d,c,b,a)
#define vec_dbl_set(a,b) _mm_set_pd(b,a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a,b) _mm_min_ps(a,b)
#define vec_fmax(a,b) _mm_max_ps(a,b)
#define vec_todbl_lo(a) _mm_cvtps_pd(a)
#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a,a))
#define vec_dbl_tofloat(a,b) _mm_movelh_ps( _mm_cvtpd_ps(a) , _mm_cvtpd_ps(b) )
#define vec_dbl_load(a) _mm_load_pd(a)
#define vec_dbl_set1(a) _mm_set1_pd(a)
#define vec_dbl_sqrt(a) _mm_sqrt_pd(a)
#define vec_dbl_rcp(a) _mm_rcp_pd(a)
#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a)
#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a)
#define vec_dbl_fmin(a,b) _mm_min_pd(a,b)
#define vec_dbl_fmax(a,b) _mm_max_pd(a,b)
#else
#define VEC_SIZE 4
#endif
......@@ -74,8 +102,10 @@
// VEC_MACRO(VEC_SIZE,float) v;
// VEC_MACRO(VEC_SIZE,int) m;
VEC_FLOAT v;
VEC_DBL vd;
VEC_INT m;
float f[VEC_SIZE];
double d[VEC_SIZE/2];
int i[VEC_SIZE];
} vector;
#endif
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment