diff --git a/src/gadgetsmp.h b/src/gadgetsmp.h index a14888a70ea2cd972b0e152c70ff7f3781fb77d3..866413474718a9bc9c5bfb1d1c050d1e2f4cb3a9 100644 --- a/src/gadgetsmp.h +++ b/src/gadgetsmp.h @@ -22,6 +22,7 @@ /* Local headers. */ #include "cycle.h" +#include "const.h" #include "lock.h" #include "task.h" #include "part.h" diff --git a/src/part.h b/src/part.h index 07e639a1aa59166f7ddf6fd25e84738d38ada638..80a93d628bb07e8d5f8923ec8047bcf6ac6c1078 100644 --- a/src/part.h +++ b/src/part.h @@ -21,6 +21,7 @@ /* Some constants. */ #define part_maxwait 3 #define part_maxunlock 39 +#define part_dtmax 10 /* Data of a single particle. */ @@ -30,14 +31,17 @@ struct part { float h; /* Particle time-step. */ - float dt; + int dt; - /* Particle ID. */ - int id; + /* Particle mass. */ + float mass; /* Particle density. */ float rho; + /* Particle ID. */ + int id; + /* Particle position. */ double x[3]; @@ -50,8 +54,8 @@ struct part { /* Particle pressure. */ float P; - /* Particle mass. */ - float m; + /* Aggregate quantities. */ + float POrho2; /* Particle internal energy. */ float u; @@ -59,11 +63,15 @@ struct part { /* Change in particle energy over time. */ float u_dt; + /* Change in smoothing length over time. */ + float h_dt; + /* Derivative of the density with respect to this particle's smoothing length. */ float rho_dh; /* Particle number density. */ int icount; + float wcount; } __attribute__((aligned (32))); diff --git a/src/queue.c b/src/queue.c index fca97701093dc04a7f99f1e1b5e41e044dc76566..367b9423fb1872cfecbb7ab7f2eb6b3b19202777 100644 --- a/src/queue.c +++ b/src/queue.c @@ -306,7 +306,7 @@ struct task *queue_gettask_new ( struct queue *q , int rid , int blocking , int continue; /* Get the score for this task. */ - if ( res->type == task_type_self || res->type == task_type_sort || ( res->type == task_type_sub && res->cj == NULL ) ) + if ( res->type == task_type_self || res->type == task_type_ghost || res->type == task_type_sort || ( res->type == task_type_sub && res->cj == NULL ) ) score = ( res->ci->owner == rid ); else score = ( res->ci->owner == rid ) + ( res->cj->owner == rid ); diff --git a/src/runner.c b/src/runner.c index b7f10d76e73f853a7e027e4f5cee09b56b033958..dd38394b7c57dcfae922bc510e6be2fee15dfb23 100644 --- a/src/runner.c +++ b/src/runner.c @@ -33,6 +33,7 @@ /* Local headers. */ #include "cycle.h" +#include "const.h" #include "lock.h" #include "task.h" #include "part.h" @@ -485,6 +486,56 @@ void runner_dosort ( struct runner_thread *rt , struct cell *c , int flags ) { #endif } + + +/** + * @brief Intermediate task between density and force + * + * @param rt The runner thread. + * @param ci THe cell. + */ + +void runner_doghost ( struct runner_thread *r , struct cell *c ) { + + struct part *p; + int i, k; + TIMER_TIC + + /* If this cell has progeny, don't bother. */ + if ( c->split ) + return; + + /* Loop over the parts in this cell. */ + for ( i = 0 ; i < c->count ; i++ ) { + + /* Get a direct pointer on the part. */ + p = &c->parts[i]; + + /* Reset the acceleration. */ + for ( k = 0 ; k < 3 ; k++ ) + p->a[k] = 0.0f; + + /* Reset the time derivatives. */ + p->u_dt = 0.0f; + p->h_dt = 0.0f; + + /* Compute the pressure. */ + p->P = p->rho * p->u * ( const_gamma - 1.0f ); + + /* Compute the P/Omega/rho2. */ + p->POrho2 = p->u * ( const_gamma - 1.0f ) / ( p->rho + p->h * p->rho_dh / 3.0f ); + + } + + #ifdef TIMER_VERBOSE + printf( "runner_doghost[%02i]: %i parts at depth %i took %.3f ms.\n" , + rt->id , c->count , c->depth , + ((double)TIMER_TOC(runner_timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout); + #else + TIMER_TOC(runner_timer_doghost); + #endif + + } /** @@ -677,6 +728,7 @@ void *runner_main ( void *data ) { cell_unlocktree( cj ); break; case task_type_ghost: + runner_doghost( rt , ci ); break; default: error( "Unknown task type." ); diff --git a/src/runner.h b/src/runner.h index 535c416b47f7f4140dcba16f4e00e4a4f17af8d5..505e7561a6c7159f9db22b748f67c5f1fd873c7d 100644 --- a/src/runner.h +++ b/src/runner.h @@ -33,9 +33,13 @@ enum { runner_timer_none = 0, runner_timer_dosort, - runner_timer_doself, - runner_timer_dopair, - runner_timer_dosub, + runner_timer_doself_density, + runner_timer_doself_force, + runner_timer_dopair_density, + runner_timer_dopair_force, + runner_timer_dosub_density, + runner_timer_dosub_force, + runner_timer_doghost, runner_timer_getpair, runner_timer_steal, runner_timer_stalled, @@ -155,6 +159,7 @@ struct runner { /* Function prototypes. */ void runner_run ( struct runner *r , int sort_queues ); +void runner_doghost ( struct runner_thread *rt , struct cell *c ); void runner_dopair_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj ); void runner_doself_density ( struct runner_thread *rt , struct cell *c ); void runner_dosub_density ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int flags ); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index a034e9d8641791b26fd2a444bf4204732faf8cd2..52039b151a43d7f7d5ca1c8ca5243e280bd98be1 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -41,6 +41,15 @@ #define IACT2(f) PASTE(runner_iact,f) #define IACT IACT2(FUNCTION) +#define TIMER_DOSELF2(f) PASTE(runner_timer_doself,f) +#define TIMER_DOSELF TIMER_DOSELF2(FUNCTION) + +#define TIMER_DOPAIR2(f) PASTE(runner_timer_dopair,f) +#define TIMER_DOPAIR TIMER_DOPAIR2(FUNCTION) + +#define TIMER_DOSUB2(f) PASTE(runner_timer_dosub,f) +#define TIMER_DOSUB TIMER_DOSUB2(FUNCTION) + /** * @brief Compute the interactions between a cell pair. @@ -56,7 +65,8 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj int pid, pjd, k, count_i = ci->count, count_j = cj->count; double shift[3] = { 0.0 , 0.0 , 0.0 }; struct part *pi, *pj, *parts_i = ci->parts, *parts_j = cj->parts; - double dx[3], pix[3], hi, hi2, r2; + double pix[3]; + float dx[3], hi, hi2, r2; TIMER_TIC /* Get the relative distance between the pairs, wrapping. */ @@ -98,7 +108,7 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj /* Hit or miss? */ if ( r2 < hi2 || r2 < pj->h*pj->h ) { - IACT( r2 , hi , pj->h , pi , pj ); + IACT( r2 , dx , hi , pj->h , pi , pj ); } @@ -107,9 +117,9 @@ void DOPAIR_NAIVE ( struct runner_thread *rt , struct cell *ci , struct cell *cj } /* loop over the parts in ci. */ #ifdef TIMER_VERBOSE - printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(runner_timer_dopair)) / CPU_TPS * 1000 ); + printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); #else - TIMER_TOC(runner_timer_dopair); + TIMER_TOC(TIMER_DOPAIR); #endif @@ -132,7 +142,8 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { struct cell *temp; struct entry *sort_i, *sort_j; struct part *pi, *pj, *parts_i, *parts_j; - double dx[3], pix[3], pjx[3], hi, hi2, hj, hj2, r2, di, dj; + double pix[3], pjx[3], di, dj; + float dx[3], hi, hi2, hj, hj2, r2; double hi_max, hj_max, di_max, dj_min; int count_i, count_j; TIMER_TIC @@ -214,7 +225,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { /* Hit or miss? */ if ( r2 < hi2 ) { - IACT( r2 , hi , pj->h , pi , pj ); + IACT( r2 , dx , hi , pj->h , pi , pj ); } @@ -255,7 +266,7 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { /* Hit or miss? */ if ( r2 < hj2 && r2 > pi->h*pi->h ) { - IACT( r2 , pi->h , hj , pi , pj ); + IACT( r2 , dx , pi->h , hj , pi , pj ); } @@ -264,9 +275,9 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { } /* loop over the parts in ci. */ #ifdef TIMER_VERBOSE - printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(runner_timer_dopair))) / CPU_TPS * 1000 ); + printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , rt->id , count_i , count_j , ci->depth , ci->r_max , cj->r_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); #else - TIMER_TOC(runner_timer_dopair); + TIMER_TOC(TIMER_DOPAIR); #endif } @@ -282,7 +293,8 @@ void DOPAIR ( struct runner_thread *rt , struct cell *ci , struct cell *cj ) { void DOSELF ( struct runner_thread *rt , struct cell *c ) { int k, pid, pjd, count = c->count; - double pix[3], dx[3], hi, hi2, r2; + double pix[3]; + float dx[3], hi, hi2, r2; struct part *pi, *pj, *parts = c->parts; TIMER_TIC @@ -317,7 +329,7 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) { /* Hit or miss? */ if ( r2 < hi2 || r2 < pj->h*pj->h ) { - IACT( r2 , hi , pj->h , pi , pj ); + IACT( r2 , dx , hi , pj->h , pi , pj ); } @@ -326,9 +338,9 @@ void DOSELF ( struct runner_thread *rt , struct cell *c ) { } /* loop over all particles. */ #ifdef TIMER_VERBOSE - printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(runner_timer_doself)) / CPU_TPS * 1000 ); + printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , rt->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); #else - TIMER_TOC(runner_timer_doself); + TIMER_TOC(TIMER_DOSELF); #endif } @@ -533,9 +545,9 @@ void DOSUB ( struct runner_thread *rt , struct cell *ci , struct cell *cj , int #ifdef TIMER_VERBOSE - printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(runner_timer_dosub)) / CPU_TPS * 1000 ); + printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , rt->id , flags , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); #else - TIMER_TOC(runner_timer_dosub); + TIMER_TOC(TIMER_DOSUB); #endif } diff --git a/src/runner_iact.h b/src/runner_iact.h index d34296a5d1c79e9a0bea8f0cf3bdb14f320343c5..0fb85d8d1e1c39aa4675a0a7916372a2a68240e9 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -16,59 +16,88 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ + +/* Coefficients for the kernel. */ + +#define kernel_degree 3 +#define kernel_ivals 2 +#define kernel_gamma 0.5f +#define kernel_igamma 2.0f +#define kernel_igamma3 8.0f +#define kernel_igamma4 16.0f +static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] = + { 3.0/4.0*M_1_PI , -3.0/2.0*M_1_PI , 0.0 , M_1_PI , + -0.25*M_1_PI , 3.0/2.0*M_1_PI , -3.0*M_1_PI , M_2_PI , + 0.0 , 0.0 , 0.0 , 0.0 }; +#define kernel_root(h) ( 1.0f/((h)*(h)*(h))*kernel_igamma3*kernel_coeffs[ kernel_degree ] ) + + +/** + * @brief Helper function to evaluate the kernel at a given x. + */ + +__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) { + int ind = fmin( x , kernel_ivals ); + float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; + float w = coeffs[0]*x + coeffs[1]; + float dw_dx = coeffs[0]; + for ( int k = 2 ; k <= kernel_degree ; k++ ) { + dw_dx = dw_dx*x + w; + w = x*w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; + } + + +__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) { + int ind = fmin( x , kernel_ivals ); + float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; + float w = coeffs[0]*x + coeffs[1]; + for ( int k = 2 ; k <= kernel_degree ; k++ ) + w = x*w + coeffs[k]; + *W = w; + } -__attribute__ ((always_inline)) INLINE void runner_iact_density ( float r2 , float hi , float hj , struct part *pi , struct part *pj ) { +/** + * @brief Density kernel + */ - #define KERNEL_COEFF_1 2.546479089470f - #define KERNEL_COEFF_2 15.278874536822f - #define KERNEL_COEFF_3 45.836623610466f - #define KERNEL_COEFF_4 30.557749073644f - #define KERNEL_COEFF_5 5.092958178941f - #define KERNEL_COEFF_6 (-15.278874536822f) - #define NORM_COEFF 4.188790204786f +__attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { float r = sqrtf( r2 ); - float ui, uj, wi, wj; - float ui_dh, uj_dh, wi_dh, wj_dh; + float xi, xj; + float hg_inv, hg2_inv; + float wi, wj, wi_dx, wj_dx; if ( r2 < hi*hi && pi != NULL ) { - ui = r / hi; - ui_dh = -r / hi / hi; - if ( ui < 0.5f ) { - wi = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (ui - 1.0f) * ui * ui; - wi_dh = KERNEL_COEFF_2 * ui_dh * ui * ui - + 2 * KERNEL_COEFF_2 * (ui - 1.0f) * ui_dh * ui; - } - else { - wi = KERNEL_COEFF_5 * (1.0f - ui) * (1.0f - ui) * (1.0f - ui); - wi_dh = -3 * KERNEL_COEFF_5 * ui_dh * (1.0f - ui) * (1.0f - ui); - } - pi->rho += NORM_COEFF * wi; - pi->rho_dh += NORM_COEFF * wi_dh; + hg_inv = kernel_igamma / hi; + hg2_inv = hg_inv * hg_inv; + xi = r * hg_inv; + kernel_deval( xi , &wi , &wi_dx ); + + pi->rho += pj->mass * hg_inv * hg2_inv * wi; + pi->rho_dh += -pj->mass * hg2_inv * hg2_inv * ( 3.0*wi + xi*wi_dx ); + pi->wcount += wi * 4.0f * M_PI / 3.0f * kernel_igamma3; pi->icount += 1; } if ( r2 < hj*hj && pj != NULL ) { - uj = r / hj; - uj_dh = -r / hj / hj; - if ( uj < 0.5f ) { - wj = KERNEL_COEFF_1 + KERNEL_COEFF_2 * (uj - 1.0f) * uj * uj; - wj_dh = KERNEL_COEFF_2 * uj_dh * uj * uj - + 2 * KERNEL_COEFF_2 * (uj - 1.0f) * uj_dh * uj; - } - else { - wj = KERNEL_COEFF_5 * (1.0f - uj) * (1.0f - uj) * (1.0f - uj); - wj_dh = -3 * KERNEL_COEFF_5 * uj_dh * (1.0f - uj) * (1.0f - uj); - } - pj->rho += NORM_COEFF * wj; - pj->rho_dh += NORM_COEFF * wj_dh; + hg_inv = kernel_igamma / hj; + hg2_inv = hg_inv * hg_inv; + xj = r * hg_inv; + kernel_deval( xj , &wj , &wj_dx ); + + pj->rho += pi->mass * hg_inv * hg2_inv * wj; + pj->rho_dh += -pi->mass * hg2_inv * hg2_inv * ( 3.0*wj + xj*wj_dx ); + pj->wcount += wj * 4.0f * M_PI / 3.0f * kernel_igamma3; pj->icount += 1; - + } #ifdef HIST @@ -82,15 +111,57 @@ __attribute__ ((always_inline)) INLINE void runner_iact_density ( float r2 , flo -__attribute__ ((always_inline)) INLINE void runner_iact_force ( float r2 , float hi , float hj , struct part *pi , struct part *pj ) { +__attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - #define KERNEL_COEFF_1 2.546479089470f - #define KERNEL_COEFF_2 15.278874536822f - #define KERNEL_COEFF_3 45.836623610466f - #define KERNEL_COEFF_4 30.557749073644f - #define KERNEL_COEFF_5 5.092958178941f - #define KERNEL_COEFF_6 (-15.278874536822f) - #define NORM_COEFF 4.188790204786f + float r = sqrtf( r2 ), ri = 1.0f / r; + float xi, xj; + float hig_inv, hig2_inv; + float hjg_inv, hjg2_inv; + float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; + float f; + int k; + + /* Get the kernel for hi. */ + hig_inv = kernel_igamma / hi; + hig2_inv = hig_inv * hig_inv; + xi = r * hig_inv; + kernel_deval( xi , &wi , &wi_dx ); + wi_dr = hig2_inv * hig2_inv * wi_dx; + + /* Get the kernel for hj. */ + hjg_inv = kernel_igamma / hj; + hjg2_inv = hjg_inv * hjg_inv; + xj = r * hjg_inv; + kernel_deval( xj , &wj , &wj_dx ); + wj_dr = hjg2_inv *hjg2_inv * wj_dx; + + /* Get the common factor out. */ + w = ri * ( pi->POrho2 * wi_dr + pj->POrho2 * wj_dr ); + + /* Use the force, Luke! */ + for ( k = 0 ; k < 3 ; k++ ) { + f = dx[k] * w; + pi->a[k] += pj->mass * f; + pj->a[k] += -pi->mass * f; + } + + /* Compute dv dot r. */ + dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2]; + + /* Get the time derivative for u. */ + pi->u_dt += pj->mass * dvdr * wi_dr; + pj->u_dt += pi->mass * dvdr * wj_dr; + + /* Get the time derivative for h. */ + pi->h_dt += pj->mass / pj->rho * dvdr * wi_dr; + pj->h_dt += pi->mass / pi->rho * dvdr * wj_dr; + + #ifdef HIST + if ( hi > hj ) + runner_hist_hit( hi / hj ); + else + runner_hist_hit( hj / hi ); + #endif } diff --git a/src/space.c b/src/space.c index 54037106a8880dd8f6d7916fd12711df634f6628..6a3203ee0601aac09cb109b91890d9911866367d 100644 --- a/src/space.c +++ b/src/space.c @@ -960,6 +960,17 @@ void space_maketasks ( struct space *s , int do_sort ) { task_addunlock( t->cj->ghost , t2 ); } + /* Otherwise, sub interaction? */ + else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) { + task_addunlock( t , t->ci->ghost ); + if ( t->cj != NULL ) + task_addunlock( t , t->cj->ghost ); + t2 = space_addtask( s , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , NULL , 0 , NULL , 0 ); + task_addunlock( t->ci->ghost , t2 ); + if ( t->cj != NULL ) + task_addunlock( t->cj->ghost , t2 ); + } + } /* Did we already create indices? */