diff --git a/src/Makefile.am b/src/Makefile.am index 8ea80c5b54e466e32b7fc5943222644e7fe6450c..2b5137582eb21cacd5888ad2b06886f5498c224d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -22,12 +22,13 @@ AUTOMAKE_OPTIONS=gnu # Add the debug flag to the whole thing AM_CFLAGS = -g -O3 -std=gnu99 -Wall -Werror -ffast-math -fstrict-aliasing \ -ftree-vectorize -funroll-loops $(SIMD_FLAGS) $(OPENMP_CFLAGS) \ - -DTIMER -DCOUNTER -DCPU_TPS=2.30e9 + -DTIMER -DCOUNTER -DCPU_TPS=2.30e9 \ + # -fsanitize=address -fno-omit-frame-pointer # AM_CFLAGS = -Wall -Werror $(OPENMP_CFLAGS) \ # -DTIMER -DCOUNTER -DCPU_TPS=2.67e9 # Assign a "safe" version number -AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 +AM_LDFLAGS = $(LAPACK_LIBS) $(BLAS_LIBS) $(HDF5_LDFLAGS) -version-info 0:0:0 # -fsanitize=address # Build the libswiftsim library lib_LTLIBRARIES = libswiftsim.la @@ -38,11 +39,13 @@ endif # List required headers include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ - engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h common_io.h + engine.h swift.h serial_io.h timers.h debug.h scheduler.h proxy.h parallel_io.h \ + common_io.h multipole.h # Common source files AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \ - serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c units.c common_io.c + serial_io.c timers.c debug.c scheduler.c proxy.c parallel_io.c \ + units.c common_io.c multipole.c # Sources and flags for regular library libswiftsim_la_SOURCES = $(AM_SOURCES) diff --git a/src/cell.c b/src/cell.c index 5e23afed5280534650847be30a6e16d35e04142d..3a0e99fd5e3a489734e05998d1cb32155e620544 100644 --- a/src/cell.c +++ b/src/cell.c @@ -47,6 +47,7 @@ #include "timers.h" #include "part.h" #include "space.h" +#include "multipole.h" #include "cell.h" #include "error.h" #include "inline.h" @@ -265,6 +266,72 @@ int cell_locktree( struct cell *c ) { } +int cell_glocktree( struct cell *c ) { + + struct cell *finger, *finger2; + TIMER_TIC + + /* First of all, try to lock this cell. */ + if ( c->ghold || lock_trylock( &c->glock ) != 0 ) { + TIMER_TOC(timer_locktree); + return 1; + } + + /* Did somebody hold this cell in the meantime? */ + if ( c->ghold ) { + + /* Unlock this cell. */ + if ( lock_unlock( &c->glock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Admit defeat. */ + TIMER_TOC(timer_locktree); + return 1; + + } + + /* Climb up the tree and lock/hold/unlock. */ + for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) { + + /* Lock this cell. */ + if ( lock_trylock( &finger->glock ) != 0 ) + break; + + /* Increment the hold. */ + __sync_fetch_and_add( &finger->ghold , 1 ); + + /* Unlock the cell. */ + if ( lock_unlock( &finger->glock ) != 0 ) + error( "Failed to unlock cell." ); + + } + + /* If we reached the top of the tree, we're done. */ + if ( finger == NULL ) { + TIMER_TOC(timer_locktree); + return 0; + } + + /* Otherwise, we hit a snag. */ + else { + + /* Undo the holds up to finger. */ + for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent ) + __sync_fetch_and_sub( &finger2->ghold , 1 ); + + /* Unlock this cell. */ + if ( lock_unlock( &c->glock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Admit defeat. */ + TIMER_TOC(timer_locktree); + return 1; + + } + + } + + /** * @brief Unock a cell's parents. * @@ -289,6 +356,24 @@ void cell_unlocktree( struct cell *c ) { } +void cell_gunlocktree( struct cell *c ) { + + struct cell *finger; + TIMER_TIC + + /* First of all, try to unlock this cell. */ + if ( lock_unlock( &c->glock ) != 0 ) + error( "Failed to unlock cell." ); + + /* Climb up the tree and unhold the parents. */ + for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) + __sync_fetch_and_sub( &finger->ghold , 1 ); + + TIMER_TOC(timer_locktree); + + } + + /** * @brief Sort the parts into eight bins along the given pivots. * @@ -297,20 +382,21 @@ void cell_unlocktree( struct cell *c ) { void cell_split ( struct cell *c ) { - int i, j, k; + int i, j, k, count = c->count, gcount = c->gcount; struct part temp, *parts = c->parts; struct xpart xtemp, *xparts = c->xparts; + struct gpart gtemp, *gparts = c->gparts; int left[8], right[8]; double pivot[3]; - /* Init the pivot. */ + /* Init the pivots. */ for ( k = 0 ; k < 3 ; k++ ) pivot[k] = c->loc[k] + c->h[k]/2; /* Split along the x-axis. */ - i = 0; j = c->count - 1; + i = 0; j = count - 1; while ( i <= j ) { - while ( i <= c->count-1 && parts[i].x[0] <= pivot[0] ) + while ( i <= count-1 && parts[i].x[0] <= pivot[0] ) i += 1; while ( j >= 0 && parts[j].x[0] > pivot[0] ) j -= 1; @@ -322,10 +408,10 @@ void cell_split ( struct cell *c ) { /* for ( k = 0 ; k <= j ; k++ ) if ( parts[k].x[0] > pivot[0] ) error( "cell_split: sorting failed." ); - for ( k = i ; k < c->count ; k++ ) + for ( k = i ; k < count ; k++ ) if ( parts[k].x[0] < pivot[0] ) error( "cell_split: sorting failed." ); */ - left[1] = i; right[1] = c->count - 1; + left[1] = i; right[1] = count - 1; left[0] = 0; right[0] = j; /* Split along the y axis, twice. */ @@ -387,13 +473,17 @@ void cell_split ( struct cell *c ) { c->progeny[k]->xparts = &c->xparts[ left[k] ]; } + /* Re-link the gparts. */ + for ( k = 0 ; k < count ; k++ ) + parts[k].gpart->part = &parts[k]; + /* Verify that _all_ the parts have been assigned to a cell. */ /* for ( k = 1 ; k < 8 ; k++ ) if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] != c->progeny[k]->parts ) error( "Particle sorting failed (internal consistency)." ); if ( c->progeny[0]->parts != c->parts ) error( "Particle sorting failed (left edge)." ); - if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ c->count ] ) + if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] ) error( "Particle sorting failed (right edge)." ); */ /* Verify a few sub-cells. */ @@ -413,6 +503,65 @@ void cell_split ( struct cell *c ) { c->progeny[2]->parts[k].x[2] > pivot[2] ) error( "Sorting failed (progeny=2)." ); */ + /* Now do the same song and dance for the gparts. */ + + /* Split along the x-axis. */ + i = 0; j = gcount - 1; + while ( i <= j ) { + while ( i <= gcount-1 && gparts[i].x[0] <= pivot[0] ) + i += 1; + while ( j >= 0 && gparts[j].x[0] > pivot[0] ) + j -= 1; + if ( i < j ) { + gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp; + } + } + left[1] = i; right[1] = gcount - 1; + left[0] = 0; right[0] = j; + + /* Split along the y axis, twice. */ + for ( k = 1 ; k >= 0 ; k-- ) { + i = left[k]; j = right[k]; + while ( i <= j ) { + while ( i <= right[k] && gparts[i].x[1] <= pivot[1] ) + i += 1; + while ( j >= left[k] && gparts[j].x[1] > pivot[1] ) + j -= 1; + if ( i < j ) { + gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp; + } + } + left[2*k+1] = i; right[2*k+1] = right[k]; + left[2*k] = left[k]; right[2*k] = j; + } + + /* Split along the z axis, four times. */ + for ( k = 3 ; k >= 0 ; k-- ) { + i = left[k]; j = right[k]; + while ( i <= j ) { + while ( i <= right[k] && gparts[i].x[2] <= pivot[2] ) + i += 1; + while ( j >= left[k] && gparts[j].x[2] > pivot[2] ) + j -= 1; + if ( i < j ) { + gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp; + } + } + left[2*k+1] = i; right[2*k+1] = right[k]; + left[2*k] = left[k]; right[2*k] = j; + } + + /* Store the counts and offsets. */ + for ( k = 0 ; k < 8 ; k++ ) { + c->progeny[k]->gcount = right[k] - left[k] + 1; + c->progeny[k]->gparts = &c->gparts[ left[k] ]; + } + + /* Re-link the parts. */ + for ( k = 0 ; k < gcount ; k++ ) + if ( gparts[k].id > 0 ) + gparts[k].part->gpart = &gparts[k]; + } diff --git a/src/cell.h b/src/cell.h index 530eb5ae5b09a2bccd72736e88110387205deae3..0b4828efbef4e297cb765e98c193bec901e370ce 100644 --- a/src/cell.h +++ b/src/cell.h @@ -67,7 +67,7 @@ struct cell { int depth, split, maxdepth; /* Nr of parts. */ - int count; + int count, gcount; /* Pointers to the particle data. */ struct part *parts; @@ -75,12 +75,12 @@ struct cell { /* Pointers to the extra particle data. */ struct xpart *xparts; - /* Pointers for the sorted indices. */ - struct entry *sort; - unsigned int sorted; + /* Pointers to the gravity particle data. */ + struct gpart *gparts; - /* Number of pairs associated with this cell. */ - // int nr_pairs; + /* Pointers for the sorted indices. */ + struct entry *sort, *gsort; + unsigned int sorted, gsorted; /* Pointers to the next level of cells. */ struct cell *progeny[8]; @@ -91,13 +91,13 @@ struct cell { /* Super cell, i.e. the highest-level supercell that has interactions. */ struct cell *super; - /* The tasks computing this cell's sorts. */ - struct task *sorts; - int sortsize; + /* The task computing this cell's sorts. */ + struct task *sorts, *gsorts; + int sortsize, gsortsize; /* The tasks computing this cell's density. */ - struct link *density, *force; - int nr_density, nr_force; + struct link *density, *force, *grav; + int nr_density, nr_force, nr_grav; /* The ghost task to link density to interactions. */ struct task *ghost, *kick1, *kick2; @@ -105,17 +105,17 @@ struct cell { /* Task receiving data. */ struct task *recv_xv, *recv_rho; + /* Tasks for gravity tree. */ + struct task *grav_up, *grav_down; + /* Number of tasks that are associated with this cell. */ int nr_tasks; - /* Number of tasks this cell is waiting for and whether it is in use. */ - int wait; - /* Is the data of this cell being used in a sub-cell? */ - int hold; + int hold, ghold; /* Spin lock for various uses. */ - lock_type lock; + lock_type lock, glock; /* ID of the previous owner, e.g. runner. */ int owner; @@ -143,6 +143,9 @@ struct cell { int pcell_size; int tag; + /* This cell's multipole. */ + struct multipole multipole; + } __attribute__((aligned (64))); @@ -150,6 +153,8 @@ struct cell { void cell_split ( struct cell *c ); int cell_locktree( struct cell *c ); void cell_unlocktree( struct cell *c ); +int cell_glocktree( struct cell *c ); +void cell_gunlocktree( struct cell *c ); int cell_pack ( struct cell *c , struct pcell *pc ); int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ); int cell_getsize ( struct cell *c ); diff --git a/src/const.h b/src/const.h index 70271a4775fb2c6fcb86fceeb6c27a2ae0797a51..7d6f5c12e67abc4b5fe3719aea496088e97ddd58 100644 --- a/src/const.h +++ b/src/const.h @@ -41,6 +41,18 @@ #define const_delta_nwneigh 1.f #define CUBIC_SPLINE_KERNEL +/* Gravity stuff. */ +#define const_theta_max 0.57735f /* Opening criteria, which is the ratio of the + cell distance over the cell width. */ +#define const_G 6.67384e-8f /* Gravitational constant. */ +#define const_epsilon 0.0014f /* Gravity blending distance. */ +#define const_iepsilon 714.285714286f /* Inverse gravity blending distance. */ +#define const_iepsilon2 (const_iepsilon*const_iepsilon) +#define const_iepsilon3 (const_iepsilon2*const_iepsilon) +#define const_iepsilon4 (const_iepsilon2*const_iepsilon2) +#define const_iepsilon5 (const_iepsilon3*const_iepsilon2) +#define const_iepsilon6 (const_iepsilon3*const_iepsilon3) + /* SPH variant to use */ #define LEGACY_GADGET2_SPH diff --git a/src/debug.c b/src/debug.c index 4cef5bfcdeba7308c09fe49a8339d0cc4878a865..75c726bfced9f9b5f1b09aa276b92fbc06ae3882 100644 --- a/src/debug.c +++ b/src/debug.c @@ -34,44 +34,74 @@ * * (Should be used for debugging only as it runs in O(N).) */ + void printParticle ( struct part *parts , long long int id, int N ) { - int i; + int i, found = 0; /* Look for the particle. */ - for ( i = 0 ; i < N && parts[i].id != id; i++ ); + for ( i = 0 ; i < N ; i++ ) + if ( parts[i].id == id ) { + printf("## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n", + i, + parts[i].id, + parts[i].x[0], parts[i].x[1], parts[i].x[2], + parts[i].v[0], parts[i].v[1], parts[i].v[2], + parts[i].a[0], parts[i].a[1], parts[i].a[2], + parts[i].h, + parts[i].force.h_dt, + parts[i].density.wcount, + parts[i].mass, + parts[i].rho, parts[i].rho_dh, + parts[i].density.div_v, + parts[i].u, + parts[i].force.u_dt, + parts[i].force.balsara, + parts[i].force.POrho2, + parts[i].force.v_sig, + parts[i].dt + ); + found = 1; + } + + if ( !found ) + printf("## Particles[???] id=%lld not found\n", id); + + } + - if ( i < N ) - printf("## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n", - i, - parts[i].id, - parts[i].x[0], parts[i].x[1], parts[i].x[2], - parts[i].v[0], parts[i].v[1], parts[i].v[2], - parts[i].a[0], parts[i].a[1], parts[i].a[2], - parts[i].h, - parts[i].force.h_dt, - parts[i].density.wcount, - parts[i].mass, - parts[i].rho, parts[i].rho_dh, - parts[i].density.div_v, - parts[i].u, - parts[i].force.u_dt, - parts[i].force.balsara, - parts[i].force.POrho2, - parts[i].force.v_sig, - parts[i].dt - ); - else +void printgParticle ( struct gpart *parts , long long int id, int N ) { + + int i, found = 0; + + /* Look for the particle. */ + for ( i = 0 ; i < N ; i++ ) + if ( parts[i].id == -id || ( parts[i].id > 0 && parts[i].part->id == id ) ) { + printf("## gParticle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], m=%.3e, dt=%.3e\n", + i, + (parts[i].id < 0) ? -parts[i].id : parts[i].part->id , + parts[i].x[0], parts[i].x[1], parts[i].x[2], + parts[i].v[0], parts[i].v[1], parts[i].v[2], + parts[i].a[0], parts[i].a[1], parts[i].a[2], + parts[i].mass, + parts[i].dt + ); + found = 1; + } + + if ( !found ) printf("## Particles[???] id=%lld not found\n", id); } + /** * @brief Prints the details of a given particle to stdout * * @param p The particle to print * */ + void printParticle_single ( struct part *p ) { printf("## Particle: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n", diff --git a/src/debug.h b/src/debug.h index ffd4699a816d5dce16667ed6ae917a9662caa056..5db731a857ef32792c4f0a377eb97e475ab6b782 100644 --- a/src/debug.h +++ b/src/debug.h @@ -21,4 +21,5 @@ void printParticle(struct part *parts, long long int i, int N); +void printgParticle(struct gpart *parts, long long int i, int N); void printParticle_single ( struct part *p ); diff --git a/src/engine.c b/src/engine.c index 0697fa926996e5c3725113f040ff646abdf5ed6a..3a40b91b968342afac18223761c446855491313b 100644 --- a/src/engine.c +++ b/src/engine.c @@ -54,6 +54,7 @@ #include "part.h" #include "debug.h" #include "space.h" +#include "multipole.h" #include "cell.h" #include "queue.h" #include "scheduler.h" @@ -551,6 +552,30 @@ void engine_repartition ( struct engine *e ) { #endif } + + +/** + * @brief Add up/down gravity tasks to a cell hierarchy. + * + * @param e The #engine. + * @param c The #cell + * @param up The upward gravity #task. + * @param down The downward gravity #task. + */ + +void engine_addtasks_grav ( struct engine *e , struct cell *c , struct task *up , struct task *down ) { + + /* Link the tasks to this cell. */ + c->grav_up = up; + c->grav_down = down; + + /* Recurse? */ + if ( c->split ) + for ( int k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + engine_addtasks_grav( e , c->progeny[k] , up , down ); + + } /** @@ -887,6 +912,9 @@ void engine_maketasks ( struct engine *e ) { struct space *s = e->s; struct scheduler *sched = &e->sched; + struct cell *cells = s->cells; + int nr_cells = s->nr_cells; + int nodeID = e->nodeID; int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd, sid; int *cdim = s->cdim; struct task *t, *t2; @@ -900,12 +928,12 @@ void engine_maketasks ( struct engine *e ) { for ( j = 0 ; j < cdim[1] ; j++ ) for ( k = 0 ; k < cdim[2] ; k++ ) { cid = cell_getid( cdim , i , j , k ); - if ( s->cells[cid].count == 0 ) + if ( cells[cid].count == 0 ) continue; - ci = &s->cells[cid]; + ci = &cells[cid]; if ( ci->count == 0 ) continue; - if ( ci->nodeID == e->nodeID ) + if ( ci->nodeID == nodeID ) scheduler_addtask( sched , task_type_self , task_subtype_density , 0 , 0 , ci , NULL , 0 ); for ( ii = -1 ; ii < 2 ; ii++ ) { iii = i + ii; @@ -923,17 +951,24 @@ void engine_maketasks ( struct engine *e ) { continue; kkk = ( kkk + cdim[2] ) % cdim[2]; cjd = cell_getid( cdim , iii , jjj , kkk ); - cj = &s->cells[cjd]; + cj = &cells[cjd]; if ( cid >= cjd || cj->count == 0 || - ( ci->nodeID != e->nodeID && cj->nodeID != e->nodeID ) ) + ( ci->nodeID != nodeID && cj->nodeID != nodeID ) ) continue; sid = sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ]; - t = scheduler_addtask( sched , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 ); + scheduler_addtask( sched , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 ); } } } } - + + /* Add the gravity mm tasks. */ + for ( i = 0 ; i < nr_cells ; i++ ) { + scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , NULL , 0 ); + for ( j = i+1 ; j < nr_cells ; j++ ) + scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , &cells[j] , 0 ); + } + /* Split the tasks. */ scheduler_splittasks( sched ); @@ -944,27 +979,44 @@ void engine_maketasks ( struct engine *e ) { error( "Failed to allocate cell-task links." ); e->nr_links = 0; + /* Add the gravity up/down tasks at the top-level cells and push them down. */ + for ( k = 0 ; k < nr_cells ; k++ ) + if ( cells[k].nodeID == nodeID ) { + + /* Create tasks at top level. */ + struct task *up = scheduler_addtask( sched , task_type_grav_up , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 ); + struct task *down = scheduler_addtask( sched , task_type_grav_down , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 ); + + /* Push tasks down the cell hierarchy. */ + engine_addtasks_grav( e , &cells[k] , up , down ); + + } + /* Count the number of tasks associated with each cell and store the density tasks in each cell, and make each sort depend on the sorts of its progeny. */ // #pragma omp parallel for private(t,j) for ( k = 0 ; k < sched->nr_tasks ; k++ ) { + + /* Get the current task. */ t = &sched->tasks[k]; if ( t->skip ) continue; + + /* Link sort tasks together. */ if ( t->type == task_type_sort && t->ci->split ) for ( j = 0 ; j < 8 ; j++ ) if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL ) { t->ci->progeny[j]->sorts->skip = 0; scheduler_addunlock( sched , t->ci->progeny[j]->sorts , t ); } + + /* Link density tasks to cells. */ if ( t->type == task_type_self ) { atomic_inc( &t->ci->nr_tasks ); if ( t->subtype == task_subtype_density ) { t->ci->density = engine_addlink( e , t->ci->density , t ); atomic_inc( &t->ci->nr_density ); - if ( t->ci->nr_density > 27*8 ) - error( "Density overflow." ); } } else if ( t->type == task_type_pair ) { @@ -975,8 +1027,6 @@ void engine_maketasks ( struct engine *e ) { atomic_inc( &t->ci->nr_density ); t->cj->density = engine_addlink( e , t->cj->density , t ); atomic_inc( &t->cj->nr_density ); - if ( t->ci->nr_density > 8*27 || t->cj->nr_density > 8*27 ) - error( "Density overflow." ); } } else if ( t->type == task_type_sub ) { @@ -989,21 +1039,33 @@ void engine_maketasks ( struct engine *e ) { if ( t->cj != NULL ) { t->cj->density = engine_addlink( e , t->cj->density , t ); atomic_inc( &t->cj->nr_density ); - if ( t->cj->nr_density > 8*27 ) - error( "Density overflow." ); + } + } } + + /* Link gravity multipole tasks to the up/down tasks. */ + if ( t->type == task_type_grav_mm || + ( t->type == task_type_sub && t->subtype == task_subtype_grav ) ) { + atomic_inc( &t->ci->nr_tasks ); + scheduler_addunlock( sched , t->ci->grav_up , t ); + scheduler_addunlock( sched , t , t->ci->grav_down ); + if ( t->cj != NULL && t->ci->grav_up != t->cj->grav_up ) { + scheduler_addunlock( sched , t->cj->grav_up , t ); + scheduler_addunlock( sched , t , t->cj->grav_down ); } } + } - /* Append a ghost task to each cell. */ - for ( k = 0 ; k < s->nr_cells ; k++ ) - engine_mkghosts( e , &s->cells[k] , NULL ); + /* Append a ghost task to each cell, and add kick2 tasks to the + super cells. */ + for ( k = 0 ; k < nr_cells ; k++ ) + engine_mkghosts( e , &cells[k] , NULL ); /* if ( e->policy & engine_policy_fixdt ) space_map_cells_pre( s , 1 , &scheduler_map_mkghosts_nokick1 , sched ); else space_map_cells_pre( s , 1 , &scheduler_map_mkghosts , sched ); */ - + /* Run through the tasks and make force tasks for each density task. Each force task depends on the cell ghosts and unlocks the kick2 task of its super-cell. */ @@ -1031,12 +1093,12 @@ void engine_maketasks ( struct engine *e ) { /* Otherwise, pair interaction? */ else if ( t->type == task_type_pair && t->subtype == task_subtype_density ) { t2 = scheduler_addtask( sched , task_type_pair , task_subtype_force , 0 , 0 , t->ci , t->cj , 0 ); - if ( t->ci->nodeID == e->nodeID ) { + if ( t->ci->nodeID == nodeID ) { scheduler_addunlock( sched , t , t->ci->super->ghost ); scheduler_addunlock( sched , t->ci->super->ghost , t2 ); scheduler_addunlock( sched , t2 , t->ci->super->kick2 ); } - if ( t->cj->nodeID == e->nodeID && t->ci->super != t->cj->super ) { + if ( t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) { scheduler_addunlock( sched , t , t->cj->super->ghost ); scheduler_addunlock( sched , t->cj->super->ghost , t2 ); scheduler_addunlock( sched , t2 , t->cj->super->kick2 ); @@ -1050,12 +1112,12 @@ void engine_maketasks ( struct engine *e ) { /* Otherwise, sub interaction? */ else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) { t2 = scheduler_addtask( sched , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , 0 ); - if ( t->ci->nodeID == e->nodeID ) { + if ( t->ci->nodeID == nodeID ) { scheduler_addunlock( sched , t , t->ci->super->ghost ); scheduler_addunlock( sched , t->ci->super->ghost , t2 ); scheduler_addunlock( sched , t2 , t->ci->super->kick2 ); } - if ( t->cj != NULL && t->cj->nodeID == e->nodeID && t->ci->super != t->cj->super ) { + if ( t->cj != NULL && t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) { scheduler_addunlock( sched , t , t->cj->super->ghost ); scheduler_addunlock( sched , t->cj->super->ghost , t2 ); scheduler_addunlock( sched , t2 , t->cj->super->kick2 ); @@ -1068,6 +1130,10 @@ void engine_maketasks ( struct engine *e ) { } } + /* Kick2 tasks should rely on the grav_down tasks of their cell. */ + else if ( t->type == task_type_kick2 ) + scheduler_addunlock( sched , t->ci->grav_down , t ); + } /* Add the communication tasks if MPI is being used. */ @@ -1657,6 +1723,10 @@ void engine_step ( struct engine *e ) { (1 << task_type_kick2) | (1 << task_type_send) | (1 << task_type_recv) | + (1 << task_type_grav_pp) | + (1 << task_type_grav_mm) | + (1 << task_type_grav_up) | + (1 << task_type_grav_down) | (1 << task_type_link) ); TIMER_TOC(timer_runners); diff --git a/src/engine.h b/src/engine.h index 308ba34476fdd408c1af53285538394329b88af6..2685fbd33f6caa529e26204f86732e6bb163ea78 100644 --- a/src/engine.h +++ b/src/engine.h @@ -32,7 +32,7 @@ #define engine_policy_setaffinity 256 #define engine_queue_scale 1.2 -#define engine_maxtaskspercell 32 +#define engine_maxtaskspercell 128 #define engine_maxproxies 64 #define engine_tasksreweight 10 diff --git a/src/kernel.h b/src/kernel.h index f2ff4e329b42095f018c36b0878cb7bee54c47f9..c012739f300aeb5aeedd4b56798b00b2d7ed5cc9 100644 --- a/src/kernel.h +++ b/src/kernel.h @@ -23,11 +23,182 @@ /** * @file kernel.h - * @brief SPH kernel functions. Compute W(x,h) and the gradient of W(x,h). + * @brief SPH kernel functions. Compute W(x,h) and the gradient of W(x,h), + * as well as the blending function used for gravity. */ #include "vector.h" +/* Gravity kernel stuff ----------------------------------------------------------------------------------------------- */ + +/* The gravity kernel is defined as a degree 6 polynomial in the distance + r. The resulting value should be post-multiplied with r^-3, resulting + in a polynomial with terms ranging from r^-3 to r^3, which are + sufficient to model both the direct potential as well as the splines + near the origin. */ + +/* Coefficients for the gravity kernel. */ +#define kernel_grav_degree 6 +#define kernel_grav_ivals 2 +#define kernel_grav_scale (2*const_iepsilon) +static float kernel_grav_coeffs[ (kernel_grav_degree+1) * (kernel_grav_ivals+1) ] = + { 32.0f*const_iepsilon6 , -192.0f/5.0f*const_iepsilon5 , 0.0f , 32.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , 0.0f , + -32.0f/3.0f*const_iepsilon6 , 192.0f/5.0f*const_iepsilon5 , -48.0f*const_iepsilon4 , 64.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , -1.0f/15.0f , + 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 1.0f }; + + +/** + * @brief Computes the gravity cubic spline for a given distance x. + */ + +__attribute__ ((always_inline)) INLINE static void kernel_grav_eval ( float x , float *W ) { + int ind = fmin( x*kernel_grav_scale , kernel_grav_ivals ); + float *coeffs = &kernel_grav_coeffs[ ind*(kernel_grav_degree + 1) ]; + float w = coeffs[0]*x + coeffs[1]; + for ( int k = 2 ; k <= kernel_grav_degree ; k++ ) + w = x*w + coeffs[k]; + *W = w; + } + + +#ifdef VECTORIZE + +/** + * @brief Computes the gravity cubic spline for a given distance x (Vectorized version). + */ + +__attribute__ ((always_inline)) INLINE static void kernel_grav_eval_vec ( vector *x , vector *w ) { + + vector ind, c[kernel_grav_degree+1]; + int j, k; + + /* Load x and get the interval id. */ + ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( kernel_grav_scale ) , vec_set1( (float)kernel_grav_ivals ) ) ); + + /* load the coefficients. */ + for ( k = 0 ; k < VEC_SIZE ; k++ ) + for ( j = 0 ; j < kernel_grav_degree+1 ; j++ ) + c[j].f[k] = kernel_grav_coeffs[ ind.i[k]*(kernel_grav_degree + 1) + j ]; + + /* Init the iteration for Horner's scheme. */ + w->v = ( c[0].v * x->v ) + c[1].v; + + /* And we're off! */ + for ( int k = 2 ; k <= kernel_grav_degree ; k++ ) + w->v = ( x->v * w->v ) + c[k].v; + + } + + +#endif + + +/* Blending function stuff -------------------------------------------------------------------------------------------- */ + +/* Coefficients for the blending function. */ +#define blender_degree 3 +#define blender_ivals 3 +#define blender_scale 4.0f +static float blender_coeffs[ (blender_degree+1) * (blender_ivals+1) ] = + { 0.0f , 0.0f , 0.0f , 1.0f , + -32.0f , 24.0f , -6.0f , 1.5f , + -32.0f , 72.0f , -54.0f , 13.5f , + 0.0f , 0.0f , 0.0f , 0.0f }; + + +/** + * @brief Computes the cubic spline blender for a given distance x. + */ + +__attribute__ ((always_inline)) INLINE static void blender_eval ( float x , float *W ) { + int ind = fmin( x*blender_scale , blender_ivals ); + float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ]; + float w = coeffs[0]*x + coeffs[1]; + for ( int k = 2 ; k <= blender_degree ; k++ ) + w = x*w + coeffs[k]; + *W = w; + } + + +/** + * @brief Computes the cubic spline blender and its derivative for a given distance x. + */ + +__attribute__ ((always_inline)) INLINE static void blender_deval ( float x , float *W , float *dW_dx ) { + int ind = fminf( x*blender_scale , blender_ivals ); + float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ]; + float w = coeffs[0]*x + coeffs[1]; + float dw_dx = coeffs[0]; + for ( int k = 2 ; k <= blender_degree ; k++ ) { + dw_dx = dw_dx*x + w; + w = x*w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; + } + + +#ifdef VECTORIZE + +/** + * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2. + */ + +__attribute__ ((always_inline)) INLINE static void blender_eval_vec ( vector *x , vector *w ) { + + vector ind, c[blender_degree+1]; + int j, k; + + /* Load x and get the interval id. */ + ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) ); + + /* load the coefficients. */ + for ( k = 0 ; k < VEC_SIZE ; k++ ) + for ( j = 0 ; j < blender_degree+1 ; j++ ) + c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ]; + + /* Init the iteration for Horner's scheme. */ + w->v = ( c[0].v * x->v ) + c[1].v; + + /* And we're off! */ + for ( int k = 2 ; k <= blender_degree ; k++ ) + w->v = ( x->v * w->v ) + c[k].v; + + } + + +/** + * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2. + */ + +__attribute__ ((always_inline)) INLINE static void blender_deval_vec ( vector *x , vector *w , vector *dw_dx ) { + + vector ind, c[blender_degree+1]; + int j, k; + + /* Load x and get the interval id. */ + ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) ); + + /* load the coefficients. */ + for ( k = 0 ; k < VEC_SIZE ; k++ ) + for ( j = 0 ; j < blender_degree+1 ; j++ ) + c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ]; + + /* Init the iteration for Horner's scheme. */ + w->v = ( c[0].v * x->v ) + c[1].v; + dw_dx->v = c[0].v; + + /* And we're off! */ + for ( int k = 2 ; k <= blender_degree ; k++ ) { + dw_dx->v = ( dw_dx->v * x->v ) + w->v; + w->v = ( x->v * w->v ) + c[k].v; + } + + } + +#endif + + /* -------------------------------------------------------------------------------------------------------------------- */ #if defined(CUBIC_SPLINE_KERNEL) @@ -49,14 +220,14 @@ static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribu 0.0 , 0.0 , 0.0 , 0.0 }; #define kernel_root ( kernel_coeffs[ kernel_degree ] ) #define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] ) - + /** * @brief Computes the cubic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<2. */ __attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) { - int ind = fmin( x , kernel_ivals ); + int ind = fminf( x , kernel_ivals ); float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; float w = coeffs[0]*x + coeffs[1]; float dw_dx = coeffs[0]; @@ -102,6 +273,7 @@ __attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x #endif + /** * @brief Computes the cubic spline kernel for a given distance x. Gives a sensible answer only if x<2. */ @@ -117,9 +289,6 @@ __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float - - - /* -------------------------------------------------------------------------------------------------------------------- */ #elif defined(QUARTIC_SPLINE_KERNEL) diff --git a/src/part.h b/src/part.h index df4040c8788dc080b247078e85a780561bb836af..ce87cfcabb3170c97723c8afeedcfd3d5fd8e37b 100644 --- a/src/part.h +++ b/src/part.h @@ -45,6 +45,38 @@ struct xpart { } __attribute__((aligned (32))); + +/* Gravity particle. */ +struct gpart { + + /* Particle position. */ + double x[3]; + + /* Particle velocity. */ + float v[3]; + + /* Particle acceleration. */ + float a[3]; + + /* Particle mass. */ + float mass; + + /* Particle time step. */ + float dt; + + /* Anonymous union for id/part. */ + union { + + /* Particle ID. */ + unsigned long long id; + + /* Pointer to corresponding SPH part. */ + struct part *part; + + }; + + } __attribute__((aligned (part_align))); + /* Data of a single particle. */ struct part { @@ -130,6 +162,9 @@ struct part { /* Particle ID. */ unsigned long long id; + /* Associated gravitas. */ + struct gpart *gpart; + } __attribute__((aligned (part_align))); diff --git a/src/proxy.c b/src/proxy.c index 8aac6cf97f574c28f940b38c24a56604df0788af..6c8f4e40476a755d186dda43461c0dc93b156fd0 100644 --- a/src/proxy.c +++ b/src/proxy.c @@ -46,9 +46,10 @@ #include "vector.h" #include "lock.h" #include "space.h" +#include "part.h" +#include "multipole.h" #include "cell.h" #include "task.h" -#include "part.h" #include "debug.h" #include "proxy.h" #include "error.h" diff --git a/src/queue.c b/src/queue.c index 1519cebac61216ec7529ea299eaad7ffd3a1c7bc..ff161fbdd35c8b3dc2afee0f45184ddedc3b240d 100644 --- a/src/queue.c +++ b/src/queue.c @@ -37,6 +37,8 @@ #include "task.h" #include "timers.h" #include "space.h" +#include "part.h" +#include "multipole.h" #include "cell.h" #include "queue.h" #include "error.h" diff --git a/src/runner.c b/src/runner.c index 32e5b5694d23430cea5eaf7f2fa260b7b27b48a2..eff08f9e15f6db02e7ac92ddf593caccdd01cadb 100644 --- a/src/runner.c +++ b/src/runner.c @@ -45,6 +45,7 @@ #include "task.h" #include "part.h" #include "space.h" +#include "multipole.h" #include "cell.h" #include "queue.h" #include "scheduler.h" @@ -58,6 +59,7 @@ #else #include "runner_iact.h" #endif +#include "runner_iact_grav.h" /* Convert cell location to ID. */ #define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) @@ -86,14 +88,18 @@ const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; -/* Import the density functions. */ +/* Import the density loop functions. */ #define FUNCTION density #include "runner_doiact.h" +/* Import the force loop functions. */ #undef FUNCTION #define FUNCTION force #include "runner_doiact.h" +/* Import the gravity loop functions. */ +#include "runner_doiact_grav.h" + /** * @brief Send a local cell's particle data to another node. @@ -222,6 +228,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) struct entry *finger; struct entry *fingers[8]; struct part *parts = c->parts; + struct entry *sort; int j, k, count = c->count; int i, ind, off[8], inds[8], temp_i, missing; // float shift[3]; @@ -235,13 +242,14 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) return; /* start by allocating the entry arrays. */ - if ( c->sort == NULL || c->sortsize < c->count ) { + if ( c->sort == NULL || c->sortsize < count ) { if ( c->sort != NULL ) free( c->sort ); - c->sortsize = c->count * 1.1; + c->sortsize = count * 1.1; if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL ) error( "Failed to allocate sort memory." ); } + sort = c->sort; /* Does this cell have any progeny? */ if ( c->split ) { @@ -289,7 +297,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) } /* For each entry in the new sort list. */ - finger = &c->sort[ j*(count + 1) ]; + finger = &sort[ j*(count + 1) ]; for ( ind = 0 ; ind < count ; ind++ ) { /* Copy the minimum into the new sort array. */ @@ -308,8 +316,8 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) } /* Merge. */ /* Add a sentinel. */ - c->sort[ j*(c->count + 1) + c->count ].d = FLT_MAX; - c->sort[ j*(c->count + 1) + c->count ].i = 0; + sort[ j*(count + 1) + count ].d = FLT_MAX; + sort[ j*(count + 1) + count ].i = 0; /* Mark as sorted. */ c->sorted |= ( 1 << j ); @@ -328,17 +336,17 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) px[2] = parts[k].x[2]; for ( j = 0 ; j < 13 ; j++ ) if ( flags & (1 << j) ) { - c->sort[ j*(count + 1) + k].i = k; - c->sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ]; + sort[ j*(count + 1) + k].i = k; + sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ]; } } /* Add the sentinel and sort. */ for ( j = 0 ; j < 13 ; j++ ) if ( flags & (1 << j) ) { - c->sort[ j*(count + 1) + c->count ].d = FLT_MAX; - c->sort[ j*(count + 1) + c->count ].i = 0; - runner_dosort_ascending( &c->sort[ j*(count + 1) ] , c->count ); + sort[ j*(count + 1) + count ].d = FLT_MAX; + sort[ j*(count + 1) + count ].i = 0; + runner_dosort_ascending( &sort[ j*(count + 1) ] , count ); c->sorted |= ( 1 << j ); } @@ -348,18 +356,173 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) /* for ( j = 0 ; j < 13 ; j++ ) { if ( !( flags & (1 << j) ) ) continue; - finger = &c->sort[ j*(c->count + 1) ]; - for ( k = 1 ; k < c->count ; k++ ) { + finger = &sort[ j*(count + 1) ]; + for ( k = 1 ; k < count ; k++ ) { if ( finger[k].d < finger[k-1].d ) error( "Sorting failed, ascending array." ); - if ( finger[k].i >= c->count ) + if ( finger[k].i >= count ) error( "Sorting failed, indices borked." ); } } */ #ifdef TIMER_VERBOSE message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." , - r->id , c->count , c->depth , + r->id , count , c->depth , + (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , + ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); + #else + if ( clock ) + TIMER_TOC(timer_dosort); + #endif + + } + + +void runner_dogsort ( struct runner *r , struct cell *c , int flags , int clock ) { + + struct entry *finger; + struct entry *fingers[8]; + struct gpart *gparts = c->gparts; + struct entry *gsort; + int j, k, count = c->gcount; + int i, ind, off[8], inds[8], temp_i, missing; + // float shift[3]; + float buff[8], px[3]; + + TIMER_TIC + + /* Clean-up the flags, i.e. filter out what's already been sorted. */ + flags &= ~c->gsorted; + if ( flags == 0 ) + return; + + /* start by allocating the entry arrays. */ + if ( c->gsort == NULL || c->gsortsize < count ) { + if ( c->gsort != NULL ) + free( c->gsort ); + c->gsortsize = count * 1.1; + if ( ( c->gsort = (struct entry *)malloc( sizeof(struct entry) * (c->gsortsize + 1) * 13 ) ) == NULL ) + error( "Failed to allocate sort memory." ); + } + gsort = c->gsort; + + /* Does this cell have any progeny? */ + if ( c->split ) { + + /* Fill in the gaps within the progeny. */ + for ( k = 0 ; k < 8 ; k++ ) { + if ( c->progeny[k] == NULL ) + continue; + missing = flags & ~c->progeny[k]->gsorted; + if ( missing ) + runner_dogsort( r , c->progeny[k] , missing , 0 ); + } + + /* Loop over the 13 different sort arrays. */ + for ( j = 0 ; j < 13 ; j++ ) { + + /* Has this sort array been flagged? */ + if ( !( flags & (1 << j) ) ) + continue; + + /* Init the particle index offsets. */ + for ( off[0] = 0 , k = 1 ; k < 8 ; k++ ) + if ( c->progeny[k-1] != NULL ) + off[k] = off[k-1] + c->progeny[k-1]->gcount; + else + off[k] = off[k-1]; + + /* Init the entries and indices. */ + for ( k = 0 ; k < 8 ; k++ ) { + inds[k] = k; + if ( c->progeny[k] != NULL && c->progeny[k]->gcount > 0 ) { + fingers[k] = &c->progeny[k]->gsort[ j*(c->progeny[k]->gcount + 1) ]; + buff[k] = fingers[k]->d; + off[k] = off[k]; + } + else + buff[k] = FLT_MAX; + } + + /* Sort the buffer. */ + for ( i = 0 ; i < 7 ; i++ ) + for ( k = i+1 ; k < 8 ; k++ ) + if ( buff[ inds[k] ] < buff[ inds[i] ] ) { + temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i; + } + + /* For each entry in the new sort list. */ + finger = &gsort[ j*(count + 1) ]; + for ( ind = 0 ; ind < count ; ind++ ) { + + /* Copy the minimum into the new sort array. */ + finger[ind].d = buff[inds[0]]; + finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; + + /* Update the buffer. */ + fingers[inds[0]] += 1; + buff[inds[0]] = fingers[inds[0]]->d; + + /* Find the smallest entry. */ + for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) { + temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i; + } + + } /* Merge. */ + + /* Add a sentinel. */ + gsort[ j*(count + 1) + count ].d = FLT_MAX; + gsort[ j*(count + 1) + count ].i = 0; + + /* Mark as sorted. */ + c->gsorted |= ( 1 << j ); + + } /* loop over sort arrays. */ + + } /* progeny? */ + + /* Otherwise, just sort. */ + else { + + /* Fill the sort array. */ + for ( k = 0 ; k < count ; k++ ) { + px[0] = gparts[k].x[0]; + px[1] = gparts[k].x[1]; + px[2] = gparts[k].x[2]; + for ( j = 0 ; j < 13 ; j++ ) + if ( flags & (1 << j) ) { + gsort[ j*(count + 1) + k].i = k; + gsort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ]; + } + } + + /* Add the sentinel and sort. */ + for ( j = 0 ; j < 13 ; j++ ) + if ( flags & (1 << j) ) { + gsort[ j*(count + 1) + count ].d = FLT_MAX; + gsort[ j*(count + 1) + count ].i = 0; + runner_dosort_ascending( &gsort[ j*(count + 1) ] , count ); + c->gsorted |= ( 1 << j ); + } + + } + + /* Verify the sorting. */ + /* for ( j = 0 ; j < 13 ; j++ ) { + if ( !( flags & (1 << j) ) ) + continue; + finger = &c->gsort[ j*(count + 1) ]; + for ( k = 1 ; k < count ; k++ ) { + if ( finger[k].d < finger[k-1].d ) + error( "Sorting failed, ascending array." ); + if ( finger[k].i < 0 || finger[k].i >= count ) + error( "Sorting failed, indices borked." ); + } + } */ + + #ifdef TIMER_VERBOSE + message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." , + r->id , count , c->depth , (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); #else @@ -1102,7 +1265,7 @@ void *runner_main ( void *data ) { t->rid = r->cpuid; /* Set super to the first cell that I own. */ - if ( ci->super->owner == r->qid ) + if ( ci->super != NULL && ci->super->owner == r->qid ) super = ci->super; else if ( cj != NULL && cj->super != NULL && cj->super->owner == r->qid ) super = cj->super; @@ -1145,6 +1308,8 @@ void *runner_main ( void *data ) { runner_dosub1_density( r , ci , cj , t->flags , 1 ); else if ( t->subtype == task_subtype_force ) runner_dosub2_force( r , ci , cj , t->flags , 1 ); + else if ( t->subtype == task_subtype_grav ) + runner_dosub_grav( r , ci , cj , 1 ); else error( "Unknown task subtype." ); break; @@ -1169,6 +1334,21 @@ void *runner_main ( void *data ) { parts[k].dt = FLT_MAX; ci->dt_min = ci->dt_max = FLT_MAX; break; + case task_type_grav_pp: + if ( t->cj == NULL ) + runner_doself_grav( r , t->ci ); + else + runner_dopair_grav( r , t->ci , t->cj ); + break; + case task_type_grav_mm: + runner_dograv_mm( r , t->ci , t->cj ); + break; + case task_type_grav_up: + runner_dograv_up( r , t->ci ); + break; + case task_type_grav_down: + runner_dograv_down( r , t->ci ); + break; default: error( "Unknown task type." ); } diff --git a/src/runner.h b/src/runner.h index 741411e5762dc1f8252bacf43bb1c625460c13a2..91ac475d7079a5a49dd194668bea567e9528a74c 100644 --- a/src/runner.h +++ b/src/runner.h @@ -27,7 +27,7 @@ extern const char runner_flip[]; /* Counters. */ -enum { +enum runner_counters { runner_counter_swap = 0, runner_counter_stall, runner_counter_steal_stall, @@ -80,6 +80,7 @@ void runner_dopair_density ( struct runner *r , struct cell *ci , struct cell *c void runner_doself_density ( struct runner *r , struct cell *c ); void runner_dosub_density ( struct runner *r , struct cell *ci , struct cell *cj , int flags ); void runner_dosort ( struct runner *r , struct cell *c , int flag , int clock ); +void runner_dogsort ( struct runner *r , struct cell *c , int flag , int clock ); void runner_dokick ( struct runner *r , struct cell *c , int timer ); void runner_dokick1 ( struct runner *r , struct cell *c ); void runner_dokick2 ( struct runner *r , struct cell *c ); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index e9b3e8e720c4c2198267fc5cf6052e49cd4c5790..1fd0492803ba2053a25fbb73768e1c357b11658d 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -673,9 +673,6 @@ void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) { dj_min = sort_j[0].d; dx_max = ( ci->dx_max + cj->dx_max ); - /* if ( ci->split && cj->split && sid == 4 ) - printf( "boing!\n" ); */ - /* Loop over the parts in ci. */ for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) { diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h new file mode 100644 index 0000000000000000000000000000000000000000..ba24b6bf4a024d4ae9f6e83f325cdcd75edee145 --- /dev/null +++ b/src/runner_doiact_grav.h @@ -0,0 +1,626 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + + + +/** + * @brief Compute the sorted gravity interactions between a cell pair. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ + +void runner_dopair_grav_new ( struct runner *r , struct cell *ci , struct cell *cj ) { + + struct engine *restrict e = r->e; + int pid, pjd, k, sid; + double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }, nshift[3]; + struct entry *restrict sort_i, *restrict sort_j; + struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; + double pix[3]; + float dx[3], r2, h_max, di, dj; + int count_i, count_j, cnj, cnj_new; + float dt_step = e->dt_step; + struct multipole m; + #ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; + #endif + TIMER_TIC + + /* Anything to do here? */ + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) + return; + + /* Get the sort ID. */ + sid = space_getsid( e->s , &ci , &cj , shift ); + + /* Make sure the cells are sorted. */ + runner_dogsort( r , ci , (1 << sid) , 0 ); + runner_dogsort( r , cj , (1 << sid) , 0 ); + + /* Have the cells been sorted? */ + if ( !(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid) ) ) + error( "Trying to interact unsorted cells." ); + + /* Get the cutoff shift. */ + for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ ) + rshift += shift[k]*runner_shift[ 3*sid + k ]; + + /* Pick-out the sorted lists. */ + sort_i = &ci->gsort[ sid*(ci->count + 1) ]; + sort_j = &cj->gsort[ sid*(cj->count + 1) ]; + + /* Get some other useful values. */ + h_max = sqrtf( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) * const_theta_max; + count_i = ci->gcount; count_j = cj->gcount; + parts_i = ci->gparts; parts_j = cj->gparts; + cnj = count_j; + multipole_reset( &m ); + nshift[0] = -shift[0]; nshift[1] = -shift[1]; nshift[2] = -shift[2]; + + /* Loop over the parts in ci. */ + for ( pid = count_i-1 ; pid >= 0 ; pid-- ) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ sort_i[ pid ].i ]; + if ( pi->dt > dt_step ) + continue; + di = sort_i[pid].d + h_max - rshift; + + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = pi->x[k] - shift[k]; + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < cnj && sort_j[pjd].d < di ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[ sort_j[pjd].i ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k]*dx[k]; + } + + #ifndef VECTORIZE + + // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) + // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) ); + + runner_iact_grav( r2 , dx , pi , pj ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + runner_iact_vec_grav( r2q , dxq , piq , pjq ); + icount = 0; + } + + #endif + + } /* loop over the parts in cj. */ + + /* Set the new limit. */ + cnj_new = pjd; + + /* Add trailing parts to the multipole. */ + for ( pjd = cnj_new ; pjd < cnj ; pjd++ ) { + + /* Add the part to the multipole. */ + multipole_addpart( &m , &parts_j[ sort_j[pjd].i ] ); + + } /* add trailing parts to the multipole. */ + + /* Set the new cnj. */ + cnj = cnj_new; + + /* Interact the ith particle with the multipole. */ + multipole_iact_mp( &m , pi , nshift ); + + } /* loop over the parts in ci. */ + + #ifdef VECTORIZE + /* Pick up any leftovers. */ + if ( icount > 0 ) + for ( k = 0 ; k < icount ; k++ ) + runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] ); + #endif + + /* Re-set the multipole. */ + multipole_reset( &m ); + + /* Loop over the parts in cj and interact with the multipole in ci. */ + for ( pid = count_i - 1 , pjd = 0 ; pjd < count_j ; pjd++ ) { + + /* Get the position of pj along the axis. */ + dj = sort_j[pjd].d - h_max + rshift; + + /* Add any left-over parts in cell_i to the multipole. */ + while ( pid >= 0 && sort_i[pid].d < dj ) { + + /* Add this particle to the multipole. */ + multipole_addpart( &m , &parts_i[ sort_i[pid].i ] ); + + /* Decrease pid. */ + pid -= 1; + + } + + /* Interact pj with the multipole. */ + multipole_iact_mp( &m , &parts_j[ sort_j[pjd].i ] , shift ); + + } /* loop over the parts in cj and interact with the multipole. */ + + + #ifdef TIMER_VERBOSE + printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); + #else + TIMER_TOC(TIMER_DOPAIR); + #endif + + } + + +/** + * @brief Compute the recursive upward sweep, i.e. construct the + * multipoles in a cell hierarchy. + * + * @param r The #runner. + * @param c The top-level #cell. + */ + +void runner_dograv_up ( struct runner *r , struct cell *c ) { + + /* Re-set this cell's multipole. */ + multipole_reset( &c->multipole ); + + /* Split? */ + if ( c->split ) { + + /* Recurse. */ + for ( int k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + runner_dograv_up( r , c->progeny[k] ); + + /* Collect the multipoles from the progeny. */ + multipole_reset( &c->multipole ); + for ( int k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + multipole_merge( &c->multipole , &c->progeny[k]->multipole ); + + } + + /* No, leaf node. */ + else + + /* Just collect the multipole. */ + multipole_init( &c->multipole , c->gparts , c->gcount ); + + } + + +/** + * @brief Compute the recursive downward sweep, i.e. apply the multipole + * accelleration on all the particles. + * + * @param r The #runner. + * @param c The top-level #cell. + */ + +void runner_dograv_down ( struct runner *r , struct cell *c ) { + + struct multipole *m = &c->multipole; + + /* Split? */ + if ( c->split ) { + + /* Apply this cell's accelleration on the multipoles below. */ + for ( int k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) { + struct multipole *mp = &c->progeny[k]->multipole; + mp->a[0] += m->a[0]; + mp->a[1] += m->a[1]; + mp->a[2] += m->a[2]; + } + + /* Recurse. */ + for ( int k = 0 ; k < 8 ; k++ ) + if ( c->progeny[k] != NULL ) + runner_dograv_down( r , c->progeny[k] ); + + } + + /* No, leaf node. */ + else { + + /* Apply the multipole accelleration to all gparts. */ + for ( int k = 0 ; k < c->gcount ; k++ ) { + struct gpart *p = &c->gparts[k]; + p->a[0] += m->a[0]; + p->a[1] += m->a[1]; + p->a[2] += m->a[2]; + } + + } + + } + + +/** + * @brief Compute the multipole-multipole interaction between two cells. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ + +void runner_dograv_mm ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { + + struct engine *e = r->e; + int k; + double shift[3] = { 0.0 , 0.0 , 0.0 }; + float dx[3], theta; + + /* Compute the shift between the cells. */ + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = cj->loc[k] - ci->loc[k]; + if ( r->e->s->periodic ) { + if ( dx[k] < -e->s->dim[k]/2 ) + shift[k] = e->s->dim[k]; + else if ( dx[k] > e->s->dim[k]/2 ) + shift[k] = -e->s->dim[k]; + dx[k] += shift[k]; + } + } + theta = sqrt( ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / + ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) ); + + /* Do an MM or an MP/PM? */ + if ( theta > const_theta_max*4 ) { + + /* Update the multipoles. */ + multipole_iact_mm( &ci->multipole , &cj->multipole , shift ); + + } + + else { + + /* Interact the multipoles via their parts. */ + for ( k = 0 ; k < ci->gcount ; k++ ) + multipole_iact_mp( &cj->multipole , &ci->gparts[k] , shift ); + for ( k = 0 ; k < cj->gcount ; k++ ) + multipole_iact_mp( &ci->multipole , &cj->gparts[k] , shift ); + + } + + } + + +/** + * @brief Compute the interactions between a cell pair. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ + +void runner_dopair_grav ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { + + struct engine *e = r->e; + int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount; + double shift[3] = { 0.0 , 0.0 , 0.0 }; + struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts; + struct gpart *restrict pi, *restrict pj; + double pix[3]; + float dx[3], r2; + float dt_step = e->dt_step; + #ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; + #endif + TIMER_TIC + + /* Anything to do here? */ + if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) + return; + + /* Get the relative distance between the pairs, wrapping. */ + if ( e->s->periodic ) + for ( k = 0 ; k < 3 ; k++ ) { + if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) + shift[k] = e->s->dim[k]; + else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) + shift[k] = -e->s->dim[k]; + } + + /* Loop over the parts in ci. */ + for ( pid = 0 ; pid < count_i ; pid++ ) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ pid ]; + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = pi->x[k] - shift[k]; + + /* Loop over the parts in cj. */ + for ( pjd = 0 ; pjd < count_j ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[ pjd ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Compute the interaction. */ + #ifndef VECTORIZE + + // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) + // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) ); + + runner_iact_grav( r2 , dx , pi , pj ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + runner_iact_vec_grav( r2q , dxq , piq , pjq ); + icount = 0; + } + + #endif + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + #ifdef VECTORIZE + /* Pick up any leftovers. */ + if ( icount > 0 ) + for ( k = 0 ; k < icount ; k++ ) + runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] ); + #endif + + #ifdef TIMER_VERBOSE + printf( "runner_dopair_naive_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(timer_dopair_grav); + #endif + + + } + + +/** + * @brief Compute the interactions within a cell. + * + * @param r The #runner. + * @param c The #cell. + */ + +void runner_doself_grav ( struct runner *r , struct cell *restrict c ) { + + struct engine *e = r->e; + int pid, pjd, k, count = c->gcount; + struct gpart *restrict parts = c->gparts; + struct gpart *restrict pi, *restrict pj; + double pix[3] = { 0.0 , 0.0 , 0.0 }; + float dx[3], r2; + float dt_step = e->dt_step; + #ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__ ((aligned (16))); + float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); + struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; + #endif + TIMER_TIC + + /* Anything to do here? */ + if ( c->dt_min > dt_step ) + return; + + /* Loop over every part in c. */ + for ( pid = 0 ; pid < count ; pid++ ) { + + /* Get a hold of the ith part in ci. */ + pi = &parts[ pid ]; + for ( k = 0 ; k < 3 ; k++ ) + pix[k] = pi->x[k]; + + /* Loop over every other part in c. */ + for ( pjd = pid+1 ; pjd < count ; pjd++ ) { + + /* Get a pointer to the jth particle. */ + pj = &parts[ pjd ]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k]*dx[k]; + } + + /* Compute the interaction. */ + #ifndef VECTORIZE + + // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) + // message( "interacting particles pi=%lli and pj=%lli with r=%.3e." , pi->part->id , pj->part->id , sqrtf(r2) ); + + runner_iact_grav( r2 , dx , pi , pj ); + + #else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3*icount+0] = dx[0]; + dxq[3*icount+1] = dx[1]; + dxq[3*icount+2] = dx[2]; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if ( icount == VEC_SIZE ) { + runner_iact_vec_grav( r2q , dxq , piq , pjq ); + icount = 0; + } + + #endif + + } /* loop over the remaining parts in c. */ + + } /* loop over the parts in c. */ + + #ifdef VECTORIZE + /* Pick up any leftovers. */ + if ( icount > 0 ) + for ( k = 0 ; k < icount ; k++ ) + runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] ); + #endif + + #ifdef TIMER_VERBOSE + printf( "runner_doself_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(timer_doself_grav); + #endif + + + } + + +/** + * @brief Compute a gravity sub-task. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + * @param gettimer Flag to record timer or not. + */ + +void runner_dosub_grav ( struct runner *r , struct cell *ci , struct cell *cj , int gettimer ) { + + int j, k, periodic = r->e->s->periodic; + struct space *s = r->e->s; + + TIMER_TIC + + /* Self-interaction? */ + if ( cj == NULL ) { + + /* If the cell is split, recurse. */ + if ( ci->split ) { + + /* Split this task into tasks on its progeny. */ + for ( j = 0 ; j < 8 ; j++ ) + if ( ci->progeny[j] != NULL ) { + runner_dosub_grav( r , ci->progeny[j] , NULL , 0 ); + for ( k = j+1 ; k < 8 ; k++ ) + if ( ci->progeny[k] != NULL ) + runner_dosub_grav( r , ci->progeny[j] , ci->progeny[k] , 0 ); + } + + } + + /* Otherwise, just make a pp task out of it. */ + else + runner_doself_grav( r , ci ); + + } + + /* Nope, pair. */ + else { + + /* Get the opening angle theta. */ + float dx[3], theta; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = fabsf( ci->loc[k] - cj->loc[k] ); + if ( periodic && dx[k] > 0.5*s->dim[k] ) + dx[k] = -dx[k] + s->dim[k]; + if ( dx[k] > 0.0f ) + dx[k] -= ci->h[k]; + } + theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / + ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ); + + /* Split the interacton? */ + if ( theta < const_theta_max*const_theta_max ) { + + /* Are both ci and cj split? */ + if ( ci->split && cj->split ) { + + /* Split this task into tasks on its progeny. */ + for ( j = 0 ; j < 8 ; j++ ) + if ( ci->progeny[j] != NULL ) { + for ( k = 0 ; k < 8 ; k++ ) + if ( cj->progeny[k] != NULL ) + runner_dosub_grav( r , ci->progeny[j] , cj->progeny[k] , 0 ); + } + + } + + /* Otherwise, make a pp task out of it. */ + else + runner_dopair_grav( r , ci , cj ); + + } + + /* Otherwise, mm interaction is fine. */ + else + runner_dograv_mm( r , ci , cj ); + + } + + if ( gettimer ) + #ifdef TIMER_VERBOSE + printf( "runner_dosub_grav[%02i]: at depth %i took %.3f ms.\n" , r->id , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); + #else + TIMER_TOC(timer_dosub_grav); + #endif + + } + + diff --git a/src/runner_iact.h b/src/runner_iact.h index 8c14b2d15f54532d63b13c4aa35c158e1408355f..0a6b9c4ce74b41e1eb795b3c13adc1348b63aa23 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -103,7 +103,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo #ifdef VECTORIZE - vector r, ri, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; + vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; vector mi, mj; vector dx[3], dv[3]; @@ -112,8 +112,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo vector curlvr[3], curl_vi[3], curl_vj[3]; int k, j; - r.v = vec_sqrt( vec_load( R2 ) ); - ri.v = vec_rcp( r.v ); #if VEC_SIZE==8 mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); @@ -134,6 +132,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); #endif + /* Get the radius and inverse radius. */ + r2.v = vec_load( R2 ); + ri.v = vec_rsqrt( r2.v ); + ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); + r.v = r2.v * ri.v; + hi.v = vec_load( Hi ); hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); @@ -267,7 +271,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit #ifdef VECTORIZE - vector r, ri, xi, hi, hi_inv, wi, wi_dx; + vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx; vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; vector mj; vector dx[3], dv[3]; @@ -276,8 +280,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit vector curlvr[3], curl_vi[3]; int k, j; - r.v = vec_sqrt( vec_load( R2 ) ); - ri.v = vec_rcp( r.v ); #if VEC_SIZE==8 mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); for ( k = 0 ; k < 3 ; k++ ) { @@ -296,6 +298,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); #endif + /* Get the radius and inverse radius. */ + r2.v = vec_load( R2 ); + ri.v = vec_rsqrt( r2.v ); + ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); + r.v = r2.v * ri.v; + hi.v = vec_load( Hi ); hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); diff --git a/src/runner_iact_grav.h b/src/runner_iact_grav.h new file mode 100644 index 0000000000000000000000000000000000000000..da1f552ae073aab3575de03255a3919d7a14cf95 --- /dev/null +++ b/src/runner_iact_grav.h @@ -0,0 +1,119 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +#include "kernel.h" +#include "vector.h" + +/** + * @file runner_iact_grav.h + * @brief Gravity interaction functions. + * + */ + + +/** + * @brief Gravity potential + */ + +__attribute__ ((always_inline)) INLINE static void runner_iact_grav ( float r2 , float *dx , struct gpart *pi , struct gpart *pj ) { + + float ir, r; + float w, acc; + float mi = pi->mass, mj = pj->mass; + int k; + + /* Get the absolute distance. */ + ir = 1.0f / sqrtf( r2 ); + r = r2 * ir; + + /* Evaluate the gravity kernel. */ + kernel_grav_eval( r , &acc ); + + /* Scale the acceleration. */ + acc *= const_G * ir * ir * ir; + + /* Aggregate the accellerations. */ + for ( k = 0 ; k < 3 ; k++ ) { + w = acc * dx[k]; + pi->a[k] -= w * mj; + pj->a[k] += w * mi; + } + + } + + +/** + * @brief Gravity potential (Vectorized version) + */ +__attribute__ ((always_inline)) INLINE static void runner_iact_vec_grav ( float *R2 , float *Dx , struct gpart **pi , struct gpart **pj ) { + +#ifdef VECTORIZE + + vector ir, r, r2, dx[3]; + vector w, acc, ai, aj; + vector mi, mj; + int j, k; + + #if VEC_SIZE==8 + mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); + mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); + for ( k = 0 ; k < 3 ; k++ ) + dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); + #elif VEC_SIZE==4 + mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass ); + mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); + for ( k = 0 ; k < 3 ; k++ ) + dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); + #endif + + + /* Get the radius and inverse radius. */ + r2.v = vec_load( R2 ); + ir.v = vec_rsqrt( r2.v ); + ir.v = ir.v - vec_set1( 0.5f ) * ir.v * ( r2.v * ir.v * ir.v - vec_set1( 1.0f ) ); + r.v = r2.v * ir.v; + + /* Evaluate the gravity kernel. */ + blender_eval_vec( &r , &acc ); + + /* Scale the acceleration. */ + acc.v *= vec_set1( const_G ) * ir.v * ir.v * ir.v; + + /* Aggregate the accellerations. */ + for ( k = 0 ; k < 3 ; k++ ) { + w.v = acc.v * dx[k].v; + ai.v = w.v * mj.v; + aj.v = w.v * mi.v; + for ( j = 0 ; j < VEC_SIZE ; j++ ) { + pi[j]->a[k] -= ai.f[j]; + pj[j]->a[k] += aj.f[j]; + } + } + +#else + + for ( int k = 0 ; k < VEC_SIZE ; k++ ) + runner_iact_grav( R2[k] , &Dx[3*k] , pi[k] , pj[k] ); + +#endif + + } + + diff --git a/src/runner_iact_legacy.h b/src/runner_iact_legacy.h index 2042a1652c9fb6b2bb539db2c215fcc0c01cfab5..aa50cc1fe2c09fa558a21eaf8b9079ffc08b6cbb 100644 --- a/src/runner_iact_legacy.h +++ b/src/runner_iact_legacy.h @@ -103,7 +103,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo #ifdef VECTORIZE - vector r, ri, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; + vector r, r2, ri, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; vector mi, mj; vector dx[3], dv[3]; @@ -112,8 +112,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo vector curlvr[3], curl_vi[3], curl_vj[3]; int k, j; - r.v = vec_sqrt( vec_load( R2 ) ); - ri.v = vec_rcp( r.v ); #if VEC_SIZE==8 mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); @@ -134,6 +132,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( flo dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); #endif + /* Get the radius and inverse radius. */ + r2.v = vec_load( R2 ); + ri.v = vec_rsqrt( r2.v ); + ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); + r.v = r2.v * ri.v; + hi.v = vec_load( Hi ); hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); @@ -267,7 +271,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit #ifdef VECTORIZE - vector r, ri, xi, hi, hi_inv, wi, wi_dx; + vector r, r2, ri, xi, hi, hi_inv, wi, wi_dx; vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; vector mj; vector dx[3], dv[3]; @@ -276,8 +280,6 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit vector curlvr[3], curl_vi[3]; int k, j; - r.v = vec_sqrt( vec_load( R2 ) ); - ri.v = vec_rcp( r.v ); #if VEC_SIZE==8 mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); for ( k = 0 ; k < 3 ; k++ ) { @@ -296,6 +298,12 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_densit dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); #endif + /* Get the radius and inverse radius. */ + r2.v = vec_load( R2 ); + ri.v = vec_rsqrt( r2.v ); + ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); + r.v = r2.v * ri.v; + hi.v = vec_load( Hi ); hi_inv.v = vec_rcp( hi.v ); hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); diff --git a/src/scheduler.c b/src/scheduler.c index 49ffd14dff821f7a1e8d7a22feb648b9db8140e3..10fb22171fce6f13c5c1862807938a18a0f293bb 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -46,6 +46,7 @@ #include "part.h" #include "debug.h" #include "space.h" +#include "multipole.h" #include "cell.h" #include "queue.h" #include "kernel.h" @@ -431,6 +432,135 @@ void scheduler_splittasks ( struct scheduler *s ) { } } /* pair interaction? */ + + /* Gravity interaction? */ + else if ( t->type == task_type_grav_mm ) { + + /* Get a handle on the cells involved. */ + ci = t->ci; + cj = t->cj; + + /* Self-interaction? */ + if ( cj == NULL ) { + + /* Ignore this task if the cell has no gparts. */ + if ( ci->gcount == 0 ) + t->type = task_type_none; + + /* If the cell is split, recurse. */ + else if ( ci->split ) { + + /* Make a single sub-task? */ + if ( scheduler_dosub && ci->count < space_subsize/ci->count ) { + + t->type = task_type_sub; + t->subtype = task_subtype_grav; + + } + + /* Otherwise, just split the task. */ + else { + + /* Split this task into tasks on its progeny. */ + t->type = task_type_none; + for ( j = 0 ; j < 8 ; j++ ) + if ( ci->progeny[j] != NULL ) { + if ( t->type == task_type_none ) { + t->type = task_type_grav_mm; + t->ci = ci->progeny[j]; + t->cj = NULL; + } + else + t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , NULL , 0 ); + for ( k = j+1 ; k < 8 ; k++ ) + if ( ci->progeny[k] != NULL ) { + if ( t->type == task_type_none ) { + t->type = task_type_grav_mm; + t->ci = ci->progeny[j]; + t->cj = ci->progeny[k]; + } + else + t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , ci->progeny[k] , 0 ); + } + } + redo = ( t->type != task_type_none ); + + } + + } + + /* Otherwise, just make a pp task out of it. */ + else + t->type = task_type_grav_pp; + + } + + /* Nope, pair. */ + else { + + /* Make a sub-task? */ + if ( scheduler_dosub && ci->count < space_subsize/cj->count ) { + + t->type = task_type_sub; + t->subtype = task_subtype_grav; + + } + + /* Otherwise, split the task. */ + else { + + /* Get the opening angle theta. */ + float dx[3], theta; + for ( k = 0 ; k < 3 ; k++ ) { + dx[k] = fabsf( ci->loc[k] - cj->loc[k] ); + if ( s->space->periodic && dx[k] > 0.5*s->space->dim[k] ) + dx[k] = -dx[k] + s->space->dim[k]; + if ( dx[k] > 0.0f ) + dx[k] -= ci->h[k]; + } + theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / + ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ); + + /* Ignore this task if the cell has no gparts. */ + if ( ci->gcount == 0 || cj->gcount == 0 ) + t->type = task_type_none; + + /* Split the interacton? */ + else if ( theta < const_theta_max*const_theta_max ) { + + /* Are both ci and cj split? */ + if ( ci->split && cj->split ) { + + /* Split this task into tasks on its progeny. */ + t->type = task_type_none; + for ( j = 0 ; j < 8 ; j++ ) + if ( ci->progeny[j] != NULL ) { + for ( k = 0 ; k < 8 ; k++ ) + if ( cj->progeny[k] != NULL ) { + if ( t->type == task_type_none ) { + t->type = task_type_grav_mm; + t->ci = ci->progeny[j]; + t->cj = cj->progeny[k]; + } + else + t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , cj->progeny[k] , 0 ); + } + } + redo = ( t->type != task_type_none ); + + } + + /* Otherwise, make a pp task out of it. */ + else + t->type = task_type_grav_pp; + + } + + } + + } /* gravity pair interaction? */ + + } /* gravity interaction? */ } /* loop over all tasks. */ @@ -457,6 +587,12 @@ struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype , /* Get the next free task. */ ind = atomic_inc( &s->tasks_next ); + + /* Overflow? */ + if ( ind >= s->size ) + error( "Task list overflow." ); + + /* Get a pointer to the new task. */ t = &s->tasks[ ind ]; /* Copy the data. */ @@ -637,10 +773,18 @@ void scheduler_reweight ( struct scheduler *s ) { break; case task_type_sub: if ( t->cj != NULL ) { - if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID ) - t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; - else - t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; + if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID ) { + if ( t->flags < 0 ) + t->weight += 3 * wscale * t->ci->count * t->cj->count; + else + t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; + } + else { + if ( t->flags < 0 ) + t->weight += 2 * wscale * t->ci->count * t->cj->count; + else + t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; + } } else t->weight += 1 * wscale * t->ci->count * t->ci->count; @@ -653,6 +797,8 @@ void scheduler_reweight ( struct scheduler *s ) { case task_type_kick2: t->weight += wscale * t->ci->count; break; + default: + break; } if ( t->type == task_type_send ) t->weight = INT_MAX / 8; diff --git a/src/space.c b/src/space.c index 5b0422fb12d1c85d8b9875fdc167bda3a7feeb77..e0bd071626006b08ea3a650548f3b344e065e0ef 100644 --- a/src/space.c +++ b/src/space.c @@ -44,6 +44,7 @@ #include "kernel.h" #include "part.h" #include "space.h" +#include "multipole.h" #include "cell.h" #include "scheduler.h" #include "engine.h" @@ -101,16 +102,16 @@ const int sortlistID[27] = { int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift ) { - int k, sid = 0; + int k, sid = 0, periodic = s->periodic; struct cell *temp; double dx[3]; /* Get the relative distance between the pairs, wrapping. */ for ( k = 0 ; k < 3 ; k++ ) { dx[k] = (*cj)->loc[k] - (*ci)->loc[k]; - if ( dx[k] < -s->dim[k]/2 ) + if ( periodic && dx[k] < -s->dim[k]/2 ) shift[k] = s->dim[k]; - else if ( dx[k] > s->dim[k]/2 ) + else if ( periodic && dx[k] > s->dim[k]/2 ) shift[k] = -s->dim[k]; else shift[k] = 0.0; @@ -240,6 +241,7 @@ void space_regrid ( struct space *s , double cell_max ) { c->dmin = dmin; c->depth = 0; c->count = 0; + c->gcount = 0; c->super = c; lock_init( &c->lock ); } @@ -265,6 +267,7 @@ void space_regrid ( struct space *s , double cell_max ) { s->cells[k].dx_max = 0.0f; s->cells[k].sorted = 0; s->cells[k].count = 0; + s->cells[k].gcount = 0; s->cells[k].kick1 = NULL; s->cells[k].kick2 = NULL; s->cells[k].super = &s->cells[k]; @@ -286,11 +289,11 @@ void space_regrid ( struct space *s , double cell_max ) { void space_rebuild ( struct space *s , double cell_max ) { - float h_max = s->cell_min / kernel_gamma / space_stretch, dmin; - int i, j, k, cdim[3], nr_parts = s->nr_parts; + int j, k, cdim[3], nr_parts = s->nr_parts, nr_gparts = s->nr_gparts; struct cell *restrict c, *restrict cells = s->cells; struct part *restrict finger, *restrict p, *parts = s->parts; struct xpart *xfinger, *xparts = s->xparts; + struct gpart *gp, *gparts = s->gparts, *gfinger; int *ind; double ih[3], dim[3]; // ticks tic; @@ -298,110 +301,8 @@ void space_rebuild ( struct space *s , double cell_max ) { /* Be verbose about this. */ // message( "re)building space..." ); fflush(stdout); - /* Run through the parts and get the current h_max. */ - // tic = getticks(); - if ( cells != NULL ) { - for ( k = 0 ; k < s->nr_cells ; k++ ) { - if ( cells[k].h_max > h_max ) - h_max = cells[k].h_max; - } - } - else { - for ( k = 0 ; k < nr_parts ; k++ ) { - if ( s->parts[k].h > h_max ) - h_max = s->parts[k].h; - } - s->h_max = h_max; - } - // message( "h_max is %.3e (cell_max=%.3e)." , h_max , cell_max ); - // message( "getting h_min and h_max took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* Get the new putative cell dimensions. */ - for ( k = 0 ; k < 3 ; k++ ) - cdim[k] = floor( s->dim[k] / fmax( h_max*kernel_gamma*space_stretch , cell_max ) ); - - /* In MPI-Land, we're not allowed to change the top-level cell size. */ - #ifdef WITH_MPI - if ( cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) - error( "Root-level change of cell size not allowed." ); - #endif - - /* Do we need to re-build the upper-level cells? */ - // tic = getticks(); - if ( cells == NULL || - cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) { - - /* Free the old cells, if they were allocated. */ - if ( cells != NULL ) { - for ( k = 0 ; k < s->nr_cells ; k++ ) { - space_rebuild_recycle( s , &cells[k] ); - if ( cells[k].sort != NULL ) - free( cells[k].sort ); - } - free( cells ); - s->maxdepth = 0; - } - - /* Set the new cell dimensions only if smaller. */ - for ( k = 0 ; k < 3 ; k++ ) { - s->cdim[k] = cdim[k]; - s->h[k] = s->dim[k] / cdim[k]; - s->ih[k] = 1.0 / s->h[k]; - } - dmin = fminf( s->h[0] , fminf( s->h[1] , s->h[2] ) ); - - /* Allocate the highest level of cells. */ - s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2]; - if ( posix_memalign( (void *)&cells , 64 , s->nr_cells * sizeof(struct cell) ) != 0 ) - error( "Failed to allocate cells." ); - s->cells = cells; - bzero( cells , s->nr_cells * sizeof(struct cell) ); - for ( k = 0 ; k < s->nr_cells ; k++ ) - if ( lock_init( &cells[k].lock ) != 0 ) - error( "Failed to init spinlock." ); - - /* Set the cell location and sizes. */ - for ( i = 0 ; i < cdim[0] ; i++ ) - for ( j = 0 ; j < cdim[1] ; j++ ) - for ( k = 0 ; k < cdim[2] ; k++ ) { - c = &cells[ cell_getid( cdim , i , j , k ) ]; - c->loc[0] = i*s->h[0]; c->loc[1] = j*s->h[1]; c->loc[2] = k*s->h[2]; - c->h[0] = s->h[0]; c->h[1] = s->h[1]; c->h[2] = s->h[2]; - c->dmin = dmin; - c->depth = 0; - c->count = 0; - c->super = c; - lock_init( &c->lock ); - } - - /* Be verbose about the change. */ - message( "set cell dimensions to [ %i %i %i ]." , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout); - - } /* re-build upper-level cells? */ - // message( "rebuilding upper-level cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* Otherwise, just clean up the cells. */ - else { - - /* Free the old cells, if they were allocated. */ - for ( k = 0 ; k < s->nr_cells ; k++ ) { - space_rebuild_recycle( s , &cells[k] ); - cells[k].sorts = NULL; - cells[k].nr_tasks = 0; - cells[k].nr_density = 0; - cells[k].nr_force = 0; - cells[k].density = NULL; - cells[k].force = NULL; - cells[k].dx_max = 0.0f; - cells[k].sorted = 0; - cells[k].count = 0; - cells[k].super = &cells[k]; - cells[k].kick1 = NULL; - cells[k].kick2 = NULL; - } - s->maxdepth = 0; - - } + /* Re-grid if necessary, or just re-set the cell data. */ + space_regrid( s , cell_max ); /* Run through the particles and get their cell index. */ // tic = getticks(); @@ -458,6 +359,11 @@ void space_rebuild ( struct space *s , double cell_max ) { parts_sort( parts , xparts , ind , nr_parts , 0 , s->nr_cells-1 ); // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); + /* Re-link the gparts. */ + for ( k = 0 ; k < nr_parts ; k++ ) + if ( parts[k].gpart != NULL ) + parts[k].gpart->part = &parts[k]; + /* Verify sort. */ /* for ( k = 1 ; k < nr_parts ; k++ ) { if ( ind[k-1] > ind[k] ) { @@ -470,16 +376,55 @@ void space_rebuild ( struct space *s , double cell_max ) { /* We no longer need the indices as of here. */ free( ind ); + + + /* Run through the gravity particles and get their cell index. */ + // tic = getticks(); + if ( ( ind = (int *)malloc( sizeof(int) * s->size_gparts ) ) == NULL ) + error( "Failed to allocate temporary particle indices." ); + #pragma omp parallel for private(gp,j) + for ( k = 0 ; k < nr_gparts ; k++ ) { + gp = &gparts[k]; + for ( j = 0 ; j < 3 ; j++ ) + if ( gp->x[j] < 0.0 ) + gp->x[j] += dim[j]; + else if ( gp->x[j] >= dim[j] ) + gp->x[j] -= dim[j]; + ind[k] = cell_getid( cdim , gp->x[0]*ih[0] , gp->x[1]*ih[1] , gp->x[2]*ih[2] ); + atomic_inc( &cells[ ind[k] ].gcount ); + } + // message( "getting particle indices took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); + + /* TODO: Here we should exchange the gparts as well! */ + + /* Sort the parts according to their cells. */ + // tic = getticks(); + gparts_sort( gparts ,ind , nr_parts , 0 , s->nr_cells-1 ); + // message( "gparts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); + + /* Re-link the parts. */ + for ( k = 0 ; k < nr_gparts ; k++ ) + if ( gparts[k].id > 0 ) + gparts[k].part->gpart = &gparts[k]; + + /* We no longer need the indices as of here. */ + free( ind ); + + + /* Hook the cells up to the parts. */ // tic = getticks(); finger = parts; xfinger = xparts; + gfinger = gparts; for ( k = 0 ; k < s->nr_cells ; k++ ) { c = &cells[ k ]; c->parts = finger; c->xparts = xfinger; + c->gparts = gfinger; finger = &finger[ c->count ]; xfinger = &xfinger[ c->count ]; + gfinger = &gfinger[ c->gcount ]; } // message( "hooking up cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); @@ -674,6 +619,165 @@ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , } +void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max ) { + + struct qstack { + volatile int i, j, min, max; + volatile int ready; + }; + struct qstack *qstack; + int qstack_size = 2*(max-min) + 10; + volatile unsigned int first, last, waiting; + + int pivot; + int i, ii, j, jj, temp_i, qid; + struct gpart temp_p; + + /* for ( int k = 0 ; k < N ; k++ ) + if ( ind[k] > max || ind[k] < min ) + error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */ + + /* Allocate the stack. */ + if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL ) + error( "Failed to allocate qstack." ); + + /* Init the interval stack. */ + qstack[0].i = 0; + qstack[0].j = N-1; + qstack[0].min = min; + qstack[0].max = max; + qstack[0].ready = 1; + for ( i = 1 ; i < qstack_size ; i++ ) + qstack[i].ready = 0; + first = 0; last = 1; waiting = 1; + + /* Parallel bit. */ + #pragma omp parallel default(none) shared(N,first,last,waiting,qstack,gparts,ind,qstack_size,stderr,engine_rank) private(pivot,i,ii,j,jj,min,max,temp_i,qid,temp_p) + { + + /* Main loop. */ + if ( omp_get_thread_num() < 8 ) + while ( waiting > 0 ) { + + /* Grab an interval off the queue. */ + qid = atomic_inc( &first ) % qstack_size; + + /* Wait for the interval to be ready. */ + while ( waiting > 0 && atomic_cas( &qstack[qid].ready , 1 , 1 ) != 1 ); + + /* Broke loop for all the wrong reasons? */ + if ( waiting == 0 ) + break; + + /* Get the stack entry. */ + i = qstack[qid].i; + j = qstack[qid].j; + min = qstack[qid].min; + max = qstack[qid].max; + qstack[qid].ready = 0; + // message( "thread %i got interval [%i,%i] with values in [%i,%i]." , omp_get_thread_num() , i , j , min , max ); + + /* Loop over sub-intervals. */ + while ( 1 ) { + + /* Bring beer. */ + pivot = (min + max) / 2; + + /* One pass of QuickSort's partitioning. */ + ii = i; jj = j; + while ( ii < jj ) { + while ( ii <= j && ind[ii] <= pivot ) + ii++; + while ( jj >= i && ind[jj] > pivot ) + jj--; + if ( ii < jj ) { + temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i; + temp_p = gparts[ii]; gparts[ii] = gparts[jj]; gparts[jj] = temp_p; + } + } + + /* Verify sort. */ + /* for ( int k = i ; k <= jj ; k++ ) + if ( ind[k] > pivot ) { + message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N ); + error( "Partition failed (<=pivot)." ); + } + for ( int k = jj+1 ; k <= j ; k++ ) + if ( ind[k] <= pivot ) { + message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N ); + error( "Partition failed (>pivot)." ); + } */ + + /* Split-off largest interval. */ + if ( jj - i > j - jj+1 ) { + + /* Recurse on the left? */ + if ( jj > i && pivot > min ) { + qid = atomic_inc( &last ) % qstack_size; + while ( atomic_cas( &qstack[qid].ready , 0 , 0 ) != 0 ); + qstack[qid].i = i; + qstack[qid].j = jj; + qstack[qid].min = min; + qstack[qid].max = pivot; + qstack[qid].ready = 1; + if ( atomic_inc( &waiting ) >= qstack_size ) + error( "Qstack overflow." ); + } + + /* Recurse on the right? */ + if ( jj+1 < j && pivot+1 < max ) { + i = jj+1; + min = pivot+1; + } + else + break; + + } + + else { + + /* Recurse on the right? */ + if ( jj+1 < j && pivot+1 < max ) { + qid = atomic_inc( &last ) % qstack_size; + while ( atomic_cas( &qstack[qid].ready , 0 , 0 ) != 0 ); + qstack[qid].i = jj+1; + qstack[qid].j = j; + qstack[qid].min = pivot+1; + qstack[qid].max = max; + qstack[qid].ready = 1; + if ( atomic_inc( &waiting ) >= qstack_size ) + error( "Qstack overflow." ); + } + + /* Recurse on the left? */ + if ( jj > i && pivot > min ) { + j = jj; + max = pivot; + } + else + break; + + } + + } /* loop over sub-intervals. */ + + atomic_dec( &waiting ); + + } /* main loop. */ + + } /* parallel bit. */ + + /* Verify sort. */ + /* for ( i = 1 ; i < N ; i++ ) + if ( ind[i-1] > ind[i] ) + error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i , ind[i] ); */ + + /* Clean up. */ + free( qstack ); + + } + + /** * @brief Mapping function to free the sorted indices buffers. */ @@ -826,7 +930,7 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell void space_split ( struct space *s , struct cell *c ) { - int k, count = c->count, maxdepth = 0; + int k, count = c->count, gcount = c->gcount, maxdepth = 0; float h, h_max = 0.0f, dt, dt_min = c->parts[0].dt, dt_max = dt_min; struct cell *temp; struct part *p, *parts = c->parts; @@ -837,7 +941,7 @@ void space_split ( struct space *s , struct cell *c ) { s->maxdepth = c->depth; /* Split or let it be? */ - if ( count > space_splitsize ) { + if ( count > space_splitsize || gcount > space_splitsize ) { /* No longer just a leaf. */ c->split = 1; @@ -846,6 +950,7 @@ void space_split ( struct space *s , struct cell *c ) { for ( k = 0 ; k < 8 ; k++ ) { temp = space_getcell( s ); temp->count = 0; + temp->gcount = 0; temp->loc[0] = c->loc[0]; temp->loc[1] = c->loc[1]; temp->loc[2] = c->loc[2]; @@ -873,7 +978,7 @@ void space_split ( struct space *s , struct cell *c ) { /* Remove any progeny with zero parts. */ for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k]->count == 0 ) { + if ( c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0 ) { space_recycle( s , c->progeny[k] ); c->progeny[k] = NULL; } @@ -1000,8 +1105,9 @@ struct cell *space_getcell ( struct space *s ) { /* Init some things in the cell. */ bzero( c , sizeof(struct cell) ); c->nodeID = -1; - if ( lock_init( &c->lock ) != 0 ) - error( "Failed to initialize cell spinlock." ); + if ( lock_init( &c->lock ) != 0 || + lock_init( &c->glock ) != 0 ) + error( "Failed to initialize cell spinlocks." ); return c; @@ -1052,6 +1158,28 @@ void space_init ( struct space *s , double dim[3] , struct part *parts , int N , xp->u_hdt = p->u; } + + /* For now, clone the parts to make gparts. */ + if ( posix_memalign( (void *)&s->gparts , part_align , N * sizeof(struct gpart) ) != 0 ) + error( "Failed to allocate gparts." ); + bzero( s->gparts , N * sizeof(struct gpart) ); + for ( int k = 0 ; k < N ; k++ ) { + s->gparts[k].x[0] = s->parts[k].x[0]; + s->gparts[k].x[1] = s->parts[k].x[1]; + s->gparts[k].x[2] = s->parts[k].x[2]; + s->gparts[k].v[0] = s->parts[k].v[0]; + s->gparts[k].v[1] = s->parts[k].v[1]; + s->gparts[k].v[2] = s->parts[k].v[2]; + s->gparts[k].mass = s->parts[k].mass; + s->gparts[k].dt = s->parts[k].dt; + s->gparts[k].id = s->parts[k].id; + s->gparts[k].part = &s->parts[k]; + s->parts[k].gpart = &s->gparts[k]; + } + s->nr_gparts = s->nr_parts; + s->size_gparts = s->size_parts; + + /* Init the space lock. */ if ( lock_init( &s->lock ) != 0 ) error( "Failed to create space spin-lock." ); diff --git a/src/space.h b/src/space.h index cbdfe8c0ee004e4e02e408eae70b6c171bf0d5e1..9d1f849d3b29b26d80b12a9767d6505040a1c74c 100644 --- a/src/space.h +++ b/src/space.h @@ -83,11 +83,12 @@ struct space { /* The particle data (cells have pointers to this). */ struct part *parts; - struct cpart *cparts; struct xpart *xparts; + struct gpart *gparts; /* The total number of parts in the space. */ int nr_parts, size_parts; + int nr_gparts, size_gparts; /* Is the space periodic? */ int periodic; @@ -110,6 +111,7 @@ struct space { /* function prototypes. */ void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max ); +void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max ); struct cell *space_getcell ( struct space *s ); int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift ); void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ); diff --git a/src/swift.h b/src/swift.h index 13b78ba17aa73075369fc7203394a9c3c6d792f5..07b4ccf13211d511e1498b0385f8722e21629f5b 100644 --- a/src/swift.h +++ b/src/swift.h @@ -31,6 +31,7 @@ #include "task.h" #include "scheduler.h" #include "part.h" +#include "multipole.h" #include "cell.h" #include "space.h" #include "queue.h" @@ -47,3 +48,4 @@ #else #include "runner_iact.h" #endif +#include "runner_iact_grav.h" diff --git a/src/task.c b/src/task.c index 7ef068905a2fae92f23e0baa3549fb85ff87a726..ba15bf956f5ee87280da9e48cd4cbe7cb03e2ba4 100644 --- a/src/task.c +++ b/src/task.c @@ -42,6 +42,8 @@ #include "atomic.h" #include "lock.h" #include "space.h" +#include "part.h" +#include "multipole.h" #include "cell.h" #include "task.h" #include "error.h" @@ -49,7 +51,8 @@ /* Task type names. */ const char *taskID_names[task_type_count] = { "none" , "sort" , "self" , "pair" , "sub" , "ghost" , - "kick1" , "kick2" , "send" , "recv" , "link" }; + "kick1" , "kick2" , "send" , "recv" , "link" , "grav_pp" , + "grav_mm" , "grav_up" , "grav_down" }; /** @@ -72,6 +75,15 @@ void task_unlock ( struct task *t ) { if ( t->cj != NULL ) cell_unlocktree( t->cj ); break; + case task_type_grav_pp: + case task_type_grav_mm: + case task_type_grav_down: + cell_gunlocktree( t->ci ); + if ( t->cj != NULL ) + cell_gunlocktree( t->cj ); + break; + default: + break; } } @@ -118,8 +130,9 @@ int task_lock ( struct task *t ) { } /* Otherwise, binary lock. */ - else if ( type == task_type_pair || (type == task_type_sub && cj != NULL) ) { - if ( ci->hold || cj->hold || ci->wait || cj->wait ) + else if ( type == task_type_pair || + ( type == task_type_sub && cj != NULL ) ) { + if ( ci->hold || cj->hold ) return 0; if ( cell_locktree( ci ) != 0 ) return 0; @@ -129,6 +142,20 @@ int task_lock ( struct task *t ) { } } + /* Gravity tasks? */ + else if ( type == task_type_grav_mm || + type == task_type_grav_pp || + type == task_type_grav_down ) { + if ( ci->ghold || ( cj != NULL && cj->ghold ) ) + return 0; + if ( cell_glocktree( ci ) != 0 ) + return 0; + if ( cj != NULL && cell_glocktree( cj ) != 0 ) { + cell_gunlocktree( ci ); + return 0; + } + } + /* If we made it this far, we've got a lock. */ return 1; diff --git a/src/task.h b/src/task.h index 7b3b81a5dbbd3dc6e32d5d410fbd374cd2b414f2..0505815ff2d5dcc186b30011a906458947589bd8 100644 --- a/src/task.h +++ b/src/task.h @@ -36,6 +36,10 @@ enum task_types { task_type_send, task_type_recv, task_type_link, + task_type_grav_pp, + task_type_grav_mm, + task_type_grav_up, + task_type_grav_down, task_type_count }; @@ -47,6 +51,7 @@ enum task_subtypes { task_subtype_none = 0, task_subtype_density, task_subtype_force, + task_subtype_grav, task_subtype_count }; @@ -56,7 +61,9 @@ extern const char *taskID_names[]; /* Data of a task. */ struct task { - char type, subtype, skip, tight, implicit; + enum task_types type; + enum task_subtypes subtype; + char skip, tight, implicit; int flags, wait, rank, weight; lock_type lock; diff --git a/src/timers.h b/src/timers.h index 7f1f831c78c28bfc13787e0660d2b41d12310dd6..58c48ac2444e2cd615f711213474d729b4bbbe70 100644 --- a/src/timers.h +++ b/src/timers.h @@ -28,10 +28,13 @@ enum { timer_dosort, timer_doself_density, timer_doself_force, + timer_doself_grav, timer_dopair_density, timer_dopair_force, + timer_dopair_grav, timer_dosub_density, timer_dosub_force, + timer_dosub_grav, timer_dopair_subset, timer_doghost, timer_gettask, diff --git a/src/vector.h b/src/vector.h index 0f4cb8c95ae061d4f06e85aaad6911d2985621de..81efefe9f6218a17149869f3ec5535a4361f6b5c 100644 --- a/src/vector.h +++ b/src/vector.h @@ -58,7 +58,7 @@ #define vec_dbl_fmax(a,b) _mm512_max_pd(a,b) #define vec_getoffsets(ptrs) _mm512_insertf64x4( _mm512_insertf64x4( _mm512_setzero_pd() , _mm512_cvtepi64_epi32( _mm512_load_epi64(ptrs) - _mm512_set1_epi64(ptrs[0]) ) , 0 ) , _mm512_cvtepi64_epi32( _mm512_load_epi64(&ptrs[4]) - _mm512_set1_epi64(ptrs[0]) ) , 1 ) #define vec_gather(base,offsets) _mm512_i32gather_ps( offsets.m , base , 1 ) - #elif defined(__AVX__) + #elif defined( NO__AVX__ ) #define VECTORIZE #define VEC_SIZE 8 #define VEC_FLOAT __m256 @@ -90,7 +90,7 @@ #define VEC_HAVE_GATHER #define vec_gather(base,offsets) _mm256_i32gather_ps( base , offsets.m , 1 ) #endif - #elif defined( __SSE2__ ) + #elif defined( NO__SSE2__ ) #define VECTORIZE #define VEC_SIZE 4 #define VEC_FLOAT __m128