diff --git a/examples/test.c b/examples/test.c index e798e69cd8c1cabff9cb531e23819f11e5d5277f..188b20c4118bf6aec607566ff9859803892b9d9a 100644 --- a/examples/test.c +++ b/examples/test.c @@ -795,7 +795,7 @@ int main ( int argc , char *argv[] ) { /* Initialize the engine with this space. */ tic = getticks(); message( "nr_nodes is %i." , nr_nodes ); - engine_init( &e , &s , dt_max , nr_threads , nr_queues , nr_nodes , myrank , ENGINE_POLICY | engine_policy_steal ); + engine_init( &e , &s , dt_max , nr_threads , nr_queues , nr_nodes , myrank , ENGINE_POLICY | engine_policy_steal | engine_policy_paranoid ); if ( myrank == 0 ) message( "engine_init took %.3f ms." , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); fflush(stdout); @@ -849,12 +849,12 @@ int main ( int argc , char *argv[] ) { /* Repartition the space amongst the nodes? */ #if defined(WITH_MPI) && defined(HAVE_METIS) - if ( j == 2 ) + if ( j % 100 == 2 ) e.forcerepart = 1; #endif /* Force a rebuild for testing. */ - /* if ( j % 4 == 1 ) + /* if ( j % 4 == 3 ) e.forcerebuild = 1; */ // message( "starting run %i/%i (t=%.3e) with %i threads and %i queues..." , j+1 , runs , e.time , e.nr_threads , e.nr_queues ); fflush(stdout); diff --git a/src/atomic.h b/src/atomic.h index df6e5aaeed4db12653530d6a5dec8ee3042f02f7..16b268c4c799cd1ca8c38a3382df912a9d618614 100644 --- a/src/atomic.h +++ b/src/atomic.h @@ -1,26 +1,30 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_ATOMIC_H +#define SWIFT_ATOMIC_H - +/* Includes. */ #include "inline.h" - -#define atomic_add(v,i) __sync_fetch_and_add( v , i ) -#define atomic_inc(v) atomic_add( v , 1 ) -#define atomic_dec(v) atomic_add( v , -1 ) -#define atomic_cas(v,o,n) __sync_val_compare_and_swap( v , o , n ) + +#define atomic_add(v, i) __sync_fetch_and_add(v, i) +#define atomic_inc(v) atomic_add(v, 1) +#define atomic_dec(v) atomic_add(v, -1) +#define atomic_cas(v, o, n) __sync_val_compare_and_swap(v, o, n) + +#endif /* SWIFT_ATOMIC_H */ diff --git a/src/cell.c b/src/cell.c index 13e1055649dd8f5b06fdad112102e25821b44850..87b51ac82cade8a7e4302e52b7a4e55e5d612aa2 100644 --- a/src/cell.c +++ b/src/cell.c @@ -1,85 +1,76 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> #include <float.h> #include <limits.h> #include <math.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif /* Switch off timers. */ #ifdef TIMER - #undef TIMER +#undef TIMER #endif +/* This object's header. */ +#include "cell.h" + /* Local headers. */ -#include "const.h" #include "atomic.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "timers.h" -#include "part.h" -#include "space.h" -#include "multipole.h" -#include "cell.h" #include "error.h" -#include "inline.h" +#include "space.h" +#include "timers.h" /* Global variables. */ int cell_next_tag = 0; - /** * @brief Get the size of the cell subtree. * * @param c The #cell. */ - -int cell_getsize ( struct cell *c ) { - - int k, count = 1; - - /* Sum up the progeny if split. */ - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - count += cell_getsize( c->progeny[k] ); - - /* Return the final count. */ - return count; - } +int cell_getsize(struct cell *c) { + + int k, count = 1; + /* Sum up the progeny if split. */ + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) count += cell_getsize(c->progeny[k]); -/** + /* Return the final count. */ + return count; +} + +/** * @brief Unpack the data of a given cell and its sub-cells. * * @param pc An array of packed #pcell. @@ -88,52 +79,47 @@ int cell_getsize ( struct cell *c ) { * * @return The number of cells created. */ - -int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ) { - - int k, count = 1; - struct cell *temp; - - /* Unpack the current pcell. */ - c->h_max = pc->h_max; - c->dt_min = FLT_MAX; // pc->dt_min; - c->dt_max = FLT_MAX; // pc->dt_max; - c->count = pc->count; - c->tag = pc->tag; - - /* Fill the progeny recursively, depth-first. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( pc->progeny[k] >= 0 ) { - temp = space_getcell( s ); - temp->count = 0; - temp->loc[0] = c->loc[0]; - temp->loc[1] = c->loc[1]; - temp->loc[2] = c->loc[2]; - temp->h[0] = c->h[0]/2; - temp->h[1] = c->h[1]/2; - temp->h[2] = c->h[2]/2; - temp->dmin = c->dmin/2; - if ( k & 4 ) - temp->loc[0] += temp->h[0]; - if ( k & 2 ) - temp->loc[1] += temp->h[1]; - if ( k & 1 ) - temp->loc[2] += temp->h[2]; - temp->depth = c->depth + 1; - temp->split = 0; - temp->dx_max = 0.0; - temp->nodeID = c->nodeID; - temp->parent = c; - c->progeny[k] = temp; - c->split = 1; - count += cell_unpack( &pc[ pc->progeny[k] ] , temp , s ); - } - - /* Return the total number of unpacked cells. */ - return count; +int cell_unpack(struct pcell *pc, struct cell *c, struct space *s) { + + int k, count = 1; + struct cell *temp; + + /* Unpack the current pcell. */ + c->h_max = pc->h_max; + c->dt_min = FLT_MAX; // pc->dt_min; + c->dt_max = FLT_MAX; // pc->dt_max; + c->count = pc->count; + c->tag = pc->tag; + + /* Fill the progeny recursively, depth-first. */ + for (k = 0; k < 8; k++) + if (pc->progeny[k] >= 0) { + temp = space_getcell(s); + temp->count = 0; + temp->loc[0] = c->loc[0]; + temp->loc[1] = c->loc[1]; + temp->loc[2] = c->loc[2]; + temp->h[0] = c->h[0] / 2; + temp->h[1] = c->h[1] / 2; + temp->h[2] = c->h[2] / 2; + temp->dmin = c->dmin / 2; + if (k & 4) temp->loc[0] += temp->h[0]; + if (k & 2) temp->loc[1] += temp->h[1]; + if (k & 1) temp->loc[2] += temp->h[2]; + temp->depth = c->depth + 1; + temp->split = 0; + temp->dx_max = 0.0; + temp->nodeID = c->nodeID; + temp->parent = c; + c->progeny[k] = temp; + c->split = 1; + count += cell_unpack(&pc[pc->progeny[k]], temp, s); } + /* Return the total number of unpacked cells. */ + return count; +} /** * @brief Link the cells recursively to the given part array. @@ -144,23 +130,20 @@ int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ) { * @return The number of particles linked. */ -int cell_link ( struct cell *c , struct part *parts ) { - - int k, ind = 0; - - c->parts = parts; - - /* Fill the progeny recursively, depth-first. */ - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - ind += cell_link( c->progeny[k] , &parts[ind] ); - - /* Return the total number of unpacked cells. */ - return c->count; +int cell_link(struct cell *c, struct part *parts) { - } + int k, ind = 0; + + c->parts = parts; + /* Fill the progeny recursively, depth-first. */ + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) ind += cell_link(c->progeny[k], &parts[ind]); + + /* Return the total number of unpacked cells. */ + return c->count; +} /** * @brief Pack the data of the given cell and all it's sub-cells. @@ -171,402 +154,394 @@ int cell_link ( struct cell *c , struct part *parts ) { * * @return The number of packed cells. */ - -int cell_pack ( struct cell *c , struct pcell *pc ) { - - int k, count = 1; - - /* Start by packing the data of the current cell. */ - pc->h_max = c->h_max; - pc->dt_min = c->dt_min; - pc->dt_max = c->dt_max; - pc->count = c->count; - c->tag = pc->tag = cell_next_tag++; - - /* Fill in the progeny, depth-first recursion. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) { - pc->progeny[k] = count; - count += cell_pack( c->progeny[k] , &pc[count] ); - } - else - pc->progeny[k] = -1; - - /* Return the number of packed cells used. */ - return count; - } +int cell_pack(struct cell *c, struct pcell *pc) { + + int k, count = 1; + + /* Start by packing the data of the current cell. */ + pc->h_max = c->h_max; + pc->dt_min = c->dt_min; + pc->dt_max = c->dt_max; + pc->count = c->count; + c->tag = pc->tag = atomic_inc(&cell_next_tag) % cell_max_tag; + /* Fill in the progeny, depth-first recursion. */ + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + pc->progeny[k] = count; + count += cell_pack(c->progeny[k], &pc[count]); + } else + pc->progeny[k] = -1; + + /* Return the number of packed cells used. */ + return count; +} /** * @brief Lock a cell and hold its parents. * * @param c The #cell. */ - -int cell_locktree( struct cell *c ) { - - struct cell *finger, *finger2; - TIMER_TIC - - /* First of all, try to lock this cell. */ - if ( c->hold || lock_trylock( &c->lock ) != 0 ) { - TIMER_TOC(timer_locktree); - return 1; - } - - /* Did somebody hold this cell in the meantime? */ - if ( c->hold ) { - - /* Unlock this cell. */ - if ( lock_unlock( &c->lock ) != 0 ) - error( "Failed to unlock cell." ); - - /* Admit defeat. */ - TIMER_TOC(timer_locktree); - return 1; - - } - - /* Climb up the tree and lock/hold/unlock. */ - for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) { - - /* Lock this cell. */ - if ( lock_trylock( &finger->lock ) != 0 ) - break; - - /* Increment the hold. */ - atomic_inc( &finger->hold ); - - /* Unlock the cell. */ - if ( lock_unlock( &finger->lock ) != 0 ) - error( "Failed to unlock cell." ); - - } - - /* If we reached the top of the tree, we're done. */ - if ( finger == NULL ) { - TIMER_TOC(timer_locktree); - return 0; - } - - /* Otherwise, we hit a snag. */ - else { - - /* Undo the holds up to finger. */ - for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent ) - __sync_fetch_and_sub( &finger2->hold , 1 ); - - /* Unlock this cell. */ - if ( lock_unlock( &c->lock ) != 0 ) - error( "Failed to unlock cell." ); - - /* Admit defeat. */ - TIMER_TOC(timer_locktree); - return 1; - - } - } - - -int cell_glocktree( struct cell *c ) { - - struct cell *finger, *finger2; - TIMER_TIC - - /* First of all, try to lock this cell. */ - if ( c->ghold || lock_trylock( &c->glock ) != 0 ) { - TIMER_TOC(timer_locktree); - return 1; - } - - /* Did somebody hold this cell in the meantime? */ - if ( c->ghold ) { - - /* Unlock this cell. */ - if ( lock_unlock( &c->glock ) != 0 ) - error( "Failed to unlock cell." ); - - /* Admit defeat. */ - TIMER_TOC(timer_locktree); - return 1; - - } - - /* Climb up the tree and lock/hold/unlock. */ - for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) { - - /* Lock this cell. */ - if ( lock_trylock( &finger->glock ) != 0 ) - break; - - /* Increment the hold. */ - __sync_fetch_and_add( &finger->ghold , 1 ); - - /* Unlock the cell. */ - if ( lock_unlock( &finger->glock ) != 0 ) - error( "Failed to unlock cell." ); - - } - - /* If we reached the top of the tree, we're done. */ - if ( finger == NULL ) { - TIMER_TOC(timer_locktree); - return 0; - } - - /* Otherwise, we hit a snag. */ - else { - - /* Undo the holds up to finger. */ - for ( finger2 = c->parent ; finger2 != finger ; finger2 = finger2->parent ) - __sync_fetch_and_sub( &finger2->ghold , 1 ); - - /* Unlock this cell. */ - if ( lock_unlock( &c->glock ) != 0 ) - error( "Failed to unlock cell." ); - - /* Admit defeat. */ - TIMER_TOC(timer_locktree); - return 1; - - } +int cell_locktree(struct cell *c) { + + struct cell *finger, *finger2; + TIMER_TIC + + /* First of all, try to lock this cell. */ + if (c->hold || lock_trylock(&c->lock) != 0) { + TIMER_TOC(timer_locktree); + return 1; + } + + /* Did somebody hold this cell in the meantime? */ + if (c->hold) { + + /* Unlock this cell. */ + if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell."); + + /* Admit defeat. */ + TIMER_TOC(timer_locktree); + return 1; + } + + /* Climb up the tree and lock/hold/unlock. */ + for (finger = c->parent; finger != NULL; finger = finger->parent) { + + /* Lock this cell. */ + if (lock_trylock(&finger->lock) != 0) break; + + /* Increment the hold. */ + atomic_inc(&finger->hold); + + /* Unlock the cell. */ + if (lock_unlock(&finger->lock) != 0) error("Failed to unlock cell."); + } + + /* If we reached the top of the tree, we're done. */ + if (finger == NULL) { + TIMER_TOC(timer_locktree); + return 0; + } + + /* Otherwise, we hit a snag. */ + else { + + /* Undo the holds up to finger. */ + for (finger2 = c->parent; finger2 != finger; finger2 = finger2->parent) + __sync_fetch_and_sub(&finger2->hold, 1); + + /* Unlock this cell. */ + if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell."); + + /* Admit defeat. */ + TIMER_TOC(timer_locktree); + return 1; + } +} + +int cell_glocktree(struct cell *c) { + + struct cell *finger, *finger2; + TIMER_TIC + + /* First of all, try to lock this cell. */ + if (c->ghold || lock_trylock(&c->glock) != 0) { + TIMER_TOC(timer_locktree); + return 1; + } + + /* Did somebody hold this cell in the meantime? */ + if (c->ghold) { + + /* Unlock this cell. */ + if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell."); + + /* Admit defeat. */ + TIMER_TOC(timer_locktree); + return 1; + } + + /* Climb up the tree and lock/hold/unlock. */ + for (finger = c->parent; finger != NULL; finger = finger->parent) { + + /* Lock this cell. */ + if (lock_trylock(&finger->glock) != 0) break; + + /* Increment the hold. */ + __sync_fetch_and_add(&finger->ghold, 1); + + /* Unlock the cell. */ + if (lock_unlock(&finger->glock) != 0) error("Failed to unlock cell."); + } + + /* If we reached the top of the tree, we're done. */ + if (finger == NULL) { + TIMER_TOC(timer_locktree); + return 0; + } + + /* Otherwise, we hit a snag. */ + else { + + /* Undo the holds up to finger. */ + for (finger2 = c->parent; finger2 != finger; finger2 = finger2->parent) + __sync_fetch_and_sub(&finger2->ghold, 1); + + /* Unlock this cell. */ + if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell."); + + /* Admit defeat. */ + TIMER_TOC(timer_locktree); + return 1; + } +} - } - - /** * @brief Unock a cell's parents. * * @param c The #cell. */ - -void cell_unlocktree( struct cell *c ) { - - struct cell *finger; - TIMER_TIC - - /* First of all, try to unlock this cell. */ - if ( lock_unlock( &c->lock ) != 0 ) - error( "Failed to unlock cell." ); - - /* Climb up the tree and unhold the parents. */ - for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) - __sync_fetch_and_sub( &finger->hold , 1 ); - - TIMER_TOC(timer_locktree); - - } - - -void cell_gunlocktree( struct cell *c ) { - - struct cell *finger; - TIMER_TIC - - /* First of all, try to unlock this cell. */ - if ( lock_unlock( &c->glock ) != 0 ) - error( "Failed to unlock cell." ); - - /* Climb up the tree and unhold the parents. */ - for ( finger = c->parent ; finger != NULL ; finger = finger->parent ) - __sync_fetch_and_sub( &finger->ghold , 1 ); - - TIMER_TOC(timer_locktree); - - } - - + +void cell_unlocktree(struct cell *c) { + + struct cell *finger; + TIMER_TIC + + /* First of all, try to unlock this cell. */ + if (lock_unlock(&c->lock) != 0) error("Failed to unlock cell."); + + /* Climb up the tree and unhold the parents. */ + for (finger = c->parent; finger != NULL; finger = finger->parent) + __sync_fetch_and_sub(&finger->hold, 1); + + TIMER_TOC(timer_locktree); +} + +void cell_gunlocktree(struct cell *c) { + + struct cell *finger; + TIMER_TIC + + /* First of all, try to unlock this cell. */ + if (lock_unlock(&c->glock) != 0) error("Failed to unlock cell."); + + /* Climb up the tree and unhold the parents. */ + for (finger = c->parent; finger != NULL; finger = finger->parent) + __sync_fetch_and_sub(&finger->ghold, 1); + + TIMER_TOC(timer_locktree); +} + /** * @brief Sort the parts into eight bins along the given pivots. * * @param c The #cell array to be sorted. */ - -void cell_split ( struct cell *c ) { - - int i, j, k, count = c->count, gcount = c->gcount; - struct part temp, *parts = c->parts; - struct xpart xtemp, *xparts = c->xparts; - struct gpart gtemp, *gparts = c->gparts; - int left[8], right[8]; - double pivot[3]; - - /* Init the pivots. */ - for ( k = 0 ; k < 3 ; k++ ) - pivot[k] = c->loc[k] + c->h[k]/2; - - /* Split along the x-axis. */ - i = 0; j = count - 1; - while ( i <= j ) { - while ( i <= count-1 && parts[i].x[0] <= pivot[0] ) - i += 1; - while ( j >= 0 && parts[j].x[0] > pivot[0] ) - j -= 1; - if ( i < j ) { - temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; - xtemp = xparts[i]; xparts[i] = xparts[j]; xparts[j] = xtemp; - } - } - /* for ( k = 0 ; k <= j ; k++ ) - if ( parts[k].x[0] > pivot[0] ) - error( "cell_split: sorting failed." ); - for ( k = i ; k < count ; k++ ) - if ( parts[k].x[0] < pivot[0] ) - error( "cell_split: sorting failed." ); */ - left[1] = i; right[1] = count - 1; - left[0] = 0; right[0] = j; - - /* Split along the y axis, twice. */ - for ( k = 1 ; k >= 0 ; k-- ) { - i = left[k]; j = right[k]; - while ( i <= j ) { - while ( i <= right[k] && parts[i].x[1] <= pivot[1] ) - i += 1; - while ( j >= left[k] && parts[j].x[1] > pivot[1] ) - j -= 1; - if ( i < j ) { - temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; - xtemp = xparts[i]; xparts[i] = xparts[j]; xparts[j] = xtemp; - } - } - /* for ( int kk = left[k] ; kk <= j ; kk++ ) - if ( parts[kk].x[1] > pivot[1] ) { - message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); - error( "sorting failed (left)." ); - } - for ( int kk = i ; kk <= right[k] ; kk++ ) - if ( parts[kk].x[1] < pivot[1] ) - error( "sorting failed (right)." ); */ - left[2*k+1] = i; right[2*k+1] = right[k]; - left[2*k] = left[k]; right[2*k] = j; - } - - /* Split along the z axis, four times. */ - for ( k = 3 ; k >= 0 ; k-- ) { - i = left[k]; j = right[k]; - while ( i <= j ) { - while ( i <= right[k] && parts[i].x[2] <= pivot[2] ) - i += 1; - while ( j >= left[k] && parts[j].x[2] > pivot[2] ) - j -= 1; - if ( i < j ) { - temp = parts[i]; parts[i] = parts[j]; parts[j] = temp; - xtemp = xparts[i]; xparts[i] = xparts[j]; xparts[j] = xtemp; - } - } - /* for ( int kk = left[k] ; kk <= j ; kk++ ) - if ( parts[kk].x[2] > pivot[2] ) { - message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); - error( "sorting failed (left)." ); - } - for ( int kk = i ; kk <= right[k] ; kk++ ) - if ( parts[kk].x[2] < pivot[2] ) { - message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); - error( "sorting failed (right)." ); - } */ - left[2*k+1] = i; right[2*k+1] = right[k]; - left[2*k] = left[k]; right[2*k] = j; - } - - /* Store the counts and offsets. */ - for ( k = 0 ; k < 8 ; k++ ) { - c->progeny[k]->count = right[k] - left[k] + 1; - c->progeny[k]->parts = &c->parts[ left[k] ]; - c->progeny[k]->xparts = &c->xparts[ left[k] ]; - } - - /* Re-link the gparts. */ - for ( k = 0 ; k < count ; k++ ) - if ( parts[k].gpart != NULL ) - parts[k].gpart->part = &parts[k]; - - /* Verify that _all_ the parts have been assigned to a cell. */ - /* for ( k = 1 ; k < 8 ; k++ ) - if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] != c->progeny[k]->parts ) - error( "Particle sorting failed (internal consistency)." ); - if ( c->progeny[0]->parts != c->parts ) - error( "Particle sorting failed (left edge)." ); - if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] ) - error( "Particle sorting failed (right edge)." ); */ - - /* Verify a few sub-cells. */ - /* for ( k = 0 ; k < c->progeny[0]->count ; k++ ) - if ( c->progeny[0]->parts[k].x[0] > pivot[0] || - c->progeny[0]->parts[k].x[1] > pivot[1] || - c->progeny[0]->parts[k].x[2] > pivot[2] ) - error( "Sorting failed (progeny=0)." ); - for ( k = 0 ; k < c->progeny[1]->count ; k++ ) - if ( c->progeny[1]->parts[k].x[0] > pivot[0] || - c->progeny[1]->parts[k].x[1] > pivot[1] || - c->progeny[1]->parts[k].x[2] <= pivot[2] ) - error( "Sorting failed (progeny=1)." ); - for ( k = 0 ; k < c->progeny[2]->count ; k++ ) - if ( c->progeny[2]->parts[k].x[0] > pivot[0] || - c->progeny[2]->parts[k].x[1] <= pivot[1] || - c->progeny[2]->parts[k].x[2] > pivot[2] ) - error( "Sorting failed (progeny=2)." ); */ - - /* Now do the same song and dance for the gparts. */ - - /* Split along the x-axis. */ - i = 0; j = gcount - 1; - while ( i <= j ) { - while ( i <= gcount-1 && gparts[i].x[0] <= pivot[0] ) - i += 1; - while ( j >= 0 && gparts[j].x[0] > pivot[0] ) - j -= 1; - if ( i < j ) { - gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp; - } - } - left[1] = i; right[1] = gcount - 1; - left[0] = 0; right[0] = j; - - /* Split along the y axis, twice. */ - for ( k = 1 ; k >= 0 ; k-- ) { - i = left[k]; j = right[k]; - while ( i <= j ) { - while ( i <= right[k] && gparts[i].x[1] <= pivot[1] ) - i += 1; - while ( j >= left[k] && gparts[j].x[1] > pivot[1] ) - j -= 1; - if ( i < j ) { - gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp; - } + +void cell_split(struct cell *c) { + + int i, j, k, count = c->count, gcount = c->gcount; + struct part temp, *parts = c->parts; + struct xpart xtemp, *xparts = c->xparts; + struct gpart gtemp, *gparts = c->gparts; + int left[8], right[8]; + double pivot[3]; + + /* Init the pivots. */ + for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h[k] / 2; + + /* Split along the x-axis. */ + i = 0; + j = count - 1; + while (i <= j) { + while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1; + while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + xtemp = xparts[i]; + xparts[i] = xparts[j]; + xparts[j] = xtemp; + } + } + /* for ( k = 0 ; k <= j ; k++ ) + if ( parts[k].x[0] > pivot[0] ) + error( "cell_split: sorting failed." ); + for ( k = i ; k < count ; k++ ) + if ( parts[k].x[0] < pivot[0] ) + error( "cell_split: sorting failed." ); */ + left[1] = i; + right[1] = count - 1; + left[0] = 0; + right[0] = j; + + /* Split along the y axis, twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1; + while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + xtemp = xparts[i]; + xparts[i] = xparts[j]; + xparts[j] = xtemp; + } + } + /* for ( int kk = left[k] ; kk <= j ; kk++ ) + if ( parts[kk].x[1] > pivot[1] ) { + message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); + error( "sorting failed (left)." ); } - left[2*k+1] = i; right[2*k+1] = right[k]; - left[2*k] = left[k]; right[2*k] = j; - } - - /* Split along the z axis, four times. */ - for ( k = 3 ; k >= 0 ; k-- ) { - i = left[k]; j = right[k]; - while ( i <= j ) { - while ( i <= right[k] && gparts[i].x[2] <= pivot[2] ) - i += 1; - while ( j >= left[k] && gparts[j].x[2] > pivot[2] ) - j -= 1; - if ( i < j ) { - gtemp = gparts[i]; gparts[i] = gparts[j]; gparts[j] = gtemp; - } + for ( int kk = i ; kk <= right[k] ; kk++ ) + if ( parts[kk].x[1] < pivot[1] ) + error( "sorting failed (right)." ); */ + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Split along the z axis, four times. */ + for (k = 3; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1; + while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + xtemp = xparts[i]; + xparts[i] = xparts[j]; + xparts[j] = xtemp; + } + } + /* for ( int kk = left[k] ; kk <= j ; kk++ ) + if ( parts[kk].x[2] > pivot[2] ) { + message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); + error( "sorting failed (left)." ); } - left[2*k+1] = i; right[2*k+1] = right[k]; - left[2*k] = left[k]; right[2*k] = j; - } - - /* Store the counts and offsets. */ - for ( k = 0 ; k < 8 ; k++ ) { - c->progeny[k]->gcount = right[k] - left[k] + 1; - c->progeny[k]->gparts = &c->gparts[ left[k] ]; - } - - /* Re-link the parts. */ - for ( k = 0 ; k < gcount ; k++ ) - if ( gparts[k].id > 0 ) - gparts[k].part->gpart = &gparts[k]; - + for ( int kk = i ; kk <= right[k] ; kk++ ) + if ( parts[kk].x[2] < pivot[2] ) { + message( "ival=[%i,%i], i=%i, j=%i." , left[k] , right[k] , i , j ); + error( "sorting failed (right)." ); + } */ + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Store the counts and offsets. */ + for (k = 0; k < 8; k++) { + c->progeny[k]->count = right[k] - left[k] + 1; + c->progeny[k]->parts = &c->parts[left[k]]; + c->progeny[k]->xparts = &c->xparts[left[k]]; + } + + /* Re-link the gparts. */ + for (k = 0; k < count; k++) + if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k]; + + /* Verify that _all_ the parts have been assigned to a cell. */ + /* for ( k = 1 ; k < 8 ; k++ ) + if ( &c->progeny[k-1]->parts[ c->progeny[k-1]->count ] != + c->progeny[k]->parts ) + error( "Particle sorting failed (internal consistency)." ); + if ( c->progeny[0]->parts != c->parts ) + error( "Particle sorting failed (left edge)." ); + if ( &c->progeny[7]->parts[ c->progeny[7]->count ] != &c->parts[ count ] ) + error( "Particle sorting failed (right edge)." ); */ + + /* Verify a few sub-cells. */ + /* for ( k = 0 ; k < c->progeny[0]->count ; k++ ) + if ( c->progeny[0]->parts[k].x[0] > pivot[0] || + c->progeny[0]->parts[k].x[1] > pivot[1] || + c->progeny[0]->parts[k].x[2] > pivot[2] ) + error( "Sorting failed (progeny=0)." ); + for ( k = 0 ; k < c->progeny[1]->count ; k++ ) + if ( c->progeny[1]->parts[k].x[0] > pivot[0] || + c->progeny[1]->parts[k].x[1] > pivot[1] || + c->progeny[1]->parts[k].x[2] <= pivot[2] ) + error( "Sorting failed (progeny=1)." ); + for ( k = 0 ; k < c->progeny[2]->count ; k++ ) + if ( c->progeny[2]->parts[k].x[0] > pivot[0] || + c->progeny[2]->parts[k].x[1] <= pivot[1] || + c->progeny[2]->parts[k].x[2] > pivot[2] ) + error( "Sorting failed (progeny=2)." ); */ + + /* Now do the same song and dance for the gparts. */ + + /* Split along the x-axis. */ + i = 0; + j = gcount - 1; + while (i <= j) { + while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1; + while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1; + if (i < j) { + gtemp = gparts[i]; + gparts[i] = gparts[j]; + gparts[j] = gtemp; } - - + } + left[1] = i; + right[1] = gcount - 1; + left[0] = 0; + right[0] = j; + + /* Split along the y axis, twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1; + while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1; + if (i < j) { + gtemp = gparts[i]; + gparts[i] = gparts[j]; + gparts[j] = gtemp; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Split along the z axis, four times. */ + for (k = 3; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1; + while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1; + if (i < j) { + gtemp = gparts[i]; + gparts[i] = gparts[j]; + gparts[j] = gtemp; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Store the counts and offsets. */ + for (k = 0; k < 8; k++) { + c->progeny[k]->gcount = right[k] - left[k] + 1; + c->progeny[k]->gparts = &c->gparts[left[k]]; + } + + /* Re-link the parts. */ + for (k = 0; k < gcount; k++) + if (gparts[k].id > 0) gparts[k].part->gpart = &gparts[k]; +} diff --git a/src/cell.h b/src/cell.h index 43dedefbb6c079b726ed1cbb5d4cfe0b39e368a2..7a5353bcae8cbb7ab4d50a546665a1a774a46aea 100644 --- a/src/cell.h +++ b/src/cell.h @@ -1,165 +1,173 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_CELL_H +#define SWIFT_CELL_H -/* Some constants. */ -#define cell_sid_dt 13 +/* Includes. */ +#include "lock.h" +#include "multipole.h" +#include "part.h" + +/* Forward declaration of space, needed for cell_unpack. */ +struct space; +/* Some constants. */ +#define cell_sid_dt 13 +#define cell_max_tag (1 << 16) /* Global variables. */ extern int cell_next_tag; - /* Packed cell. */ struct pcell { - /* Stats on this cell's particles. */ - double h_max, dt_min, dt_max; - - /* Number of particles in this cell. */ - int count; - - /* tag used for MPI communication. */ - int tag; + /* Stats on this cell's particles. */ + double h_max, dt_min, dt_max; + + /* Number of particles in this cell. */ + int count; - /* Relative indices of the cell's progeny. */ - int progeny[8]; - - }; + /* tag used for MPI communication. */ + int tag; + /* Relative indices of the cell's progeny. */ + int progeny[8]; +}; /* Structure to store the data of a single cell. */ struct cell { - /* The cell location on the grid. */ - double loc[3]; - - /* The cell dimensions. */ - double h[3]; - - /* Max radii in this cell. */ - double h_max; - - /* Minimum and maximum dt in this cell. */ - double dt_min, dt_max; - - /* Minimum dimension, i.e. smallest edge of this cell. */ - float dmin; - - /* Maximum slack allowed for particle movement. */ - float slack; - - /* Maximum particle movement in this cell. */ - float dx_max; - - /* The depth of this cell in the tree. */ - int depth, split, maxdepth; - - /* Nr of parts. */ - int count, gcount; - - /* Pointers to the particle data. */ - struct part *parts; - - /* Pointers to the extra particle data. */ - struct xpart *xparts; - - /* Pointers to the gravity particle data. */ - struct gpart *gparts; - - /* Pointers for the sorted indices. */ - struct entry *sort, *gsort; - unsigned int sorted, gsorted; - - /* Pointers to the next level of cells. */ - struct cell *progeny[8]; - - /* Parent cell. */ - struct cell *parent; - - /* Super cell, i.e. the highest-level supercell that has interactions. */ - struct cell *super; - - /* The task computing this cell's sorts. */ - struct task *sorts, *gsorts; - int sortsize, gsortsize; - - /* The tasks computing this cell's density. */ - struct link *density, *force, *grav; - int nr_density, nr_force, nr_grav; - - /* The ghost task to link density to interactions. */ - struct task *ghost, *kick1, *kick2; - - /* Task receiving data. */ - struct task *recv_xv, *recv_rho; - - /* Tasks for gravity tree. */ - struct task *grav_up, *grav_down; - - /* Number of tasks that are associated with this cell. */ - int nr_tasks; - - /* Is the data of this cell being used in a sub-cell? */ - int hold, ghold; - - /* Spin lock for various uses. */ - lock_type lock, glock; - - /* ID of the previous owner, e.g. runner. */ - int owner; - - /* Momentum of particles in cell. */ - float mom[3], ang[3]; - - /* Potential and kinetic energy of particles in this cell. */ - double epot, ekin; - - /* Number of particles updated in this cell. */ - int updated; - - /* Linking pointer for "memory management". */ - struct cell *next; - - /* ID of the node this cell lives on. */ - int nodeID; - - /* Bit mask of the proxies this cell is registered with. */ - unsigned long long int sendto; - - /* Pointer to this cell's packed representation. */ - struct pcell *pcell; - int pcell_size; - int tag; - - /* This cell's multipole. */ - struct multipole multipole; - - } __attribute__((aligned (64))); + /* The cell location on the grid. */ + double loc[3]; + + /* The cell dimensions. */ + double h[3]; + + /* Max radii in this cell. */ + double h_max; + + /* Minimum and maximum dt in this cell. */ + double dt_min, dt_max; + + /* Minimum dimension, i.e. smallest edge of this cell. */ + float dmin; + + /* Maximum slack allowed for particle movement. */ + float slack; + + /* Maximum particle movement in this cell. */ + float dx_max; + + /* The depth of this cell in the tree. */ + int depth, split, maxdepth; + + /* Nr of parts. */ + int count, gcount; + + /* Pointers to the particle data. */ + struct part *parts; + + /* Pointers to the extra particle data. */ + struct xpart *xparts; + + /* Pointers to the gravity particle data. */ + struct gpart *gparts; + + /* Pointers for the sorted indices. */ + struct entry *sort, *gsort; + unsigned int sorted, gsorted; + + /* Pointers to the next level of cells. */ + struct cell *progeny[8]; + + /* Parent cell. */ + struct cell *parent; + /* Super cell, i.e. the highest-level supercell that has interactions. */ + struct cell *super; + + /* The task computing this cell's sorts. */ + struct task *sorts, *gsorts; + int sortsize, gsortsize; + + /* The tasks computing this cell's density. */ + struct link *density, *force, *grav; + int nr_density, nr_force, nr_grav; + + /* The ghost task to link density to interactions. */ + struct task *ghost, *kick1, *kick2; + + /* Task receiving data. */ + struct task *recv_xv, *recv_rho; + + /* Tasks for gravity tree. */ + struct task *grav_up, *grav_down; + + /* Number of tasks that are associated with this cell. */ + int nr_tasks; + + /* Is the data of this cell being used in a sub-cell? */ + int hold, ghold; + + /* Spin lock for various uses. */ + lock_type lock, glock; + + /* ID of the previous owner, e.g. runner. */ + int owner; + + /* Momentum of particles in cell. */ + float mom[3], ang[3]; + + /* Potential and kinetic energy of particles in this cell. */ + double epot, ekin; + + /* Number of particles updated in this cell. */ + int updated; + + /* Linking pointer for "memory management". */ + struct cell *next; + + /* ID of the node this cell lives on. */ + int nodeID; + + /* Bit mask of the proxies this cell is registered with. */ + unsigned long long int sendto; + + /* Pointer to this cell's packed representation. */ + struct pcell *pcell; + int pcell_size; + int tag; + + /* This cell's multipole. */ + struct multipole multipole; + +} __attribute__((aligned(64))); /* Function prototypes. */ -void cell_split ( struct cell *c ); -int cell_locktree( struct cell *c ); -void cell_unlocktree( struct cell *c ); -int cell_glocktree( struct cell *c ); -void cell_gunlocktree( struct cell *c ); -int cell_pack ( struct cell *c , struct pcell *pc ); -int cell_unpack ( struct pcell *pc , struct cell *c , struct space *s ); -int cell_getsize ( struct cell *c ); -int cell_link ( struct cell *c , struct part *parts ); +void cell_split(struct cell *c); +int cell_locktree(struct cell *c); +void cell_unlocktree(struct cell *c); +int cell_glocktree(struct cell *c); +void cell_gunlocktree(struct cell *c); +int cell_pack(struct cell *c, struct pcell *pc); +int cell_unpack(struct pcell *pc, struct cell *c, struct space *s); +int cell_getsize(struct cell *c); +int cell_link(struct cell *c, struct part *parts); + +#endif /* SWIFT_CELL_H */ diff --git a/src/common_io.c b/src/common_io.c index e2e29a596701281fb307ab256721d7809d2c1419..64194e5829658a0c3f0f1aa7d9951d73f3eed377 100644 --- a/src/common_io.c +++ b/src/common_io.c @@ -2,98 +2,110 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" - #if defined(HAVE_HDF5) /* Some standard headers. */ +#include <hdf5.h> +#include <math.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stddef.h> -#include <hdf5.h> -#include <math.h> + +/* MPI headers. */ #ifdef WITH_MPI #include <mpi.h> #endif +/* This object's header. */ +#include "common_io.h" + +/* Local includes. */ #include "const.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "space.h" -#include "scheduler.h" -#include "engine.h" #include "error.h" #include "kernel.h" -#include "common_io.h" - - /** - * @brief Converts a C data type to the HDF5 equivalent. + * @brief Converts a C data type to the HDF5 equivalent. * * This function is a trivial wrapper around the HDF5 types but allows - * to change the exact storage types matching the code types in a transparent way. + * to change the exact storage types matching the code types in a transparent + *way. */ -hid_t hdf5Type(enum DATA_TYPE type) -{ - switch(type) - { - case INT: return H5T_NATIVE_INT; - case UINT: return H5T_NATIVE_UINT; - case LONG: return H5T_NATIVE_LONG; - case ULONG: return H5T_NATIVE_ULONG; - case LONGLONG: return H5T_NATIVE_LLONG; - case ULONGLONG: return H5T_NATIVE_ULLONG; - case FLOAT: return H5T_NATIVE_FLOAT; - case DOUBLE: return H5T_NATIVE_DOUBLE; - case CHAR: return H5T_C_S1; - default: error("Unknown type"); return 0; - } +hid_t hdf5Type(enum DATA_TYPE type) { + switch (type) { + case INT: + return H5T_NATIVE_INT; + case UINT: + return H5T_NATIVE_UINT; + case LONG: + return H5T_NATIVE_LONG; + case ULONG: + return H5T_NATIVE_ULONG; + case LONGLONG: + return H5T_NATIVE_LLONG; + case ULONGLONG: + return H5T_NATIVE_ULLONG; + case FLOAT: + return H5T_NATIVE_FLOAT; + case DOUBLE: + return H5T_NATIVE_DOUBLE; + case CHAR: + return H5T_C_S1; + default: + error("Unknown type"); + return 0; + } } /** * @brief Returns the memory size of the data type */ -size_t sizeOfType(enum DATA_TYPE type) -{ - switch(type) - { - case INT: return sizeof(int); - case UINT: return sizeof(unsigned int); - case LONG: return sizeof(long); - case ULONG: return sizeof(unsigned long); - case LONGLONG: return sizeof(long long); - case ULONGLONG: return sizeof(unsigned long long); - case FLOAT: return sizeof(float); - case DOUBLE: return sizeof(double); - case CHAR: return sizeof(char); - default: error("Unknown type"); return 0; - } +size_t sizeOfType(enum DATA_TYPE type) { + switch (type) { + case INT: + return sizeof(int); + case UINT: + return sizeof(unsigned int); + case LONG: + return sizeof(long); + case ULONG: + return sizeof(unsigned long); + case LONGLONG: + return sizeof(long long); + case ULONGLONG: + return sizeof(unsigned long long); + case FLOAT: + return sizeof(float); + case DOUBLE: + return sizeof(double); + case CHAR: + return sizeof(char); + default: + error("Unknown type"); + return 0; + } } - - /** * @brief Reads an attribute from a given HDF5 group. * @@ -104,21 +116,18 @@ size_t sizeOfType(enum DATA_TYPE type) * * Calls #error() if an error occurs. */ -void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data) -{ - hid_t h_attr=0, h_err=0; +void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data) { + hid_t h_attr = 0, h_err = 0; h_attr = H5Aopen(grp, name, H5P_DEFAULT); - if(h_attr < 0) - { - error( "Error while opening attribute '%s'" , name ); - } + if (h_attr < 0) { + error("Error while opening attribute '%s'", name); + } h_err = H5Aread(h_attr, hdf5Type(type), data); - if(h_err < 0) - { - error( "Error while reading attribute '%s'" , name ); - } + if (h_err < 0) { + error("Error while reading attribute '%s'", name); + } H5Aclose(h_attr); } @@ -134,34 +143,30 @@ void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data) * * Calls #error() if an error occurs. */ -void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int num) -{ - hid_t h_space=0, h_attr=0, h_err=0; - hsize_t dim[1]={num}; +void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, + int num) { + hid_t h_space = 0, h_attr = 0, h_err = 0; + hsize_t dim[1] = {num}; h_space = H5Screate(H5S_SIMPLE); - if(h_space < 0) - { - error( "Error while creating dataspace for attribute '%s'." , name ); - } + if (h_space < 0) { + error("Error while creating dataspace for attribute '%s'.", name); + } h_err = H5Sset_extent_simple(h_space, 1, dim, NULL); - if(h_err < 0) - { - error( "Error while changing dataspace shape for attribute '%s'." , name ); - } + if (h_err < 0) { + error("Error while changing dataspace shape for attribute '%s'.", name); + } h_attr = H5Acreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT); - if(h_attr < 0) - { - error( "Error while creating attribute '%s'.", name ); - } + if (h_attr < 0) { + error("Error while creating attribute '%s'.", name); + } h_err = H5Awrite(h_attr, hdf5Type(type), data); - if(h_err < 0) - { - error( "Error while reading attribute '%s'." , name ); - } + if (h_err < 0) { + error("Error while reading attribute '%s'.", name); + } H5Sclose(h_space); H5Aclose(h_attr); @@ -177,39 +182,33 @@ void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int * * Calls #error() if an error occurs. */ -void writeStringAttribute(hid_t grp, char* name, char* str, int length) -{ - hid_t h_space=0, h_attr=0, h_err=0, h_type=0; +void writeStringAttribute(hid_t grp, char* name, char* str, int length) { + hid_t h_space = 0, h_attr = 0, h_err = 0, h_type = 0; h_space = H5Screate(H5S_SCALAR); - if(h_space < 0) - { - error( "Error while creating dataspace for attribute '%s'." , name ); - } + if (h_space < 0) { + error("Error while creating dataspace for attribute '%s'.", name); + } h_type = H5Tcopy(H5T_C_S1); - if(h_type < 0) - { - error( "Error while copying datatype 'H5T_C_S1'." ); - } + if (h_type < 0) { + error("Error while copying datatype 'H5T_C_S1'."); + } h_err = H5Tset_size(h_type, length); - if(h_err < 0) - { - error( "Error while resizing attribute tyep to '%i'." , length ); - } + if (h_err < 0) { + error("Error while resizing attribute tyep to '%i'.", length); + } h_attr = H5Acreate1(grp, name, h_type, h_space, H5P_DEFAULT); - if(h_attr < 0) - { - error( "Error while creating attribute '%s'." , name ); - } + if (h_attr < 0) { + error("Error while creating attribute '%s'.", name); + } - h_err = H5Awrite(h_attr, h_type, str ); - if(h_err < 0) - { - error( "Error while reading attribute '%s'." , name ); - } + h_err = H5Awrite(h_attr, h_type, str); + if (h_err < 0) { + error("Error while reading attribute '%s'.", name); + } H5Tclose(h_type); H5Sclose(h_space); @@ -222,8 +221,7 @@ void writeStringAttribute(hid_t grp, char* name, char* str, int length) * @param name The name of the attribute * @param data The value to write */ -void writeAttribute_d(hid_t grp, char* name, double data) -{ +void writeAttribute_d(hid_t grp, char* name, double data) { writeAttribute(grp, name, DOUBLE, &data, 1); } @@ -233,8 +231,7 @@ void writeAttribute_d(hid_t grp, char* name, double data) * @param name The name of the attribute * @param data The value to write */ -void writeAttribute_f(hid_t grp, char* name, float data) -{ +void writeAttribute_f(hid_t grp, char* name, float data) { writeAttribute(grp, name, FLOAT, &data, 1); } @@ -245,8 +242,7 @@ void writeAttribute_f(hid_t grp, char* name, float data) * @param data The value to write */ -void writeAttribute_i(hid_t grp, char* name, int data) -{ +void writeAttribute_i(hid_t grp, char* name, int data) { writeAttribute(grp, name, INT, &data, 1); } @@ -256,8 +252,7 @@ void writeAttribute_i(hid_t grp, char* name, int data) * @param name The name of the attribute * @param data The value to write */ -void writeAttribute_l(hid_t grp, char* name, long data) -{ +void writeAttribute_l(hid_t grp, char* name, long data) { writeAttribute(grp, name, LONG, &data, 1); } @@ -267,26 +262,24 @@ void writeAttribute_l(hid_t grp, char* name, long data) * @param name The name of the attribute * @param str The string to write */ -void writeAttribute_s(hid_t grp, char* name, char* str) -{ +void writeAttribute_s(hid_t grp, char* name, char* str) { writeStringAttribute(grp, name, str, strlen(str)); } - -/* ------------------------------------------------------------------------------------------------ - * This part writes the XMF file descriptor enabling a visualisation through ParaView - * ------------------------------------------------------------------------------------------------ */ +/* ------------------------------------------------------------------------------------------------ + * This part writes the XMF file descriptor enabling a visualisation through + * ParaView + * ------------------------------------------------------------------------------------------------ + */ /** * @brief Writes the current model of SPH to the file * @param h_file The (opened) HDF5 file in which to write */ -void writeSPHflavour(hid_t h_file) -{ - hid_t h_grpsph=0; +void writeSPHflavour(hid_t h_file) { + hid_t h_grpsph = 0; h_grpsph = H5Gcreate1(h_file, "/SPH", 0); - if(h_grpsph < 0) - error("Error while creating SPH group"); + if (h_grpsph < 0) error("Error while creating SPH group"); writeAttribute_f(h_grpsph, "Kernel eta", const_eta_kernel); writeAttribute_f(h_grpsph, "Weighted N_ngb", kernel_nwneigh); @@ -294,24 +287,33 @@ void writeSPHflavour(hid_t h_file) writeAttribute_f(h_grpsph, "Hydro gamma", const_hydro_gamma); #ifdef LEGACY_GADGET2_SPH - writeAttribute_s(h_grpsph, "Thermal Conductivity Model", "(No treatment) Legacy Gadget-2 as in Springel (2005)"); - writeAttribute_s(h_grpsph, "Viscosity Model", "Legacy Gadget-2 as in Springel (2005)"); - writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha); - writeAttribute_f(h_grpsph, "Viscosity beta", 3.f); + writeAttribute_s(h_grpsph, "Thermal Conductivity Model", + "(No treatment) Legacy Gadget-2 as in Springel (2005)"); + writeAttribute_s(h_grpsph, "Viscosity Model", + "Legacy Gadget-2 as in Springel (2005)"); + writeAttribute_f(h_grpsph, "Viscosity alpha", const_viscosity_alpha); + writeAttribute_f(h_grpsph, "Viscosity beta", 3.f); #else - writeAttribute_s(h_grpsph, "Thermal Conductivity Model", "Price (2008) without switch"); - writeAttribute_f(h_grpsph, "Thermal Conductivity alpha", const_conductivity_alpha); - writeAttribute_s(h_grpsph, "Viscosity Model", "Morris & Monaghan (1997), Rosswog, Davies, Thielemann & Piran (2000) with additional Balsara (1995) switch"); - writeAttribute_f(h_grpsph, "Viscosity alpha_min", const_viscosity_alpha_min); - writeAttribute_f(h_grpsph, "Viscosity alpha_max", const_viscosity_alpha_max); - writeAttribute_f(h_grpsph, "Viscosity beta", 2.f); - writeAttribute_f(h_grpsph, "Viscosity decay length", const_viscosity_length); + writeAttribute_s(h_grpsph, "Thermal Conductivity Model", + "Price (2008) without switch"); + writeAttribute_f(h_grpsph, "Thermal Conductivity alpha", + const_conductivity_alpha); + writeAttribute_s(h_grpsph, "Viscosity Model", + "Morris & Monaghan (1997), Rosswog, Davies, Thielemann & " + "Piran (2000) with additional Balsara (1995) switch"); + writeAttribute_f(h_grpsph, "Viscosity alpha_min", const_viscosity_alpha_min); + writeAttribute_f(h_grpsph, "Viscosity alpha_max", const_viscosity_alpha_max); + writeAttribute_f(h_grpsph, "Viscosity beta", 2.f); + writeAttribute_f(h_grpsph, "Viscosity decay length", const_viscosity_length); #endif - writeAttribute_f(h_grpsph, "CFL parameter", const_cfl); - writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt", const_ln_max_h_change); - writeAttribute_f(h_grpsph, "Maximal Delta h change over dt", exp(const_ln_max_h_change)); - writeAttribute_f(h_grpsph, "Maximal Delta u change over dt", const_max_u_change); + writeAttribute_f(h_grpsph, "CFL parameter", const_cfl); + writeAttribute_f(h_grpsph, "Maximal ln(Delta h) change over dt", + const_ln_max_h_change); + writeAttribute_f(h_grpsph, "Maximal Delta h change over dt", + exp(const_ln_max_h_change)); + writeAttribute_f(h_grpsph, "Maximal Delta u change over dt", + const_max_u_change); writeAttribute_s(h_grpsph, "Kernel", kernel_name); H5Gclose(h_grpsph); @@ -322,25 +324,26 @@ void writeSPHflavour(hid_t h_file) * @param h_file The (opened) HDF5 file in which to write * @param us The UnitSystem used in the run */ -void writeUnitSystem(hid_t h_file, struct UnitSystem* us) -{ - hid_t h_grpunit=0; +void writeUnitSystem(hid_t h_file, struct UnitSystem* us) { + hid_t h_grpunit = 0; h_grpunit = H5Gcreate1(h_file, "/Units", 0); - if(h_grpunit < 0) - error("Error while creating Unit System group"); - - writeAttribute_d(h_grpunit, "Unit mass in cgs (U_M)", getBaseUnit(us, UNIT_MASS)); - writeAttribute_d(h_grpunit, "Unit length in cgs (U_L)", getBaseUnit(us, UNIT_LENGTH)); - writeAttribute_d(h_grpunit, "Unit time in cgs (U_t)", getBaseUnit(us, UNIT_TIME)); - writeAttribute_d(h_grpunit, "Unit current in cgs (U_I)", getBaseUnit(us, UNIT_CURRENT)); - writeAttribute_d(h_grpunit, "Unit temperature in cgs (U_T)", getBaseUnit(us, UNIT_TEMPERATURE)); + if (h_grpunit < 0) error("Error while creating Unit System group"); + + writeAttribute_d(h_grpunit, "Unit mass in cgs (U_M)", + getBaseUnit(us, UNIT_MASS)); + writeAttribute_d(h_grpunit, "Unit length in cgs (U_L)", + getBaseUnit(us, UNIT_LENGTH)); + writeAttribute_d(h_grpunit, "Unit time in cgs (U_t)", + getBaseUnit(us, UNIT_TIME)); + writeAttribute_d(h_grpunit, "Unit current in cgs (U_I)", + getBaseUnit(us, UNIT_CURRENT)); + writeAttribute_d(h_grpunit, "Unit temperature in cgs (U_T)", + getBaseUnit(us, UNIT_TEMPERATURE)); H5Gclose(h_grpunit); } - - /** * @brief Prepares the XMF file for the new entry * @@ -348,67 +351,63 @@ void writeUnitSystem(hid_t h_file, struct UnitSystem* us) * * @todo Use a proper XML library to avoid stupid copies. */ -FILE* prepareXMFfile() -{ +FILE* prepareXMFfile() { char buffer[1024]; FILE* xmfFile = fopen("output.xmf", "r"); FILE* tempFile = fopen("output_temp.xmf", "w"); - if(xmfFile == NULL) - error("Unable to open current XMF file."); - - if(tempFile == NULL) - error("Unable to open temporary file."); + if (xmfFile == NULL) error("Unable to open current XMF file."); + if (tempFile == NULL) error("Unable to open temporary file."); /* First we make a temporary copy of the XMF file and count the lines */ int counter = 0; - while (fgets(buffer, 1024, xmfFile) != NULL) - { - counter++; - fprintf(tempFile, "%s", buffer); - } + while (fgets(buffer, 1024, xmfFile) != NULL) { + counter++; + fprintf(tempFile, "%s", buffer); + } fclose(tempFile); fclose(xmfFile); - + /* We then copy the XMF file back up to the closing lines */ xmfFile = fopen("output.xmf", "w"); tempFile = fopen("output_temp.xmf", "r"); - if(xmfFile == NULL) - error("Unable to open current XMF file."); + if (xmfFile == NULL) error("Unable to open current XMF file."); - if(tempFile == NULL) - error("Unable to open temporary file."); + if (tempFile == NULL) error("Unable to open temporary file."); int i = 0; - while (fgets(buffer, 1024, tempFile) != NULL && i < counter - 3) - { - i++; - fprintf(xmfFile, "%s", buffer); - } + while (fgets(buffer, 1024, tempFile) != NULL && i < counter - 3) { + i++; + fprintf(xmfFile, "%s", buffer); + } fprintf(xmfFile, "\n"); fclose(tempFile); remove("output_temp.xmf"); - + return xmfFile; } /** * @brief Writes the begin of the XMF file * - * @todo Exploit the XML nature of the XMF format to write a proper XML writer and simplify all the XMF-related stuff. + * @todo Exploit the XML nature of the XMF format to write a proper XML writer + *and simplify all the XMF-related stuff. */ -void createXMFfile() -{ +void createXMFfile() { FILE* xmfFile = fopen("output.xmf", "w"); fprintf(xmfFile, "<?xml version=\"1.0\" ?> \n"); fprintf(xmfFile, "<!DOCTYPE Xdmf SYSTEM \"Xdmf.dtd\" []> \n"); - fprintf(xmfFile, "<Xdmf xmlns:xi=\"http://www.w3.org/2003/XInclude\" Version=\"2.1\">\n"); + fprintf( + xmfFile, + "<Xdmf xmlns:xi=\"http://www.w3.org/2003/XInclude\" Version=\"2.1\">\n"); fprintf(xmfFile, "<Domain>\n"); - fprintf(xmfFile, "<Grid Name=\"TimeSeries\" GridType=\"Collection\" CollectionType=\"Temporal\">\n\n"); + fprintf(xmfFile, + "<Grid Name=\"TimeSeries\" GridType=\"Collection\" " + "CollectionType=\"Temporal\">\n\n"); fprintf(xmfFile, "</Grid>\n"); fprintf(xmfFile, "</Domain>\n"); @@ -417,48 +416,52 @@ void createXMFfile() fclose(xmfFile); } - /** - * @brief Writes the part of the XMF entry presenting the geometry of the snapshot + * @brief Writes the part of the XMF entry presenting the geometry of the + *snapshot * * @param xmfFile The file to write in. * @param Nparts The number of particles. * @param hdfFileName The name of the HDF5 file corresponding to this output. * @param time The current simulation time. */ -void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName, float time) -{ +void writeXMFheader(FILE* xmfFile, long long Nparts, char* hdfFileName, + float time) { /* Write end of file */ - - fprintf(xmfFile, "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n"); + + fprintf(xmfFile, + "<Grid GridType=\"Collection\" CollectionType=\"Spatial\">\n"); fprintf(xmfFile, "<Time Type=\"Single\" Value=\"%f\"/>\n", time); fprintf(xmfFile, "<Grid Name=\"Gas\" GridType=\"Uniform\">\n"); - fprintf(xmfFile, "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n", Nparts); + fprintf(xmfFile, + "<Topology TopologyType=\"Polyvertex\" Dimensions=\"%lld\"/>\n", + Nparts); fprintf(xmfFile, "<Geometry GeometryType=\"XYZ\">\n"); - fprintf(xmfFile, "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" Precision=\"8\" Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n", Nparts, hdfFileName); + fprintf(xmfFile, + "<DataItem Dimensions=\"%lld 3\" NumberType=\"Double\" " + "Precision=\"8\" " + "Format=\"HDF\">%s:/PartType0/Coordinates</DataItem>\n", + Nparts, hdfFileName); fprintf(xmfFile, "</Geometry>"); } - /** * @brief Writes the end of the XMF file (closes all open markups) * * @param xmfFile The file to write in. */ -void writeXMFfooter(FILE* xmfFile) -{ +void writeXMFfooter(FILE* xmfFile) { /* Write end of the section of this time step */ - + fprintf(xmfFile, "\n</Grid>\n"); fprintf(xmfFile, "</Grid>\n"); fprintf(xmfFile, "\n</Grid>\n"); fprintf(xmfFile, "</Domain>\n"); fprintf(xmfFile, "</Xdmf>\n"); - + fclose(xmfFile); } - /** * @brief Writes the lines corresponding to an array of the HDF5 output * @@ -471,15 +474,22 @@ void writeXMFfooter(FILE* xmfFile) * * @todo Treat the types in a better way. */ -void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N, int dim, enum DATA_TYPE type ) -{ - fprintf(xmfFile, "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n", name, dim == 1 ? "Scalar": "Vector"); - if(dim == 1) - fprintf(xmfFile, "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n", N, type==FLOAT ? 4:8, fileName, name); +void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N, + int dim, enum DATA_TYPE type) { + fprintf(xmfFile, + "<Attribute Name=\"%s\" AttributeType=\"%s\" Center=\"Node\">\n", + name, dim == 1 ? "Scalar" : "Vector"); + if (dim == 1) + fprintf(xmfFile, + "<DataItem Dimensions=\"%lld\" NumberType=\"Double\" " + "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n", + N, type == FLOAT ? 4 : 8, fileName, name); else - fprintf(xmfFile, "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n", N, dim, type==FLOAT ? 4:8, fileName, name); + fprintf(xmfFile, + "<DataItem Dimensions=\"%lld %d\" NumberType=\"Double\" " + "Precision=\"%d\" Format=\"HDF\">%s:/PartType0/%s</DataItem>\n", + N, dim, type == FLOAT ? 4 : 8, fileName, name); fprintf(xmfFile, "</Attribute>\n"); } - #endif diff --git a/src/common_io.h b/src/common_io.h index 0c098f597f7acd7f8a084becb1afaadda09c381a..7aacd9ad6e02b5bf2f9b4dd325d52f0af34d2f4f 100644 --- a/src/common_io.h +++ b/src/common_io.h @@ -2,52 +2,66 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_COMMON_IO_H +#define SWIFT_COMMON_IO_H /* Config parameters. */ #include "../config.h" +/* Includes. */ #include "units.h" #if defined(HAVE_HDF5) - /** * @brief The different types of data used in the GADGET IC files. * * (This is admittedly a poor substitute to C++ templates...) */ -enum DATA_TYPE{INT, LONG, LONGLONG, UINT, ULONG, ULONGLONG, FLOAT, DOUBLE, CHAR}; +enum DATA_TYPE { + INT, + LONG, + LONGLONG, + UINT, + ULONG, + ULONGLONG, + FLOAT, + DOUBLE, + CHAR +}; /** - * @brief The two sorts of data present in the GADGET IC files: compulsory to start a run or optional. + * @brief The two sorts of data present in the GADGET IC files: compulsory to + *start a run or optional. * */ -enum DATA_IMPORTANCE{COMPULSORY=1, OPTIONAL=0}; - - - +enum DATA_IMPORTANCE { + COMPULSORY = 1, + OPTIONAL = 0 +}; hid_t hdf5Type(enum DATA_TYPE type); size_t sizeOfType(enum DATA_TYPE type); void readAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data); -void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, int num); +void writeAttribute(hid_t grp, char* name, enum DATA_TYPE type, void* data, + int num); void writeAttribute_d(hid_t grp, char* name, double data); void writeAttribute_f(hid_t grp, char* name, float data); @@ -58,9 +72,9 @@ void writeAttribute_s(hid_t grp, char* name, char* str); void createXMFfile(); FILE* prepareXMFfile(); void writeXMFfooter(FILE* xmfFile); -void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName, float time); -void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N, int dim, enum DATA_TYPE type); - +void writeXMFheader(FILE* xmfFile, long long N, char* hdfFileName, float time); +void writeXMFline(FILE* xmfFile, char* fileName, char* name, long long N, + int dim, enum DATA_TYPE type); /** * @brief Writes the current model of SPH to the file @@ -74,5 +88,6 @@ void writeSPHflavour(hid_t h_file); */ void writeUnitSystem(hid_t h_file, struct UnitSystem* us); - #endif + +#endif /* SWIFT_COMMON_IO_H */ diff --git a/src/const.h b/src/const.h index e600f7f50b4ce4d2f6943860bef02e702b5022d6..ccccf6fa89884328efb33fcb018e0b17228fceff 100644 --- a/src/const.h +++ b/src/const.h @@ -2,63 +2,74 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (ptcedro.gonnet@durham.ac.uk) * Matthieu Schaller (matthieu.schaller@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ - +#ifndef SWIFT_CONST_H +#define SWIFT_CONST_H /* Hydrodynamical constants. */ -#define const_hydro_gamma (5.0f/3.0f) +#define const_hydro_gamma (5.0f / 3.0f) /* SPH Viscosity constants. */ -#define const_viscosity_alpha 0.8f /* Used in the legacy gadget-2 SPH mode only */ -#define const_viscosity_alpha_min 0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */ -#define const_viscosity_alpha_max 2.0f /* Values taken from (Price,2004), not used in legacy gadget mode */ -#define const_viscosity_length 0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */ +#define const_viscosity_alpha \ + 0.8f /* Used in the legacy gadget-2 SPH mode only */ +#define const_viscosity_alpha_min \ + 0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */ +#define const_viscosity_alpha_max \ + 2.0f /* Values taken from (Price,2004), not used in legacy gadget mode */ +#define const_viscosity_length \ + 0.1f /* Values taken from (Price,2004), not used in legacy gadget mode */ /* SPH Thermal conductivity constants. */ -#define const_conductivity_alpha 1.f /* Value taken from (Price,2008), not used in legacy gadget mode */ +#define const_conductivity_alpha \ + 1.f /* Value taken from (Price,2008), not used in legacy gadget mode */ /* Time integration constants. */ -#define const_cfl 0.3f -#define const_ln_max_h_change 0.231111721f /* Particle can't change volume by more than a factor of 2=1.26^3 over one time step */ -#define const_max_u_change 0.1f +#define const_cfl 0.3f +#define const_ln_max_h_change \ + 0.231111721f /* Particle can't change volume by more than a factor of \ + 2=1.26^3 over one time step */ +#define const_max_u_change 0.1f /* Neighbour search constants. */ -#define const_eta_kernel 1.2349f /* Corresponds to 48 ngbs with the cubic spline kernel */ -#define const_delta_nwneigh 1.f +#define const_eta_kernel \ + 1.2349f /* Corresponds to 48 ngbs with the cubic spline kernel */ +#define const_delta_nwneigh 1.f #define CUBIC_SPLINE_KERNEL /* Gravity stuff. */ -#define const_theta_max 0.57735f /* Opening criteria, which is the ratio of the - cell distance over the cell width. */ +#define const_theta_max \ + 0.57735f /* Opening criteria, which is the ratio of the \ + cell distance over the cell width. */ // #define const_G 6.67384e-8f /* Gravitational constant. */ -#define const_G 6.672e-8f /* Gravitational constant. */ -#define const_epsilon 0.0014f /* Gravity blending distance. */ -#define const_iepsilon 714.285714286f /* Inverse gravity blending distance. */ -#define const_iepsilon2 (const_iepsilon*const_iepsilon) -#define const_iepsilon3 (const_iepsilon2*const_iepsilon) -#define const_iepsilon4 (const_iepsilon2*const_iepsilon2) -#define const_iepsilon5 (const_iepsilon3*const_iepsilon2) -#define const_iepsilon6 (const_iepsilon3*const_iepsilon3) +#define const_G 6.672e-8f /* Gravitational constant. */ +#define const_epsilon 0.0014f /* Gravity blending distance. */ +#define const_iepsilon 714.285714286f /* Inverse gravity blending distance. */ +#define const_iepsilon2 (const_iepsilon* const_iepsilon) +#define const_iepsilon3 (const_iepsilon2* const_iepsilon) +#define const_iepsilon4 (const_iepsilon2* const_iepsilon2) +#define const_iepsilon5 (const_iepsilon3* const_iepsilon2) +#define const_iepsilon6 (const_iepsilon3* const_iepsilon3) /* SPH variant to use */ #define LEGACY_GADGET2_SPH - /* System of units */ -#define const_unit_length_in_cgs 1 /* 3.08567810e16 /\* 1Mpc *\/ */ -#define const_unit_mass_in_cgs 1 /* 1.9891e33 /\* 1 M_sun *\/ */ -#define const_unit_velocity_in_cgs 1 /* 1e5 /\* km s^-1 *\/ */ +#define const_unit_length_in_cgs 1 /* 3.08567810e16 /\* 1Mpc *\/ */ +#define const_unit_mass_in_cgs 1 /* 1.9891e33 /\* 1 M_sun *\/ */ +#define const_unit_velocity_in_cgs 1 /* 1e5 /\* km s^-1 *\/ */ + +#endif /* SWIFT_CONST_H */ diff --git a/src/cycle.h b/src/cycle.h index 16f57e7e1ef942d2736f4328be9117b2deab6d6e..1278c83e8b43324662bdeb0de75eec08faf4fd82 100644 --- a/src/cycle.h +++ b/src/cycle.h @@ -23,7 +23,6 @@ * */ - /* machine-dependent cycle counters code. Needs to be inlined. */ /***************************************************************************/ @@ -52,25 +51,28 @@ defined according to whether the corresponding function/type/header is available on your system. The necessary macros are most conveniently defined if you are using GNU autoconf, via the tests: - + dnl --------------------------------------------------------------------- AC_C_INLINE AC_HEADER_TIME AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) - AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H + AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if +hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H #include <sys/time.h> #endif]) - AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time]) + AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime +mach_absolute_time]) dnl Cray UNICOS _rtc() (real-time clock) intrinsic AC_MSG_CHECKING([for _rtc intrinsic]) rtc_ok=yes AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H #include <intrinsics.h> -#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no]) +#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() +intrinsic.])], [rtc_ok=no]) AC_MSG_RESULT($rtc_ok) dnl --------------------------------------------------------------------- @@ -79,24 +81,25 @@ /***************************************************************************/ #if TIME_WITH_SYS_TIME -# include <sys/time.h> -# include <time.h> +#include <sys/time.h> +#include <time.h> #else -# if HAVE_SYS_TIME_H -# include <sys/time.h> -# else -# include <time.h> -# endif +#if HAVE_SYS_TIME_H +#include <sys/time.h> +#else +#include <time.h> +#endif #endif -#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \ -{ \ - return (double)t1 - (double)t0; \ -} +#define INLINE_ELAPSED(INL) \ + static INL double elapsed(ticks t1, ticks t0) { \ + return (double)t1 - (double)t0; \ + } /*----------------------------------------------------------------*/ /* Solaris */ -#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) +#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && \ + !defined(HAVE_TICK_COUNTER) typedef hrtime_t ticks; #define getticks gethrtime @@ -108,22 +111,22 @@ INLINE_ELAPSED(inline) /*----------------------------------------------------------------*/ /* AIX v. 4+ routines to read the real-time clock or time-base register */ -#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER) +#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && \ + !defined(HAVE_TICK_COUNTER) typedef timebasestruct_t ticks; -static __inline ticks getticks(void) -{ - ticks t; - read_real_time(&t, TIMEBASE_SZ); - return t; +static __inline ticks getticks(void) { + ticks t; + read_real_time(&t, TIMEBASE_SZ); + return t; } static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ { - time_base_to_time(&t1, TIMEBASE_SZ); - time_base_to_time(&t0, TIMEBASE_SZ); - return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + - ((double)t1.tb_low - (double)t0.tb_low)); + time_base_to_time(&t1, TIMEBASE_SZ); + time_base_to_time(&t0, TIMEBASE_SZ); + return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + + ((double)t1.tb_low - (double)t0.tb_low)); } #define HAVE_TICK_COUNTER @@ -133,20 +136,23 @@ static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ /* * PowerPC ``cycle'' counter using the time base register. */ -#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER) +#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || \ + (defined(__MWERKS__) && defined(macintosh)))) || \ + (defined(__IBM_GCC_ASM) && \ + (defined(__powerpc__) || defined(__ppc__)))) && \ + !defined(HAVE_TICK_COUNTER) typedef unsigned long long ticks; -static __inline__ ticks getticks(void) -{ - unsigned int tbl, tbu0, tbu1; +static __inline__ ticks getticks(void) { + unsigned int tbl, tbu0, tbu1; - do { - __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); - __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); - __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); - } while (tbu0 != tbu1); + do { + __asm__ __volatile__("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__("mftb %0" : "=r"(tbl)); + __asm__ __volatile__("mftbu %0" : "=r"(tbu1)); + } while (tbu0 != tbu1); - return (((unsigned long long)tbu0) << 32) | tbl; + return (((unsigned long long)tbu0) << 32) | tbl; } INLINE_ELAPSED(__inline__) @@ -156,7 +162,8 @@ INLINE_ELAPSED(__inline__) /* MacOS/Mach (Darwin) time-base register interface (unlike UpTime, from Carbon, requires no additional libraries to be linked). */ -#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER) +#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && \ + !defined(HAVE_TICK_COUNTER) #include <mach/mach_time.h> typedef uint64_t ticks; #define getticks mach_absolute_time @@ -166,31 +173,31 @@ INLINE_ELAPSED(__inline__) /*----------------------------------------------------------------*/ /* - * Pentium cycle counter + * Pentium cycle counter */ -#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER) +#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && \ + !defined(HAVE_TICK_COUNTER) typedef unsigned long long ticks; #ifndef INLINE -# if __GNUC__ && !__GNUC_STDC_INLINE__ -# define INLINE extern inline -# else -# define INLINE inline -# endif +#if __GNUC__ && !__GNUC_STDC_INLINE__ +#define INLINE extern inline +#else +#define INLINE inline #endif -INLINE static ticks getticks(void) -{ - ticks ret; +#endif +INLINE static ticks getticks(void) { + ticks ret; - __asm__ __volatile__("rdtsc": "=A" (ret)); - /* no input, nothing else clobbered */ - return ret; + __asm__ __volatile__("rdtsc" : "=A"(ret)); + /* no input, nothing else clobbered */ + return ret; } INLINE_ELAPSED(__inline__) #define HAVE_TICK_COUNTER -#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ #endif /* Visual C++ -- thanks to Morten Nissov for his help with this */ @@ -199,46 +206,43 @@ INLINE_ELAPSED(__inline__) typedef LARGE_INTEGER ticks; #define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ -static __inline ticks getticks(void) -{ - ticks retval; - - __asm { - RDTSC - mov retval.HighPart, edx - mov retval.LowPart, eax - } - return retval; +static __inline ticks getticks(void) { + ticks retval; + + __asm { + RDTSC + mov retval.HighPart, edx mov retval.LowPart, eax + } + return retval; } -static __inline double elapsed(ticks t1, ticks t0) -{ - return (double)t1.QuadPart - (double)t0.QuadPart; -} +static __inline double elapsed(ticks t1, ticks t0) { + return (double)t1.QuadPart - (double)t0.QuadPart; +} #define HAVE_TICK_COUNTER -#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ #endif /*----------------------------------------------------------------*/ /* * X86-64 cycle counter */ -#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && \ + defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) typedef unsigned long long ticks; #ifndef INLINE -# if __GNUC__ && !__GNUC_STDC_INLINE__ -# define INLINE extern inline -# else -# define INLINE inline -# endif +#if __GNUC__ && !__GNUC_STDC_INLINE__ +#define INLINE extern inline +#else +#define INLINE inline #endif -INLINE static ticks getticks(void) -{ - unsigned a, d; - asm volatile("rdtsc" : "=a" (a), "=d" (d)); - return ((ticks)a) | (((ticks)d) << 32); +#endif +INLINE static ticks getticks(void) { + unsigned a, d; + asm volatile("rdtsc" : "=a"(a), "=d"(d)); + return ((ticks)a) | (((ticks)d) << 32); } INLINE_ELAPSED(__inline__) @@ -249,18 +253,18 @@ INLINE_ELAPSED(__inline__) /* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori. NOTE: this code will fail to link unless you use the -Masmkeyword compiler option (grrr). */ -#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) typedef unsigned long long ticks; -static ticks getticks(void) -{ - asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); +static ticks getticks(void) { + asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); } INLINE_ELAPSED(__inline__) #define HAVE_TICK_COUNTER #endif /* Visual C++, courtesy of Dirk Michaelis */ -#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER) +#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && \ + !defined(HAVE_TICK_COUNTER) #include <intrin.h> #pragma intrinsic(__rdtsc) @@ -277,17 +281,15 @@ INLINE_ELAPSED(__inline) */ /* intel's icc/ecc compiler */ -#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && \ + !defined(HAVE_TICK_COUNTER) typedef unsigned long ticks; #include <ia64intrin.h> -static __inline__ ticks getticks(void) -{ - return __getReg(_IA64_REG_AR_ITC); -} - +static __inline__ ticks getticks(void) { return __getReg(_IA64_REG_AR_ITC); } + INLINE_ELAPSED(__inline__) - + #define HAVE_TICK_COUNTER #endif @@ -295,12 +297,11 @@ INLINE_ELAPSED(__inline__) #if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) typedef unsigned long ticks; -static __inline__ ticks getticks(void) -{ - ticks ret; +static __inline__ ticks getticks(void) { + ticks ret; - __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); - return ret; + __asm__ __volatile__("mov %0=ar.itc" : "=r"(ret)); + return ret; } INLINE_ELAPSED(__inline__) @@ -313,12 +314,11 @@ INLINE_ELAPSED(__inline__) #include <machine/sys/inline.h> typedef unsigned long ticks; -static inline ticks getticks(void) -{ - ticks ret; +static inline ticks getticks(void) { + ticks ret; - ret = _Asm_mov_from_ar (_AREG_ITC); - return ret; + ret = _Asm_mov_from_ar(_AREG_ITC); + return ret; } INLINE_ELAPSED(inline) @@ -330,17 +330,17 @@ INLINE_ELAPSED(inline) #if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER) typedef unsigned __int64 ticks; -# ifdef __cplusplus +#ifdef __cplusplus extern "C" -# endif -ticks __getReg(int whichReg); +#endif + ticks + __getReg(int whichReg); #pragma intrinsic(__getReg) -static __inline ticks getticks(void) -{ - volatile ticks temp; - temp = __getReg(3116); - return temp; +static __inline ticks getticks(void) { + volatile ticks temp; + temp = __getReg(3116); + return temp; } INLINE_ELAPSED(inline) @@ -350,29 +350,27 @@ INLINE_ELAPSED(inline) /*----------------------------------------------------------------*/ /* - * PA-RISC cycle counter + * PA-RISC cycle counter */ #if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) typedef unsigned long ticks; -# ifdef __GNUC__ -static __inline__ ticks getticks(void) -{ - ticks ret; +#ifdef __GNUC__ +static __inline__ ticks getticks(void) { + ticks ret; - __asm__ __volatile__("mfctl 16, %0": "=r" (ret)); - /* no input, nothing else clobbered */ - return ret; + __asm__ __volatile__("mfctl 16, %0" : "=r"(ret)); + /* no input, nothing else clobbered */ + return ret; } -# else -# include <machine/inline.h> -static inline unsigned long getticks(void) -{ - register ticks ret; - _MFCTL(16, ret); - return ret; +#else +#include <machine/inline.h> +static inline unsigned long getticks(void) { + register ticks ret; + _MFCTL(16, ret); + return ret; } -# endif +#endif INLINE_ELAPSED(inline) @@ -384,11 +382,10 @@ INLINE_ELAPSED(inline) #if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) typedef unsigned long long ticks; -static __inline__ ticks getticks(void) -{ - ticks cycles; - __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc"); - return cycles; +static __inline__ ticks getticks(void) { + ticks cycles; + __asm__("stck 0(%0)" : : "a"(&(cycles)) : "memory", "cc"); + return cycles; } INLINE_ELAPSED(__inline__) @@ -398,16 +395,15 @@ INLINE_ELAPSED(__inline__) /*----------------------------------------------------------------*/ #if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) /* - * The 32-bit cycle counter on alpha overflows pretty quickly, + * The 32-bit cycle counter on alpha overflows pretty quickly, * unfortunately. A 1GHz machine overflows in 4 seconds. */ typedef unsigned int ticks; -static __inline__ ticks getticks(void) -{ - unsigned long cc; - __asm__ __volatile__ ("rpcc %0" : "=r"(cc)); - return (cc & 0xFFFFFFFF); +static __inline__ ticks getticks(void) { + unsigned long cc; + __asm__ __volatile__("rpcc %0" : "=r"(cc)); + return (cc & 0xFFFFFFFF); } INLINE_ELAPSED(__inline__) @@ -419,11 +415,10 @@ INLINE_ELAPSED(__inline__) #if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) typedef unsigned long ticks; -static __inline__ ticks getticks(void) -{ - ticks ret; - __asm__ __volatile__("rd %%tick, %0" : "=r" (ret)); - return ret; +static __inline__ ticks getticks(void) { + ticks ret; + __asm__ __volatile__("rd %%tick, %0" : "=r"(ret)); + return ret; } INLINE_ELAPSED(__inline__) @@ -432,15 +427,15 @@ INLINE_ELAPSED(__inline__) #endif /*----------------------------------------------------------------*/ -#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) -# include <c_asm.h> +#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && \ + defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) +#include <c_asm.h> typedef unsigned int ticks; -static __inline ticks getticks(void) -{ - unsigned long cc; - cc = asm("rpcc %v0"); - return (cc & 0xFFFFFFFF); +static __inline ticks getticks(void) { + unsigned long cc; + cc = asm("rpcc %v0"); + return (cc & 0xFFFFFFFF); } INLINE_ELAPSED(__inline) @@ -449,20 +444,19 @@ INLINE_ELAPSED(__inline) #endif /*----------------------------------------------------------------*/ /* SGI/Irix */ -#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) +#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && \ + !defined(HAVE_TICK_COUNTER) typedef struct timespec ticks; -static inline ticks getticks(void) -{ - struct timespec t; - clock_gettime(CLOCK_SGI_CYCLE, &t); - return t; +static inline ticks getticks(void) { + struct timespec t; + clock_gettime(CLOCK_SGI_CYCLE, &t); + return t; } -static inline double elapsed(ticks t1, ticks t0) -{ - return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 + - ((double)t1.tv_nsec - (double)t0.tv_nsec); +static inline double elapsed(ticks t1, ticks t0) { + return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 + + ((double)t1.tv_nsec - (double)t0.tv_nsec); } #define HAVE_TICK_COUNTER #endif @@ -471,7 +465,7 @@ static inline double elapsed(ticks t1, ticks t0) /* Cray UNICOS _rtc() intrinsic function */ #if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) #ifdef HAVE_INTRINSICS_H -# include <intrinsics.h> +#include <intrinsics.h> #endif typedef long long ticks; @@ -493,25 +487,23 @@ INLINE_ELAPSED(inline) typedef uint64_t ticks; -static inline ticks getticks(void) -{ +static inline ticks getticks(void) { static uint64_t* addr = 0; - if (addr == 0) - { + if (addr == 0) { uint32_t rq_addr = 0x10030000; int fd; int pgsize; pgsize = getpagesize(); - fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0); + fd = open("/dev/mem", O_RDONLY | O_SYNC, 0); if (fd < 0) { perror("open"); return NULL; } addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr); close(fd); - if (addr == (uint64_t *)-1) { + if (addr == (uint64_t*)-1) { perror("mmap"); return NULL; } @@ -525,4 +517,3 @@ INLINE_ELAPSED(inline) #define HAVE_TICK_COUNTER #endif #endif /* HAVE_MIPS_ZBUS_TIMER */ - diff --git a/src/debug.c b/src/debug.c index 75c726bfced9f9b5f1b09aa276b92fbc06ae3882..0ebbd44ae03ffc6ddfabab78e577335f0b9bbe5a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -2,125 +2,111 @@ * This file is part of SWIFT. * Coypright (c) 2013 Matthieu Schaller (matthieu.schaller@durham.ac.uk), * Pedro Gonnet (pedro.gonnet@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ - #include <stdio.h> -#include "const.h" -#include "part.h" +/* This object's header. */ +#include "debug.h" +/** + * @brief Dump the information pertaining to the given cell. + */ + +void print_cell(struct cell *c) { + printf( + "## Cell 0x%0zx: loc=[%.3e,%.3e,%.3e], h=[%.3e,%.3e,%.3e], depth=%i, " + "split=%i, maxdepth=%i.\n", + (size_t)c, c->loc[0], c->loc[1], c->loc[2], c->h[0], c->h[1], c->h[2], + c->depth, c->split, c->maxdepth); +} /** - * @brief Looks for the particle with the given id and prints its information to the standard output. - * + * @brief Looks for the particle with the given id and prints its information to + *the standard output. + * * @param parts The array of particles. * @param id The id too look for. * @param N The size of the array of particles. * * (Should be used for debugging only as it runs in O(N).) */ - -void printParticle ( struct part *parts , long long int id, int N ) { - - int i, found = 0; - - /* Look for the particle. */ - for ( i = 0 ; i < N ; i++ ) - if ( parts[i].id == id ) { - printf("## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n", - i, - parts[i].id, - parts[i].x[0], parts[i].x[1], parts[i].x[2], - parts[i].v[0], parts[i].v[1], parts[i].v[2], - parts[i].a[0], parts[i].a[1], parts[i].a[2], - parts[i].h, - parts[i].force.h_dt, - parts[i].density.wcount, - parts[i].mass, - parts[i].rho, parts[i].rho_dh, - parts[i].density.div_v, - parts[i].u, - parts[i].force.u_dt, - parts[i].force.balsara, - parts[i].force.POrho2, - parts[i].force.v_sig, - parts[i].dt - ); - found = 1; - } - - if ( !found ) - printf("## Particles[???] id=%lld not found\n", id); - + +void printParticle(struct part *parts, long long int id, int N) { + + int i, found = 0; + + /* Look for the particle. */ + for (i = 0; i < N; i++) + if (parts[i].id == id) { + printf( + "## Particle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], " + "v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, " + "wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, " + "dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n", + i, parts[i].id, parts[i].x[0], parts[i].x[1], parts[i].x[2], + parts[i].v[0], parts[i].v[1], parts[i].v[2], parts[i].a[0], + parts[i].a[1], parts[i].a[2], parts[i].h, parts[i].force.h_dt, + parts[i].density.wcount, parts[i].mass, parts[i].rho, parts[i].rho_dh, + parts[i].density.div_v, parts[i].u, parts[i].force.u_dt, + parts[i].force.balsara, parts[i].force.POrho2, parts[i].force.v_sig, + parts[i].dt); + found = 1; } + if (!found) printf("## Particles[???] id=%lld not found\n", id); +} + +void printgParticle(struct gpart *parts, long long int id, int N) { + + int i, found = 0; -void printgParticle ( struct gpart *parts , long long int id, int N ) { - - int i, found = 0; - - /* Look for the particle. */ - for ( i = 0 ; i < N ; i++ ) - if ( parts[i].id == -id || ( parts[i].id > 0 && parts[i].part->id == id ) ) { - printf("## gParticle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], m=%.3e, dt=%.3e\n", - i, - (parts[i].id < 0) ? -parts[i].id : parts[i].part->id , - parts[i].x[0], parts[i].x[1], parts[i].x[2], - parts[i].v[0], parts[i].v[1], parts[i].v[2], - parts[i].a[0], parts[i].a[1], parts[i].a[2], - parts[i].mass, - parts[i].dt - ); - found = 1; - } - - if ( !found ) - printf("## Particles[???] id=%lld not found\n", id); - + /* Look for the particle. */ + for (i = 0; i < N; i++) + if (parts[i].id == -id || (parts[i].id > 0 && parts[i].part->id == id)) { + printf( + "## gParticle[%d]: id=%lld, x=[%.16e,%.16e,%.16e], " + "v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], m=%.3e, dt=%.3e\n", + i, (parts[i].id < 0) ? -parts[i].id : parts[i].part->id, + parts[i].x[0], parts[i].x[1], parts[i].x[2], parts[i].v[0], + parts[i].v[1], parts[i].v[2], parts[i].a[0], parts[i].a[1], + parts[i].a[2], parts[i].mass, parts[i].dt); + found = 1; } + if (!found) printf("## Particles[???] id=%lld not found\n", id); +} /** * @brief Prints the details of a given particle to stdout - * + * * @param p The particle to print - * + * */ - -void printParticle_single ( struct part *p ) { - - printf("## Particle: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, v_sig=%.3e, dt=%.3e\n", - p->id, - p->x[0], p->x[1], p->x[2], - p->v[0], p->v[1], p->v[2], - p->a[0], p->a[1], p->a[2], - p->h, - p->force.h_dt, - p->density.wcount, - p->mass, - p->rho, p->rho_dh, - p->density.div_v, - p->u, - p->force.u_dt, - p->force.balsara, - p->force.POrho2, - p->force.v_sig, - p->dt - ); - } +void printParticle_single(struct part *p) { + + printf( + "## Particle: id=%lld, x=[%e,%e,%e], v=[%.3e,%.3e,%.3e], " + "a=[%.3e,%.3e,%.3e], h=%.3e, h_dt=%.3e, wcount=%.3e, m=%.3e, rho=%.3e, " + "rho_dh=%.3e, div_v=%.3e, u=%.3e, dudt=%.3e, bals=%.3e, POrho2=%.3e, " + "v_sig=%.3e, dt=%.3e\n", + p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], p->a[0], + p->a[1], p->a[2], p->h, p->force.h_dt, p->density.wcount, p->mass, p->rho, + p->rho_dh, p->density.div_v, p->u, p->force.u_dt, p->force.balsara, + p->force.POrho2, p->force.v_sig, p->dt); +} diff --git a/src/debug.h b/src/debug.h index 5db731a857ef32792c4f0a377eb97e475ab6b782..42269fc267e6d5721990b992c92c515a5764f6a9 100644 --- a/src/debug.h +++ b/src/debug.h @@ -1,25 +1,31 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_DEBUG_H +#define SWIFT_DEBUG_H +/* Includes. */ +#include "cell.h" +#include "part.h" - - +void print_cell(struct cell *c); void printParticle(struct part *parts, long long int i, int N); void printgParticle(struct gpart *parts, long long int i, int N); -void printParticle_single ( struct part *p ); +void printParticle_single(struct part *p); + +#endif /* SWIFT_DEBUG_H */ diff --git a/src/engine.c b/src/engine.c index 36b6215772f2220788aed71eeef7f89c94e0cdb6..88d9547b83a10327a6a7183f13f5bb499da43948 100644 --- a/src/engine.c +++ b/src/engine.c @@ -1,66 +1,53 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <string.h> -#include <pthread.h> -#include <math.h> #include <float.h> #include <limits.h> #include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> - +#include <mpi.h> /* METIS headers only used when MPI is also available. */ - #ifdef HAVE_METIS - #include <metis.h> - #endif +#ifdef HAVE_METIS +#include <metis.h> +#endif #endif +/* This object's header. */ +#include "engine.h" + /* Local headers. */ -#include "const.h" -#include "cycle.h" #include "atomic.h" -#include "timers.h" -#include "const.h" -#include "vector.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "debug.h" -#include "space.h" -#include "multipole.h" #include "cell.h" -#include "queue.h" -#include "scheduler.h" -#include "engine.h" -#include "runner.h" -#include "proxy.h" +#include "cycle.h" +#include "debug.h" #include "error.h" +#include "timers.h" #ifdef LEGACY_GADGET2_SPH #include "runner_iact_legacy.h" @@ -68,14 +55,85 @@ #include "runner_iact.h" #endif - /* Convert cell location to ID. */ -#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) - +#define cell_getid(cdim, i, j, k) \ + ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /** The rank of the engine as a global variable (for messages). */ int engine_rank; +/** + * @brief Check if a single particle is OK. + * + * @return Zero if all checks passed, non-zero otherwise. + */ +int engine_check_part(struct part *p) { + if (p == NULL || p->mass == 0.0f || p->h == 0.0f) { + message("Bad particle data."); + printParticle_single(p); + return 1; + } else if (p->x[0] == 0.0 && p->x[1] == 0.0 && p->x[2] == 0.0) { + message("Bad particle location."); + printParticle_single(p); + return 1; + } else { + return 0; + } +} + +/** + * @brief Check if a cell's data is reasonable, also check if its particles + * are OK. + * + * @return Zero if all checks passed, non-zero otherwise. + */ + +void engine_check_cell(struct cell *c, void *data) { + /* Check the cell data. */ + if (c->count == 0) { + print_cell(c); + error("Empty cell."); + } + + /* Check the particles. */ + for (int k = 0; k < c->count; k++) { + if (engine_check_part(&c->parts[k])) { + print_cell(c); + error("Bad particle in cell."); + } + } + + /* Check that the progeny, if any, contain all the particles. */ + if (c->split) { + int count = 0; + for (int k = 0; k < 8; k++) { + if (c->progeny[k] != NULL) { + count += c->progeny[k]->count; + } + } + if (count != c->count) { + print_cell(c); + error("Progeny cell counts don't add up."); + } + } +} + +/** + * @brief Runs a series of checks to make sure we have no bad particles. + */ + +void engine_check(struct engine *e) { + /* Check all particles directly. */ + struct space *s = e->s; + for (int k = 0; k < s->nr_parts; k++) { + if (engine_check_part(&s->parts[k])) { + error("Bad particle s->parts[%i], aborting.", k); + } + } + + /* Check each cell in the space. */ + space_map_cells_post(s, 1, &engine_check_cell, NULL); +} /** * @brief Link a density/force task to a cell. @@ -86,16 +144,14 @@ int engine_rank; * * @return The new #link pointer. */ - -struct link *engine_addlink( struct engine *e , struct link *l , struct task *t ) { - - struct link *res = &e->links[ atomic_inc( &e->nr_links ) ]; - res->next = l; - res->t = t; - return res; - } +struct link *engine_addlink(struct engine *e, struct link *l, struct task *t) { + struct link *res = &e->links[atomic_inc(&e->nr_links)]; + res->next = l; + res->t = t; + return res; +} /** * @brief Generate the ghost and kick tasks for a hierarchy of cells. @@ -104,46 +160,44 @@ struct link *engine_addlink( struct engine *e , struct link *l , struct task *t * @param c The #cell. * @param super The super #cell. */ - -void engine_mkghosts ( struct engine *e , struct cell *c , struct cell *super ) { - - int k; - struct scheduler *s = &e->sched; - - /* Am I the super-cell? */ - if ( super == NULL && c->nr_tasks > 0 ) { - - /* Remember me. */ - super = c; - - /* Local tasks only... */ - if ( c->nodeID == e->nodeID ) { - - /* Generate the ghost task. */ - c->ghost = scheduler_addtask( s , task_type_ghost , task_subtype_none , 0 , 0 , c , NULL , 0 ); - - /* Add the kick2 task. */ - c->kick2 = scheduler_addtask( s , task_type_kick2 , task_subtype_none , 0 , 0 , c , NULL , 0 ); - - /* Add the kick1 task if needed. */ - if ( !(e->policy & engine_policy_fixdt) ) - c->kick1 = scheduler_addtask( s , task_type_kick1 , task_subtype_none , 0 , 0 , c , NULL , 0 ); - - } - - } - - /* Set the super-cell. */ - c->super = super; - - /* Recurse. */ - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - engine_mkghosts( e , c->progeny[k] , super ); - + +void engine_mkghosts(struct engine *e, struct cell *c, struct cell *super) { + + int k; + struct scheduler *s = &e->sched; + + /* Am I the super-cell? */ + if (super == NULL && c->nr_tasks > 0) { + + /* Remember me. */ + super = c; + + /* Local tasks only... */ + if (c->nodeID == e->nodeID) { + + /* Generate the ghost task. */ + c->ghost = scheduler_addtask(s, task_type_ghost, task_subtype_none, 0, 0, + c, NULL, 0); + + /* Add the kick2 task. */ + c->kick2 = scheduler_addtask(s, task_type_kick2, task_subtype_none, 0, 0, + c, NULL, 0); + + /* Add the kick1 task if needed. */ + if (!(e->policy & engine_policy_fixdt)) + c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, + 0, c, NULL, 0); } + } + /* Set the super-cell. */ + c->super = super; + + /* Recurse. */ + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) engine_mkghosts(e, c->progeny[k], super); +} /** * @brief Redistribute the particles amongst the nodes accorind @@ -151,432 +205,465 @@ void engine_mkghosts ( struct engine *e , struct cell *c , struct cell *super ) * * @param e The #engine. */ - -void engine_redistribute ( struct engine *e ) { + +void engine_redistribute(struct engine *e) { #ifdef WITH_MPI - int i, j, k, cid; - int nr_nodes = e->nr_nodes, nodeID = e->nodeID; - struct space *s = e->s; - int my_cells = 0; - int *cdim = s->cdim; - struct cell *cells = s->cells; - int nr_cells = s->nr_cells; - - /* Start by sorting the particles according to their nodes and - getting the counts. */ - int *counts, *dest; - struct part *parts = s->parts; - double ih[3], dim[3]; - ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2]; - dim[0] = s->dim[0]; dim[1] = s->dim[1]; dim[2] = s->dim[2]; - if ( ( counts = (int *)malloc( sizeof(int) * nr_nodes * nr_nodes ) ) == NULL || - ( dest = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL ) - error( "Failed to allocate count and dest buffers." ); - bzero( counts , sizeof(int) * nr_nodes * nr_nodes ); - for ( k = 0 ; k < s->nr_parts ; k++ ) { - for ( j = 0 ; j < 3 ; j++ ) { - if ( parts[k].x[j] < 0.0 ) parts[k].x[j] += dim[j]; - else if ( parts[k].x[j] >= dim[j] ) parts[k].x[j] -= dim[j]; - } - cid = cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ); - dest[k] = cells[ cid ].nodeID; - counts[ nodeID*nr_nodes + dest[k] ] += 1; - } - parts_sort( s->parts , s->xparts , dest , s->nr_parts , 0 , nr_nodes-1 ); - - /* Get all the counts from all the nodes. */ - if ( MPI_Allreduce( MPI_IN_PLACE , counts , nr_nodes * nr_nodes , MPI_INT , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to allreduce particle transfer counts." ); - - /* Get the new number of parts for this node, be generous in allocating. */ - int nr_parts = 0; - for ( k = 0 ; k < nr_nodes ; k++ ) - nr_parts += counts[ k*nr_nodes + nodeID ]; - struct part *parts_new; - struct xpart *xparts_new, *xparts = s->xparts; - if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * nr_parts * 1.2 ) != 0 || - posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * nr_parts * 1.2 ) != 0 ) - error( "Failed to allocate new part data." ); - - /* Emit the sends and recvs for the particle data. */ - MPI_Request *reqs; - if ( ( reqs = (MPI_Request *)malloc( sizeof(MPI_Request) * 4 * nr_nodes ) ) == NULL ) - error( "Failed to allocate MPI request list." ); - for ( k = 0 ; k < 4*nr_nodes ; k++ ) - reqs[k] = MPI_REQUEST_NULL; - for ( i = 0 , j = 0 , k = 0 ; k < nr_nodes ; k++ ) { - if ( k == nodeID && counts[ nodeID*nr_nodes + k ] > 0 ) { - memcpy( &parts_new[j] , &parts[i] , sizeof(struct part) * counts[ k*nr_nodes + nodeID ] ); - memcpy( &xparts_new[j] , &xparts[i] , sizeof(struct xpart) * counts[ k*nr_nodes + nodeID ] ); - i += counts[ nodeID*nr_nodes + k ]; - j += counts[ k*nr_nodes + nodeID ]; - } - if ( k != nodeID && counts[ nodeID*nr_nodes + k ] > 0 ) { - if ( MPI_Isend( &parts[i] , sizeof(struct part) * counts[ nodeID*nr_nodes + k ] , MPI_BYTE , k , 2*(nodeID*nr_nodes + k) + 0 , MPI_COMM_WORLD , &reqs[4*k] ) != MPI_SUCCESS ) - error( "Failed to isend parts to node %i." , k ); - if ( MPI_Isend( &xparts[i] , sizeof(struct xpart) * counts[ nodeID*nr_nodes + k ] , MPI_BYTE , k , 2*(nodeID*nr_nodes + k) + 1 , MPI_COMM_WORLD , &reqs[4*k+1] ) != MPI_SUCCESS ) - error( "Failed to isend xparts to node %i." , k ); - i += counts[ nodeID*nr_nodes + k ]; - } - if ( k != nodeID && counts[ k*nr_nodes + nodeID ] > 0 ) { - if ( MPI_Irecv( &parts_new[j] , sizeof(struct part) * counts[ k*nr_nodes + nodeID ] , MPI_BYTE , k , 2*(k*nr_nodes + nodeID) + 0 , MPI_COMM_WORLD , &reqs[4*k+2] ) != MPI_SUCCESS ) - error( "Failed to emit irecv of parts from node %i." , k ); - if ( MPI_Irecv( &xparts_new[j] , sizeof(struct xpart) * counts[ k*nr_nodes + nodeID ] , MPI_BYTE , k , 2*(k*nr_nodes + nodeID) + 1 , MPI_COMM_WORLD , &reqs[4*k+3] ) != MPI_SUCCESS ) - error( "Failed to emit irecv of parts from node %i." , k ); - j += counts[ k*nr_nodes + nodeID ]; - } - } - - /* Wait for all the sends and recvs to tumble in. */ - MPI_Status stats[4*nr_nodes]; - int res; - if ( ( res = MPI_Waitall( 4*nr_nodes , reqs , stats ) ) != MPI_SUCCESS ) { - for ( k = 0 ; k < 4*nr_nodes ; k++ ) { - char buff[ MPI_MAX_ERROR_STRING ]; - int res; - MPI_Error_string( stats[k].MPI_ERROR , buff , &res ); - message( "request %i has error '%s'." , k , buff ); - } - message( "counts is [ %i %i %i %i ]." , counts[0] , counts[1] , counts[2] , counts[3] ); - error( "Failed during waitall for part data." ); - } + int i, j, k, cid; + int nr_nodes = e->nr_nodes, nodeID = e->nodeID; + struct space *s = e->s; + int my_cells = 0; + int *cdim = s->cdim; + struct cell *cells = s->cells; + int nr_cells = s->nr_cells; + + /* Start by sorting the particles according to their nodes and + getting the counts. */ + int *counts, *dest; + struct part *parts = s->parts; + double ih[3], dim[3]; + ih[0] = s->ih[0]; + ih[1] = s->ih[1]; + ih[2] = s->ih[2]; + dim[0] = s->dim[0]; + dim[1] = s->dim[1]; + dim[2] = s->dim[2]; + if ((counts = (int *)malloc(sizeof(int) *nr_nodes *nr_nodes)) == NULL || + (dest = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL) + error("Failed to allocate count and dest buffers."); + bzero(counts, sizeof(int) * nr_nodes * nr_nodes); + for (k = 0; k < s->nr_parts; k++) { + for (j = 0; j < 3; j++) { + if (parts[k].x[j] < 0.0) + parts[k].x[j] += dim[j]; + else if (parts[k].x[j] >= dim[j]) + parts[k].x[j] -= dim[j]; + } + cid = cell_getid(cdim, parts[k].x[0] * ih[0], parts[k].x[1] * ih[1], + parts[k].x[2] * ih[2]); + dest[k] = cells[cid].nodeID; + counts[nodeID * nr_nodes + dest[k]] += 1; + } + parts_sort(s->parts, s->xparts, dest, s->nr_parts, 0, nr_nodes - 1); + + /* Get all the counts from all the nodes. */ + if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM, + MPI_COMM_WORLD) != MPI_SUCCESS) + error("Failed to allreduce particle transfer counts."); + + /* Get the new number of parts for this node, be generous in allocating. */ + int nr_parts = 0; + for (k = 0; k < nr_nodes; k++) nr_parts += counts[k * nr_nodes + nodeID]; + struct part *parts_new; + struct xpart *xparts_new, *xparts = s->xparts; + if (posix_memalign((void **)&parts_new, part_align, + sizeof(struct part) * nr_parts * 1.2) != 0 || + posix_memalign((void **)&xparts_new, part_align, + sizeof(struct xpart) * nr_parts * 1.2) != 0) + error("Failed to allocate new part data."); + + /* Emit the sends and recvs for the particle data. */ + MPI_Request *reqs; + if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 4 * nr_nodes)) == + NULL) + error("Failed to allocate MPI request list."); + for (k = 0; k < 4 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL; + for (i = 0, j = 0, k = 0; k < nr_nodes; k++) { + if (k == nodeID && counts[nodeID * nr_nodes + k] > 0) { + memcpy(&parts_new[j], &parts[i], + sizeof(struct part) * counts[k * nr_nodes + nodeID]); + memcpy(&xparts_new[j], &xparts[i], + sizeof(struct xpart) * counts[k * nr_nodes + nodeID]); + i += counts[nodeID * nr_nodes + k]; + j += counts[k * nr_nodes + nodeID]; + } + if (k != nodeID && counts[nodeID * nr_nodes + k] > 0) { + if (MPI_Isend(&parts[i], + sizeof(struct part) * counts[nodeID * nr_nodes + k], + MPI_BYTE, k, 2 * (nodeID * nr_nodes + k) + 0, + MPI_COMM_WORLD, &reqs[4 * k]) != MPI_SUCCESS) + error("Failed to isend parts to node %i.", k); + if (MPI_Isend(&xparts[i], + sizeof(struct xpart) * counts[nodeID * nr_nodes + k], + MPI_BYTE, k, 2 * (nodeID * nr_nodes + k) + 1, + MPI_COMM_WORLD, &reqs[4 * k + 1]) != MPI_SUCCESS) + error("Failed to isend xparts to node %i.", k); + i += counts[nodeID * nr_nodes + k]; + } + if (k != nodeID && counts[k * nr_nodes + nodeID] > 0) { + if (MPI_Irecv(&parts_new[j], + sizeof(struct part) * counts[k * nr_nodes + nodeID], + MPI_BYTE, k, 2 * (k * nr_nodes + nodeID) + 0, + MPI_COMM_WORLD, &reqs[4 * k + 2]) != MPI_SUCCESS) + error("Failed to emit irecv of parts from node %i.", k); + if (MPI_Irecv(&xparts_new[j], + sizeof(struct xpart) * counts[k * nr_nodes + nodeID], + MPI_BYTE, k, 2 * (k * nr_nodes + nodeID) + 1, + MPI_COMM_WORLD, &reqs[4 * k + 3]) != MPI_SUCCESS) + error("Failed to emit irecv of parts from node %i.", k); + j += counts[k * nr_nodes + nodeID]; + } + } + + /* Wait for all the sends and recvs to tumble in. */ + MPI_Status stats[4 * nr_nodes]; + int res; + if ((res = MPI_Waitall(4 * nr_nodes, reqs, stats)) != MPI_SUCCESS) { + for (k = 0; k < 4 * nr_nodes; k++) { + char buff[MPI_MAX_ERROR_STRING]; + int res; + MPI_Error_string(stats[k].MPI_ERROR, buff, &res); + message("request %i has error '%s'.", k, buff); + } + error("Failed during waitall for part data."); + } + + /* Verify that all parts are in the right place. */ + /* for ( k = 0 ; k < nr_parts ; k++ ) { + cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1] + , parts_new[k].x[2]*ih[2] ); + if ( cells[ cid ].nodeID != nodeID ) + error( "Received particle (%i) that does not belong here (nodeID=%i)." + , k , cells[ cid ].nodeID ); + } */ + + /* Set the new part data, free the old. */ + free(parts); + free(xparts); + s->parts = parts_new; + s->xparts = xparts_new; + s->nr_parts = nr_parts; + s->size_parts = 1.2 * nr_parts; + + /* Be verbose about what just happened. */ + for (k = 0; k < nr_cells; k++) + if (cells[k].nodeID == nodeID) my_cells += 1; + message("node %i now has %i parts in %i cells.", nodeID, nr_parts, my_cells); + + /* Clean up other stuff. */ + free(reqs); + free(counts); + free(dest); - /* Verify that all parts are in the right place. */ - /* for ( k = 0 ; k < nr_parts ; k++ ) { - cid = cell_getid( cdim , parts_new[k].x[0]*ih[0] , parts_new[k].x[1]*ih[1] , parts_new[k].x[2]*ih[2] ); - if ( cells[ cid ].nodeID != nodeID ) - error( "Received particle (%i) that does not belong here (nodeID=%i)." , k , cells[ cid ].nodeID ); - } */ - - /* Set the new part data, free the old. */ - free( parts ); - free( xparts ); - s->parts = parts_new; - s->xparts = xparts_new; - s->nr_parts = nr_parts; - s->size_parts = 1.2*nr_parts; - - /* Be verbose about what just happened. */ - for ( k = 0 ; k < nr_cells ; k++ ) - if ( cells[k].nodeID == nodeID ) - my_cells += 1; - message( "node %i now has %i parts in %i cells." , nodeID , nr_parts , my_cells ); - - /* Clean up other stuff. */ - free( reqs ); - free( counts ); - free( dest ); - #else - error( "SWIFT was not compiled with MPI and METIS support." ); + error("SWIFT was not compiled with MPI and METIS support."); #endif - - } - +} /** * @brief Repartition the cells amongst the nodes. * * @param e The #engine. */ - -void engine_repartition ( struct engine *e ) { + +void engine_repartition(struct engine *e) { #if defined(WITH_MPI) && defined(HAVE_METIS) - int i, j, k, l, cid, cjd, ii, jj, kk, res, w; - idx_t *inds, *nodeIDs; - idx_t *weights_v, *weights_e; - struct space *s = e->s; - int nr_cells = s->nr_cells, my_cells = 0; - struct cell *cells = s->cells; - int ind[3], *cdim = s->cdim; - struct task *t, *tasks = e->sched.tasks; - struct cell *ci, *cj; - int nr_nodes = e->nr_nodes, nodeID = e->nodeID; - float wscale = 1.0, vscale = 1e-3, wscale_buff; - idx_t wtot = 0; - const idx_t wmax = 1e9 / e->nr_nodes; - - /* Clear the repartition flag. */ - e->forcerepart = 0; - - /* Allocate the inds and weights. */ - if ( ( inds = (idx_t *)malloc( sizeof(idx_t) * 26*nr_cells ) ) == NULL || - ( weights_v = (idx_t *)malloc( sizeof(idx_t) * nr_cells ) ) == NULL || - ( weights_e = (idx_t *)malloc( sizeof(idx_t) * 26*nr_cells ) ) == NULL || - ( nodeIDs = (idx_t *)malloc( sizeof(idx_t) * nr_cells ) ) == NULL ) - error( "Failed to allocate inds and weights arrays." ); - - /* Fill the inds array. */ - for ( cid = 0 ; cid < nr_cells ; cid++ ) { - ind[0] = cells[cid].loc[0] / s->cells[cid].h[0] + 0.5; - ind[1] = cells[cid].loc[1] / s->cells[cid].h[1] + 0.5; - ind[2] = cells[cid].loc[2] / s->cells[cid].h[2] + 0.5; - l = 0; - for ( i = -1 ; i <= 1 ; i++ ) { - ii = ind[0] + i; - if ( ii < 0 ) ii += cdim[0]; - else if ( ii >= cdim[0] ) ii -= cdim[0]; - for ( j = -1 ; j <= 1 ; j++ ) { - jj = ind[1] + j; - if ( jj < 0 ) jj += cdim[1]; - else if ( jj >= cdim[1] ) jj -= cdim[1]; - for ( k = -1 ; k <= 1 ; k++ ) { - kk = ind[2] + k; - if ( kk < 0 ) kk += cdim[2]; - else if ( kk >= cdim[2] ) kk -= cdim[2]; - if ( i || j || k ) { - inds[ cid*26 + l ] = cell_getid( cdim , ii , jj , kk ); - l += 1; - } - } - } - } - } - - /* Init the weights arrays. */ - bzero( weights_e , sizeof(idx_t) * 26*nr_cells ); - bzero( weights_v , sizeof(idx_t) * nr_cells ); - - /* Loop over the tasks... */ - for ( j = 0 ; j < e->sched.nr_tasks ; j++ ) { - - /* Get a pointer to the kth task. */ - t = &tasks[j]; - - /* Skip un-interesting tasks. */ - if ( t->type != task_type_self && - t->type != task_type_pair && - t->type != task_type_sub && - t->type != task_type_ghost && - t->type != task_type_kick1 && - t->type != task_type_kick2 ) - continue; - - /* Get the task weight. */ - w = ( t->toc - t->tic ) * wscale; - if ( w < 0 ) - error( "Bad task weight (%i)." , w ); - - /* Do we need to re-scale? */ - wtot += w; - if (wtot > wmax) { - wscale /= 2; - wtot /= 2; - for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; - for (k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; + int i, j, k, l, cid, cjd, ii, jj, kk, res; + idx_t *inds, *nodeIDs; + idx_t *weights_v, *weights_e; + struct space *s = e->s; + int nr_cells = s->nr_cells, my_cells = 0; + struct cell *cells = s->cells; + int ind[3], *cdim = s->cdim; + struct task *t, *tasks = e->sched.tasks; + struct cell *ci, *cj; + int nr_nodes = e->nr_nodes, nodeID = e->nodeID; + float wscale = 1e-3, vscale = 1e-3, wscale_buff; + idx_t wtot = 0; + const idx_t wmax = 1e9 / e->nr_nodes; + + /* Clear the repartition flag. */ + e->forcerepart = 0; + + /* Allocate the inds and weights. */ + if ((inds = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL || + (weights_v = (idx_t *)malloc(sizeof(idx_t) *nr_cells)) == NULL || + (weights_e = (idx_t *)malloc(sizeof(idx_t) * 26 *nr_cells)) == NULL || + (nodeIDs = (idx_t *)malloc(sizeof(idx_t) * nr_cells)) == NULL) + error("Failed to allocate inds and weights arrays."); + + /* Fill the inds array. */ + for (cid = 0; cid < nr_cells; cid++) { + ind[0] = cells[cid].loc[0] / s->cells[cid].h[0] + 0.5; + ind[1] = cells[cid].loc[1] / s->cells[cid].h[1] + 0.5; + ind[2] = cells[cid].loc[2] / s->cells[cid].h[2] + 0.5; + l = 0; + for (i = -1; i <= 1; i++) { + ii = ind[0] + i; + if (ii < 0) + ii += cdim[0]; + else if (ii >= cdim[0]) + ii -= cdim[0]; + for (j = -1; j <= 1; j++) { + jj = ind[1] + j; + if (jj < 0) + jj += cdim[1]; + else if (jj >= cdim[1]) + jj -= cdim[1]; + for (k = -1; k <= 1; k++) { + kk = ind[2] + k; + if (kk < 0) + kk += cdim[2]; + else if (kk >= cdim[2]) + kk -= cdim[2]; + if (i || j || k) { + inds[cid * 26 + l] = cell_getid(cdim, ii, jj, kk); + l += 1; + } } - - /* Get the top-level cells involved. */ - for ( ci = t->ci ; ci->parent != NULL ; ci = ci->parent ); - if ( t->cj != NULL ) - for ( cj = t->cj ; cj->parent != NULL ; cj = cj->parent ); - else - cj = NULL; - - /* Get the cell IDs. */ - cid = ci - cells; - - /* Different weights for different tasks. */ - if ( t->type == task_type_ghost || - t->type == task_type_kick1 || - t->type == task_type_kick2 ) { - - /* Particle updates add only to vertex weight. */ - weights_v[cid] += w; - - } - - /* Self interaction? */ - else if ( ( t->type == task_type_self && ci->nodeID == nodeID ) || - ( t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID ) ) { - - /* Self interactions add only to vertex weight. */ - weights_v[cid] += w; - - } - - /* Pair? */ - else if ( t->type == task_type_pair || - ( t->type == task_type_sub && cj != NULL ) ) { - - /* In-cell pair? */ - if ( ci == cj ) { - - /* Add weight to vertex for ci. */ - weights_v[cid] += w; - - } - - /* Distinct cells with local ci? */ - else if ( ci->nodeID == nodeID ) { - - /* Index of the jth cell. */ - cjd = cj - cells; - - /* Add half of weight to each cell. */ - if ( ci->nodeID == nodeID ) - weights_v[cid] += 0.5 * w; - if ( cj->nodeID == nodeID ) - weights_v[cjd] += 0.5 * w; - - /* Add Weight to edge. */ - for ( k = 26*cid ; inds[k] != cjd ; k++ ); - weights_e[ k ] += w; - for ( k = 26*cjd ; inds[k] != cid ; k++ ); - weights_e[ k ] += w; - - } - - } - - } - - /* Get the minimum scaling and re-scale if necessary. */ - if ( ( res = MPI_Allreduce( &wscale , &wscale_buff , 1 , MPI_FLOAT , MPI_MIN , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) { - char buff[ MPI_MAX_ERROR_STRING ]; - MPI_Error_string( res , buff , &i ); - error( "Failed to allreduce the weight scales (%s)." , buff ); + } + } + } + + /* Init the weights arrays. */ + bzero(weights_e, sizeof(idx_t) * 26 * nr_cells); + bzero(weights_v, sizeof(idx_t) * nr_cells); + + /* Loop over the tasks... */ + for (j = 0; j < e->sched.nr_tasks; j++) { + + /* Get a pointer to the kth task. */ + t = &tasks[j]; + + /* Skip un-interesting tasks. */ + if (t->type != task_type_self && t->type != task_type_pair && + t->type != task_type_sub && t->type != task_type_ghost && + t->type != task_type_kick1 && t->type != task_type_kick2) + continue; + + /* Get the task weight. */ + idx_t w = (t->toc - t->tic) * wscale; + if (w < 0) error("Bad task weight (%i).", w); + + /* Do we need to re-scale? */ + wtot += w; + while (wtot > wmax) { + wscale /= 2; + wtot /= 2; + w /= 2; + for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= 0.5; + for (k = 0; k < nr_cells; k++) weights_v[k] *= 0.5; + } + + /* Get the top-level cells involved. */ + for (ci = t->ci; ci->parent != NULL; ci = ci->parent) + ; + if (t->cj != NULL) + for (cj = t->cj; cj->parent != NULL; cj = cj->parent) + ; + else + cj = NULL; + + /* Get the cell IDs. */ + cid = ci - cells; + + /* Different weights for different tasks. */ + if (t->type == task_type_ghost || t->type == task_type_kick1 || + t->type == task_type_kick2) { + + /* Particle updates add only to vertex weight. */ + weights_v[cid] += w; + } - if (wscale_buff != wscale) { - float scale = wscale_buff / wscale; - for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; - for (k = 0; k < nr_cells; k++) weights_v[k] *= scale; + + /* Self interaction? */ + else if ((t->type == task_type_self && ci->nodeID == nodeID) || + (t->type == task_type_sub && cj == NULL && ci->nodeID == nodeID)) { + + /* Self interactions add only to vertex weight. */ + weights_v[cid] += w; + + } + + /* Pair? */ + else if (t->type == task_type_pair || + (t->type == task_type_sub && cj != NULL)) { + + /* In-cell pair? */ + if (ci == cj) { + + /* Add weight to vertex for ci. */ + weights_v[cid] += w; + + } + + /* Distinct cells with local ci? */ + else if (ci->nodeID == nodeID) { + + /* Index of the jth cell. */ + cjd = cj - cells; + + /* Add half of weight to each cell. */ + if (ci->nodeID == nodeID) weights_v[cid] += 0.5 * w; + if (cj->nodeID == nodeID) weights_v[cjd] += 0.5 * w; + + /* Add Weight to edge. */ + for (k = 26 * cid; inds[k] != cjd; k++) + ; + weights_e[k] += w; + for (k = 26 * cjd; inds[k] != cid; k++) + ; + weights_e[k] += w; + } } - - /* Merge the weights arrays accross all nodes. */ -#if IDXTYPEWIDTH==32 - if ( ( res = MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_v , weights_v , nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) { + } + + /* Get the minimum scaling and re-scale if necessary. */ + if ((res = MPI_Allreduce(&wscale, &wscale_buff, 1, MPI_FLOAT, MPI_MIN, + MPI_COMM_WORLD)) != MPI_SUCCESS) { + char buff[MPI_MAX_ERROR_STRING]; + MPI_Error_string(res, buff, &i); + error("Failed to allreduce the weight scales (%s).", buff); + } + if (wscale_buff != wscale) { + float scale = wscale_buff / wscale; + for (k = 0; k < 26 * nr_cells; k++) weights_e[k] *= scale; + for (k = 0; k < nr_cells; k++) weights_v[k] *= scale; + } + +/* Merge the weights arrays accross all nodes. */ +#if IDXTYPEWIDTH == 32 + if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, + nr_cells, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD)) != + MPI_SUCCESS) { #else - if ( ( res = MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_v , weights_v , nr_cells , MPI_LONG_LONG_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) ) != MPI_SUCCESS ) { + if ((res = MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_v, weights_v, + nr_cells, MPI_LONG_LONG_INT, MPI_SUM, 0, + MPI_COMM_WORLD)) != MPI_SUCCESS) { #endif - char buff[ MPI_MAX_ERROR_STRING ]; - MPI_Error_string( res , buff , &i ); - error( "Failed to allreduce vertex weights (%s)." , buff ); - } -#if IDXTYPEWIDTH==32 - if ( MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_e , weights_e , 26*nr_cells , MPI_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS ) + char buff[MPI_MAX_ERROR_STRING]; + MPI_Error_string(res, buff, &i); + error("Failed to allreduce vertex weights (%s).", buff); + } +#if IDXTYPEWIDTH == 32 + if (MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, + 26 * nr_cells, MPI_INT, MPI_SUM, 0, + MPI_COMM_WORLD) != MPI_SUCCESS) #else - if ( MPI_Reduce( ( nodeID == 0 ) ? MPI_IN_PLACE : weights_e , weights_e , 26*nr_cells , MPI_LONG_LONG_INT , MPI_SUM , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS ) + if (MPI_Reduce((nodeID == 0) ? MPI_IN_PLACE : weights_e, weights_e, + 26 * nr_cells, MPI_LONG_LONG_INT, MPI_SUM, 0, + MPI_COMM_WORLD) != MPI_SUCCESS) #endif - error( "Failed to allreduce edge weights." ); - - /* As of here, only one node needs to compute the partition. */ - if ( nodeID == 0 ) { - - /* Check that the edge weights are fully symmetric. */ - /* for ( cid = 0 ; cid < nr_cells ; cid++ ) - for ( k = 0 ; k < 26 ; k++ ) { - cjd = inds[ cid*26 + k ]; - for ( j = 26*cjd ; inds[j] != cid ; j++ ); - if ( weights_e[ cid*26+k ] != weights_e[ j ] ) - error( "Unsymmetric edge weights detected (%i vs %i)." , weights_e[ cid*26+k ] , weights_e[ j ] ); - } */ - /* int w_min = weights_e[0], w_max = weights_e[0], w_tot = weights_e[0]; - for ( k = 1 ; k < 26*nr_cells ; k++ ) { - w_tot += weights_e[k]; - if ( weights_e[k] < w_min ) - w_min = weights_e[k]; - else if ( weights_e[k] > w_max ) - w_max = weights_e[k]; - } - message( "edge weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); - w_min = weights_e[0], w_max = weights_e[0]; w_tot = weights_v[0]; - for ( k = 1 ; k < nr_cells ; k++ ) { - w_tot += weights_v[k]; - if ( weights_v[k] < w_min ) - w_min = weights_v[k]; - else if ( weights_v[k] > w_max ) - w_max = weights_v[k]; - } - message( "vertex weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); */ - - /* Make sure there are no zero weights. */ - for ( k = 0 ; k < 26*nr_cells ; k++ ) - if ( weights_e[k] == 0 ) - weights_e[k] = 1; - for ( k = 0 ; k < nr_cells ; k++ ) - if ( ( weights_v[k] *= vscale ) == 0 ) - weights_v[k] = 1; - - /* Allocate and fill the connection array. */ - idx_t *offsets; - if ( ( offsets = (idx_t *)malloc( sizeof(idx_t) * (nr_cells + 1) ) ) == NULL ) - error( "Failed to allocate offsets buffer." ); - offsets[0] = 0; - for ( k = 0 ; k < nr_cells ; k++ ) - offsets[k+1] = offsets[k] + 26; - - /* Set the METIS options. */ - idx_t options[METIS_NOPTIONS]; - METIS_SetDefaultOptions( options ); - options[ METIS_OPTION_OBJTYPE ] = METIS_OBJTYPE_CUT; - options[ METIS_OPTION_NUMBERING ] = 0; - options[ METIS_OPTION_CONTIG ] = 1; - options[ METIS_OPTION_NCUTS ] = 10; - options[ METIS_OPTION_NITER ] = 20; - // options[ METIS_OPTION_UFACTOR ] = 1; - - /* Set the initial partition, although this is probably ignored. */ - for ( k = 0 ; k < nr_cells ; k++ ) - nodeIDs[k] = cells[k].nodeID; - - /* Call METIS. */ - idx_t one = 1, idx_nr_cells = nr_cells, idx_nr_nodes = nr_nodes; - idx_t objval; - if ( METIS_PartGraphRecursive( &idx_nr_cells , &one , offsets , inds , weights_v , NULL , weights_e , &idx_nr_nodes , NULL , NULL , options , &objval , nodeIDs ) != METIS_OK ) - error( "Call to METIS_PartGraphKway failed." ); - - /* Dump the 3d array of cell IDs. */ - /* printf( "engine_repartition: nodeIDs = reshape( [" ); - for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ ) - printf( "%i " , (int)nodeIDs[ i ] ); - printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */ - + error("Failed to allreduce edge weights."); + + /* As of here, only one node needs to compute the partition. */ + if (nodeID == 0) { + + /* Check that the edge weights are fully symmetric. */ + /* for ( cid = 0 ; cid < nr_cells ; cid++ ) + for ( k = 0 ; k < 26 ; k++ ) { + cjd = inds[ cid*26 + k ]; + for ( j = 26*cjd ; inds[j] != cid ; j++ ); + if ( weights_e[ cid*26+k ] != weights_e[ j ] ) + error( "Unsymmetric edge weights detected (%i vs %i)." , + weights_e[ cid*26+k ] , weights_e[ j ] ); + } */ + /* int w_min = weights_e[0], w_max = weights_e[0], w_tot = weights_e[0]; + for ( k = 1 ; k < 26*nr_cells ; k++ ) { + w_tot += weights_e[k]; + if ( weights_e[k] < w_min ) + w_min = weights_e[k]; + else if ( weights_e[k] > w_max ) + w_max = weights_e[k]; } - - /* Broadcast the result of the partition. */ - if ( MPI_Bcast( nodeIDs , nr_cells , MPI_INT , 0 , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to bcast the node IDs." ); - - /* Set the cell nodeIDs and clear any non-local parts. */ - for ( k = 0 ; k < nr_cells ; k++ ) { - cells[k].nodeID = nodeIDs[k]; - if ( nodeIDs[k] == nodeID ) - my_cells += 1; + message( "edge weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); + w_min = weights_e[0], w_max = weights_e[0]; w_tot = weights_v[0]; + for ( k = 1 ; k < nr_cells ; k++ ) { + w_tot += weights_v[k]; + if ( weights_v[k] < w_min ) + w_min = weights_v[k]; + else if ( weights_v[k] > w_max ) + w_max = weights_v[k]; } - - /* Clean up. */ - free( inds ); - free( weights_v ); - free( weights_e ); - free( nodeIDs ); - - /* Now comes the tricky part: Exchange particles between all nodes. - This is done in two steps, first allreducing a matrix of - how many particles go from where to where, then re-allocating - the parts array, and emiting the sends and receives. - Finally, the space, tasks, and proxies need to be rebuilt. */ - - /* Redistribute the particles between the nodes. */ - engine_redistribute( e ); - - /* Make the proxies. */ - engine_makeproxies( e ); - - /* Tell the engine it should re-build whenever possible */ - e->forcerebuild = 1; - + message( "vertex weights in [ %i , %i ], tot=%i." , w_min , w_max , w_tot ); + */ + + /* Make sure there are no zero weights. */ + for (k = 0; k < 26 * nr_cells; k++) + if (weights_e[k] == 0) weights_e[k] = 1; + for (k = 0; k < nr_cells; k++) + if ((weights_v[k] *= vscale) == 0) weights_v[k] = 1; + + /* Allocate and fill the connection array. */ + idx_t *offsets; + if ((offsets = (idx_t *)malloc(sizeof(idx_t) * (nr_cells + 1))) == NULL) + error("Failed to allocate offsets buffer."); + offsets[0] = 0; + for (k = 0; k < nr_cells; k++) offsets[k + 1] = offsets[k] + 26; + + /* Set the METIS options. +1 to keep the GCC sanitizer happy. */ + idx_t options[METIS_NOPTIONS + 1]; + METIS_SetDefaultOptions(options); + options[METIS_OPTION_OBJTYPE] = METIS_OBJTYPE_CUT; + options[METIS_OPTION_NUMBERING] = 0; + options[METIS_OPTION_CONTIG] = 1; + options[METIS_OPTION_NCUTS] = 10; + options[METIS_OPTION_NITER] = 20; + // options[ METIS_OPTION_UFACTOR ] = 1; + + /* Set the initial partition, although this is probably ignored. */ + for (k = 0; k < nr_cells; k++) nodeIDs[k] = cells[k].nodeID; + + /* Call METIS. */ + idx_t one = 1, idx_nr_cells = nr_cells, idx_nr_nodes = nr_nodes; + idx_t objval; + if (METIS_PartGraphRecursive(&idx_nr_cells, &one, offsets, inds, weights_v, + NULL, weights_e, &idx_nr_nodes, NULL, NULL, + options, &objval, nodeIDs) != METIS_OK) + error("Call to METIS_PartGraphKway failed."); + + /* Dump the 3d array of cell IDs. */ + /* printf( "engine_repartition: nodeIDs = reshape( [" ); + for ( i = 0 ; i < cdim[0]*cdim[1]*cdim[2] ; i++ ) + printf( "%i " , (int)nodeIDs[ i ] ); + printf("] ,%i,%i,%i);\n",cdim[0],cdim[1],cdim[2]); */ + } + +/* Broadcast the result of the partition. */ +#if IDXTYPEWIDTH == 32 + if (MPI_Bcast(nodeIDs, nr_cells, MPI_INT, 0, MPI_COMM_WORLD) != MPI_SUCCESS) + error("Failed to bcast the node IDs."); #else - error( "SWIFT was not compiled with MPI and METIS support." ); + if (MPI_Bcast(nodeIDs, nr_cells, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD) != + MPI_SUCCESS) + error("Failed to bcast the node IDs."); #endif - } - - + /* Set the cell nodeIDs and clear any non-local parts. */ + for (k = 0; k < nr_cells; k++) { + cells[k].nodeID = nodeIDs[k]; + if (nodeIDs[k] == nodeID) my_cells += 1; + } + + /* Clean up. */ + free(inds); + free(weights_v); + free(weights_e); + free(nodeIDs); + + /* Now comes the tricky part: Exchange particles between all nodes. + This is done in two steps, first allreducing a matrix of + how many particles go from where to where, then re-allocating + the parts array, and emiting the sends and receives. + Finally, the space, tasks, and proxies need to be rebuilt. */ + + /* Redistribute the particles between the nodes. */ + engine_redistribute(e); + + /* Make the proxies. */ + engine_makeproxies(e); + + /* Tell the engine it should re-build whenever possible */ + e->forcerebuild = 1; + +#else + error("SWIFT was not compiled with MPI and METIS support."); +#endif +} + /** * @brief Add up/down gravity tasks to a cell hierarchy. * @@ -585,21 +672,20 @@ void engine_repartition ( struct engine *e ) { * @param up The upward gravity #task. * @param down The downward gravity #task. */ - -void engine_addtasks_grav ( struct engine *e , struct cell *c , struct task *up , struct task *down ) { - - /* Link the tasks to this cell. */ - c->grav_up = up; - c->grav_down = down; - - /* Recurse? */ - if ( c->split ) - for ( int k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - engine_addtasks_grav( e , c->progeny[k] , up , down ); - } +void engine_addtasks_grav(struct engine *e, struct cell *c, struct task *up, + struct task *down) { + /* Link the tasks to this cell. */ + c->grav_up = up; + c->grav_down = down; + + /* Recurse? */ + if (c->split) + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) + engine_addtasks_grav(e, c->progeny[k], up, down); +} /** * @brief Add send tasks to a hierarchy of cells. @@ -609,44 +695,45 @@ void engine_addtasks_grav ( struct engine *e , struct cell *c , struct task *up * @param cj The receiving #cell */ -void engine_addtasks_send ( struct engine *e , struct cell *ci , struct cell *cj ) { +void engine_addtasks_send(struct engine *e, struct cell *ci, struct cell *cj) { - int k; - struct link *l = NULL; - struct scheduler *s = &e->sched; + int k; + struct link *l = NULL; + struct scheduler *s = &e->sched; - /* Check if any of the density tasks are for the target node. */ - for ( l = ci->density ; l != NULL ; l = l->next ) - if ( l->t->ci->nodeID == cj->nodeID || - ( l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID ) ) - break; + /* Check if any of the density tasks are for the target node. */ + for (l = ci->density; l != NULL; l = l->next) + if (l->t->ci->nodeID == cj->nodeID || + (l->t->cj != NULL && l->t->cj->nodeID == cj->nodeID)) + break; - /* If so, attach send tasks. */ - if ( l != NULL ) { + /* If so, attach send tasks. */ + if (l != NULL) { - /* Create the tasks. */ - struct task *t_xv = scheduler_addtask( &e->sched , task_type_send , task_subtype_none , 2*ci->tag , 0 , ci , cj , 0 ); - struct task *t_rho = scheduler_addtask( &e->sched , task_type_send , task_subtype_none , 2*ci->tag + 1 , 0 , ci , cj , 0 ); + /* Create the tasks. */ + struct task *t_xv = + scheduler_addtask(&e->sched, task_type_send, task_subtype_none, + 2 * ci->tag, 0, ci, cj, 0); + struct task *t_rho = + scheduler_addtask(&e->sched, task_type_send, task_subtype_none, + 2 * ci->tag + 1, 0, ci, cj, 0); - /* The send_rho task depends on the cell's ghost task. */ - scheduler_addunlock( s , ci->super->ghost , t_rho ); + /* The send_rho task depends on the cell's ghost task. */ + scheduler_addunlock(s, ci->super->ghost, t_rho); - /* The send_rho task should unlock the super-cell's kick2 task. */ - scheduler_addunlock( s , t_rho , ci->super->kick2 ); + /* The send_rho task should unlock the super-cell's kick2 task. */ + scheduler_addunlock(s, t_rho, ci->super->kick2); - /* The send_xv task should unlock the super-cell's ghost task. */ - scheduler_addunlock( s , t_xv , ci->super->ghost ); + /* The send_xv task should unlock the super-cell's ghost task. */ + scheduler_addunlock(s, t_xv, ci->super->ghost); - } - - /* Recurse? */ - else if ( ci->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) - engine_addtasks_send( e , ci->progeny[k] , cj ); - - } + } + /* Recurse? */ + else if (ci->split) + for (k = 0; k < 8; k++) + if (ci->progeny[k] != NULL) engine_addtasks_send(e, ci->progeny[k], cj); +} /** * @brief Add recv tasks to a hierarchy of cells. @@ -657,796 +744,799 @@ void engine_addtasks_send ( struct engine *e , struct cell *ci , struct cell *cj * @param t_rho The recv_rho #task, if it has already been created. */ -void engine_addtasks_recv ( struct engine *e , struct cell *c , struct task *t_xv , struct task *t_rho ) { - - int k; - struct scheduler *s = &e->sched; - - /* Do we need to construct a recv task? */ - if ( t_xv == NULL && c->nr_density > 0 ) { - - /* Create the tasks. */ - t_xv = c->recv_xv = scheduler_addtask( &e->sched , task_type_recv , task_subtype_none , 2*c->tag , 0 , c , NULL , 0 ); - t_rho = c->recv_rho = scheduler_addtask( &e->sched , task_type_recv , task_subtype_none , 2*c->tag + 1 , 0 , c , NULL , 0 ); - - } - - /* Add dependencies. */ - for ( struct link *l = c->density ; l != NULL ; l = l->next ) { - scheduler_addunlock( s , t_xv , l->t ); - scheduler_addunlock( s , l->t , t_rho ); - } - for ( struct link *l = c->force ; l != NULL ; l = l->next ) - scheduler_addunlock( s , t_rho , l->t ); - if ( c->sorts != NULL ) - scheduler_addunlock( s , t_xv , c->sorts ); - - /* Recurse? */ - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - engine_addtasks_recv( e , c->progeny[k] , t_xv , t_rho ); - - } - +void engine_addtasks_recv(struct engine *e, struct cell *c, struct task *t_xv, + struct task *t_rho) { + + int k; + struct scheduler *s = &e->sched; + + /* Do we need to construct a recv task? */ + if (t_xv == NULL && c->nr_density > 0) { + + /* Create the tasks. */ + t_xv = c->recv_xv = + scheduler_addtask(&e->sched, task_type_recv, task_subtype_none, + 2 * c->tag, 0, c, NULL, 0); + t_rho = c->recv_rho = + scheduler_addtask(&e->sched, task_type_recv, task_subtype_none, + 2 * c->tag + 1, 0, c, NULL, 0); + } + + /* Add dependencies. */ + for (struct link *l = c->density; l != NULL; l = l->next) { + scheduler_addunlock(s, t_xv, l->t); + scheduler_addunlock(s, l->t, t_rho); + } + for (struct link *l = c->force; l != NULL; l = l->next) + scheduler_addunlock(s, t_rho, l->t); + if (c->sorts != NULL) scheduler_addunlock(s, t_xv, c->sorts); + + /* Recurse? */ + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) + engine_addtasks_recv(e, c->progeny[k], t_xv, t_rho); +} /** * @brief Exchange cell structures with other nodes. * * @param e The #engine. */ - -void engine_exchange_cells ( struct engine *e ) { -#ifdef WITH_MPI +void engine_exchange_cells(struct engine *e) { - int j, k, pid, count = 0; - struct pcell *pcells; - struct space *s = e->s; - struct cell *cells = s->cells; - int nr_cells = s->nr_cells; - int nr_proxies = e->nr_proxies; - int offset[ nr_cells ]; - MPI_Request reqs_in[ engine_maxproxies ]; - MPI_Request reqs_out[ engine_maxproxies ]; - MPI_Status status; - struct part *parts = &s->parts[ s->nr_parts ]; - - /* Run through the cells and get the size of the ones that will be sent off. */ - for ( k = 0 ; k < nr_cells ; k++ ) { - offset[k] = count; - if ( cells[k].sendto ) - count += ( cells[k].pcell_size = cell_getsize( &cells[k] ) ); - } - - /* Allocate the pcells. */ - if ( ( pcells = (struct pcell *)malloc( sizeof(struct pcell) * count ) ) == NULL ) - error( "Failed to allocate pcell buffer." ); - - /* Pack the cells. */ - cell_next_tag = 0; - for ( k = 0 ; k < nr_cells ; k++ ) - if ( cells[k].sendto ) { - cell_pack( &cells[k] , &pcells[ offset[k] ] ); - cells[k].pcell = &pcells[ offset[k] ]; - } +#ifdef WITH_MPI - /* Launch the proxies. */ - for ( k = 0 ; k < nr_proxies ; k++ ) { - proxy_cells_exch1( &e->proxies[k] ); - reqs_in[k] = e->proxies[k].req_cells_count_in; - reqs_out[k] = e->proxies[k].req_cells_count_out; - } - - /* Wait for each count to come in and start the recv. */ - for ( k = 0 ; k < nr_proxies ; k++ ) { - if ( MPI_Waitany( nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS || - pid == MPI_UNDEFINED ) - error( "MPI_Waitany failed." ); - // message( "request from proxy %i has arrived." , pid ); - proxy_cells_exch2( &e->proxies[pid] ); - } - - /* Wait for all the sends to have finnished too. */ - if ( MPI_Waitall( nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS ) - error( "MPI_Waitall on sends failed." ); - - /* Set the requests for the cells. */ - for ( k = 0 ; k < nr_proxies ; k++ ) { - reqs_in[k] = e->proxies[k].req_cells_in; - reqs_out[k] = e->proxies[k].req_cells_out; - } - - /* Wait for each pcell array to come in from the proxies. */ - for ( k = 0 ; k < nr_proxies ; k++ ) { - if ( MPI_Waitany( nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS || - pid == MPI_UNDEFINED ) - error( "MPI_Waitany failed." ); - // message( "cell data from proxy %i has arrived." , pid ); - for ( count = 0 , j = 0 ; j < e->proxies[pid].nr_cells_in ; j++ ) - count += cell_unpack( &e->proxies[pid].pcells_in[count] , e->proxies[pid].cells_in[j] , e->s ); - } - - /* Wait for all the sends to have finnished too. */ - if ( MPI_Waitall( nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS ) - error( "MPI_Waitall on sends failed." ); - - /* Count the number of particles we need to import and re-allocate - the buffer if needed. */ - for ( count = 0 , k = 0 ; k < nr_proxies ; k++ ) - for ( j = 0 ; j < e->proxies[k].nr_cells_in ; j++ ) - count += e->proxies[k].cells_in[j]->count; - if ( count > s->size_parts_foreign ) { - if ( s->parts_foreign != NULL ) - free( s->parts_foreign ); - s->size_parts_foreign = 1.1 * count; - if ( posix_memalign( (void **)&s->parts_foreign , part_align , sizeof(struct part) * s->size_parts_foreign ) != 0 ) - error( "Failed to allocate foreign part data." ); - } - - /* Unpack the cells and link to the particle data. */ - parts = s->parts_foreign; - for ( k = 0 ; k < nr_proxies ; k++ ) { - for ( count = 0 , j = 0 ; j < e->proxies[k].nr_cells_in ; j++ ) { - count += cell_link( e->proxies[k].cells_in[j] , parts ); - parts = &parts[ e->proxies[k].cells_in[j]->count ]; - } - } - s->nr_parts_foreign = parts - s->parts_foreign; - - /* Is the parts buffer large enough? */ - if ( s->nr_parts_foreign > s->size_parts_foreign ) - error( "Foreign parts buffer too small." ); - - /* Free the pcell buffer. */ - free( pcells ); - -#else - error( "SWIFT was not compiled with MPI support." ); -#endif + int j, k, pid, count = 0; + struct pcell *pcells; + struct space *s = e->s; + struct cell *cells = s->cells; + int nr_cells = s->nr_cells; + int nr_proxies = e->nr_proxies; + int offset[nr_cells]; + MPI_Request reqs_in[engine_maxproxies]; + MPI_Request reqs_out[engine_maxproxies]; + MPI_Status status; + struct part *parts = &s->parts[s->nr_parts]; + + /* Run through the cells and get the size of the ones that will be sent off. + */ + for (k = 0; k < nr_cells; k++) { + offset[k] = count; + if (cells[k].sendto) + count += (cells[k].pcell_size = cell_getsize(&cells[k])); + } + + /* Allocate the pcells. */ + if ((pcells = (struct pcell *)malloc(sizeof(struct pcell) * count)) == NULL) + error("Failed to allocate pcell buffer."); + + /* Pack the cells. */ + cell_next_tag = 0; + for (k = 0; k < nr_cells; k++) + if (cells[k].sendto) { + cell_pack(&cells[k], &pcells[offset[k]]); + cells[k].pcell = &pcells[offset[k]]; + } + /* Launch the proxies. */ + for (k = 0; k < nr_proxies; k++) { + proxy_cells_exch1(&e->proxies[k]); + reqs_in[k] = e->proxies[k].req_cells_count_in; + reqs_out[k] = e->proxies[k].req_cells_count_out; + } + + /* Wait for each count to come in and start the recv. */ + for (k = 0; k < nr_proxies; k++) { + if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS || + pid == MPI_UNDEFINED) + error("MPI_Waitany failed."); + // message( "request from proxy %i has arrived." , pid ); + proxy_cells_exch2(&e->proxies[pid]); + } + + /* Wait for all the sends to have finnished too. */ + if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS) + error("MPI_Waitall on sends failed."); + + /* Set the requests for the cells. */ + for (k = 0; k < nr_proxies; k++) { + reqs_in[k] = e->proxies[k].req_cells_in; + reqs_out[k] = e->proxies[k].req_cells_out; + } + + /* Wait for each pcell array to come in from the proxies. */ + for (k = 0; k < nr_proxies; k++) { + if (MPI_Waitany(nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS || + pid == MPI_UNDEFINED) + error("MPI_Waitany failed."); + // message( "cell data from proxy %i has arrived." , pid ); + for (count = 0, j = 0; j < e->proxies[pid].nr_cells_in; j++) + count += cell_unpack(&e->proxies[pid].pcells_in[count], + e->proxies[pid].cells_in[j], e->s); + } + + /* Wait for all the sends to have finnished too. */ + if (MPI_Waitall(nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS) + error("MPI_Waitall on sends failed."); + + /* Count the number of particles we need to import and re-allocate + the buffer if needed. */ + for (count = 0, k = 0; k < nr_proxies; k++) + for (j = 0; j < e->proxies[k].nr_cells_in; j++) + count += e->proxies[k].cells_in[j]->count; + if (count > s->size_parts_foreign) { + if (s->parts_foreign != NULL) free(s->parts_foreign); + s->size_parts_foreign = 1.1 * count; + if (posix_memalign((void **)&s->parts_foreign, part_align, + sizeof(struct part) * s->size_parts_foreign) != 0) + error("Failed to allocate foreign part data."); + } + + /* Unpack the cells and link to the particle data. */ + parts = s->parts_foreign; + for (k = 0; k < nr_proxies; k++) { + for (count = 0, j = 0; j < e->proxies[k].nr_cells_in; j++) { + count += cell_link(e->proxies[k].cells_in[j], parts); + parts = &parts[e->proxies[k].cells_in[j]->count]; } + } + s->nr_parts_foreign = parts - s->parts_foreign; + + /* Is the parts buffer large enough? */ + if (s->nr_parts_foreign > s->size_parts_foreign) + error("Foreign parts buffer too small."); + /* Free the pcell buffer. */ + free(pcells); + +#else + error("SWIFT was not compiled with MPI support."); +#endif +} /** * @brief Exchange straying parts with other nodes. * * @param e The #engine. - * @param offset The index in the parts array as of which the foreign parts reside. + * @param offset The index in the parts array as of which the foreign parts + *reside. * @param ind The ID of the foreign #cell. * @param N The number of stray parts. * * @return The number of arrived parts copied to parts and xparts. */ - -int engine_exchange_strays ( struct engine *e , int offset , int *ind , int N ) { + +int engine_exchange_strays(struct engine *e, int offset, int *ind, int N) { #ifdef WITH_MPI - int k, pid, count = 0, nr_in = 0, nr_out = 0; - MPI_Request reqs_in[ 2*engine_maxproxies ]; - MPI_Request reqs_out[ 2*engine_maxproxies ]; - MPI_Status status; - struct proxy *p; - struct space *s = e->s; - - /* Re-set the proxies. */ - for ( k = 0 ; k < e->nr_proxies ; k++ ) - e->proxies[k].nr_parts_out = 0; - - /* Put the parts into the corresponding proxies. */ - for ( k = 0 ; k < N ; k++ ) { - pid = e->proxy_ind[ e->s->cells[ ind[k] ].nodeID ]; - if ( pid < 0 ) - error( "Do not have a proxy for the requested nodeID." ); - proxy_parts_load( &e->proxies[pid] , &s->parts[offset + k] , &s->xparts[offset + k] , 1 ); - } - - /* Launch the proxies. */ - for ( k = 0 ; k < e->nr_proxies ; k++ ) { - proxy_parts_exch1( &e->proxies[k] ); - reqs_in[k] = e->proxies[k].req_parts_count_in; - reqs_out[k] = e->proxies[k].req_parts_count_out; - } - - /* Wait for each count to come in and start the recv. */ - for ( k = 0 ; k < e->nr_proxies ; k++ ) { - if ( MPI_Waitany( e->nr_proxies , reqs_in , &pid , &status ) != MPI_SUCCESS || - pid == MPI_UNDEFINED ) - error( "MPI_Waitany failed." ); - // message( "request from proxy %i has arrived." , pid ); - proxy_parts_exch2( &e->proxies[pid] ); - } - - /* Wait for all the sends to have finnished too. */ - if ( MPI_Waitall( e->nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS ) - error( "MPI_Waitall on sends failed." ); - - /* Count the total number of incomming particles and make sure we have - enough space to accommodate them. */ - int count_in = 0; - for ( k = 0 ; k < e->nr_proxies ; k++ ) - count_in += e->proxies[k].nr_parts_in; - message("sent out %i particles, got %i back.", N, count_in); - if ( offset + count_in > s->size_parts ) { - s->size_parts = (offset + count_in) * 1.05; - struct part *parts_new; - struct xpart *xparts_new; - if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * s->size_parts ) != 0 || - posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * s->size_parts ) != 0 ) - error( "Failed to allocate new part data." ); - memcpy( parts_new , s->parts , sizeof(struct part) * offset ); - memcpy( xparts_new , s->xparts , sizeof(struct xpart) * offset ); - free( s->parts ); - free( s->xparts ); - s->parts = parts_new; - s->xparts = xparts_new; + int k, pid, count = 0, nr_in = 0, nr_out = 0; + MPI_Request reqs_in[2 * engine_maxproxies]; + MPI_Request reqs_out[2 * engine_maxproxies]; + MPI_Status status; + struct proxy *p; + struct space *s = e->s; + + /* Re-set the proxies. */ + for (k = 0; k < e->nr_proxies; k++) e->proxies[k].nr_parts_out = 0; + + /* Put the parts into the corresponding proxies. */ + for (k = 0; k < N; k++) { + int node_id = e->s->cells[ind[k]].nodeID; + if (node_id < 0 || node_id >= e->nr_nodes) + error("Bad node ID %i.", node_id); + pid = e->proxy_ind[node_id]; + if (pid < 0) + error( + "Do not have a proxy for the requested nodeID %i for part with " + "id=%llu, x=[%e,%e,%e].", + node_id, s->parts[offset + k].id, s->parts[offset + k].x[0], + s->parts[offset + k].x[1], s->parts[offset + k].x[2]); + proxy_parts_load(&e->proxies[pid], &s->parts[offset + k], + &s->xparts[offset + k], 1); + } + + /* Launch the proxies. */ + for (k = 0; k < e->nr_proxies; k++) { + proxy_parts_exch1(&e->proxies[k]); + reqs_in[k] = e->proxies[k].req_parts_count_in; + reqs_out[k] = e->proxies[k].req_parts_count_out; + } + + /* Wait for each count to come in and start the recv. */ + for (k = 0; k < e->nr_proxies; k++) { + if (MPI_Waitany(e->nr_proxies, reqs_in, &pid, &status) != MPI_SUCCESS || + pid == MPI_UNDEFINED) + error("MPI_Waitany failed."); + // message( "request from proxy %i has arrived." , pid ); + proxy_parts_exch2(&e->proxies[pid]); + } + + /* Wait for all the sends to have finnished too. */ + if (MPI_Waitall(e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != MPI_SUCCESS) + error("MPI_Waitall on sends failed."); + + /* Count the total number of incomming particles and make sure we have + enough space to accommodate them. */ + int count_in = 0; + for (k = 0; k < e->nr_proxies; k++) count_in += e->proxies[k].nr_parts_in; + message("sent out %i particles, got %i back.", N, count_in); + if (offset + count_in > s->size_parts) { + s->size_parts = (offset + count_in) * 1.05; + struct part *parts_new; + struct xpart *xparts_new; + if (posix_memalign((void **)&parts_new, part_align, + sizeof(struct part) * s->size_parts) != 0 || + posix_memalign((void **)&xparts_new, part_align, + sizeof(struct xpart) * s->size_parts) != 0) + error("Failed to allocate new part data."); + memcpy(parts_new, s->parts, sizeof(struct part) * offset); + memcpy(xparts_new, s->xparts, sizeof(struct xpart) * offset); + free(s->parts); + free(s->xparts); + s->parts = parts_new; + s->xparts = xparts_new; + } + + /* Collect the requests for the particle data from the proxies. */ + for (k = 0; k < e->nr_proxies; k++) { + if (e->proxies[k].nr_parts_in > 0) { + reqs_in[2 * k] = e->proxies[k].req_parts_in; + reqs_in[2 * k + 1] = e->proxies[k].req_xparts_in; + nr_in += 1; + } else + reqs_in[2 * k] = reqs_in[2 * k + 1] = MPI_REQUEST_NULL; + if (e->proxies[k].nr_parts_out > 0) { + reqs_out[2 * k] = e->proxies[k].req_parts_out; + reqs_out[2 * k + 1] = e->proxies[k].req_xparts_out; + nr_out += 1; + } else + reqs_out[2 * k] = reqs_out[2 * k + 1] = MPI_REQUEST_NULL; + } + + /* Wait for each part array to come in and collect the new + parts from the proxies. */ + for (k = 0; k < 2 * (nr_in + nr_out); k++) { + int err; + if ((err = MPI_Waitany(2 * e->nr_proxies, reqs_in, &pid, &status)) != + MPI_SUCCESS) { + char buff[MPI_MAX_ERROR_STRING]; + int res; + MPI_Error_string(err, buff, &res); + error("MPI_Waitany failed (%s).", buff); } - - /* Collect the requests for the particle data from the proxies. */ - for ( k = 0 ; k < e->nr_proxies ; k++ ) { - if ( e->proxies[k].nr_parts_in > 0 ) { - reqs_in[2*k] = e->proxies[k].req_parts_in; - reqs_in[2*k+1] = e->proxies[k].req_xparts_in; - nr_in += 1; - } - else - reqs_in[2*k] = reqs_in[2*k+1] = MPI_REQUEST_NULL; - if ( e->proxies[k].nr_parts_out > 0 ) { - reqs_out[2*k] = e->proxies[k].req_parts_out; - reqs_out[2*k+1] = e->proxies[k].req_xparts_out; - nr_out += 1; - } - else - reqs_out[2*k] = reqs_out[2*k+1] = MPI_REQUEST_NULL; - } - - /* Wait for each part array to come in and collect the new - parts from the proxies. */ - for ( k = 0 ; k < 2*(nr_in + nr_out) ; k++ ) { - int err; - if ( ( err = MPI_Waitany( 2*e->nr_proxies , reqs_in , &pid , &status ) ) != MPI_SUCCESS ) { - char buff[ MPI_MAX_ERROR_STRING ]; - int res; - MPI_Error_string( err , buff , &res ); - error( "MPI_Waitany failed (%s)." , buff ); - } - if ( pid == MPI_UNDEFINED ) - break; - // message( "request from proxy %i has arrived." , pid ); - if ( reqs_in[pid & ~1] == MPI_REQUEST_NULL && - reqs_in[pid | 1 ] == MPI_REQUEST_NULL ) { - p = &e->proxies[pid/2]; - memcpy( &s->parts[offset + count] , p->parts_in , sizeof(struct part) * p->nr_parts_in ); - memcpy( &s->xparts[offset + count] , p->xparts_in , sizeof(struct xpart) * p->nr_parts_in ); - count += p->nr_parts_in; - /* for ( int k = 0 ; k < p->nr_parts_in ; k++ ) - message( "received particle %lli, x=[%.3e %.3e %.3e], h=%.3e, from node %i." , - p->parts_in[k].id , p->parts_in[k].x[0] , p->parts_in[k].x[1] , p->parts_in[k].x[2] , - p->parts_in[k].h , p->nodeID ); */ - } - } - - /* Wait for all the sends to have finnished too. */ - if ( nr_out > 0 ) - if ( MPI_Waitall( 2*e->nr_proxies , reqs_out , MPI_STATUSES_IGNORE ) != MPI_SUCCESS ) - error( "MPI_Waitall on sends failed." ); - - /* Return the number of harvested parts. */ - return count; - -#else - error( "SWIFT was not compiled with MPI support." ); - return 0; -#endif - + if (pid == MPI_UNDEFINED) break; + // message( "request from proxy %i has arrived." , pid ); + if (reqs_in[pid & ~1] == MPI_REQUEST_NULL && + reqs_in[pid | 1] == MPI_REQUEST_NULL) { + p = &e->proxies[pid >> 1]; + memcpy(&s->parts[offset + count], p->parts_in, + sizeof(struct part) * p->nr_parts_in); + memcpy(&s->xparts[offset + count], p->xparts_in, + sizeof(struct xpart) * p->nr_parts_in); + for (int k = offset; k < offset + count; k++) + message( + "received particle %lli, x=[%.3e %.3e %.3e], h=%.3e, from node %i.", + s->parts[k].id, s->parts[k].x[0], s->parts[k].x[1], + s->parts[k].x[2], s->parts[k].h, p->nodeID); + count += p->nr_parts_in; } + } + /* Wait for all the sends to have finnished too. */ + if (nr_out > 0) + if (MPI_Waitall(2 * e->nr_proxies, reqs_out, MPI_STATUSES_IGNORE) != + MPI_SUCCESS) + error("MPI_Waitall on sends failed."); + + /* Return the number of harvested parts. */ + return count; + +#else + error("SWIFT was not compiled with MPI support."); + return 0; +#endif +} /** * @brief Fill the #space's task list. * * @param e The #engine we are working with. */ - -void engine_maketasks ( struct engine *e ) { - - struct space *s = e->s; - struct scheduler *sched = &e->sched; - struct cell *cells = s->cells; - int nr_cells = s->nr_cells; - int nodeID = e->nodeID; - int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd, sid; - int *cdim = s->cdim; - struct task *t, *t2; - struct cell *ci, *cj; - - /* Re-set the scheduler. */ - scheduler_reset( sched , s->tot_cells * engine_maxtaskspercell ); - - /* Run through the highest level of cells and add pairs. */ - for ( i = 0 ; i < cdim[0] ; i++ ) - for ( j = 0 ; j < cdim[1] ; j++ ) - for ( k = 0 ; k < cdim[2] ; k++ ) { - cid = cell_getid( cdim , i , j , k ); - if ( cells[cid].count == 0 ) - continue; - ci = &cells[cid]; - if ( ci->count == 0 ) - continue; - if ( ci->nodeID == nodeID ) - scheduler_addtask( sched , task_type_self , task_subtype_density , 0 , 0 , ci , NULL , 0 ); - for ( ii = -1 ; ii < 2 ; ii++ ) { - iii = i + ii; - if ( !s->periodic && ( iii < 0 || iii >= cdim[0] ) ) - continue; - iii = ( iii + cdim[0] ) % cdim[0]; - for ( jj = -1 ; jj < 2 ; jj++ ) { - jjj = j + jj; - if ( !s->periodic && ( jjj < 0 || jjj >= cdim[1] ) ) - continue; - jjj = ( jjj + cdim[1] ) % cdim[1]; - for ( kk = -1 ; kk < 2 ; kk++ ) { - kkk = k + kk; - if ( !s->periodic && ( kkk < 0 || kkk >= cdim[2] ) ) - continue; - kkk = ( kkk + cdim[2] ) % cdim[2]; - cjd = cell_getid( cdim , iii , jjj , kkk ); - cj = &cells[cjd]; - if ( cid >= cjd || cj->count == 0 || - ( ci->nodeID != nodeID && cj->nodeID != nodeID ) ) - continue; - sid = sortlistID[ (kk+1) + 3*( (jj+1) + 3*(ii+1) ) ]; - scheduler_addtask( sched , task_type_pair , task_subtype_density , sid , 0 , ci , cj , 1 ); - } - } - } - } - - /* Add the gravity mm tasks. */ - for ( i = 0 ; i < nr_cells ; i++ ) - if ( cells[i].gcount > 0 ) { - scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , NULL , 0 ); - for ( j = i+1 ; j < nr_cells ; j++ ) - if ( cells[j].gcount > 0 ) - scheduler_addtask( sched , task_type_grav_mm , task_subtype_none , -1 , 0 , &cells[i] , &cells[j] , 0 ); - } - - /* Split the tasks. */ - scheduler_splittasks( sched ); - - /* Allocate the list of cell-task links. The maximum number of links - is the number of cells (s->tot_cells) times the number of neighbours (27) - times the number of interaction types (2, density and force). */ - if ( e->links != NULL ) - free( e->links ); - if ( ( e->links = malloc( sizeof(struct link) * s->tot_cells * 27 * 2 ) ) == NULL ) - error( "Failed to allocate cell-task links." ); - e->nr_links = 0; - - /* Add the gravity up/down tasks at the top-level cells and push them down. */ - for ( k = 0 ; k < nr_cells ; k++ ) - if ( cells[k].nodeID == nodeID && cells[k].gcount > 0 ) { - - /* Create tasks at top level. */ - struct task *up = scheduler_addtask( sched , task_type_grav_up , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 ); - struct task *down = scheduler_addtask( sched , task_type_grav_down , task_subtype_none , 0 , 0 , &cells[k] , NULL , 0 ); - - /* Push tasks down the cell hierarchy. */ - engine_addtasks_grav( e , &cells[k] , up , down ); - - } - - /* Count the number of tasks associated with each cell and - store the density tasks in each cell, and make each sort - depend on the sorts of its progeny. */ - for ( k = 0 ; k < sched->nr_tasks ; k++ ) { - - /* Get the current task. */ - t = &sched->tasks[k]; - if ( t->skip ) - continue; - - /* Link sort tasks together. */ - if ( t->type == task_type_sort && t->ci->split ) - for ( j = 0 ; j < 8 ; j++ ) - if ( t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL ) { - t->ci->progeny[j]->sorts->skip = 0; - scheduler_addunlock( sched , t->ci->progeny[j]->sorts , t ); - } - - /* Link density tasks to cells. */ - if ( t->type == task_type_self ) { - atomic_inc( &t->ci->nr_tasks ); - if ( t->subtype == task_subtype_density ) { - t->ci->density = engine_addlink( e , t->ci->density , t ); - atomic_inc( &t->ci->nr_density ); - } - } - else if ( t->type == task_type_pair ) { - atomic_inc( &t->ci->nr_tasks ); - atomic_inc( &t->cj->nr_tasks ); - if ( t->subtype == task_subtype_density ) { - t->ci->density = engine_addlink( e , t->ci->density , t ); - atomic_inc( &t->ci->nr_density ); - t->cj->density = engine_addlink( e , t->cj->density , t ); - atomic_inc( &t->cj->nr_density ); - } - } - else if ( t->type == task_type_sub ) { - atomic_inc( &t->ci->nr_tasks ); - if ( t->cj != NULL ) - atomic_inc( &t->cj->nr_tasks ); - if ( t->subtype == task_subtype_density ) { - t->ci->density = engine_addlink( e , t->ci->density , t ); - atomic_inc( &t->ci->nr_density ); - if ( t->cj != NULL ) { - t->cj->density = engine_addlink( e , t->cj->density , t ); - atomic_inc( &t->cj->nr_density ); - } - } - } - - /* Link gravity multipole tasks to the up/down tasks. */ - if ( t->type == task_type_grav_mm || - ( t->type == task_type_sub && t->subtype == task_subtype_grav ) ) { - atomic_inc( &t->ci->nr_tasks ); - scheduler_addunlock( sched , t->ci->grav_up , t ); - scheduler_addunlock( sched , t , t->ci->grav_down ); - if ( t->cj != NULL && t->ci->grav_up != t->cj->grav_up ) { - scheduler_addunlock( sched , t->cj->grav_up , t ); - scheduler_addunlock( sched , t , t->cj->grav_down ); - } + +void engine_maketasks(struct engine *e) { + + struct space *s = e->s; + struct scheduler *sched = &e->sched; + struct cell *cells = s->cells; + int nr_cells = s->nr_cells; + int nodeID = e->nodeID; + int i, j, k, ii, jj, kk, iii, jjj, kkk, cid, cjd, sid; + int *cdim = s->cdim; + struct task *t, *t2; + struct cell *ci, *cj; + + /* Re-set the scheduler. */ + scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell); + + /* Run through the highest level of cells and add pairs. */ + for (i = 0; i < cdim[0]; i++) + for (j = 0; j < cdim[1]; j++) + for (k = 0; k < cdim[2]; k++) { + cid = cell_getid(cdim, i, j, k); + if (cells[cid].count == 0) continue; + ci = &cells[cid]; + if (ci->count == 0) continue; + if (ci->nodeID == nodeID) + scheduler_addtask(sched, task_type_self, task_subtype_density, 0, 0, + ci, NULL, 0); + for (ii = -1; ii < 2; ii++) { + iii = i + ii; + if (!s->periodic && (iii < 0 || iii >= cdim[0])) continue; + iii = (iii + cdim[0]) % cdim[0]; + for (jj = -1; jj < 2; jj++) { + jjj = j + jj; + if (!s->periodic && (jjj < 0 || jjj >= cdim[1])) continue; + jjj = (jjj + cdim[1]) % cdim[1]; + for (kk = -1; kk < 2; kk++) { + kkk = k + kk; + if (!s->periodic && (kkk < 0 || kkk >= cdim[2])) continue; + kkk = (kkk + cdim[2]) % cdim[2]; + cjd = cell_getid(cdim, iii, jjj, kkk); + cj = &cells[cjd]; + if (cid >= cjd || cj->count == 0 || + (ci->nodeID != nodeID && cj->nodeID != nodeID)) + continue; + sid = sortlistID[(kk + 1) + 3 * ((jj + 1) + 3 * (ii + 1))]; + scheduler_addtask(sched, task_type_pair, task_subtype_density, + sid, 0, ci, cj, 1); } - + } } - - /* Append a ghost task to each cell, and add kick2 tasks to the - super cells. */ - for ( k = 0 ; k < nr_cells ; k++ ) - engine_mkghosts( e , &cells[k] , NULL ); - - /* Run through the tasks and make force tasks for each density task. - Each force task depends on the cell ghosts and unlocks the kick2 task - of its super-cell. */ - kk = sched->nr_tasks; - for ( k = 0 ; k < kk ; k++ ) { - - /* Get a pointer to the task. */ - t = &sched->tasks[k]; - - /* Skip? */ - if ( t->skip ) - continue; - - /* Self-interaction? */ - if ( t->type == task_type_self && t->subtype == task_subtype_density ) { - scheduler_addunlock( sched , t , t->ci->super->ghost ); - t2 = scheduler_addtask( sched , task_type_self , task_subtype_force , 0 , 0 , t->ci , NULL , 0 ); - scheduler_addunlock( sched , t->ci->super->ghost , t2 ); - scheduler_addunlock( sched , t2 , t->ci->super->kick2 ); - t->ci->force = engine_addlink( e , t->ci->force , t2 ); - atomic_inc( &t->ci->nr_force ); - } - - /* Otherwise, pair interaction? */ - else if ( t->type == task_type_pair && t->subtype == task_subtype_density ) { - t2 = scheduler_addtask( sched , task_type_pair , task_subtype_force , 0 , 0 , t->ci , t->cj , 0 ); - if ( t->ci->nodeID == nodeID ) { - scheduler_addunlock( sched , t , t->ci->super->ghost ); - scheduler_addunlock( sched , t->ci->super->ghost , t2 ); - scheduler_addunlock( sched , t2 , t->ci->super->kick2 ); - } - if ( t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) { - scheduler_addunlock( sched , t , t->cj->super->ghost ); - scheduler_addunlock( sched , t->cj->super->ghost , t2 ); - scheduler_addunlock( sched , t2 , t->cj->super->kick2 ); - } - t->ci->force = engine_addlink( e , t->ci->force , t2 ); - atomic_inc( &t->ci->nr_force ); - t->cj->force = engine_addlink( e , t->cj->force , t2 ); - atomic_inc( &t->cj->nr_force ); - } - - /* Otherwise, sub interaction? */ - else if ( t->type == task_type_sub && t->subtype == task_subtype_density ) { - t2 = scheduler_addtask( sched , task_type_sub , task_subtype_force , t->flags , 0 , t->ci , t->cj , 0 ); - if ( t->ci->nodeID == nodeID ) { - scheduler_addunlock( sched , t , t->ci->super->ghost ); - scheduler_addunlock( sched , t->ci->super->ghost , t2 ); - scheduler_addunlock( sched , t2 , t->ci->super->kick2 ); - } - if ( t->cj != NULL && t->cj->nodeID == nodeID && t->ci->super != t->cj->super ) { - scheduler_addunlock( sched , t , t->cj->super->ghost ); - scheduler_addunlock( sched , t->cj->super->ghost , t2 ); - scheduler_addunlock( sched , t2 , t->cj->super->kick2 ); - } - t->ci->force = engine_addlink( e , t->ci->force , t2 ); - atomic_inc( &t->ci->nr_force ); - if ( t->cj != NULL ) { - t->cj->force = engine_addlink( e , t->cj->force , t2 ); - atomic_inc( &t->cj->nr_force ); - } - } - - /* Kick2 tasks should rely on the grav_down tasks of their cell. */ - else if ( t->type == task_type_kick2 && t->ci->grav_down != NULL ) - scheduler_addunlock( sched , t->ci->grav_down , t ); - + } + + /* Add the gravity mm tasks. */ + for (i = 0; i < nr_cells; i++) + if (cells[i].gcount > 0) { + scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0, + &cells[i], NULL, 0); + for (j = i + 1; j < nr_cells; j++) + if (cells[j].gcount > 0) + scheduler_addtask(sched, task_type_grav_mm, task_subtype_none, -1, 0, + &cells[i], &cells[j], 0); + } + + /* Split the tasks. */ + scheduler_splittasks(sched); + + /* Allocate the list of cell-task links. The maximum number of links + is the number of cells (s->tot_cells) times the number of neighbours (27) + times the number of interaction types (2, density and force). */ + if (e->links != NULL) free(e->links); + if ((e->links = malloc(sizeof(struct link) * s->tot_cells * 27 * 2)) == NULL) + error("Failed to allocate cell-task links."); + e->nr_links = 0; + + /* Add the gravity up/down tasks at the top-level cells and push them down. */ + for (k = 0; k < nr_cells; k++) + if (cells[k].nodeID == nodeID && cells[k].gcount > 0) { + + /* Create tasks at top level. */ + struct task *up = + scheduler_addtask(sched, task_type_grav_up, task_subtype_none, 0, 0, + &cells[k], NULL, 0); + struct task *down = + scheduler_addtask(sched, task_type_grav_down, task_subtype_none, 0, 0, + &cells[k], NULL, 0); + + /* Push tasks down the cell hierarchy. */ + engine_addtasks_grav(e, &cells[k], up, down); + } + + /* Count the number of tasks associated with each cell and + store the density tasks in each cell, and make each sort + depend on the sorts of its progeny. */ + for (k = 0; k < sched->nr_tasks; k++) { + + /* Get the current task. */ + t = &sched->tasks[k]; + if (t->skip) continue; + + /* Link sort tasks together. */ + if (t->type == task_type_sort && t->ci->split) + for (j = 0; j < 8; j++) + if (t->ci->progeny[j] != NULL && t->ci->progeny[j]->sorts != NULL) { + t->ci->progeny[j]->sorts->skip = 0; + scheduler_addunlock(sched, t->ci->progeny[j]->sorts, t); } - - /* Add the communication tasks if MPI is being used. */ - #ifdef WITH_MPI - - /* Loop over the proxies. */ - for ( int pid = 0 ; pid < e->nr_proxies ; pid++ ) { - - /* Get a handle on the proxy. */ - struct proxy *p = &e->proxies[pid]; - - /* Loop through the proxy's incomming cells and add the - recv tasks. */ - for ( k = 0 ; k < p->nr_cells_in ; k++ ) - engine_addtasks_recv( e , p->cells_in[k] , NULL , NULL ); - - /* Loop through the proxy's outgoing cells and add the - send tasks. */ - for ( k = 0 ; k < p->nr_cells_out ; k++ ) - engine_addtasks_send( e , p->cells_out[k] , p->cells_in[0] ); - - } - - #endif - - /* Rank the tasks. */ - scheduler_ranktasks( sched ); - - /* Weight the tasks. */ - scheduler_reweight( sched ); - - /* Set the tasks age. */ - e->tasks_age = 0; - + + /* Link density tasks to cells. */ + if (t->type == task_type_self) { + atomic_inc(&t->ci->nr_tasks); + if (t->subtype == task_subtype_density) { + t->ci->density = engine_addlink(e, t->ci->density, t); + atomic_inc(&t->ci->nr_density); + } + } else if (t->type == task_type_pair) { + atomic_inc(&t->ci->nr_tasks); + atomic_inc(&t->cj->nr_tasks); + if (t->subtype == task_subtype_density) { + t->ci->density = engine_addlink(e, t->ci->density, t); + atomic_inc(&t->ci->nr_density); + t->cj->density = engine_addlink(e, t->cj->density, t); + atomic_inc(&t->cj->nr_density); + } + } else if (t->type == task_type_sub) { + atomic_inc(&t->ci->nr_tasks); + if (t->cj != NULL) atomic_inc(&t->cj->nr_tasks); + if (t->subtype == task_subtype_density) { + t->ci->density = engine_addlink(e, t->ci->density, t); + atomic_inc(&t->ci->nr_density); + if (t->cj != NULL) { + t->cj->density = engine_addlink(e, t->cj->density, t); + atomic_inc(&t->cj->nr_density); + } + } + } + + /* Link gravity multipole tasks to the up/down tasks. */ + if (t->type == task_type_grav_mm || + (t->type == task_type_sub && t->subtype == task_subtype_grav)) { + atomic_inc(&t->ci->nr_tasks); + scheduler_addunlock(sched, t->ci->grav_up, t); + scheduler_addunlock(sched, t, t->ci->grav_down); + if (t->cj != NULL && t->ci->grav_up != t->cj->grav_up) { + scheduler_addunlock(sched, t->cj->grav_up, t); + scheduler_addunlock(sched, t, t->cj->grav_down); + } + } + } + + /* Append a ghost task to each cell, and add kick2 tasks to the + super cells. */ + for (k = 0; k < nr_cells; k++) engine_mkghosts(e, &cells[k], NULL); + + /* Run through the tasks and make force tasks for each density task. + Each force task depends on the cell ghosts and unlocks the kick2 task + of its super-cell. */ + kk = sched->nr_tasks; + for (k = 0; k < kk; k++) { + + /* Get a pointer to the task. */ + t = &sched->tasks[k]; + + /* Skip? */ + if (t->skip) continue; + + /* Self-interaction? */ + if (t->type == task_type_self && t->subtype == task_subtype_density) { + scheduler_addunlock(sched, t, t->ci->super->ghost); + t2 = scheduler_addtask(sched, task_type_self, task_subtype_force, 0, 0, + t->ci, NULL, 0); + scheduler_addunlock(sched, t->ci->super->ghost, t2); + scheduler_addunlock(sched, t2, t->ci->super->kick2); + t->ci->force = engine_addlink(e, t->ci->force, t2); + atomic_inc(&t->ci->nr_force); } - - + + /* Otherwise, pair interaction? */ + else if (t->type == task_type_pair && t->subtype == task_subtype_density) { + t2 = scheduler_addtask(sched, task_type_pair, task_subtype_force, 0, 0, + t->ci, t->cj, 0); + if (t->ci->nodeID == nodeID) { + scheduler_addunlock(sched, t, t->ci->super->ghost); + scheduler_addunlock(sched, t->ci->super->ghost, t2); + scheduler_addunlock(sched, t2, t->ci->super->kick2); + } + if (t->cj->nodeID == nodeID && t->ci->super != t->cj->super) { + scheduler_addunlock(sched, t, t->cj->super->ghost); + scheduler_addunlock(sched, t->cj->super->ghost, t2); + scheduler_addunlock(sched, t2, t->cj->super->kick2); + } + t->ci->force = engine_addlink(e, t->ci->force, t2); + atomic_inc(&t->ci->nr_force); + t->cj->force = engine_addlink(e, t->cj->force, t2); + atomic_inc(&t->cj->nr_force); + } + + /* Otherwise, sub interaction? */ + else if (t->type == task_type_sub && t->subtype == task_subtype_density) { + t2 = scheduler_addtask(sched, task_type_sub, task_subtype_force, t->flags, + 0, t->ci, t->cj, 0); + if (t->ci->nodeID == nodeID) { + scheduler_addunlock(sched, t, t->ci->super->ghost); + scheduler_addunlock(sched, t->ci->super->ghost, t2); + scheduler_addunlock(sched, t2, t->ci->super->kick2); + } + if (t->cj != NULL && t->cj->nodeID == nodeID && + t->ci->super != t->cj->super) { + scheduler_addunlock(sched, t, t->cj->super->ghost); + scheduler_addunlock(sched, t->cj->super->ghost, t2); + scheduler_addunlock(sched, t2, t->cj->super->kick2); + } + t->ci->force = engine_addlink(e, t->ci->force, t2); + atomic_inc(&t->ci->nr_force); + if (t->cj != NULL) { + t->cj->force = engine_addlink(e, t->cj->force, t2); + atomic_inc(&t->cj->nr_force); + } + } + + /* Kick2 tasks should rely on the grav_down tasks of their cell. */ + else if (t->type == task_type_kick2 && t->ci->grav_down != NULL) + scheduler_addunlock(sched, t->ci->grav_down, t); + } + +/* Add the communication tasks if MPI is being used. */ +#ifdef WITH_MPI + + /* Loop over the proxies. */ + for (int pid = 0; pid < e->nr_proxies; pid++) { + + /* Get a handle on the proxy. */ + struct proxy *p = &e->proxies[pid]; + + /* Loop through the proxy's incomming cells and add the + recv tasks. */ + for (k = 0; k < p->nr_cells_in; k++) + engine_addtasks_recv(e, p->cells_in[k], NULL, NULL); + + /* Loop through the proxy's outgoing cells and add the + send tasks. */ + for (k = 0; k < p->nr_cells_out; k++) + engine_addtasks_send(e, p->cells_out[k], p->cells_in[0]); + } + +#endif + + /* Rank the tasks. */ + scheduler_ranktasks(sched); + + /* Weight the tasks. */ + scheduler_reweight(sched); + + /* Set the tasks age. */ + e->tasks_age = 0; +} /** * @brief Mark tasks to be skipped and set the sort flags accordingly. - * + * * @return 1 if the space has to be rebuilt, 0 otherwise. */ - -int engine_marktasks ( struct engine *e ) { - - struct scheduler *s = &e->sched; - int k, nr_tasks = s->nr_tasks, *ind = s->tasks_ind; - struct task *t, *tasks = s->tasks; - float dt_step = e->dt_step; - struct cell *ci, *cj; - // ticks tic = getticks(); - - /* Muc less to do here if we're on a fixed time-step. */ - if ( !( e->policy & engine_policy_multistep ) ) { - - /* Run through the tasks and mark as skip or not. */ - for ( k = 0 ; k < nr_tasks ; k++ ) { - - /* Get a handle on the kth task. */ - t = &tasks[ ind[k] ]; - - /* Pair? */ - if ( t->type == task_type_pair || ( t->type == task_type_sub && t->cj != NULL ) ) { - - /* Local pointers. */ - ci = t->ci; - cj = t->cj; - - /* Too much particle movement? */ - if ( t->tight && - ( fmaxf( ci->h_max , cj->h_max ) + ci->dx_max + cj->dx_max > cj->dmin || - ci->dx_max > space_maxreldx*ci->h_max || cj->dx_max > space_maxreldx*cj->h_max ) ) - return 1; - } - - /* Sort? */ - else if ( t->type == task_type_sort ) { - - /* If all the sorts have been done, make this task implicit. */ - if ( !( t->flags & (t->flags ^ t->ci->sorted ) ) ) - t->implicit = 1; - - } +int engine_marktasks(struct engine *e) { - } - - } - - else { - - /* Run through the tasks and mark as skip or not. */ - for ( k = 0 ; k < nr_tasks ; k++ ) { + struct scheduler *s = &e->sched; + int k, nr_tasks = s->nr_tasks, *ind = s->tasks_ind; + struct task *t, *tasks = s->tasks; + float dt_step = e->dt_step; + struct cell *ci, *cj; + // ticks tic = getticks(); - /* Get a handle on the kth task. */ - t = &tasks[ ind[k] ]; + /* Muc less to do here if we're on a fixed time-step. */ + if (!(e->policy & engine_policy_multistep)) { - /* Sort-task? Note that due to the task ranking, the sorts - will all come before the pairs. */ - if ( t->type == task_type_sort ) { + /* Run through the tasks and mark as skip or not. */ + for (k = 0; k < nr_tasks; k++) { - /* Re-set the flags. */ - t->flags = 0; - t->skip = 1; + /* Get a handle on the kth task. */ + t = &tasks[ind[k]]; - } + /* Pair? */ + if (t->type == task_type_pair || + (t->type == task_type_sub && t->cj != NULL)) { - /* Single-cell task? */ - else if ( t->type == task_type_self || - t->type == task_type_ghost || - ( t->type == task_type_sub && t->cj == NULL ) ) { + /* Local pointers. */ + ci = t->ci; + cj = t->cj; - /* Set this task's skip. */ - t->skip = ( t->ci->dt_min > dt_step ); + /* Too much particle movement? */ + if (t->tight && + (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin || + ci->dx_max > space_maxreldx * ci->h_max || + cj->dx_max > space_maxreldx * cj->h_max)) + return 1; - } + } - /* Pair? */ - else if ( t->type == task_type_pair || ( t->type == task_type_sub && t->cj != NULL ) ) { - - /* Local pointers. */ - ci = t->ci; - cj = t->cj; - - /* Set this task's skip. */ - t->skip = ( ci->dt_min > dt_step && cj->dt_min > dt_step ); - - /* Too much particle movement? */ - if ( t->tight && - ( fmaxf( ci->h_max , cj->h_max ) + ci->dx_max + cj->dx_max > cj->dmin || - ci->dx_max > space_maxreldx*ci->h_max || cj->dx_max > space_maxreldx*cj->h_max ) ) - return 1; - - /* Set the sort flags. */ - if ( !t->skip && t->type == task_type_pair ) { - if ( !( ci->sorted & ( 1 << t->flags ) ) ) { - ci->sorts->flags |= (1 << t->flags); - ci->sorts->skip = 0; - } - if ( !( cj->sorted & ( 1 << t->flags ) ) ) { - cj->sorts->flags |= (1 << t->flags); - cj->sorts->skip = 0; - } - } - - } + /* Sort? */ + else if (t->type == task_type_sort) { - /* Kick2? */ - else if ( t->type == task_type_kick2 ) - t->skip = 0; + /* If all the sorts have been done, make this task implicit. */ + if (!(t->flags & (t->flags ^ t->ci->sorted))) t->implicit = 1; + } + } - /* None? */ - else if ( t->type == task_type_none ) - t->skip = 1; + } else { - } - + /* Run through the tasks and mark as skip or not. */ + for (k = 0; k < nr_tasks; k++) { + + /* Get a handle on the kth task. */ + t = &tasks[ind[k]]; + + /* Sort-task? Note that due to the task ranking, the sorts + will all come before the pairs. */ + if (t->type == task_type_sort) { + + /* Re-set the flags. */ + t->flags = 0; + t->skip = 1; + + } + + /* Single-cell task? */ + else if (t->type == task_type_self || t->type == task_type_ghost || + (t->type == task_type_sub && t->cj == NULL)) { + + /* Set this task's skip. */ + t->skip = (t->ci->dt_min > dt_step); + + } + + /* Pair? */ + else if (t->type == task_type_pair || + (t->type == task_type_sub && t->cj != NULL)) { + + /* Local pointers. */ + ci = t->ci; + cj = t->cj; + + /* Set this task's skip. */ + t->skip = (ci->dt_min > dt_step && cj->dt_min > dt_step); + + /* Too much particle movement? */ + if (t->tight && + (fmaxf(ci->h_max, cj->h_max) + ci->dx_max + cj->dx_max > cj->dmin || + ci->dx_max > space_maxreldx * ci->h_max || + cj->dx_max > space_maxreldx * cj->h_max)) + return 1; + + /* Set the sort flags. */ + if (!t->skip && t->type == task_type_pair) { + if (!(ci->sorted & (1 << t->flags))) { + ci->sorts->flags |= (1 << t->flags); + ci->sorts->skip = 0; + } + if (!(cj->sorted & (1 << t->flags))) { + cj->sorts->flags |= (1 << t->flags); + cj->sorts->skip = 0; + } } - - // message( "took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - - /* All is well... */ - return 0; - + + } + + /* Kick2? */ + else if (t->type == task_type_kick2) + t->skip = 0; + + /* None? */ + else if (t->type == task_type_none) + t->skip = 1; } - + } + + // message( "took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); + + /* All is well... */ + return 0; +} /** * @brief Rebuild the space and tasks. * * @param e The #engine. */ - -void engine_rebuild ( struct engine *e ) { - int k; - struct scheduler *sched = &e->sched; - - /* Clear the forcerebuild flag, whatever it was. */ - e->forcerebuild = 0; +void engine_rebuild(struct engine *e) { - /* Re-build the space. */ - // tic = getticks(); - space_rebuild( e->s , 0.0 ); - // message( "space_rebuild took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); + int k; + struct scheduler *sched = &e->sched; - /* If in parallel, exchange the cell structure. */ - #ifdef WITH_MPI - // tic = getticks(); - engine_exchange_cells( e ); - // message( "engine_exchange_cells took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - #endif + /* Clear the forcerebuild flag, whatever it was. */ + e->forcerebuild = 0; - /* Re-build the tasks. */ - // tic = getticks(); - engine_maketasks( e ); - // message( "engine_maketasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); + /* Re-build the space. */ + // tic = getticks(); + space_rebuild(e->s, 0.0); +// message( "space_rebuild took %.3f ms." , (double)(getticks() - +// tic)/CPU_TPS*1000 ); - /* Run through the tasks and mark as skip or not. */ - // tic = getticks(); - if ( engine_marktasks( e ) ) - error( "engine_marktasks failed after space_rebuild." ); - // message( "engine_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - - /* Count and print the number of each task type. */ - int counts[ task_type_count+1 ]; - for ( k = 0 ; k <= task_type_count ; k++ ) - counts[k] = 0; - for ( k = 0 ; k < sched->nr_tasks ; k++ ) - if ( !sched->tasks[k].skip ) - counts[ (int)sched->tasks[k].type ] += 1; - else - counts[ task_type_count ] += 1; - #ifdef WITH_MPI - printf( "[%03i] engine_rebuild: task counts are [ %s=%i" , e->nodeID , taskID_names[0] , counts[0] ); - #else - printf( "engine_rebuild: task counts are [ %s=%i" , taskID_names[0] , counts[0] ); - #endif - for ( k = 1 ; k < task_type_count ; k++ ) - printf( " %s=%i" , taskID_names[k] , counts[k] ); - printf( " skipped=%i ]\n" , counts[ task_type_count ] ); fflush(stdout); - message( "nr_parts = %i." , e->s->nr_parts ); - - } +/* If in parallel, exchange the cell structure. */ +#ifdef WITH_MPI + // tic = getticks(); + engine_exchange_cells(e); +// message( "engine_exchange_cells took %.3f ms." , (double)(getticks() - +// tic)/CPU_TPS*1000 ); +#endif + /* Re-build the tasks. */ + // tic = getticks(); + engine_maketasks(e); + // message( "engine_maketasks took %.3f ms." , (double)(getticks() - + // tic)/CPU_TPS*1000 ); + + /* Run through the tasks and mark as skip or not. */ + // tic = getticks(); + if (engine_marktasks(e)) + error("engine_marktasks failed after space_rebuild."); + // message( "engine_marktasks took %.3f ms." , (double)(getticks() - + // tic)/CPU_TPS*1000 ); + + /* Count and print the number of each task type. */ + int counts[task_type_count + 1]; + for (k = 0; k <= task_type_count; k++) counts[k] = 0; + for (k = 0; k < sched->nr_tasks; k++) + if (!sched->tasks[k].skip) + counts[(int)sched->tasks[k].type] += 1; + else + counts[task_type_count] += 1; +#ifdef WITH_MPI + printf("[%03i] engine_rebuild: task counts are [ %s=%i", e->nodeID, + taskID_names[0], counts[0]); +#else + printf("engine_rebuild: task counts are [ %s=%i", taskID_names[0], counts[0]); +#endif + for (k = 1; k < task_type_count; k++) + printf(" %s=%i", taskID_names[k], counts[k]); + printf(" skipped=%i ]\n", counts[task_type_count]); + fflush(stdout); + message("nr_parts = %i.", e->s->nr_parts); +} /** * @brief Prepare the #engine by re-building the cells and tasks. * * @param e The #engine to prepare. */ - -void engine_prepare ( struct engine *e ) { - - int rebuild; - - TIMER_TIC - /* Run through the tasks and mark as skip or not. */ +void engine_prepare(struct engine *e) { + + int rebuild; + + TIMER_TIC + + /* Run through the tasks and mark as skip or not. */ + // tic = getticks(); + rebuild = (e->forcerebuild || engine_marktasks(e)); +// message( "space_marktasks took %.3f ms." , (double)(getticks() - +// tic)/CPU_TPS*1000 ); + +/* Collect the values of rebuild from all nodes. */ +#ifdef WITH_MPI + // tic = getticks(); + int buff; + if (MPI_Allreduce(&rebuild, &buff, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD) != + MPI_SUCCESS) + error("Failed to aggreggate the rebuild flag accross nodes."); + rebuild = buff; +// message( "rebuild allreduce took %.3f ms." , (double)(getticks() - +// tic)/CPU_TPS*1000 ); +#endif + e->tic_step = getticks(); + + /* Did this not go through? */ + if (rebuild) { // tic = getticks(); - rebuild = ( e->forcerebuild || engine_marktasks( e ) ); - // message( "space_marktasks took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - - /* Collect the values of rebuild from all nodes. */ - #ifdef WITH_MPI - // tic = getticks(); - int buff; - if ( MPI_Allreduce( &rebuild , &buff , 1 , MPI_INT , MPI_MAX , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to aggreggate the rebuild flag accross nodes." ); - rebuild = buff; - // message( "rebuild allreduce took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - #endif - e->tic_step = getticks(); - - /* Did this not go through? */ - if ( rebuild ) { - // tic = getticks(); - engine_rebuild( e ); - // message( "engine_rebuild took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - } - - /* Re-rank the tasks every now and then. */ - if ( e->tasks_age % engine_tasksreweight == 1 ) { - // tic = getticks(); - scheduler_reweight( &e->sched ); - // message( "scheduler_reweight took %.3f ms." , (double)(getticks() - tic)/CPU_TPS*1000 ); - } - e->tasks_age += 1; + engine_rebuild(e); + // message( "engine_rebuild took %.3f ms." , (double)(getticks() - + // tic)/CPU_TPS*1000 ); + } - TIMER_TOC( timer_prepare ); - - } + /* Re-rank the tasks every now and then. */ + if (e->tasks_age % engine_tasksreweight == 1) { + // tic = getticks(); + scheduler_reweight(&e->sched); + // message( "scheduler_reweight took %.3f ms." , (double)(getticks() - + // tic)/CPU_TPS*1000 ); + } + e->tasks_age += 1; + TIMER_TOC(timer_prepare); +} /** * @brief Implements a barrier for the #runner threads. @@ -1454,104 +1544,107 @@ void engine_prepare ( struct engine *e ) { * @param e The #engine. * @param tid The thread ID */ - -void engine_barrier ( struct engine *e , int tid ) { - - /* First, get the barrier mutex. */ - if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 ) - error( "Failed to get barrier mutex." ); - - /* This thread is no longer running. */ - e->barrier_running -= 1; - - /* If all threads are in, send a signal... */ - if ( e->barrier_running == 0 ) - if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 ) - error( "Failed to broadcast barrier full condition." ); - - /* Wait for the barrier to open. */ - while ( e->barrier_launch == 0 || tid >= e->barrier_launchcount ) - if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) - error( "Eror waiting for barrier to close." ); - - /* This thread has been launched. */ - e->barrier_running += 1; - e->barrier_launch -= 1; - - /* If I'm the last one out, signal the condition again. */ - if ( e->barrier_launch == 0 ) - if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 ) - error( "Failed to broadcast empty barrier condition." ); - - /* Last but not least, release the mutex. */ - if ( pthread_mutex_unlock( &e->barrier_mutex ) != 0 ) - error( "Failed to get unlock the barrier mutex." ); - } - - +void engine_barrier(struct engine *e, int tid) { + + /* First, get the barrier mutex. */ + if (pthread_mutex_lock(&e->barrier_mutex) != 0) + error("Failed to get barrier mutex."); + + /* This thread is no longer running. */ + e->barrier_running -= 1; + + /* If all threads are in, send a signal... */ + if (e->barrier_running == 0) + if (pthread_cond_broadcast(&e->barrier_cond) != 0) + error("Failed to broadcast barrier full condition."); + + /* Wait for the barrier to open. */ + while (e->barrier_launch == 0 || tid >= e->barrier_launchcount) + if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0) + error("Eror waiting for barrier to close."); + + /* This thread has been launched. */ + e->barrier_running += 1; + e->barrier_launch -= 1; + + /* If I'm the last one out, signal the condition again. */ + if (e->barrier_launch == 0) + if (pthread_cond_broadcast(&e->barrier_cond) != 0) + error("Failed to broadcast empty barrier condition."); + + /* Last but not least, release the mutex. */ + if (pthread_mutex_unlock(&e->barrier_mutex) != 0) + error("Failed to get unlock the barrier mutex."); +} + /** * @brief Mapping function to collect the data from the second kick. */ -void engine_collect_kick2 ( struct cell *c ) { - - int k, updated = 0; - float dt_min = FLT_MAX, dt_max = 0.0f; - double ekin = 0.0, epot = 0.0; - float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f }; - struct cell *cp; - - /* If I am a super-cell, return immediately. */ - if ( c->kick2 != NULL || c->count == 0 ) - return; - - /* If this cell is not split, I'm in trouble. */ - if ( !c->split ) - error( "Cell has no super-cell." ); - - /* Collect the values from the progeny. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( ( cp = c->progeny[k] ) != NULL ) { - engine_collect_kick2( cp ); - dt_min = fminf( dt_min , cp->dt_min ); - dt_max = fmaxf( dt_max , cp->dt_max ); - updated += cp->updated; - ekin += cp->ekin; - epot += cp->epot; - mom[0] += cp->mom[0]; mom[1] += cp->mom[1]; mom[2] += cp->mom[2]; - ang[0] += cp->ang[0]; ang[1] += cp->ang[1]; ang[2] += cp->ang[2]; - } - - /* Store the collected values in the cell. */ - c->dt_min = dt_min; - c->dt_max = dt_max; - c->updated = updated; - c->ekin = ekin; - c->epot = epot; - c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2]; - c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2]; - +void engine_collect_kick2(struct cell *c) { + + int k, updated = 0; + float dt_min = FLT_MAX, dt_max = 0.0f; + double ekin = 0.0, epot = 0.0; + float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f}; + struct cell *cp; + + /* If I am a super-cell, return immediately. */ + if (c->kick2 != NULL || c->count == 0) return; + + /* If this cell is not split, I'm in trouble. */ + if (!c->split) error("Cell has no super-cell."); + + /* Collect the values from the progeny. */ + for (k = 0; k < 8; k++) + if ((cp = c->progeny[k]) != NULL) { + engine_collect_kick2(cp); + dt_min = fminf(dt_min, cp->dt_min); + dt_max = fmaxf(dt_max, cp->dt_max); + updated += cp->updated; + ekin += cp->ekin; + epot += cp->epot; + mom[0] += cp->mom[0]; + mom[1] += cp->mom[1]; + mom[2] += cp->mom[2]; + ang[0] += cp->ang[0]; + ang[1] += cp->ang[1]; + ang[2] += cp->ang[2]; } + /* Store the collected values in the cell. */ + c->dt_min = dt_min; + c->dt_max = dt_max; + c->updated = updated; + c->ekin = ekin; + c->epot = epot; + c->mom[0] = mom[0]; + c->mom[1] = mom[1]; + c->mom[2] = mom[2]; + c->ang[0] = ang[0]; + c->ang[1] = ang[1]; + c->ang[2] = ang[2]; +} /** * @brief Compute the force on a single particle brute-force. */ -// void engine_single_density ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) { -// +// void engine_single_density ( double *dim , long long int pid , struct part +// *__restrict__ parts , int N , int periodic ) { +// // int i, k; // double r2, dx[3]; // float fdx[3], ih; // struct part p; -// +// // /* Find "our" part. */ // for ( k = 0 ; k < N && parts[k].id != pid ; k++ ); // if ( k == N ) // error( "Part not found." ); // p = parts[k]; -// +// // /* Clear accumulators. */ // ih = 1.0f / p.h; // p.rho = 0.0f; p.rho_dh = 0.0f; @@ -1559,7 +1652,7 @@ void engine_collect_kick2 ( struct cell *c ) { // p.density.div_v = 0.0; // for ( k=0 ; k < 3 ; k++) // p.density.curl_v[k] = 0.0; -// +// // /* Loop over all particle pairs (force). */ // for ( k = 0 ; k < N ; k++ ) { // if ( parts[k].id == p.id ) @@ -1576,37 +1669,40 @@ void engine_collect_kick2 ( struct cell *c ) { // } // r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2]; // if ( r2 < p.h*p.h*kernel_gamma2 ) { -// runner_iact_nonsym_density( r2 , fdx , p.h , parts[k].h , &p , &parts[k] ); +// runner_iact_nonsym_density( r2 , fdx , p.h , parts[k].h , &p , +// &parts[k] ); // } // } -// +// // /* Dump the result. */ // p.rho = ih * ih * ih * ( p.rho + p.mass*kernel_root ); // p.rho_dh = p.rho_dh * ih * ih * ih * ih; -// p.density.wcount = ( p.density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 ); -// message( "part %lli (h=%e) has wcount=%e, rho=%e, rho_dh=%e." , p.id , p.h , p.density.wcount , p.rho , p.rho_dh ); +// p.density.wcount = ( p.density.wcount + kernel_root ) * ( 4.0f / 3.0 * +// M_PI * kernel_gamma3 ); +// message( "part %lli (h=%e) has wcount=%e, rho=%e, rho_dh=%e." , p.id , +// p.h , p.density.wcount , p.rho , p.rho_dh ); // fflush(stdout); -// +// // } - -// void engine_single_force ( double *dim , long long int pid , struct part *__restrict__ parts , int N , int periodic ) { -// +// void engine_single_force ( double *dim , long long int pid , struct part +// *__restrict__ parts , int N , int periodic ) { +// // int i, k; // double r2, dx[3]; // float fdx[3]; // struct part p; -// +// // /* Find "our" part. */ // for ( k = 0 ; k < N && parts[k].id != pid ; k++ ); // if ( k == N ) // error( "Part not found." ); // p = parts[k]; -// +// // /* Clear accumulators. */ // p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f; // p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f; -// +// // /* Loop over all particle pairs (force). */ // for ( k = 0 ; k < N ; k++ ) { // // for ( k = N-1 ; k >= 0 ; k-- ) { @@ -1623,23 +1719,28 @@ void engine_collect_kick2 ( struct cell *c ) { // fdx[i] = dx[i]; // } // r2 = fdx[0]*fdx[0] + fdx[1]*fdx[1] + fdx[2]*fdx[2]; -// if ( r2 < p.h*p.h*kernel_gamma2 || r2 < parts[k].h*parts[k].h*kernel_gamma2 ) { +// if ( r2 < p.h*p.h*kernel_gamma2 || r2 < +// parts[k].h*parts[k].h*kernel_gamma2 ) { // p.a[0] = 0.0f; p.a[1] = 0.0f; p.a[2] = 0.0f; // p.force.u_dt = 0.0f; p.force.h_dt = 0.0f; p.force.v_sig = 0.0f; -// runner_iact_nonsym_force( r2 , fdx , p.h , parts[k].h , &p , &parts[k] ); -// double dvdr = ( (p.v[0]-parts[k].v[0])*fdx[0] + (p.v[1]-parts[k].v[1])*fdx[1] + (p.v[2]-parts[k].v[2])*fdx[2] ) / sqrt(r2); -// message( "part %lli and %lli interact (r=%.3e,dvdr=%.3e) with a=[%.3e,%.3e,%.3e], dudt=%.3e." , -// p.id , parts[k].id , sqrt(r2) , dvdr , p.a[0] , p.a[1], p.a[2] , p.force.u_dt ); +// runner_iact_nonsym_force( r2 , fdx , p.h , parts[k].h , &p , +// &parts[k] ); +// double dvdr = ( (p.v[0]-parts[k].v[0])*fdx[0] + +// (p.v[1]-parts[k].v[1])*fdx[1] + (p.v[2]-parts[k].v[2])*fdx[2] ) / sqrt(r2); +// message( "part %lli and %lli interact (r=%.3e,dvdr=%.3e) with +// a=[%.3e,%.3e,%.3e], dudt=%.3e." , +// p.id , parts[k].id , sqrt(r2) , dvdr , p.a[0] , p.a[1], +// p.a[2] , p.force.u_dt ); // } // } -// +// // /* Dump the result. */ -// // message( "part %lli (h=%e) has a=[%.3e,%.3e,%.3e], udt=%e." , p.id , p.h , p.a[0] , p.a[1] , p.a[2] , p.force.u_dt ); +// // message( "part %lli (h=%e) has a=[%.3e,%.3e,%.3e], udt=%e." , p.id , +// p.h , p.a[0] , p.a[1] , p.a[2] , p.force.u_dt ); // fflush(stdout); -// +// // } - - + /** * @brief Launch the runners. * @@ -1647,384 +1748,409 @@ void engine_collect_kick2 ( struct cell *c ) { * @param nr_runners The number of #runner to let loose. * @param mask The task mask to launch. */ - -void engine_launch ( struct engine *e , int nr_runners , unsigned int mask ) { - - /* Prepare the scheduler. */ - atomic_inc( &e->sched.waiting ); - - /* Cry havoc and let loose the dogs of war. */ - e->barrier_launch = nr_runners; - e->barrier_launchcount = nr_runners; - if ( pthread_cond_broadcast( &e->barrier_cond ) != 0 ) - error( "Failed to broadcast barrier open condition." ); - - /* Load the tasks. */ - pthread_mutex_unlock( &e->barrier_mutex ); - scheduler_start( &e->sched , mask ); - pthread_mutex_lock( &e->barrier_mutex ); - - /* Remove the safeguard. */ - pthread_mutex_lock( &e->sched.sleep_mutex ); - atomic_dec( &e->sched.waiting ); - pthread_cond_broadcast( &e->sched.sleep_cond ); - pthread_mutex_unlock( &e->sched.sleep_mutex ); - - /* Sit back and wait for the runners to come home. */ - while ( e->barrier_launch || e->barrier_running ) - if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) - error( "Error while waiting for barrier." ); - - } - - -void hassorted ( struct cell *c ) { - - if ( c->sorted ) - error( "Suprious sorted flags." ); - - if ( c->split ) - for ( int k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - hassorted( c->progeny[k] ); - - } +void engine_launch(struct engine *e, int nr_runners, unsigned int mask) { + + /* Prepare the scheduler. */ + atomic_inc(&e->sched.waiting); + + /* Cry havoc and let loose the dogs of war. */ + e->barrier_launch = nr_runners; + e->barrier_launchcount = nr_runners; + if (pthread_cond_broadcast(&e->barrier_cond) != 0) + error("Failed to broadcast barrier open condition."); + + /* Load the tasks. */ + pthread_mutex_unlock(&e->barrier_mutex); + scheduler_start(&e->sched, mask); + pthread_mutex_lock(&e->barrier_mutex); + + /* Remove the safeguard. */ + pthread_mutex_lock(&e->sched.sleep_mutex); + atomic_dec(&e->sched.waiting); + pthread_cond_broadcast(&e->sched.sleep_cond); + pthread_mutex_unlock(&e->sched.sleep_mutex); + + /* Sit back and wait for the runners to come home. */ + while (e->barrier_launch || e->barrier_running) + if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0) + error("Error while waiting for barrier."); +} + +void hassorted(struct cell *c) { + + if (c->sorted) error("Suprious sorted flags."); + + if (c->split) + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) hassorted(c->progeny[k]); +} /** * @brief Let the #engine loose to compute the forces. * * @param e The #engine. */ - -void engine_step ( struct engine *e ) { - - int k; - float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX; - double epot = 0.0, ekin = 0.0; - float mom[3] = { 0.0 , 0.0 , 0.0 }; - float ang[3] = { 0.0 , 0.0 , 0.0 }; - int count = 0; - struct cell *c; - struct space *s = e->s; - - TIMER_TIC2 - - /* Get the maximum dt. */ - if ( e->policy & engine_policy_multistep ) { - dt_step = 2.0f*dt; - for ( k = 0 ; k < 32 && (e->step & (1 << k)) == 0 ; k++ ) - dt_step *= 2; - } - else - dt_step = FLT_MAX; - - /* Set the maximum dt. */ - e->dt_step = dt_step; - e->s->dt_step = dt_step; - // message( "dt_step set to %.3e (dt=%.3e)." , dt_step , e->dt ); fflush(stdout); - - // printParticle( parts , 432626 ); - - /* First kick. */ - if ( e->step == 0 || !( e->policy & engine_policy_fixdt ) ) { - TIMER_TIC - engine_launch( e , ( e->nr_threads > 8 ) ? 8 : e->nr_threads , (1 << task_type_kick1) | (1 << task_type_link) ); - TIMER_TOC( timer_kick1 ); - } - - /* Check if all the kick1 threads have executed. */ - /* for ( k = 0 ; k < e->sched.nr_tasks ; k++ ) - if ( e->sched.tasks[k].type == task_type_kick1 && - e->sched.tasks[k].toc == 0 ) - error( "Not all kick1 tasks completed." ); */ - - // for(k=0; k<10; ++k) - // printParticle(parts, k); - // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts ); - - /* Re-distribute the particles amongst the nodes? */ - if ( e->forcerepart ) - engine_repartition( e ); - - /* Prepare the space. */ - engine_prepare( e ); - - // engine_single_density( e->s->dim , 3392063069037 , e->s->parts , e->s->nr_parts , e->s->periodic ); - - /* Send off the runners. */ + +void engine_step(struct engine *e) { + + int k; + float dt = e->dt, dt_step, dt_max = 0.0f, dt_min = FLT_MAX; + double epot = 0.0, ekin = 0.0; + float mom[3] = {0.0, 0.0, 0.0}; + float ang[3] = {0.0, 0.0, 0.0}; + int count = 0; + struct cell *c; + struct space *s = e->s; + + TIMER_TIC2 + + if (e->policy & engine_policy_paranoid) { + message("Checking system sanity..."); + engine_check(e); + } + + /* Get the maximum dt. */ + if (e->policy & engine_policy_multistep) { + dt_step = 2.0f * dt; + for (k = 0; k < 32 && (e->step & (1 << k)) == 0; k++) dt_step *= 2; + } else + dt_step = FLT_MAX; + + /* Set the maximum dt. */ + e->dt_step = dt_step; + e->s->dt_step = dt_step; + // message( "dt_step set to %.3e (dt=%.3e)." , dt_step , e->dt ); + // fflush(stdout); + + // printParticle( parts , 432626 ); + + /* First kick. */ + if (e->step == 0 || !(e->policy & engine_policy_fixdt)) { TIMER_TIC - engine_launch( e , e->nr_threads , (1 << task_type_sort) | - (1 << task_type_self) | - (1 << task_type_pair) | - (1 << task_type_sub) | - (1 << task_type_ghost) | - (1 << task_type_kick2) | - (1 << task_type_send) | - (1 << task_type_recv) | - (1 << task_type_grav_pp) | - (1 << task_type_grav_mm) | - (1 << task_type_grav_up) | - (1 << task_type_grav_down) | - (1 << task_type_link) ); - TIMER_TOC(timer_runners); - - // engine_single_force( e->s->dim , 8328423931905 , e->s->parts , e->s->nr_parts , e->s->periodic ); - - // for(k=0; k<10; ++k) - // printParticle(parts, k); - // printParticle( parts , 432626 ); - // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts ); - // printParticle( e->s->parts , 8328423931905 , e->s->nr_parts ); - - /* Collect the cell data from the second kick. */ - for ( k = 0 ; k < s->nr_cells ; k++ ) - if ( s->cells[k].nodeID == e->nodeID ) { - c = &s->cells[k]; - engine_collect_kick2( c ); - dt_min = fminf( dt_min , c->dt_min ); - dt_max = fmaxf( dt_max , c->dt_max ); - ekin += c->ekin; - epot += c->epot; - count += c->updated; - mom[0] += c->mom[0]; mom[1] += c->mom[1]; mom[2] += c->mom[2]; - ang[0] += c->ang[0]; ang[1] += c->ang[1]; ang[2] += c->ang[2]; - } - - /* Aggregate the data from the different nodes. */ - #ifdef WITH_MPI - double in[3], out[3]; - out[0] = dt_min; - if ( MPI_Allreduce( out , in , 1 , MPI_DOUBLE , MPI_MIN , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to aggregate dt_min." ); - dt_min = in[0]; - out[0] = dt_max; - if ( MPI_Allreduce( out , in , 1 , MPI_DOUBLE , MPI_MAX , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to aggregate dt_max." ); - dt_max = in[0]; - out[0] = count; out[1] = ekin; out[2] = epot; - if ( MPI_Allreduce( out , in , 3 , MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to aggregate energies." ); - count = in[0]; ekin = in[1]; epot = in[2]; - /* int nr_parts; - if ( MPI_Allreduce( &s->nr_parts , &nr_parts , 1 , MPI_INT , MPI_SUM , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to aggregate particle count." ); - if ( e->nodeID == 0 ) - message( "nr_parts=%i." , nr_parts ); */ - #endif - - e->dt_min = dt_min; - e->dt_max = dt_max; - e->count_step = count; - e->ekin = ekin; - e->epot = epot; - // printParticle( e->s->parts , 382557 , e->s->nr_parts ); - // message( "dt_min/dt_max is %e/%e." , dt_min , dt_max ); fflush(stdout); - // message( "etot is %e (ekin=%e, epot=%e)." , ekin+epot , ekin , epot ); fflush(stdout); - // message( "total momentum is [ %e , %e , %e ]." , mom[0] , mom[1] , mom[2] ); fflush(stdout); - // message( "total angular momentum is [ %e , %e , %e ]." , ang[0] , ang[1] , ang[2] ); fflush(stdout); - // message( "updated %i parts (dt_step=%.3e)." , count , dt_step ); fflush(stdout); - - /* Increase the step. */ - e->step += 1; - - /* Does the time step need adjusting? */ - if ( e->policy & engine_policy_fixdt ) { + engine_launch(e, (e->nr_threads > 8) ? 8 : e->nr_threads, + (1 << task_type_kick1) | (1 << task_type_link)); + TIMER_TOC(timer_kick1); + } + + /* Check if all the kick1 threads have executed. */ + /* for ( k = 0 ; k < e->sched.nr_tasks ; k++ ) + if ( e->sched.tasks[k].type == task_type_kick1 && + e->sched.tasks[k].toc == 0 ) + error( "Not all kick1 tasks completed." ); */ + + // for(k=0; k<10; ++k) + // printParticle(parts, k); + // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts ); + + if (e->policy & engine_policy_paranoid) { + message("Checking system sanity..."); + engine_check(e); + } + + /* Re-distribute the particles amongst the nodes? */ + if (e->forcerepart) engine_repartition(e); + + if (e->policy & engine_policy_paranoid) { + message("Checking system sanity..."); + engine_check(e); + } + + /* Prepare the space. */ + engine_prepare(e); + + // engine_single_density( e->s->dim , 3392063069037 , e->s->parts , + // e->s->nr_parts , e->s->periodic ); + + /* Send off the runners. */ + TIMER_TIC + engine_launch(e, e->nr_threads, + (1 << task_type_sort) | (1 << task_type_self) | + (1 << task_type_pair) | (1 << task_type_sub) | + (1 << task_type_ghost) | (1 << task_type_kick2) | + (1 << task_type_send) | (1 << task_type_recv) | + (1 << task_type_grav_pp) | (1 << task_type_grav_mm) | + (1 << task_type_grav_up) | (1 << task_type_grav_down) | + (1 << task_type_link)); + + if (e->policy & engine_policy_paranoid) { + message("Checking system sanity..."); + engine_check(e); + } + + TIMER_TOC(timer_runners); + + // engine_single_force( e->s->dim , 8328423931905 , e->s->parts , + // e->s->nr_parts , e->s->periodic ); + + // for(k=0; k<10; ++k) + // printParticle(parts, k); + // printParticle( parts , 432626 ); + // printParticle( e->s->parts , 3392063069037 , e->s->nr_parts ); + // printParticle( e->s->parts , 8328423931905 , e->s->nr_parts ); + + /* Collect the cell data from the second kick. */ + for (k = 0; k < s->nr_cells; k++) + if (s->cells[k].nodeID == e->nodeID) { + c = &s->cells[k]; + engine_collect_kick2(c); + dt_min = fminf(dt_min, c->dt_min); + dt_max = fmaxf(dt_max, c->dt_max); + ekin += c->ekin; + epot += c->epot; + count += c->updated; + mom[0] += c->mom[0]; + mom[1] += c->mom[1]; + mom[2] += c->mom[2]; + ang[0] += c->ang[0]; + ang[1] += c->ang[1]; + ang[2] += c->ang[2]; + } + +/* Aggregate the data from the different nodes. */ +#ifdef WITH_MPI + double in[3], out[3]; + out[0] = dt_min; + if (MPI_Allreduce(out, in, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD) != + MPI_SUCCESS) + error("Failed to aggregate dt_min."); + dt_min = in[0]; + out[0] = dt_max; + if (MPI_Allreduce(out, in, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD) != + MPI_SUCCESS) + error("Failed to aggregate dt_max."); + dt_max = in[0]; + out[0] = count; + out[1] = ekin; + out[2] = epot; + if (MPI_Allreduce(out, in, 3, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD) != + MPI_SUCCESS) + error("Failed to aggregate energies."); + count = in[0]; + ekin = in[1]; + epot = in[2]; +/* int nr_parts; +if ( MPI_Allreduce( &s->nr_parts , &nr_parts , 1 , MPI_INT , MPI_SUM , +MPI_COMM_WORLD ) != MPI_SUCCESS ) + error( "Failed to aggregate particle count." ); +if ( e->nodeID == 0 ) + message( "nr_parts=%i." , nr_parts ); */ +#endif + + e->dt_min = dt_min; + e->dt_max = dt_max; + e->count_step = count; + e->ekin = ekin; + e->epot = epot; + // printParticle( e->s->parts , 382557 , e->s->nr_parts ); + // message( "dt_min/dt_max is %e/%e." , dt_min , dt_max ); fflush(stdout); + // message( "etot is %e (ekin=%e, epot=%e)." , ekin+epot , ekin , epot ); + // fflush(stdout); + // message( "total momentum is [ %e , %e , %e ]." , mom[0] , mom[1] , mom[2] + // ); fflush(stdout); + // message( "total angular momentum is [ %e , %e , %e ]." , ang[0] , ang[1] , + // ang[2] ); fflush(stdout); + // message( "updated %i parts (dt_step=%.3e)." , count , dt_step ); + // fflush(stdout); + + /* Increase the step. */ + e->step += 1; + + /* Does the time step need adjusting? */ + if (e->policy & engine_policy_fixdt) { + dt = e->dt_orig; + } else { + if (dt == 0) { + e->nullstep += 1; + if (e->dt_orig > 0.0) { dt = e->dt_orig; - } - else { - if ( dt == 0 ) { - e->nullstep += 1; - if ( e->dt_orig > 0.0 ) { - dt = e->dt_orig; - while ( dt_min < dt ) - dt *= 0.5; - while ( dt_min > 2*dt ) - dt *= 2.0; - } - else - dt = dt_min; - for ( k = 0 ; k < s->nr_parts ; k++ ) { - /* struct part *p = &s->parts[k]; - struct xpart *xp = &s->xparts[k]; - float dt_curr = dt; - for ( int j = (int)( p->dt / dt ) ; j > 1 ; j >>= 1 ) - dt_curr *= 2.0f; - xp->dt_curr = dt_curr; */ - s->parts[k].dt = dt; - s->xparts[k].dt_curr = dt; - } - // message( "dt_min=%.3e, adjusting time step to dt=%e." , dt_min , e->dt ); - } - else { - while ( dt_min < dt ) { - dt *= 0.5; - e->step *= 2; - e->nullstep *= 2; - // message( "dt_min dropped below time step, adjusting to dt=%e." , e->dt ); - } - while ( dt_min > 2*dt && (e->step & 1) == 0 ) { - dt *= 2.0; - e->step /= 2; - e->nullstep /= 2; - // message( "dt_min is larger than twice the time step, adjusting to dt=%e." , e->dt ); - } - } - } - e->dt = dt; - - /* Set the system time. */ - e->time = dt * (e->step - e->nullstep); - - TIMER_TOC2(timer_step); - + while (dt_min < dt) dt *= 0.5; + while (dt_min > 2 * dt) dt *= 2.0; + } else + dt = dt_min; + for (k = 0; k < s->nr_parts; k++) { + /* struct part *p = &s->parts[k]; + struct xpart *xp = &s->xparts[k]; + float dt_curr = dt; + for ( int j = (int)( p->dt / dt ) ; j > 1 ; j >>= 1 ) + dt_curr *= 2.0f; + xp->dt_curr = dt_curr; */ + s->parts[k].dt = dt; + s->xparts[k].dt_curr = dt; + } + // message( "dt_min=%.3e, adjusting time step to dt=%e." , dt_min , e->dt + // ); + } else { + while (dt_min < dt) { + dt *= 0.5; + e->step *= 2; + e->nullstep *= 2; + // message( "dt_min dropped below time step, adjusting to dt=%e." , + // e->dt ); + } + while (dt_min > 2 * dt && (e->step & 1) == 0) { + dt *= 2.0; + e->step /= 2; + e->nullstep /= 2; + // message( "dt_min is larger than twice the time step, adjusting to + // dt=%e." , e->dt ); + } } - - + } + e->dt = dt; + + /* Set the system time. */ + e->time = dt * (e->step - e->nullstep); + + TIMER_TOC2(timer_step); +} + /** * @brief Create and fill the proxies. * * @param e The #engine. */ - -void engine_makeproxies ( struct engine *e ) { - - int i, j, k, ii, jj, kk; - int cid, cjd, pid, ind[3], *cdim = e->s->cdim; - struct space *s = e->s; - struct cell *cells = s->cells; - struct proxy *proxies = e->proxies; - - /* Prepare the proxies and the proxy index. */ - if ( e->proxy_ind == NULL ) - if ( ( e->proxy_ind = (int *)malloc( sizeof(int) * e->nr_nodes ) ) == NULL ) - error( "Failed to allocate proxy index." ); - for ( k = 0 ; k < e->nr_nodes ; k++ ) - e->proxy_ind[k] = -1; - e->nr_proxies = 0; - - /* The following loop is super-clunky, but it's necessary - to ensure that the order of the send and recv cells in - the proxies is identical for all nodes! */ - - /* Loop over each cell in the space. */ - for ( ind[0] = 0 ; ind[0] < cdim[0] ; ind[0]++ ) - for ( ind[1] = 0 ; ind[1] < cdim[1] ; ind[1]++ ) - for ( ind[2] = 0 ; ind[2] < cdim[2] ; ind[2]++ ) { - - /* Get the cell ID. */ - cid = cell_getid( cdim , ind[0] , ind[1] , ind[2] ); - - /* Loop over all its neighbours (periodic). */ - for ( i = -1 ; i <= 1 ; i++ ) { - ii = ind[0] + i; - if ( ii >= cdim[0] ) - ii -= cdim[0]; - else if ( ii < 0 ) - ii += cdim[0]; - for ( j = -1 ; j <= 1 ; j++ ) { - jj = ind[1] + j; - if ( jj >= cdim[1] ) - jj -= cdim[1]; - else if ( jj < 0 ) - jj += cdim[1]; - for ( k = -1 ; k <= 1 ; k++ ) { - kk = ind[2] + k; - if ( kk >= cdim[2] ) - kk -= cdim[2]; - else if ( kk < 0 ) - kk += cdim[2]; - - /* Get the cell ID. */ - cjd = cell_getid( cdim , ii , jj , kk ); - - /* Add to proxies? */ - if ( cells[cid].nodeID == e->nodeID && cells[cjd].nodeID != e->nodeID ) { - pid = e->proxy_ind[ cells[cjd].nodeID ]; - if ( pid < 0 ) { - if ( e->nr_proxies == engine_maxproxies ) - error( "Maximum number of proxies exceeded." ); - proxy_init( &proxies[ e->nr_proxies ] , e->nodeID , cells[cjd].nodeID ); - e->proxy_ind[ cells[cjd].nodeID ] = e->nr_proxies; - pid = e->nr_proxies; - e->nr_proxies += 1; - } - proxy_addcell_in( &proxies[pid] , &cells[cjd] ); - proxy_addcell_out( &proxies[pid] , &cells[cid] ); - cells[cid].sendto |= ( 1ULL << pid ); - } - - if ( cells[cjd].nodeID == e->nodeID && cells[cid].nodeID != e->nodeID ) { - pid = e->proxy_ind[ cells[cid].nodeID ]; - if ( pid < 0 ) { - if ( e->nr_proxies == engine_maxproxies ) - error( "Maximum number of proxies exceeded." ); - proxy_init( &proxies[ e->nr_proxies ] , e->nodeID , cells[cid].nodeID ); - e->proxy_ind[ cells[cid].nodeID ] = e->nr_proxies; - pid = e->nr_proxies; - e->nr_proxies += 1; - } - proxy_addcell_in( &proxies[pid] , &cells[cid] ); - proxy_addcell_out( &proxies[pid] , &cells[cjd] ); - cells[cjd].sendto |= ( 1ULL << pid ); - } - } - } - } + +void engine_makeproxies(struct engine *e) { + + int i, j, k, ii, jj, kk; + int cid, cjd, pid, ind[3], *cdim = e->s->cdim; + struct space *s = e->s; + struct cell *cells = s->cells; + struct proxy *proxies = e->proxies; + + /* Prepare the proxies and the proxy index. */ + if (e->proxy_ind == NULL) + if ((e->proxy_ind = (int *)malloc(sizeof(int) * e->nr_nodes)) == NULL) + error("Failed to allocate proxy index."); + for (k = 0; k < e->nr_nodes; k++) e->proxy_ind[k] = -1; + e->nr_proxies = 0; + + /* The following loop is super-clunky, but it's necessary + to ensure that the order of the send and recv cells in + the proxies is identical for all nodes! */ + + /* Loop over each cell in the space. */ + for (ind[0] = 0; ind[0] < cdim[0]; ind[0]++) + for (ind[1] = 0; ind[1] < cdim[1]; ind[1]++) + for (ind[2] = 0; ind[2] < cdim[2]; ind[2]++) { + + /* Get the cell ID. */ + cid = cell_getid(cdim, ind[0], ind[1], ind[2]); + + /* Loop over all its neighbours (periodic). */ + for (i = -1; i <= 1; i++) { + ii = ind[0] + i; + if (ii >= cdim[0]) + ii -= cdim[0]; + else if (ii < 0) + ii += cdim[0]; + for (j = -1; j <= 1; j++) { + jj = ind[1] + j; + if (jj >= cdim[1]) + jj -= cdim[1]; + else if (jj < 0) + jj += cdim[1]; + for (k = -1; k <= 1; k++) { + kk = ind[2] + k; + if (kk >= cdim[2]) + kk -= cdim[2]; + else if (kk < 0) + kk += cdim[2]; + + /* Get the cell ID. */ + cjd = cell_getid(cdim, ii, jj, kk); + + /* Add to proxies? */ + if (cells[cid].nodeID == e->nodeID && + cells[cjd].nodeID != e->nodeID) { + pid = e->proxy_ind[cells[cjd].nodeID]; + if (pid < 0) { + if (e->nr_proxies == engine_maxproxies) + error("Maximum number of proxies exceeded."); + proxy_init(&proxies[e->nr_proxies], e->nodeID, + cells[cjd].nodeID); + e->proxy_ind[cells[cjd].nodeID] = e->nr_proxies; + pid = e->nr_proxies; + e->nr_proxies += 1; } - - } - - -/** + proxy_addcell_in(&proxies[pid], &cells[cjd]); + proxy_addcell_out(&proxies[pid], &cells[cid]); + cells[cid].sendto |= (1ULL << pid); + } + + if (cells[cjd].nodeID == e->nodeID && + cells[cid].nodeID != e->nodeID) { + pid = e->proxy_ind[cells[cid].nodeID]; + if (pid < 0) { + if (e->nr_proxies == engine_maxproxies) + error("Maximum number of proxies exceeded."); + proxy_init(&proxies[e->nr_proxies], e->nodeID, + cells[cid].nodeID); + e->proxy_ind[cells[cid].nodeID] = e->nr_proxies; + pid = e->nr_proxies; + e->nr_proxies += 1; + } + proxy_addcell_in(&proxies[pid], &cells[cid]); + proxy_addcell_out(&proxies[pid], &cells[cjd]); + cells[cjd].sendto |= (1ULL << pid); + } + } + } + } + } +} + +/** * @brief Split the underlying space according to the given grid. * * @param e The #engine. * @param grid The grid. */ - -void engine_split ( struct engine *e , int *grid ) { - - int j, k; - int ind[3]; - struct space *s = e->s; - struct cell *c; - - /* If we've got the wrong number of nodes, fail. */ - if ( e->nr_nodes != grid[0]*grid[1]*grid[2] ) - error( "Grid size does not match number of nodes." ); - - /* Run through the cells and set their nodeID. */ - // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]); - for ( k = 0 ; k < s->nr_cells ; k++ ) { - c = &s->cells[k]; - for ( j = 0 ; j < 3 ; j++ ) - ind[j] = c->loc[j] / s->dim[j] * grid[j]; - c->nodeID = ind[0] + grid[0]*( ind[1] + grid[1]*ind[2] ); - // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0], c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID); - } - - /* Make the proxies. */ - engine_makeproxies( e ); - - /* Re-allocate the local parts. */ - message("Re-allocating parts array from %i to %i.", s->size_parts, (int)(s->nr_parts * 1.2)); - s->size_parts = s->nr_parts * 1.2; - struct part *parts_new; - struct xpart *xparts_new; - if ( posix_memalign( (void **)&parts_new , part_align , sizeof(struct part) * s->size_parts ) != 0 || - posix_memalign( (void **)&xparts_new , part_align , sizeof(struct xpart) * s->size_parts ) != 0 ) - error( "Failed to allocate new part data." ); - memcpy( parts_new , s->parts , sizeof(struct part) * s->nr_parts ); - memcpy( xparts_new , s->xparts , sizeof(struct xpart) * s->nr_parts ); - free( s->parts ); - free( s->xparts ); - s->parts = parts_new; - s->xparts = xparts_new; - } - - +void engine_split(struct engine *e, int *grid) { + + int j, k; + int ind[3]; + struct space *s = e->s; + struct cell *c; + + /* If we've got the wrong number of nodes, fail. */ + if (e->nr_nodes != grid[0] * grid[1] * grid[2]) + error("Grid size does not match number of nodes."); + + /* Run through the cells and set their nodeID. */ + // message("s->dim = [%e,%e,%e]", s->dim[0], s->dim[1], s->dim[2]); + for (k = 0; k < s->nr_cells; k++) { + c = &s->cells[k]; + for (j = 0; j < 3; j++) ind[j] = c->loc[j] / s->dim[j] * grid[j]; + c->nodeID = ind[0] + grid[0] * (ind[1] + grid[1] * ind[2]); + // message("cell at [%e,%e,%e]: ind = [%i,%i,%i], nodeID = %i", c->loc[0], + // c->loc[1], c->loc[2], ind[0], ind[1], ind[2], c->nodeID); + } + + /* Make the proxies. */ + engine_makeproxies(e); + + /* Re-allocate the local parts. */ + message("Re-allocating parts array from %i to %i.", s->size_parts, + (int)(s->nr_parts * 1.2)); + s->size_parts = s->nr_parts * 1.2; + struct part *parts_new; + struct xpart *xparts_new; + if (posix_memalign((void **)&parts_new, part_align, + sizeof(struct part) * s->size_parts) != 0 || + posix_memalign((void **)&xparts_new, part_align, + sizeof(struct xpart) * s->size_parts) != 0) + error("Failed to allocate new part data."); + memcpy(parts_new, s->parts, sizeof(struct part) * s->nr_parts); + memcpy(xparts_new, s->xparts, sizeof(struct xpart) * s->nr_parts); + free(s->parts); + free(s->xparts); + s->parts = parts_new; + s->xparts = xparts_new; +} + /** * @brief init an engine with the given number of threads, queues, and * the given policy. @@ -2038,152 +2164,148 @@ void engine_split ( struct engine *e , int *grid ) { * @param nodeID The MPI rank of this node * @param policy The queueing policy to use. */ - -void engine_init ( struct engine *e , struct space *s , float dt , int nr_threads , int nr_queues , int nr_nodes , int nodeID , int policy ) { - - int k; - float dt_min = dt; - #if defined(HAVE_SETAFFINITY) - int nr_cores = sysconf( _SC_NPROCESSORS_ONLN ); - int i, j, cpuid[ nr_cores ]; - cpu_set_t cpuset; - if ( policy & engine_policy_cputight ) { - for ( k = 0 ; k < nr_cores ; k++ ) - cpuid[k] = k; - } - else { - /* Get next highest power of 2. */ - int maxint = 1; - while ( maxint < nr_cores ) - maxint *= 2; - - cpuid[0] = 0; - k = 1; - for ( i = 1 ; i < maxint ; i *= 2 ) - for ( j = maxint / i / 2 ; j < maxint ; j += maxint / i ) - if ( j < nr_cores && j != 0 ) - cpuid[k++] = j; - #ifdef WITHMPI - printf( "engine_init: cpu map is [ " ); - #else - printf( "[%03i] engine_init: cpu map is [ " , nodeID ); - #endif - for ( i = 0 ; i < nr_cores ; i++ ) - printf( "%i " , cpuid[i] ); - printf( "].\n" ); - } - #endif - - /* Store the values. */ - e->s = s; - e->nr_threads = nr_threads; - e->policy = policy; - e->step = 0; - e->nullstep = 0; - e->time = 0.0; - e->nr_nodes = nr_nodes; - e->nodeID = nodeID; - e->proxy_ind = NULL; + +void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, + int nr_queues, int nr_nodes, int nodeID, int policy) { + + int k; + float dt_min = dt; +#if defined(HAVE_SETAFFINITY) + int nr_cores = sysconf(_SC_NPROCESSORS_ONLN); + int i, j, cpuid[nr_cores]; + cpu_set_t cpuset; + if (policy & engine_policy_cputight) { + for (k = 0; k < nr_cores; k++) cpuid[k] = k; + } else { + /* Get next highest power of 2. */ + int maxint = 1; + while (maxint < nr_cores) maxint *= 2; + + cpuid[0] = 0; + k = 1; + for (i = 1; i < maxint; i *= 2) + for (j = maxint / i / 2; j < maxint; j += maxint / i) + if (j < nr_cores && j != 0) cpuid[k++] = j; +#ifdef WITHMPI + printf("engine_init: cpu map is [ "); +#else + printf("[%03i] engine_init: cpu map is [ ", nodeID); +#endif + for (i = 0; i < nr_cores; i++) printf("%i ", cpuid[i]); + printf("].\n"); + } +#endif + + /* Store the values. */ + e->s = s; + e->nr_threads = nr_threads; + e->policy = policy; + e->step = 0; + e->nullstep = 0; + e->time = 0.0; + e->nr_nodes = nr_nodes; + e->nodeID = nodeID; + e->proxy_ind = NULL; + e->nr_proxies = 0; + e->forcerebuild = 1; + e->forcerepart = 0; + e->links = NULL; + e->nr_links = 0; + engine_rank = nodeID; + + /* Make the space link back to the engine. */ + s->e = e; + + /* Are we doing stuff in parallel? */ + if (nr_nodes > 1) { +#ifndef HAVE_MPI + error("SWIFT was not compiled with MPI support."); +#else + e->policy |= engine_policy_mpi; + if ((e->proxies = (struct proxy *)malloc(sizeof(struct proxy) * + engine_maxproxies)) == NULL) + error("Failed to allocate memory for proxies."); + bzero(e->proxies, sizeof(struct proxy) * engine_maxproxies); e->nr_proxies = 0; - e->forcerebuild = 1; - e->forcerepart = 0; - e->links = NULL; - e->nr_links = 0; - engine_rank = nodeID; - - /* Make the space link back to the engine. */ - s->e = e; - - /* Are we doing stuff in parallel? */ - if ( nr_nodes > 1 ) { - #ifndef HAVE_MPI - error( "SWIFT was not compiled with MPI support." ); - #else - e->policy |= engine_policy_mpi; - if ( ( e->proxies = (struct proxy *)malloc( sizeof(struct proxy) * engine_maxproxies ) ) == NULL ) - error( "Failed to allocate memory for proxies." ); - bzero( e->proxies , sizeof(struct proxy) * engine_maxproxies ); - e->nr_proxies = 0; - #endif - } - - /* First of all, init the barrier and lock it. */ - if ( pthread_mutex_init( &e->barrier_mutex , NULL ) != 0 ) - error( "Failed to initialize barrier mutex." ); - if ( pthread_cond_init( &e->barrier_cond , NULL ) != 0 ) - error( "Failed to initialize barrier condition variable." ); - if ( pthread_mutex_lock( &e->barrier_mutex ) != 0 ) - error( "Failed to lock barrier mutex." ); - e->barrier_running = 0; - e->barrier_launch = 0; - e->barrier_launchcount = 0; - - /* Run through the parts and get the minimum time step. */ - e->dt_orig = dt; - for ( k = 0 ; k < s->nr_parts ; k++ ) - if ( s->parts[k].dt < dt_min ) - dt_min = s->parts[k].dt; - if ( dt_min == 0.0f ) - dt = 0.0f; - else - while ( dt > dt_min ) - dt *= 0.5f; - e->dt = dt; - - /* Init the scheduler. */ - scheduler_init( &e->sched , e->s , nr_queues , scheduler_flag_steal , e->nodeID ); - s->nr_queues = nr_queues; - - /* Append a kick1 task to each cell. */ - scheduler_reset( &e->sched , s->tot_cells ); - for ( k = 0 ; k < s->nr_cells ; k++ ) - s->cells[k].kick1 = scheduler_addtask( &e->sched , task_type_kick1 , task_subtype_none , 0 , 0 , &s->cells[k] , NULL , 0 ); - scheduler_ranktasks( &e->sched ); - - /* Allocate and init the threads. */ - if ( ( e->runners = (struct runner *)malloc( sizeof(struct runner) * nr_threads ) ) == NULL ) - error( "Failed to allocate threads array." ); - for ( k = 0 ; k < nr_threads ; k++ ) { - e->runners[k].id = k; - e->runners[k].e = e; - e->barrier_running += 1; - if ( pthread_create( &e->runners[k].thread , NULL , &runner_main , &e->runners[k] ) != 0 ) - error( "Failed to create runner thread." ); - if ( e->policy & engine_policy_setaffinity ) { - #if defined(HAVE_SETAFFINITY) - - /* Set a reasonable queue ID. */ - e->runners[k].cpuid = cpuid[ k % nr_cores ]; - if ( nr_queues < nr_threads ) - e->runners[k].qid = cpuid[ k % nr_cores ] * nr_queues / nr_cores; - else - e->runners[k].qid = k; - - /* Set the cpu mask to zero | e->id. */ - CPU_ZERO( &cpuset ); - CPU_SET( cpuid[ k % nr_cores ] , &cpuset ); - - /* Apply this mask to the runner's pthread. */ - if ( pthread_setaffinity_np( e->runners[k].thread , sizeof(cpu_set_t) , &cpuset ) != 0 ) - error( "Failed to set thread affinity." ); - - #else - error( "SWIFT was not compiled with affinity enabled." ); - #endif - } - else { - e->runners[k].cpuid = k; - e->runners[k].qid = k * nr_queues / nr_threads; - } - // message( "runner %i on cpuid=%i with qid=%i." , e->runners[k].id , e->runners[k].cpuid , e->runners[k].qid ); - } - - /* Wait for the runner threads to be in place. */ - while ( e->barrier_running || e->barrier_launch ) - if ( pthread_cond_wait( &e->barrier_cond , &e->barrier_mutex ) != 0 ) - error( "Error while waiting for runner threads to get in place." ); - +#endif + } + + /* First of all, init the barrier and lock it. */ + if (pthread_mutex_init(&e->barrier_mutex, NULL) != 0) + error("Failed to initialize barrier mutex."); + if (pthread_cond_init(&e->barrier_cond, NULL) != 0) + error("Failed to initialize barrier condition variable."); + if (pthread_mutex_lock(&e->barrier_mutex) != 0) + error("Failed to lock barrier mutex."); + e->barrier_running = 0; + e->barrier_launch = 0; + e->barrier_launchcount = 0; + + /* Run through the parts and get the minimum time step. */ + e->dt_orig = dt; + for (k = 0; k < s->nr_parts; k++) + if (s->parts[k].dt < dt_min) dt_min = s->parts[k].dt; + if (dt_min == 0.0f) + dt = 0.0f; + else + while (dt > dt_min) dt *= 0.5f; + e->dt = dt; + + /* Init the scheduler. */ + scheduler_init(&e->sched, e->s, nr_queues, scheduler_flag_steal, e->nodeID); + s->nr_queues = nr_queues; + + /* Append a kick1 task to each cell. */ + scheduler_reset(&e->sched, s->tot_cells); + for (k = 0; k < s->nr_cells; k++) + s->cells[k].kick1 = + scheduler_addtask(&e->sched, task_type_kick1, task_subtype_none, 0, 0, + &s->cells[k], NULL, 0); + scheduler_ranktasks(&e->sched); + + /* Allocate and init the threads. */ + if ((e->runners = + (struct runner *)malloc(sizeof(struct runner) * nr_threads)) == NULL) + error("Failed to allocate threads array."); + for (k = 0; k < nr_threads; k++) { + e->runners[k].id = k; + e->runners[k].e = e; + e->barrier_running += 1; + if (pthread_create(&e->runners[k].thread, NULL, &runner_main, + &e->runners[k]) != 0) + error("Failed to create runner thread."); + if (e->policy & engine_policy_setaffinity) { +#if defined(HAVE_SETAFFINITY) + + /* Set a reasonable queue ID. */ + e->runners[k].cpuid = cpuid[k % nr_cores]; + if (nr_queues < nr_threads) + e->runners[k].qid = cpuid[k % nr_cores] * nr_queues / nr_cores; + else + e->runners[k].qid = k; + + /* Set the cpu mask to zero | e->id. */ + CPU_ZERO(&cpuset); + CPU_SET(cpuid[k % nr_cores], &cpuset); + + /* Apply this mask to the runner's pthread. */ + if (pthread_setaffinity_np(e->runners[k].thread, sizeof(cpu_set_t), + &cpuset) != 0) + error("Failed to set thread affinity."); + +#else + error("SWIFT was not compiled with affinity enabled."); +#endif + } else { + e->runners[k].cpuid = k; + e->runners[k].qid = k * nr_queues / nr_threads; } - - - + // message( "runner %i on cpuid=%i with qid=%i." , e->runners[k].id , + // e->runners[k].cpuid , e->runners[k].qid ); + } + + /* Wait for the runner threads to be in place. */ + while (e->barrier_running || e->barrier_launch) + if (pthread_cond_wait(&e->barrier_cond, &e->barrier_mutex) != 0) + error("Error while waiting for runner threads to get in place."); +} diff --git a/src/engine.h b/src/engine.h index ba525ccc3ec4c20ef3d86361587be93be119f39b..caa286e7d3c518c0aba84fd9da1b6ff9ef6a78f4 100644 --- a/src/engine.h +++ b/src/engine.h @@ -1,135 +1,144 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_ENGINE_H +#define SWIFT_ENGINE_H +/* Some standard headers. */ +#include <pthread.h> +/* Includes. */ +#include "lock.h" +#include "proxy.h" +#include "runner.h" +#include "scheduler.h" +#include "space.h" +#include "task.h" /* Some constants. */ -#define engine_policy_none 0 -#define engine_policy_rand 1 -#define engine_policy_steal 2 -#define engine_policy_keep 4 -#define engine_policy_block 8 -#define engine_policy_fixdt 16 -#define engine_policy_multistep 32 -#define engine_policy_cputight 64 -#define engine_policy_mpi 128 -#define engine_policy_setaffinity 256 - -#define engine_queue_scale 1.2 -#define engine_maxtaskspercell 128 -#define engine_maxproxies 64 -#define engine_tasksreweight 10 +#define engine_policy_none 0 +#define engine_policy_rand 1 +#define engine_policy_steal 2 +#define engine_policy_keep 4 +#define engine_policy_block 8 +#define engine_policy_fixdt 16 +#define engine_policy_multistep 32 +#define engine_policy_cputight 64 +#define engine_policy_mpi 128 +#define engine_policy_setaffinity 256 +#define engine_policy_paranoid 512 +#define engine_queue_scale 1.2 +#define engine_maxtaskspercell 128 +#define engine_maxproxies 64 +#define engine_tasksreweight 10 /* The rank of the engine as a global variable (for messages). */ extern int engine_rank; - /* Mini struct to link cells to density/force tasks. */ struct link { - /* The task pointer. */ - struct task *t; - - /* The next pointer. */ - struct link *next; - - }; + /* The task pointer. */ + struct task *t; + /* The next pointer. */ + struct link *next; +}; /* Data structure for the engine. */ struct engine { - /* Number of threads on which to run. */ - int nr_threads; - - /* The space with which the runner is associated. */ - struct space *s; - - /* The runner's threads. */ - struct runner *runners; - - /* The running policy. */ - int policy; - - /* The task scheduler. */ - struct scheduler sched; - - /* The maximum dt to step (current). */ - float dt_step; - - /* The minimum dt over all particles in the system. */ - float dt_min, dt_max; - - /* The system time step. */ - float dt, dt_orig; - - /* The system energies from the previous step. */ - double ekin, epot; - - /* The current step number. */ - int step, nullstep; - - /* The number of particles updated in the previous step. */ - int count_step; - - /* The current system time. */ - float time; - - /* Data for the threads' barrier. */ - pthread_mutex_t barrier_mutex; - pthread_cond_t barrier_cond; - volatile int barrier_running, barrier_launch, barrier_launchcount; - - /* ID of the node this engine lives on. */ - int nr_nodes, nodeID; - - /* Proxies for the other nodes in this simulation. */ - struct proxy *proxies; - int nr_proxies, *proxy_ind; - - /* Tic at the start of a step. */ - ticks tic_step; - - /* Force the engine to rebuild? */ - int forcerebuild, forcerepart; - - /* How many steps have we done with the same set of tasks? */ - int tasks_age; - - /* Linked list for cell-task association. */ - struct link *links; - int nr_links; - - }; + /* Number of threads on which to run. */ + int nr_threads; + + /* The space with which the runner is associated. */ + struct space *s; + + /* The runner's threads. */ + struct runner *runners; + + /* The running policy. */ + int policy; + + /* The task scheduler. */ + struct scheduler sched; + + /* The maximum dt to step (current). */ + float dt_step; + + /* The minimum dt over all particles in the system. */ + float dt_min, dt_max; + + /* The system time step. */ + float dt, dt_orig; + + /* The system energies from the previous step. */ + double ekin, epot; + + /* The current step number. */ + int step, nullstep; + /* The number of particles updated in the previous step. */ + int count_step; + + /* The current system time. */ + float time; + + /* Data for the threads' barrier. */ + pthread_mutex_t barrier_mutex; + pthread_cond_t barrier_cond; + volatile int barrier_running, barrier_launch, barrier_launchcount; + + /* ID of the node this engine lives on. */ + int nr_nodes, nodeID; + + /* Proxies for the other nodes in this simulation. */ + struct proxy *proxies; + int nr_proxies, *proxy_ind; + + /* Tic at the start of a step. */ + ticks tic_step; + + /* Force the engine to rebuild? */ + int forcerebuild, forcerepart; + + /* How many steps have we done with the same set of tasks? */ + int tasks_age; + + /* Linked list for cell-task association. */ + struct link *links; + int nr_links; +}; /* Function prototypes. */ -void engine_barrier( struct engine *e , int tid ); -void engine_init ( struct engine *e , struct space *s , float dt , int nr_threads , int nr_queues , int nr_nodes , int nodeID , int policy ); -void engine_prepare ( struct engine *e ); -void engine_step ( struct engine *e ); -void engine_maketasks ( struct engine *e ); -void engine_split ( struct engine *e , int *grid ); -int engine_exchange_strays ( struct engine *e , int offset , int *ind , int N ); -void engine_rebuild ( struct engine *e ); -void engine_repartition ( struct engine *e ); -void engine_makeproxies ( struct engine *e ); -void engine_redistribute ( struct engine *e ); -struct link *engine_addlink( struct engine *e , struct link *l , struct task *t ); +void engine_barrier(struct engine *e, int tid); +void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, + int nr_queues, int nr_nodes, int nodeID, int policy); +void engine_prepare(struct engine *e); +void engine_step(struct engine *e); +void engine_maketasks(struct engine *e); +void engine_split(struct engine *e, int *grid); +int engine_exchange_strays(struct engine *e, int offset, int *ind, int N); +void engine_rebuild(struct engine *e); +void engine_repartition(struct engine *e); +void engine_makeproxies(struct engine *e); +void engine_redistribute(struct engine *e); +struct link *engine_addlink(struct engine *e, struct link *l, struct task *t); + +#endif /* SWIFT_ENGINE_H */ diff --git a/src/error.h b/src/error.h index b41f9a38e237509d9bcecd7c9cdf093487f804cd..e581dcf86ecea9abbc0a116fb041175fd872758c 100644 --- a/src/error.h +++ b/src/error.h @@ -2,44 +2,63 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_ERROR_H +#define SWIFT_ERROR_H +/* Some standard headers. */ #include <stdio.h> +/* MPI headers. */ +#ifdef WITH_MPI +#include <mpi.h> +#endif /** * @brief Error macro. Prints the message given in argument and aborts. * */ #ifdef WITH_MPI - extern int engine_rank; - #define error(s, ...) { fprintf( stderr , "[%03i] %s:%s():%i: " s "\n" , engine_rank , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); MPI_Abort(MPI_COMM_WORLD, -1); } +extern int engine_rank; +#define error(s, ...) \ + { \ + fprintf(stderr, "[%03i] %s:%s():%i: " s "\n", engine_rank, __FILE__, \ + __FUNCTION__, __LINE__, ##__VA_ARGS__); \ + MPI_Abort(MPI_COMM_WORLD, -1); \ + } #else - #define error(s, ...) { fprintf( stderr , "%s:%s():%i: " s "\n" , __FILE__ , __FUNCTION__ , __LINE__ , ##__VA_ARGS__ ); abort(); } +#define error(s, ...) \ + { \ + fprintf(stderr, "%s:%s():%i: " s "\n", __FILE__, __FUNCTION__, __LINE__, \ + ##__VA_ARGS__); \ + abort(); \ + } #endif - /** * @brief Macro to print a localized message with variable arguments. * */ #ifdef WITH_MPI - extern int engine_rank; - #define message(s, ...) printf( "[%03i] %s: " s "\n" , engine_rank , __FUNCTION__ , ##__VA_ARGS__ ) +extern int engine_rank; +#define message(s, ...) \ + printf("[%03i] %s: " s "\n", engine_rank, __FUNCTION__, ##__VA_ARGS__) #else - #define message(s, ...) printf( "%s: " s "\n" , __FUNCTION__ , ##__VA_ARGS__ ) +#define message(s, ...) printf("%s: " s "\n", __FUNCTION__, ##__VA_ARGS__) #endif + +#endif /* SWIFT_ERROR_H */ diff --git a/src/inline.h b/src/inline.h index a9b3059fe7570a9b7bb67cc0d4b9f93181c19ccf..06728cb87f5e342b22d4a4a861cbd83ea6af31d9 100644 --- a/src/inline.h +++ b/src/inline.h @@ -2,29 +2,33 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_INLINE_H +#define SWIFT_INLINE_H /** - * @brief Defines inline + * @brief Defines inline */ #ifndef INLINE -# if __GNUC__ && !__GNUC_STDC_INLINE__ -# define INLINE extern inline -# else -# define INLINE inline -# endif +#if __GNUC__ && !__GNUC_STDC_INLINE__ +#define INLINE extern inline +#else +#define INLINE inline #endif +#endif + +#endif /* SWIFT_INLINE_H */ diff --git a/src/kernel.h b/src/kernel.h index c012739f300aeb5aeedd4b56798b00b2d7ed5cc9..0fc232597e1e9917d17f068407acc85b37659d42 100644 --- a/src/kernel.h +++ b/src/kernel.h @@ -2,24 +2,28 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * Matthieu Schaller (matthieu.schaller@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ -#ifndef KERNEL_H -#define KERNEL_H +#ifndef SWIFT_KERNEL_H +#define SWIFT_KERNEL_H +/* Includes. */ +#include "const.h" +#include "inline.h" +#include "vector.h" /** * @file kernel.h @@ -27,185 +31,194 @@ * as well as the blending function used for gravity. */ -#include "vector.h" - -/* Gravity kernel stuff ----------------------------------------------------------------------------------------------- */ +/* Gravity kernel stuff + * ----------------------------------------------------------------------------------------------- + */ /* The gravity kernel is defined as a degree 6 polynomial in the distance r. The resulting value should be post-multiplied with r^-3, resulting in a polynomial with terms ranging from r^-3 to r^3, which are sufficient to model both the direct potential as well as the splines near the origin. */ - + /* Coefficients for the gravity kernel. */ #define kernel_grav_degree 6 #define kernel_grav_ivals 2 -#define kernel_grav_scale (2*const_iepsilon) -static float kernel_grav_coeffs[ (kernel_grav_degree+1) * (kernel_grav_ivals+1) ] = - { 32.0f*const_iepsilon6 , -192.0f/5.0f*const_iepsilon5 , 0.0f , 32.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , 0.0f , - -32.0f/3.0f*const_iepsilon6 , 192.0f/5.0f*const_iepsilon5 , -48.0f*const_iepsilon4 , 64.0f/3.0f*const_iepsilon3 , 0.0f , 0.0f , -1.0f/15.0f , - 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 1.0f }; - +#define kernel_grav_scale (2 * const_iepsilon) +static float kernel_grav_coeffs + [(kernel_grav_degree + 1) * (kernel_grav_ivals + 1)] = { + 32.0f * const_iepsilon6, -192.0f / 5.0f * const_iepsilon5, + 0.0f, 32.0f / 3.0f * const_iepsilon3, + 0.0f, 0.0f, + 0.0f, -32.0f / 3.0f * const_iepsilon6, + 192.0f / 5.0f * const_iepsilon5, -48.0f * const_iepsilon4, + 64.0f / 3.0f * const_iepsilon3, 0.0f, + 0.0f, -1.0f / 15.0f, + 0.0f, 0.0f, + 0.0f, 0.0f, + 0.0f, 0.0f, + 1.0f}; /** * @brief Computes the gravity cubic spline for a given distance x. */ -__attribute__ ((always_inline)) INLINE static void kernel_grav_eval ( float x , float *W ) { - int ind = fmin( x*kernel_grav_scale , kernel_grav_ivals ); - float *coeffs = &kernel_grav_coeffs[ ind*(kernel_grav_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - for ( int k = 2 ; k <= kernel_grav_degree ; k++ ) - w = x*w + coeffs[k]; - *W = w; - } - +__attribute__((always_inline)) INLINE static void kernel_grav_eval(float x, + float *W) { + int ind = fmin(x * kernel_grav_scale, kernel_grav_ivals); + float *coeffs = &kernel_grav_coeffs[ind * (kernel_grav_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + for (int k = 2; k <= kernel_grav_degree; k++) w = x * w + coeffs[k]; + *W = w; +} #ifdef VECTORIZE /** - * @brief Computes the gravity cubic spline for a given distance x (Vectorized version). + * @brief Computes the gravity cubic spline for a given distance x (Vectorized + * version). */ -__attribute__ ((always_inline)) INLINE static void kernel_grav_eval_vec ( vector *x , vector *w ) { - - vector ind, c[kernel_grav_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( kernel_grav_scale ) , vec_set1( (float)kernel_grav_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < kernel_grav_degree+1 ; j++ ) - c[j].f[k] = kernel_grav_coeffs[ ind.i[k]*(kernel_grav_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - - /* And we're off! */ - for ( int k = 2 ; k <= kernel_grav_degree ; k++ ) - w->v = ( x->v * w->v ) + c[k].v; - - } - - -#endif +__attribute__((always_inline)) + INLINE static void kernel_grav_eval_vec(vector *x, vector *w) { + + vector ind, c[kernel_grav_degree + 1]; + int j, k; + + /* Load x and get the interval id. */ + ind.m = vec_ftoi(vec_fmin(x->v * vec_set1(kernel_grav_scale), + vec_set1((float)kernel_grav_ivals))); + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < kernel_grav_degree + 1; j++) + c[j].f[k] = kernel_grav_coeffs[ind.i[k] * (kernel_grav_degree + 1) + j]; -/* Blending function stuff -------------------------------------------------------------------------------------------- */ + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + + /* And we're off! */ + for (int k = 2; k <= kernel_grav_degree; k++) w->v = (x->v * w->v) + c[k].v; +} + +#endif + +/* Blending function stuff + * -------------------------------------------------------------------------------------------- + */ /* Coefficients for the blending function. */ #define blender_degree 3 #define blender_ivals 3 #define blender_scale 4.0f -static float blender_coeffs[ (blender_degree+1) * (blender_ivals+1) ] = - { 0.0f , 0.0f , 0.0f , 1.0f , - -32.0f , 24.0f , -6.0f , 1.5f , - -32.0f , 72.0f , -54.0f , 13.5f , - 0.0f , 0.0f , 0.0f , 0.0f }; - - +static float blender_coeffs[(blender_degree + 1) * (blender_ivals + 1)] = { + 0.0f, 0.0f, 0.0f, 1.0f, -32.0f, 24.0f, -6.0f, 1.5f, + -32.0f, 72.0f, -54.0f, 13.5f, 0.0f, 0.0f, 0.0f, 0.0f}; + /** * @brief Computes the cubic spline blender for a given distance x. */ -__attribute__ ((always_inline)) INLINE static void blender_eval ( float x , float *W ) { - int ind = fmin( x*blender_scale , blender_ivals ); - float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - for ( int k = 2 ; k <= blender_degree ; k++ ) - w = x*w + coeffs[k]; - *W = w; - } - +__attribute__((always_inline)) INLINE static void blender_eval(float x, + float *W) { + int ind = fmin(x * blender_scale, blender_ivals); + float *coeffs = &blender_coeffs[ind * (blender_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + for (int k = 2; k <= blender_degree; k++) w = x * w + coeffs[k]; + *W = w; +} /** - * @brief Computes the cubic spline blender and its derivative for a given distance x. + * @brief Computes the cubic spline blender and its derivative for a given + * distance x. */ -__attribute__ ((always_inline)) INLINE static void blender_deval ( float x , float *W , float *dW_dx ) { - int ind = fminf( x*blender_scale , blender_ivals ); - float *coeffs = &blender_coeffs[ ind*(blender_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - float dw_dx = coeffs[0]; - for ( int k = 2 ; k <= blender_degree ; k++ ) { - dw_dx = dw_dx*x + w; - w = x*w + coeffs[k]; - } - *W = w; - *dW_dx = dw_dx; - } - +__attribute__((always_inline)) INLINE static void blender_deval(float x, + float *W, + float *dW_dx) { + int ind = fminf(x * blender_scale, blender_ivals); + float *coeffs = &blender_coeffs[ind * (blender_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + for (int k = 2; k <= blender_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; +} #ifdef VECTORIZE /** - * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2. + * @brief Computes the cubic spline blender and its derivative for a given + * distance x (Vectorized version). Gives a sensible answer only if x<2. */ -__attribute__ ((always_inline)) INLINE static void blender_eval_vec ( vector *x , vector *w ) { - - vector ind, c[blender_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < blender_degree+1 ; j++ ) - c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - - /* And we're off! */ - for ( int k = 2 ; k <= blender_degree ; k++ ) - w->v = ( x->v * w->v ) + c[k].v; - - } - - +__attribute__((always_inline)) INLINE static void blender_eval_vec(vector *x, + vector *w) { + + vector ind, c[blender_degree + 1]; + int j, k; + + /* Load x and get the interval id. */ + ind.m = vec_ftoi( + vec_fmin(x->v * vec_set1(blender_scale), vec_set1((float)blender_ivals))); + + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < blender_degree + 1; j++) + c[j].f[k] = blender_coeffs[ind.i[k] * (blender_degree + 1) + j]; + + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + + /* And we're off! */ + for (int k = 2; k <= blender_degree; k++) w->v = (x->v * w->v) + c[k].v; +} + /** - * @brief Computes the cubic spline blender and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2. + * @brief Computes the cubic spline blender and its derivative for a given + * distance x (Vectorized version). Gives a sensible answer only if x<2. */ -__attribute__ ((always_inline)) INLINE static void blender_deval_vec ( vector *x , vector *w , vector *dw_dx ) { - - vector ind, c[blender_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v*vec_set1( blender_scale ) , vec_set1( (float)blender_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < blender_degree+1 ; j++ ) - c[j].f[k] = blender_coeffs[ ind.i[k]*(blender_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - dw_dx->v = c[0].v; - - /* And we're off! */ - for ( int k = 2 ; k <= blender_degree ; k++ ) { - dw_dx->v = ( dw_dx->v * x->v ) + w->v; - w->v = ( x->v * w->v ) + c[k].v; - } - - } - -#endif +__attribute__((always_inline)) + INLINE static void blender_deval_vec(vector *x, vector *w, vector *dw_dx) { + vector ind, c[blender_degree + 1]; + int j, k; -/* -------------------------------------------------------------------------------------------------------------------- */ + /* Load x and get the interval id. */ + ind.m = vec_ftoi( + vec_fmin(x->v * vec_set1(blender_scale), vec_set1((float)blender_ivals))); + + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < blender_degree + 1; j++) + c[j].f[k] = blender_coeffs[ind.i[k] * (blender_degree + 1) + j]; + + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + dw_dx->v = c[0].v; + + /* And we're off! */ + for (int k = 2; k <= blender_degree; k++) { + dw_dx->v = (dw_dx->v * x->v) + w->v; + w->v = (x->v * w->v) + c[k].v; + } +} + +#endif + +/* -------------------------------------------------------------------------------------------------------------------- + */ #if defined(CUBIC_SPLINE_KERNEL) -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ -/* Coefficients for the kernel. */ +/* Coefficients for the kernel. */ #define kernel_name "Cubic spline" #define kernel_degree 3 #define kernel_ivals 2 @@ -213,89 +226,94 @@ __attribute__ ((always_inline)) INLINE static void blender_deval_vec ( vector *x #define kernel_gamma2 4.0f #define kernel_gamma3 8.0f #define kernel_igamma 0.5f -#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*6.0858f ) -static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) = - { 3.0/4.0*M_1_PI , -3.0/2.0*M_1_PI , 0.0 , M_1_PI , - -0.25*M_1_PI , 3.0/2.0*M_1_PI , -3.0*M_1_PI , M_2_PI , - 0.0 , 0.0 , 0.0 , 0.0 }; -#define kernel_root ( kernel_coeffs[ kernel_degree ] ) -#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] ) - - +#define kernel_nwneigh \ + (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \ + 6.0858f) +static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] + __attribute__((aligned(16))) = { + 3.0 / 4.0 * M_1_PI, -3.0 / 2.0 * M_1_PI, 0.0, M_1_PI, + -0.25 * M_1_PI, 3.0 / 2.0 * M_1_PI, -3.0 * M_1_PI, M_2_PI, + 0.0, 0.0, 0.0, 0.0}; +#define kernel_root (kernel_coeffs[kernel_degree]) +#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree]) + /** - * @brief Computes the cubic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<2. + * @brief Computes the cubic spline kernel and its derivative for a given + * distance x. Gives a sensible answer only if x<2. */ -__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) { - int ind = fminf( x , kernel_ivals ); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - float dw_dx = coeffs[0]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx = dw_dx*x + w; - w = x*w + coeffs[k]; - } - *W = w; - *dW_dx = dw_dx; - } - +__attribute__((always_inline)) INLINE static void kernel_deval(float x, + float *W, + float *dW_dx) { + int ind = fminf(x, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + for (int k = 2; k <= kernel_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; +} #ifdef VECTORIZE /** - * @brief Computes the cubic spline kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2. + * @brief Computes the cubic spline kernel and its derivative for a given + * distance x (Vectorized version). Gives a sensible answer only if x<2. */ -__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) { - - vector ind, c[kernel_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v , vec_set1( (float)kernel_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < kernel_degree+1 ; j++ ) - c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - dw_dx->v = c[0].v; - - /* And we're off! */ - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx->v = ( dw_dx->v * x->v ) + w->v; - w->v = ( x->v * w->v ) + c[k].v; - } - - } - -#endif +__attribute__((always_inline)) + INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) { + vector ind, c[kernel_degree + 1]; + int j, k; -/** - * @brief Computes the cubic spline kernel for a given distance x. Gives a sensible answer only if x<2. - */ + /* Load x and get the interval id. */ + ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals))); -__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) { - int ind = fmin( x , kernel_ivals ); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) - w = x*w + coeffs[k]; - *W = w; - } + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < kernel_degree + 1; j++) + c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + dw_dx->v = c[0].v; + /* And we're off! */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx->v = (dw_dx->v * x->v) + w->v; + w->v = (x->v * w->v) + c[k].v; + } +} -/* -------------------------------------------------------------------------------------------------------------------- */ +#endif + +/** + * @brief Computes the cubic spline kernel for a given distance x. Gives a + * sensible answer only if x<2. + */ + +__attribute__((always_inline)) INLINE static void kernel_eval(float x, + float *W) { + int ind = fmin(x, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k]; + *W = w; +} + +/* -------------------------------------------------------------------------------------------------------------------- + */ #elif defined(QUARTIC_SPLINE_KERNEL) -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ -/* Coefficients for the kernel. */ +/* Coefficients for the kernel. */ #define kernel_name "Quartic spline" #define kernel_degree 4 #define kernel_ivals 3 @@ -303,186 +321,198 @@ __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float #define kernel_gamma2 6.25f #define kernel_gamma3 15.625f #define kernel_igamma 0.4f -#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*8.2293f ) -static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) = - { 3.0/10.0*M_1_PI , 0.0 , -3.0/4.0*M_1_PI , 0.0 , 23.0/32.0*M_1_PI , - -1.0/5.0*M_1_PI , M_1_PI , -3.0/2.0*M_1_PI , 0.25*M_1_PI , 11.0/16.0*M_1_PI , - 1.0/20.0*M_1_PI , -0.5*M_1_PI , 15.0/8.0*M_1_PI , -25.0/8.0*M_1_PI , 125.0/64.0*M_1_PI , - 0.0 , 0.0 , 0.0 , 0.0 , 0.0 }; -#define kernel_root ( kernel_coeffs[ kernel_degree ] ) -#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] ) - - +#define kernel_nwneigh \ + (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \ + 8.2293f) +static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] + __attribute__((aligned(16))) = { + 3.0 / 10.0 * M_1_PI, 0.0, -3.0 / 4.0 * M_1_PI, + 0.0, 23.0 / 32.0 * M_1_PI, -1.0 / 5.0 * M_1_PI, + M_1_PI, -3.0 / 2.0 * M_1_PI, 0.25 * M_1_PI, + 11.0 / 16.0 * M_1_PI, 1.0 / 20.0 * M_1_PI, -0.5 * M_1_PI, + 15.0 / 8.0 * M_1_PI, -25.0 / 8.0 * M_1_PI, 125.0 / 64.0 * M_1_PI, + 0.0, 0.0, 0.0, + 0.0, 0.0}; +#define kernel_root (kernel_coeffs[kernel_degree]) +#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree]) + /** - * @brief Computes the quartic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<2.5 + * @brief Computes the quartic spline kernel and its derivative for a given + * distance x. Gives a sensible answer only if x<2.5 */ -__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) { - int ind = fminf( x + 0.5, kernel_ivals); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - float dw_dx = coeffs[0]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx = dw_dx*x + w; - w = x*w + coeffs[k]; - } - *W = w; - *dW_dx = dw_dx; - } - +__attribute__((always_inline)) INLINE static void kernel_deval(float x, + float *W, + float *dW_dx) { + int ind = fminf(x + 0.5, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + for (int k = 2; k <= kernel_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; +} #ifdef VECTORIZE /** - * @brief Computes the quartic spline kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<2.5 + * @brief Computes the quartic spline kernel and its derivative for a given + * distance x (Vectorized version). Gives a sensible answer only if x<2.5 */ -__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) { - - vector ind, c[kernel_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v + 0.5f, vec_set1( (float)kernel_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < kernel_degree+1 ; j++ ) - c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - dw_dx->v = c[0].v; - - /* And we're off! */ - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx->v = ( dw_dx->v * x->v ) + w->v; - w->v = ( x->v * w->v ) + c[k].v; - } - - } - -#endif +__attribute__((always_inline)) + INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) { -/** - * @brief Computes the quartic spline kernel for a given distance x. Gives a sensible answer only if x<2.5 - */ + vector ind, c[kernel_degree + 1]; + int j, k; -__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) { - int ind = fmin( x + 0.5f, kernel_ivals ); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) - w = x*w + coeffs[k]; - *W = w; - } + /* Load x and get the interval id. */ + ind.m = vec_ftoi(vec_fmin(x->v + 0.5f, vec_set1((float)kernel_ivals))); + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < kernel_degree + 1; j++) + c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + dw_dx->v = c[0].v; + /* And we're off! */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx->v = (dw_dx->v * x->v) + w->v; + w->v = (x->v * w->v) + c[k].v; + } +} +#endif +/** + * @brief Computes the quartic spline kernel for a given distance x. Gives a + * sensible answer only if x<2.5 + */ +__attribute__((always_inline)) INLINE static void kernel_eval(float x, + float *W) { + int ind = fmin(x + 0.5f, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k]; + *W = w; +} -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ #elif defined(QUINTIC_SPLINE_KERNEL) -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ -/* Coefficients for the kernel. */ +/* Coefficients for the kernel. */ #define kernel_name "Quintic spline" #define kernel_degree 5 #define kernel_ivals 3 #define kernel_gamma 3.f #define kernel_gamma2 9.f #define kernel_gamma3 27.f -#define kernel_igamma 1.0f/3.0f -#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*10.5868f ) -static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) = -{ -1.0/12.0*M_1_PI , 1.0/4.0*M_1_PI , 0.0 , -1.0/2.0*M_1_PI , 0.0 , 11.0/20.0*M_1_PI, - 1.0/24.0*M_1_PI , -3.0/8.0*M_1_PI , 5.0/4.0*M_1_PI , -7.0/4.0*M_1_PI , 5.0/8.0*M_1_PI , 17.0/40.0*M_1_PI , - -1.0/120.0*M_1_PI , 1.0/8.0*M_1_PI , -3.0/4.0*M_1_PI , 9.0/4.0*M_1_PI , -27.0/8.0*M_1_PI , 81.0/40.0*M_1_PI, - 0.0 , 0.0 , 0.0 , 0.0 , 0.0 , 0.0}; -#define kernel_root ( kernel_coeffs[ kernel_degree ] ) -#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] ) - - +#define kernel_igamma 1.0f / 3.0f +#define kernel_nwneigh \ + (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \ + 10.5868f) +static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] + __attribute__((aligned(16))) = { + -1.0 / 12.0 * M_1_PI, 1.0 / 4.0 * M_1_PI, 0.0, + -1.0 / 2.0 * M_1_PI, 0.0, 11.0 / 20.0 * M_1_PI, + 1.0 / 24.0 * M_1_PI, -3.0 / 8.0 * M_1_PI, 5.0 / 4.0 * M_1_PI, + -7.0 / 4.0 * M_1_PI, 5.0 / 8.0 * M_1_PI, 17.0 / 40.0 * M_1_PI, + -1.0 / 120.0 * M_1_PI, 1.0 / 8.0 * M_1_PI, -3.0 / 4.0 * M_1_PI, + 9.0 / 4.0 * M_1_PI, -27.0 / 8.0 * M_1_PI, 81.0 / 40.0 * M_1_PI, + 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0}; +#define kernel_root (kernel_coeffs[kernel_degree]) +#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree]) + /** - * @brief Computes the quintic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<3. + * @brief Computes the quintic spline kernel and its derivative for a given + * distance x. Gives a sensible answer only if x<3. */ -__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) { - int ind = fminf( x, kernel_ivals); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - float dw_dx = coeffs[0]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx = dw_dx*x + w; - w = x*w + coeffs[k]; - } - *W = w; - *dW_dx = dw_dx; - } - +__attribute__((always_inline)) INLINE static void kernel_deval(float x, + float *W, + float *dW_dx) { + int ind = fminf(x, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + for (int k = 2; k <= kernel_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; +} #ifdef VECTORIZE /** - * @brief Computes the quintic spline kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<3. + * @brief Computes the quintic spline kernel and its derivative for a given + * distance x (Vectorized version). Gives a sensible answer only if x<3. */ -__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) { - - vector ind, c[kernel_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v, vec_set1( (float)kernel_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < kernel_degree+1 ; j++ ) - c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - dw_dx->v = c[0].v; - - /* And we're off! */ - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx->v = ( dw_dx->v * x->v ) + w->v; - w->v = ( x->v * w->v ) + c[k].v; - } - - } - -#endif +__attribute__((always_inline)) + INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) { -/** - * @brief Computes the quintic spline kernel for a given distance x. Gives a sensible answer only if x<3. - */ + vector ind, c[kernel_degree + 1]; + int j, k; -__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) { - int ind = fmin( x, kernel_ivals ); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) - w = x*w + coeffs[k]; - *W = w; - } + /* Load x and get the interval id. */ + ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals))); + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < kernel_degree + 1; j++) + c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + dw_dx->v = c[0].v; + /* And we're off! */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx->v = (dw_dx->v * x->v) + w->v; + w->v = (x->v * w->v) + c[k].v; + } +} + +#endif +/** + * @brief Computes the quintic spline kernel for a given distance x. Gives a + * sensible answer only if x<3. + */ +__attribute__((always_inline)) INLINE static void kernel_eval(float x, + float *W) { + int ind = fmin(x, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k]; + *W = w; +} -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ #elif defined(WENDLAND_C2_KERNEL) -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ -/* Coefficients for the kernel. */ +/* Coefficients for the kernel. */ #define kernel_name "Wendland C2" #define kernel_degree 5 #define kernel_ivals 1 @@ -490,92 +520,93 @@ __attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float #define kernel_gamma2 1.f #define kernel_gamma3 1.f #define kernel_igamma 1.f -#define kernel_nwneigh ( 4.0/3.0*M_PI*const_eta_kernel*const_eta_kernel*const_eta_kernel*7.261825f ) -static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) = -{ 4.0f , -15.0f , 20.0f , -10.0f , 0.0f , 1.0f, - 0.0f , 0.0f , 0.0f , 0.0f , 0.0f , 0.0f}; -#define kernel_root ( kernel_coeffs[ kernel_degree ] ) -#define kernel_wroot ( 4.0/3.0*M_PI*kernel_coeffs[ kernel_degree ] ) - - +#define kernel_nwneigh \ + (4.0 / 3.0 * M_PI *const_eta_kernel *const_eta_kernel *const_eta_kernel * \ + 7.261825f) +static float kernel_coeffs[(kernel_degree + 1) * (kernel_ivals + 1)] + __attribute__((aligned(16))) = {4.0f, -15.0f, 20.0f, -10.0f, 0.0f, 1.0f, + 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; +#define kernel_root (kernel_coeffs[kernel_degree]) +#define kernel_wroot (4.0 / 3.0 * M_PI *kernel_coeffs[kernel_degree]) + /** - * @brief Computes the quintic spline kernel and its derivative for a given distance x. Gives a sensible answer only if x<1. + * @brief Computes the quintic spline kernel and its derivative for a given + * distance x. Gives a sensible answer only if x<1. */ -__attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , float *W , float *dW_dx ) { - int ind = fminf( x, kernel_ivals); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - float dw_dx = coeffs[0]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx = dw_dx*x + w; - w = x*w + coeffs[k]; - } - *W = w; - *dW_dx = dw_dx; - } - +__attribute__((always_inline)) INLINE static void kernel_deval(float x, + float *W, + float *dW_dx) { + int ind = fminf(x, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + float dw_dx = coeffs[0]; + for (int k = 2; k <= kernel_degree; k++) { + dw_dx = dw_dx * x + w; + w = x * w + coeffs[k]; + } + *W = w; + *dW_dx = dw_dx; +} #ifdef VECTORIZE /** - * @brief Computes the Wendland C2 kernel and its derivative for a given distance x (Vectorized version). Gives a sensible answer only if x<1. + * @brief Computes the Wendland C2 kernel and its derivative for a given + * distance x (Vectorized version). Gives a sensible answer only if x<1. */ -__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) { - - vector ind, c[kernel_degree+1]; - int j, k; - - /* Load x and get the interval id. */ - ind.m = vec_ftoi( vec_fmin( x->v, vec_set1( (float)kernel_ivals ) ) ); - - /* load the coefficients. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) - for ( j = 0 ; j < kernel_degree+1 ; j++ ) - c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ]; - - /* Init the iteration for Horner's scheme. */ - w->v = ( c[0].v * x->v ) + c[1].v; - dw_dx->v = c[0].v; - - /* And we're off! */ - for ( int k = 2 ; k <= kernel_degree ; k++ ) { - dw_dx->v = ( dw_dx->v * x->v ) + w->v; - w->v = ( x->v * w->v ) + c[k].v; - } - - } - -#endif +__attribute__((always_inline)) + INLINE static void kernel_deval_vec(vector *x, vector *w, vector *dw_dx) { -/** - * @brief Computes the Wendland C2 kernel for a given distance x. Gives a sensible answer only if x<1. - */ + vector ind, c[kernel_degree + 1]; + int j, k; -__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) { - int ind = fmin( x, kernel_ivals ); - float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ]; - float w = coeffs[0]*x + coeffs[1]; - for ( int k = 2 ; k <= kernel_degree ; k++ ) - w = x*w + coeffs[k]; - *W = w; - } + /* Load x and get the interval id. */ + ind.m = vec_ftoi(vec_fmin(x->v, vec_set1((float)kernel_ivals))); + /* load the coefficients. */ + for (k = 0; k < VEC_SIZE; k++) + for (j = 0; j < kernel_degree + 1; j++) + c[j].f[k] = kernel_coeffs[ind.i[k] * (kernel_degree + 1) + j]; + /* Init the iteration for Horner's scheme. */ + w->v = (c[0].v * x->v) + c[1].v; + dw_dx->v = c[0].v; + /* And we're off! */ + for (int k = 2; k <= kernel_degree; k++) { + dw_dx->v = (dw_dx->v * x->v) + w->v; + w->v = (x->v * w->v) + c[k].v; + } +} +#endif +/** + * @brief Computes the Wendland C2 kernel for a given distance x. Gives a + * sensible answer only if x<1. + */ +__attribute__((always_inline)) INLINE static void kernel_eval(float x, + float *W) { + int ind = fmin(x, kernel_ivals); + float *coeffs = &kernel_coeffs[ind * (kernel_degree + 1)]; + float w = coeffs[0] * x + coeffs[1]; + for (int k = 2; k <= kernel_degree; k++) w = x * w + coeffs[k]; + *W = w; +} -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ #else -/* -------------------------------------------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------------------------------------------- + */ #error "A kernel function must be chosen in const.h !!" -#endif // Kernel choice +#endif // Kernel choice -#endif //KERNEL_H +#endif // SWIFT_KERNEL_H diff --git a/src/lock.h b/src/lock.h index 3e3affb1b08e320770687217b4a572631f7413c1..19a4e74bf82d3b6bb8e305388ca42929cc9d719e 100644 --- a/src/lock.h +++ b/src/lock.h @@ -1,54 +1,61 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_LOCK_H +#define SWIFT_LOCK_H +/* Some standard headers. */ +#include <pthread.h> - +/* Includes. */ #include "inline.h" - + #ifdef PTHREAD_SPINLOCK - #include <pthread.h> - #define lock_type pthread_spinlock_t - #define lock_init( l ) ( pthread_spin_init( l , PTHREAD_PROCESS_PRIVATE ) != 0 ) - #define lock_destroy( l ) ( pthread_spin_destroy( l ) != 0 ) - #define lock_lock( l ) ( pthread_spin_lock( l ) != 0 ) - #define lock_trylock( l ) ( pthread_spin_lock( l ) != 0 ) - #define lock_unlock( l ) ( pthread_spin_unlock( l ) != 0 ) - #define lock_unlock_blind( l ) pthread_spin_unlock( l ) +#include <pthread.h> +#define lock_type pthread_spinlock_t +#define lock_init(l) (pthread_spin_init(l, PTHREAD_PROCESS_PRIVATE) != 0) +#define lock_destroy(l) (pthread_spin_destroy(l) != 0) +#define lock_lock(l) (pthread_spin_lock(l) != 0) +#define lock_trylock(l) (pthread_spin_lock(l) != 0) +#define lock_unlock(l) (pthread_spin_unlock(l) != 0) +#define lock_unlock_blind(l) pthread_spin_unlock(l) #elif defined(PTHREAD_LOCK) - #include <pthread.h> - #define lock_type pthread_mutex_t - #define lock_init( l ) ( pthread_mutex_init( l , NULL ) != 0 ) - #define lock_destroy( l ) ( pthread_mutex_destroy( l ) != 0 ) - #define lock_lock( l ) ( pthread_mutex_lock( l ) != 0 ) - #define lock_trylock( l ) ( pthread_mutex_trylock( l ) != 0 ) - #define lock_unlock( l ) ( pthread_mutex_unlock( l ) != 0 ) - #define lock_unlock_blind( l ) pthread_mutex_unlock( l ) +#include <pthread.h> +#define lock_type pthread_mutex_t +#define lock_init(l) (pthread_mutex_init(l, NULL) != 0) +#define lock_destroy(l) (pthread_mutex_destroy(l) != 0) +#define lock_lock(l) (pthread_mutex_lock(l) != 0) +#define lock_trylock(l) (pthread_mutex_trylock(l) != 0) +#define lock_unlock(l) (pthread_mutex_unlock(l) != 0) +#define lock_unlock_blind(l) pthread_mutex_unlock(l) #else - #define lock_type volatile int - #define lock_init( l ) ( *(l) = 0 ) - #define lock_destroy( l ) 0 - INLINE static int lock_lock ( volatile int *l ) { - while ( __sync_val_compare_and_swap( l , 0 , 1 ) != 0 ); - // while( *l ); - return 0; - } - #define lock_trylock( l ) ( ( *(l) ) ? 1 : __sync_val_compare_and_swap( l , 0 , 1 ) ) - #define lock_unlock( l ) ( __sync_val_compare_and_swap( l , 1 , 0 ) != 1 ) - #define lock_unlock_blind( l ) __sync_val_compare_and_swap( l , 1 , 0 ) +#define lock_type volatile int +#define lock_init(l) (*(l) = 0) +#define lock_destroy(l) 0 +INLINE static int lock_lock(volatile int *l) { + while (__sync_val_compare_and_swap(l, 0, 1) != 0) + ; + // while( *l ); + return 0; +} +#define lock_trylock(l) ((*(l)) ? 1 : __sync_val_compare_and_swap(l, 0, 1)) +#define lock_unlock(l) (__sync_val_compare_and_swap(l, 1, 0) != 1) +#define lock_unlock_blind(l) __sync_val_compare_and_swap(l, 1, 0) #endif + +#endif /* SWIFT_LOCK_H */ diff --git a/src/multipole.c b/src/multipole.c index 38337230404338eae3faa6bd14a5d3f3a313e971..439e9cd5f0218bddf28d228de6eb3bb14a2d6735 100644 --- a/src/multipole.c +++ b/src/multipole.c @@ -1,49 +1,40 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ +#include <float.h> +#include <limits.h> +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <float.h> -#include <limits.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif -/* Local headers. */ -#include "error.h" -#include "const.h" -#include "cycle.h" -#include "atomic.h" -#include "lock.h" -#include "space.h" -#include "part.h" +/* This object's header. */ #include "multipole.h" -#include "cell.h" - /** * @brief Merge two multipoles. @@ -51,26 +42,23 @@ * @param ma The #multipole which will contain the merged result. * @param mb The other #multipole. */ - -void multipole_merge ( struct multipole *ma , struct multipole *mb ) { - - #if multipole_order == 1 - - /* Correct the position. */ - float mma = ma->coeffs[0], mmb = mb->coeffs[0]; - float w = 1.0f / ( mma + mmb ); - for ( int k = 0 ; k < 3 ; k++ ) - ma->x[k] = ( ma->x[k]*mma + mb->x[k]*mmb ) * w; - - /* Add the particle to the moments. */ - ma->coeffs[0] = mma + mmb; - - #else - #error( "Multipoles of order %i not yet implemented." , multipole_order ) - #endif - - } +void multipole_merge(struct multipole *ma, struct multipole *mb) { + +#if multipole_order == 1 + + /* Correct the position. */ + float mma = ma->coeffs[0], mmb = mb->coeffs[0]; + float w = 1.0f / (mma + mmb); + for (int k = 0; k < 3; k++) ma->x[k] = (ma->x[k] * mma + mb->x[k] * mmb) * w; + + /* Add the particle to the moments. */ + ma->coeffs[0] = mma + mmb; + +#else +#error( "Multipoles of order %i not yet implemented." , multipole_order ) +#endif +} /** * @brief Add a particle to the given multipole. @@ -78,26 +66,23 @@ void multipole_merge ( struct multipole *ma , struct multipole *mb ) { * @param m The #multipole. * @param p The #gpart. */ - -void multipole_addpart ( struct multipole *m , struct gpart *p ) { - - #if multipole_order == 1 - - /* Correct the position. */ - float mm = m->coeffs[0], mp = p->mass; - float w = 1.0f / ( mm + mp ); - for ( int k = 0 ; k < 3 ; k++ ) - m->x[k] = ( m->x[k]*mm + p->x[k]*mp ) * w; - - /* Add the particle to the moments. */ - m->coeffs[0] = mm + mp; - - #else - #error( "Multipoles of order %i not yet implemented." , multipole_order ) - #endif - - } +void multipole_addpart(struct multipole *m, struct gpart *p) { + +#if multipole_order == 1 + + /* Correct the position. */ + float mm = m->coeffs[0], mp = p->mass; + float w = 1.0f / (mm + mp); + for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + p->x[k] * mp) * w; + + /* Add the particle to the moments. */ + m->coeffs[0] = mm + mp; + +#else +#error( "Multipoles of order %i not yet implemented." , multipole_order ) +#endif +} /** * @brief Add a group of particles to the given multipole. @@ -106,37 +91,34 @@ void multipole_addpart ( struct multipole *m , struct gpart *p ) { * @param p The #gpart array. * @param N Number of parts to add. */ - -void multipole_addparts ( struct multipole *m , struct gpart *p , int N ) { - - #if multipole_order == 1 - - /* Get the combined mass and positions. */ - double xp[3] = { 0.0 , 0.0 , 0.0 }; - float mp = 0.0f, w; - for ( int k = 0 ; k < N ; k++ ) { - w = p[k].mass; - mp += w; - xp[0] += p[k].x[0] * w; - xp[1] += p[k].x[1] * w; - xp[2] += p[k].x[2] * w; - } - - /* Correct the position. */ - float mm = m->coeffs[0]; - w = 1.0f / ( mm + mp ); - for ( int k = 0 ; k < 3 ; k++ ) - m->x[k] = ( m->x[k]*mm + xp[k] ) * w; - - /* Add the particle to the moments. */ - m->coeffs[0] = mm + mp; - - #else - #error( "Multipoles of order %i not yet implemented." , multipole_order ) - #endif - - } +void multipole_addparts(struct multipole *m, struct gpart *p, int N) { + +#if multipole_order == 1 + + /* Get the combined mass and positions. */ + double xp[3] = {0.0, 0.0, 0.0}; + float mp = 0.0f, w; + for (int k = 0; k < N; k++) { + w = p[k].mass; + mp += w; + xp[0] += p[k].x[0] * w; + xp[1] += p[k].x[1] * w; + xp[2] += p[k].x[2] * w; + } + + /* Correct the position. */ + float mm = m->coeffs[0]; + w = 1.0f / (mm + mp); + for (int k = 0; k < 3; k++) m->x[k] = (m->x[k] * mm + xp[k]) * w; + + /* Add the particle to the moments. */ + m->coeffs[0] = mm + mp; + +#else +#error( "Multipoles of order %i not yet implemented." , multipole_order ) +#endif +} /** * @brief Init a multipole from a set of particles. @@ -145,46 +127,43 @@ void multipole_addparts ( struct multipole *m , struct gpart *p , int N ) { * @param parts The #gpart. * @param N The number of particles. */ - -void multipole_init ( struct multipole *m , struct gpart *parts , int N ) { - - #if multipole_order == 1 - - float mass = 0.0f, w; - double x[3] = { 0.0 , 0.0 , 0.0 }; - int k; - - /* Collect the particle data. */ - for ( k = 0 ; k < N ; k++ ) { - w = parts[k].mass; - mass += w; - x[0] += parts[k].x[0] * w; - x[1] += parts[k].x[1] * w; - x[2] += parts[k].x[2] * w; - } - - /* Store the data on the multipole. */ - m->coeffs[0] = mass; - m->x[0] = x[0] / mass; - m->x[1] = x[1] / mass; - m->x[2] = x[2] / mass; - - #else - #error( "Multipoles of order %i not yet implemented." , multipole_order ) - #endif - - } +void multipole_init(struct multipole *m, struct gpart *parts, int N) { + +#if multipole_order == 1 + + float mass = 0.0f, w; + double x[3] = {0.0, 0.0, 0.0}; + int k; + + /* Collect the particle data. */ + for (k = 0; k < N; k++) { + w = parts[k].mass; + mass += w; + x[0] += parts[k].x[0] * w; + x[1] += parts[k].x[1] * w; + x[2] += parts[k].x[2] * w; + } + + /* Store the data on the multipole. */ + m->coeffs[0] = mass; + m->x[0] = x[0] / mass; + m->x[1] = x[1] / mass; + m->x[2] = x[2] / mass; + +#else +#error( "Multipoles of order %i not yet implemented." , multipole_order ) +#endif +} /** * @brief Reset the data of a #multipole. * * @param m The #multipole. */ - -void multipole_reset ( struct multipole *m ) { - /* Just bzero the struct. */ - bzero( m , sizeof(struct multipole) ); - - } +void multipole_reset(struct multipole *m) { + + /* Just bzero the struct. */ + bzero(m, sizeof(struct multipole)); +} diff --git a/src/multipole.h b/src/multipole.h index e3372e245637dbaec6e078ac7294902f6a22a9fe..ffa5d713f507b85f6ae9216cfe81d4fc49316345 100644 --- a/src/multipole.h +++ b/src/multipole.h @@ -1,53 +1,58 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_MULTIPOLE_H +#define SWIFT_MULTIPOLE_H -/* Some constants. */ -#define multipole_order 1 +/* Some standard headers. */ +#include <math.h> + +/* Includes. */ +#include "const.h" +#include "inline.h" +#include "kernel.h" +#include "part.h" +/* Some constants. */ +#define multipole_order 1 /* Multipole struct. */ struct multipole { - /* Multipole location. */ - double x[3]; - - /* Acceleration on this multipole. */ - float a[3]; - - /* Multipole coefficients. */ - float coeffs[ multipole_order*multipole_order ]; - - }; - - -/* Multipole function prototypes. */ -static void multipole_iact_mm ( struct multipole *ma , struct multipole *mb , double *shift ); -void multipole_merge ( struct multipole *ma , struct multipole *mb ); -void multipole_addpart ( struct multipole *m , struct gpart *p ); -void multipole_addparts ( struct multipole *m , struct gpart *p , int N ); -void multipole_init ( struct multipole *m , struct gpart *parts , int N ); -void multipole_reset ( struct multipole *m ); + /* Multipole location. */ + double x[3]; + /* Acceleration on this multipole. */ + float a[3]; + + /* Multipole coefficients. */ + float coeffs[multipole_order * multipole_order]; +}; + +/* Multipole function prototypes. */ +static void multipole_iact_mm(struct multipole *ma, struct multipole *mb, + double *shift); +void multipole_merge(struct multipole *ma, struct multipole *mb); +void multipole_addpart(struct multipole *m, struct gpart *p); +void multipole_addparts(struct multipole *m, struct gpart *p, int N); +void multipole_init(struct multipole *m, struct gpart *parts, int N); +void multipole_reset(struct multipole *m); -#include <math.h> -#include "kernel.h" - /** * @brief Compute the pairwise interaction between two multipoles. * @@ -55,41 +60,40 @@ void multipole_reset ( struct multipole *m ); * @param mb The second #multipole. * @param shift The periodicity correction. */ - -__attribute__ ((always_inline)) INLINE static void multipole_iact_mm ( struct multipole *ma , struct multipole *mb , double *shift ) { - - float dx[3], ir, r, r2 = 0.0f, acc; - int k; - - /* Compute the multipole distance. */ - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = ma->x[k] - mb->x[k] - shift[k]; - r2 += dx[k]*dx[k]; - } - - /* Compute the normalized distance vector. */ - ir = 1.0f / sqrtf( r2 ); - r = r2 * ir; - - /* Evaluate the gravity kernel. */ - kernel_grav_eval( r , &acc ); - - /* Scale the acceleration. */ - acc *= const_G * ir * ir * ir; - - /* Compute the forces on both multipoles. */ - #if multipole_order == 1 - float mma = ma->coeffs[0], mmb = mb->coeffs[0]; - for ( k = 0 ; k < 3 ; k++ ) { - ma->a[k] -= dx[k] * acc * mmb; - mb->a[k] += dx[k] * acc * mma; - } - #else - #error( "Multipoles of order %i not yet implemented." , multipole_order ) - #endif - - } +__attribute__((always_inline)) INLINE static void multipole_iact_mm( + struct multipole *ma, struct multipole *mb, double *shift) { + + float dx[3], ir, r, r2 = 0.0f, acc; + int k; + + /* Compute the multipole distance. */ + for (k = 0; k < 3; k++) { + dx[k] = ma->x[k] - mb->x[k] - shift[k]; + r2 += dx[k] * dx[k]; + } + + /* Compute the normalized distance vector. */ + ir = 1.0f / sqrtf(r2); + r = r2 * ir; + + /* Evaluate the gravity kernel. */ + kernel_grav_eval(r, &acc); + + /* Scale the acceleration. */ + acc *= const_G * ir * ir * ir; + +/* Compute the forces on both multipoles. */ +#if multipole_order == 1 + float mma = ma->coeffs[0], mmb = mb->coeffs[0]; + for (k = 0; k < 3; k++) { + ma->a[k] -= dx[k] * acc * mmb; + mb->a[k] += dx[k] * acc * mma; + } +#else +#error( "Multipoles of order %i not yet implemented." , multipole_order ) +#endif +} /** * @brief Compute the interaction of a multipole on a particle. @@ -98,36 +102,35 @@ __attribute__ ((always_inline)) INLINE static void multipole_iact_mm ( struct mu * @param p The #gpart. * @param shift The periodicity correction. */ - -__attribute__ ((always_inline)) INLINE static void multipole_iact_mp ( struct multipole *m , struct gpart *p , double *shift ) { - - float dx[3], ir, r, r2 = 0.0f, acc; - int k; - - /* Compute the multipole distance. */ - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = m->x[k] - p->x[k] - shift[k]; - r2 += dx[k]*dx[k]; - } - - /* Compute the normalized distance vector. */ - ir = 1.0f / sqrtf( r2 ); - r = r2 * ir; - - /* Evaluate the gravity kernel. */ - kernel_grav_eval( r , &acc ); - - /* Scale the acceleration. */ - acc *= const_G * ir * ir * ir * m->coeffs[0]; - - /* Compute the forces on both multipoles. */ - #if multipole_order == 1 - for ( k = 0 ; k < 3 ; k++ ) - p->a[k] += dx[k] * acc; - #else - #error( "Multipoles of order %i not yet implemented." , multipole_order ) - #endif - - } +__attribute__((always_inline)) INLINE static void multipole_iact_mp( + struct multipole *m, struct gpart *p, double *shift) { + + float dx[3], ir, r, r2 = 0.0f, acc; + int k; + + /* Compute the multipole distance. */ + for (k = 0; k < 3; k++) { + dx[k] = m->x[k] - p->x[k] - shift[k]; + r2 += dx[k] * dx[k]; + } + + /* Compute the normalized distance vector. */ + ir = 1.0f / sqrtf(r2); + r = r2 * ir; + + /* Evaluate the gravity kernel. */ + kernel_grav_eval(r, &acc); + + /* Scale the acceleration. */ + acc *= const_G * ir * ir * ir * m->coeffs[0]; + +/* Compute the forces on both multipoles. */ +#if multipole_order == 1 + for (k = 0; k < 3; k++) p->a[k] += dx[k] * acc; +#else +#error( "Multipoles of order %i not yet implemented." , multipole_order ) +#endif +} +#endif /* SWIFT_MULTIPOLE_H */ diff --git a/src/parallel_io.c b/src/parallel_io.c index 7e2f5e244aa3b280872dc25e1fd14ce0348bbd3b..ccd35fe06ef4bed6bd183c6a28648d19c184b4f4 100644 --- a/src/parallel_io.c +++ b/src/parallel_io.c @@ -2,20 +2,20 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ @@ -26,27 +26,21 @@ /* Tell hdf5 that we intend to use shared-memory parallel stuff. */ #define H5_HAVE_PARALLEL - /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <stddef.h> #include <hdf5.h> #include <math.h> #include <mpi.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> -#include "const.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "space.h" -#include "scheduler.h" -#include "engine.h" -#include "error.h" -#include "kernel.h" +/* This object's header. */ +#include "parallel_io.h" + +/* Local includes. */ #include "common_io.h" +#include "error.h" /** * @brief Reads a data array from a given HDF5 group. @@ -56,83 +50,79 @@ * @param type The #DATA_TYPE of the attribute. * @param N The number of particles. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array - * @param importance If COMPULSORY, the data must be present in the IC file. If OPTIONAL, the array will be zeroed when the data is not present. + * @param part_c A (char*) pointer on the first occurence of the field of + *interest in the parts array + * @param importance If COMPULSORY, the data must be present in the IC file. If + *OPTIONAL, the array will be zeroed when the data is not present. * - * @todo A better version using HDF5 hyperslabs to read the file directly into the part array + * @todo A better version using HDF5 hyperslabs to read the file directly into + *the part array * will be written once the strucutres have been stabilized. - * + * * Calls #error() if an error occurs. */ -void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, long long offset, char* part_c, enum DATA_IMPORTANCE importance) -{ - hid_t h_data=0, h_err=0, h_type=0, h_memspace=0, h_filespace=0, h_plist_id=0; +void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, + int dim, long long N_total, long long offset, + char* part_c, enum DATA_IMPORTANCE importance) { + hid_t h_data = 0, h_err = 0, h_type = 0, h_memspace = 0, h_filespace = 0, + h_plist_id = 0; hsize_t shape[2], offsets[2]; - htri_t exist=0; + htri_t exist = 0; void* temp; - int i=0, rank=0; + int i = 0, rank = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); char* temp_c = 0; - /* Check whether the dataspace exists or not */ exist = H5Lexists(grp, name, 0); - if(exist < 0) - { - error( "Error while checking the existence of data set '%s'." , name ); - } - else if(exist == 0) - { - if(importance == COMPULSORY) - { - error( "Compulsory data set '%s' not present in the file." , name ); - } - else - { - for(i=0; i<N; ++i) - memset(part_c+i*partSize, 0, copySize); - return; - } + if (exist < 0) { + error("Error while checking the existence of data set '%s'.", name); + } else if (exist == 0) { + if (importance == COMPULSORY) { + error("Compulsory data set '%s' not present in the file.", name); + } else { + for (i = 0; i < N; ++i) memset(part_c + i * partSize, 0, copySize); + return; } + } - /* message( "Reading %s '%s' array...", importance == COMPULSORY ? "compulsory": "optional ", name); */ + /* message( "Reading %s '%s' array...", importance == COMPULSORY ? + * "compulsory": "optional ", name); */ /* Open data space in file */ h_data = H5Dopen2(grp, name, H5P_DEFAULT); - if(h_data < 0) - error( "Error while opening data space '%s'." , name ); + if (h_data < 0) error("Error while opening data space '%s'.", name); /* Check data type */ h_type = H5Dget_type(h_data); - if(h_type < 0) - error("Unable to retrieve data type from the file"); - if(!H5Tequal(h_type, hdf5Type(type))) + if (h_type < 0) error("Unable to retrieve data type from the file"); + if (!H5Tequal(h_type, hdf5Type(type))) error("Non-matching types between the code and the file"); - + /* Allocate temporary buffer */ temp = malloc(N * dim * sizeOfType(type)); - if(temp == NULL) - error("Unable to allocate memory for temporary buffer"); + if (temp == NULL) error("Unable to allocate memory for temporary buffer"); /* Prepare information for hyperslab */ - if(dim > 1) - { - rank = 2; - shape[0] = N; shape[1] = dim; - offsets[0] = offset; offsets[1] = 0; - } - else - { - rank = 1; - shape[0] = N; shape[1] = 0; - offsets[0] = offset; offsets[1] = 0; - } + if (dim > 1) { + rank = 2; + shape[0] = N; + shape[1] = dim; + offsets[0] = offset; + offsets[1] = 0; + } else { + rank = 1; + shape[0] = N; + shape[1] = 0; + offsets[0] = offset; + offsets[1] = 0; + } /* Create data space in memory */ h_memspace = H5Screate_simple(rank, shape, NULL); - + /* Select hyperslab in file */ h_filespace = H5Dget_space(h_data); H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL); @@ -144,17 +134,17 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim /* Read HDF5 dataspace in temporary buffer */ /* Dirty version that happens to work for vectors but should be improved */ /* Using HDF5 dataspaces would be better */ - h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id, temp); - if(h_err < 0) - { - error( "Error while reading data array '%s'." , name ); - } + h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id, + temp); + if (h_err < 0) { + error("Error while reading data array '%s'.", name); + } /* Copy temporary buffer to particle data */ temp_c = temp; - for(i=0; i<N; ++i) - memcpy(part_c+i*partSize, &temp_c[i*copySize], copySize); - + for (i = 0; i < N; ++i) + memcpy(part_c + i * partSize, &temp_c[i * copySize], copySize); + /* Free and close everything */ free(temp); H5Pclose(h_plist_id); @@ -179,7 +169,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim * @param importance Is the data compulsory or not * */ -#define readArray(grp, name, type, N, dim, part, N_total, offset, field, importance) readArrayBackEnd(grp, name, type, N, dim, N_total, offset, (char*)(&(part[0]).field), importance) +#define readArray(grp, name, type, N, dim, part, N_total, offset, field, \ + importance) \ + readArrayBackEnd(grp, name, type, N, dim, N_total, offset, \ + (char*)(&(part[0]).field), importance) /** * @brief Reads an HDF5 initial condition file (GADGET-3 type) in parallel @@ -200,12 +193,15 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim * Calls #error() if an error occurs. * */ -void read_ic_parallel ( char* fileName, double dim[3], struct part **parts, int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info) -{ - hid_t h_file=0, h_grp=0; - double boxSize[3]={0.0,-1.0,-1.0}; /* GADGET has only cubic boxes (in cosmological mode) */ - int numParticles[6]={0}; /* GADGET has 6 particle types. We only keep the type 0*/ - int numParticles_highWord[6]={0}; +void read_ic_parallel(char* fileName, double dim[3], struct part** parts, + int* N, int* periodic, int mpi_rank, int mpi_size, + MPI_Comm comm, MPI_Info info) { + hid_t h_file = 0, h_grp = 0; + double boxSize[3] = { + 0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */ + int numParticles[6] = { + 0}; /* GADGET has 6 particle types. We only keep the type 0*/ + int numParticles_highWord[6] = {0}; long long offset = 0; long long N_total = 0; @@ -214,38 +210,36 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts, int hid_t h_plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(h_plist_id, comm, info); h_file = H5Fopen(fileName, H5F_ACC_RDONLY, h_plist_id); - if(h_file < 0) - { - error( "Error while opening file '%s'." , fileName ); - } + if (h_file < 0) { + error("Error while opening file '%s'.", fileName); + } /* Open header to read simulation properties */ /* message("Reading runtime parameters..."); */ h_grp = H5Gopen1(h_file, "/RuntimePars"); - if(h_grp < 0) - error("Error while opening runtime parameters\n"); + if (h_grp < 0) error("Error while opening runtime parameters\n"); /* Read the relevant information */ readAttribute(h_grp, "PeriodicBoundariesOn", INT, periodic); /* Close runtime parameters */ H5Gclose(h_grp); - + /* Open header to read simulation properties */ /* message("Reading file header..."); */ h_grp = H5Gopen1(h_file, "/Header"); - if(h_grp < 0) - error("Error while opening file header\n"); - + if (h_grp < 0) error("Error while opening file header\n"); + /* Read the relevant information and print status */ readAttribute(h_grp, "BoxSize", DOUBLE, boxSize); readAttribute(h_grp, "NumPart_Total", UINT, numParticles); readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord); - N_total = ((long long) numParticles[0]) + ((long long) numParticles_highWord[0] << 32); + N_total = ((long long)numParticles[0]) + + ((long long)numParticles_highWord[0] << 32); dim[0] = boxSize[0]; - dim[1] = ( boxSize[1] < 0 ) ? boxSize[0] : boxSize[1]; - dim[2] = ( boxSize[2] < 0 ) ? boxSize[0] : boxSize[2]; + dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1]; + dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2]; /* Divide the particles among the tasks. */ offset = mpi_rank * N_total / mpi_size; @@ -258,28 +252,37 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts, int H5Gclose(h_grp); /* Allocate memory to store particles */ - if(posix_memalign( (void*)parts , part_align , *N * sizeof(struct part)) != 0) + if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0) error("Error while allocating memory for particles"); - bzero( *parts , *N * sizeof(struct part) ); + bzero(*parts, *N * sizeof(struct part)); + + /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / + * (1024.*1024.)); */ - /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / (1024.*1024.)); */ - /* Open SPH particles group */ /* message("Reading particle arrays..."); */ h_grp = H5Gopen1(h_file, "/PartType0"); - if(h_grp < 0) - error( "Error while opening particle group.\n"); + if (h_grp < 0) error("Error while opening particle group.\n"); /* Read arrays */ - readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x, COMPULSORY); - readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v, COMPULSORY); - readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass, COMPULSORY); - readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, h, COMPULSORY); - readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, u, COMPULSORY); - readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, id, COMPULSORY); - readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt, OPTIONAL); - readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a, OPTIONAL); - readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho, OPTIONAL ); + readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x, + COMPULSORY); + readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v, + COMPULSORY); + readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass, + COMPULSORY); + readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, h, + COMPULSORY); + readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, u, + COMPULSORY); + readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, id, + COMPULSORY); + readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt, + OPTIONAL); + readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a, + OPTIONAL); + readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho, + OPTIONAL); /* Close particle group */ H5Gclose(h_grp); @@ -293,12 +296,10 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts, int /* message("Done Reading particles..."); */ } - /*----------------------------------------------------------------------------- * Routines writing an output file *-----------------------------------------------------------------------------*/ - /** * @brief Writes a data array in given HDF5 group. * @@ -311,89 +312,91 @@ void read_ic_parallel ( char* fileName, double dim[3], struct part **parts, int * @param dim The dimension of the data (1 for scalar, 3 for vector) * @param N_total Total number of particles across all cores * @param offset Offset in the array where this mpi task starts writing - * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array + * @param part_c A (char*) pointer on the first occurence of the field of + *interest in the parts array * @param us The UnitSystem currently in use * @param convFactor The UnitConversionFactor for this array * - * @todo A better version using HDF5 hyperslabs to write the file directly from the part array + * @todo A better version using HDF5 hyperslabs to write the file directly from + *the part array * will be written once the strucutres have been stabilized. * * Calls #error() if an error occurs. */ -void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, int mpi_rank, long long offset, char* part_c, struct UnitSystem* us, enum UnitConversionFactor convFactor) -{ - hid_t h_data=0, h_err=0, h_memspace=0, h_filespace=0, h_plist_id=0; +void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, + enum DATA_TYPE type, int N, int dim, long long N_total, + int mpi_rank, long long offset, char* part_c, + struct UnitSystem* us, + enum UnitConversionFactor convFactor) { + hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0, h_plist_id = 0; hsize_t shape[2], shape_total[2], offsets[2]; void* temp = 0; - int i=0, rank=0; + int i = 0, rank = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); char* temp_c = 0; char buffer[150]; - /* message("Writing '%s' array...", name); */ /* Allocate temporary buffer */ temp = malloc(N * dim * sizeOfType(type)); - if(temp == NULL) - error("Unable to allocate memory for temporary buffer"); + if (temp == NULL) error("Unable to allocate memory for temporary buffer"); /* Copy particle data to temporary buffer */ temp_c = temp; - for(i=0; i<N; ++i) - memcpy(&temp_c[i*copySize], part_c+i*partSize, copySize); + for (i = 0; i < N; ++i) + memcpy(&temp_c[i * copySize], part_c + i * partSize, copySize); /* Create data space */ h_memspace = H5Screate(H5S_SIMPLE); - if(h_memspace < 0) - { - error( "Error while creating data space (memory) for field '%s'." , name ); - } + if (h_memspace < 0) { + error("Error while creating data space (memory) for field '%s'.", name); + } h_filespace = H5Screate(H5S_SIMPLE); - if(h_filespace < 0) - { - error( "Error while creating data space (file) for field '%s'." , name ); - } - - if(dim > 1) - { - rank = 2; - shape[0] = N; shape[1] = dim; - shape_total[0] = N_total; shape_total[1] = dim; - offsets[0] = offset; offsets[1] = 0; - } - else - { - rank = 1; - shape[0] = N; shape[1] = 0; - shape_total[0] = N_total; shape_total[1] = 0; - offsets[0] = offset; offsets[1] = 0; - } - + if (h_filespace < 0) { + error("Error while creating data space (file) for field '%s'.", name); + } + + if (dim > 1) { + rank = 2; + shape[0] = N; + shape[1] = dim; + shape_total[0] = N_total; + shape_total[1] = dim; + offsets[0] = offset; + offsets[1] = 0; + } else { + rank = 1; + shape[0] = N; + shape[1] = 0; + shape_total[0] = N_total; + shape_total[1] = 0; + offsets[0] = offset; + offsets[1] = 0; + } + /* Change shape of memory data space */ h_err = H5Sset_extent_simple(h_memspace, rank, shape, NULL); - if(h_err < 0) - { - error( "Error while changing data space (memory) shape for field '%s'." , name ); - } + if (h_err < 0) { + error("Error while changing data space (memory) shape for field '%s'.", + name); + } /* Change shape of file data space */ h_err = H5Sset_extent_simple(h_filespace, rank, shape_total, NULL); - if(h_err < 0) - { - error( "Error while changing data space (file) shape for field '%s'." , name ); - } - + if (h_err < 0) { + error("Error while changing data space (file) shape for field '%s'.", name); + } + /* Create dataset */ h_data = H5Dcreate1(grp, name, hdf5Type(type), h_filespace, H5P_DEFAULT); - if(h_data < 0) - { - error( "Error while creating dataset '%s'." , name ); - } - + if (h_data < 0) { + error("Error while creating dataset '%s'.", name); + } + H5Sclose(h_filespace); h_filespace = H5Dget_space(h_data); H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL); @@ -403,23 +406,23 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu H5Pset_dxpl_mpio(h_plist_id, H5FD_MPIO_COLLECTIVE); /* Write temporary buffer to HDF5 dataspace */ - h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id, temp); - if(h_err < 0) - { - error( "Error while writing data array '%s'." , name ); - } + h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, h_plist_id, + temp); + if (h_err < 0) { + error("Error while writing data array '%s'.", name); + } /* Write XMF description for this data set */ - if(mpi_rank == 0) - writeXMFline(xmfFile, fileName, name, N_total, dim, type); + if (mpi_rank == 0) writeXMFline(xmfFile, fileName, name, N_total, dim, type); /* Write unit conversion factors for this data set */ - conversionString( buffer, us, convFactor ); - writeAttribute_d( h_data, "CGS conversion factor", conversionFactor( us, convFactor ) ); - writeAttribute_f( h_data, "h-scale exponant", hFactor( us, convFactor ) ); - writeAttribute_f( h_data, "a-scale exponant", aFactor( us, convFactor ) ); - writeAttribute_s( h_data, "Conversion factor", buffer ); - + conversionString(buffer, us, convFactor); + writeAttribute_d(h_data, "CGS conversion factor", + conversionFactor(us, convFactor)); + writeAttribute_f(h_data, "h-scale exponant", hFactor(us, convFactor)); + writeAttribute_f(h_data, "a-scale exponant", aFactor(us, convFactor)); + writeAttribute_s(h_data, "Conversion factor", buffer); + /* Free and close everything */ free(temp); H5Dclose(h_data); @@ -441,13 +444,18 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu * @param N_total Total number of particles across all cores * @param mpi_rank The MPI task rank calling the function * @param offset Offset in the array where this mpi task starts writing - * @param part A (char*) pointer on the first occurence of the field of interest in the parts array + * @param part A (char*) pointer on the first occurence of the field of interest + *in the parts array * @param field The name (code name) of the field to read from. * @param us The UnitSystem currently in use * @param convFactor The UnitConversionFactor for this array * */ -#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, mpi_rank, offset, field, us, convFactor) writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total, mpi_rank, offset, (char*)(&(part[0]).field), us, convFactor) +#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, N_total, \ + mpi_rank, offset, field, us, convFactor) \ + writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, N_total, \ + mpi_rank, offset, (char*)(&(part[0]).field), us, \ + convFactor) /** * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor @@ -457,92 +465,90 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu * * Creates an HDF5 output file and writes the particles contained * in the engine. If such a file already exists, it is erased and replaced - * by the new one. + * by the new one. * The companion XMF file is also updated accordingly. * * Calls #error() if an error occurs. * */ -void write_output_parallel (struct engine *e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info) -{ - - hid_t h_file=0, h_grp=0; +void write_output_parallel(struct engine* e, struct UnitSystem* us, + int mpi_rank, int mpi_size, MPI_Comm comm, + MPI_Info info) { + + hid_t h_file = 0, h_grp = 0; int N = e->s->nr_parts; int periodic = e->s->periodic; - unsigned int numParticles[6]={N,0}; - unsigned int numParticlesHighWord[6]={0}; - unsigned int flagEntropy[6]={0}; + unsigned int numParticles[6] = {N, 0}; + unsigned int numParticlesHighWord[6] = {0}; + unsigned int flagEntropy[6] = {0}; long long N_total = 0, offset = 0; double offset_d = 0., N_d = 0., N_total_d = 0.; int numFiles = 1; struct part* parts = e->s->parts; FILE* xmfFile = 0; static int outputCount = 0; - + /* File name */ char fileName[200]; sprintf(fileName, "output_%03i.hdf5", outputCount); /* First time, we need to create the XMF file */ - if(outputCount == 0 && mpi_rank == 0) - createXMFfile(); - + if (outputCount == 0 && mpi_rank == 0) createXMFfile(); + /* Prepare the XMF file for the new entry */ - if(mpi_rank == 0) - xmfFile = prepareXMFfile(); + if (mpi_rank == 0) xmfFile = prepareXMFfile(); /* Open HDF5 file */ hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS); H5Pset_fapl_mpio(plist_id, comm, info); h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id); - if(h_file < 0) - { - error( "Error while opening file '%s'." , fileName ); - } + if (h_file < 0) { + error("Error while opening file '%s'.", fileName); + } /* Compute offset in the file and total number of particles */ /* Done using double to allow for up to 2^50=10^15 particles */ N_d = (double)N; MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm); N_total_d = offset_d + N_d; - MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size-1, comm); - if(N_total_d > 1.e15) - error("Error while computing the offest for parallel output: Simulation has more than 10^15 particles.\n"); - N_total = (long long) N_total_d; - offset = (long long) offset_d; + MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm); + if (N_total_d > 1.e15) + error( + "Error while computing the offest for parallel output: Simulation has " + "more than 10^15 particles.\n"); + N_total = (long long)N_total_d; + offset = (long long)offset_d; /* Write the part of the XMF file corresponding to this specific output */ - if(mpi_rank == 0) - writeXMFheader(xmfFile, N_total, fileName, e->time); + if (mpi_rank == 0) writeXMFheader(xmfFile, N_total, fileName, e->time); /* Open header to write simulation properties */ /* message("Writing runtime parameters..."); */ h_grp = H5Gcreate1(h_file, "/RuntimePars", 0); - if(h_grp < 0) - error("Error while creating runtime parameters group\n"); + if (h_grp < 0) error("Error while creating runtime parameters group\n"); /* Write the relevant information */ writeAttribute(h_grp, "PeriodicBoundariesOn", INT, &periodic, 1); /* Close runtime parameters */ H5Gclose(h_grp); - + /* Open header to write simulation properties */ /* message("Writing file header..."); */ h_grp = H5Gcreate1(h_file, "/Header", 0); - if(h_grp < 0) - error("Error while creating file header\n"); - + if (h_grp < 0) error("Error while creating file header\n"); + /* Print the relevant information and print status */ writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3); writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6); writeAttribute(h_grp, "Time", DOUBLE, &e->time, 1); /* GADGET-2 legacy values */ - numParticles[0] = (unsigned int) N_total ; + numParticles[0] = (unsigned int)N_total; writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6); - numParticlesHighWord[0] = (unsigned int) (N_total >> 32); - writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, 6); + numParticlesHighWord[0] = (unsigned int)(N_total >> 32); + writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, + 6); double MassTable[6] = {0., 0., 0., 0., 0., 0.}; writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6); writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6); @@ -556,30 +562,37 @@ void write_output_parallel (struct engine *e, struct UnitSystem* us, int mpi_ra /* Print the system of Units */ writeUnitSystem(h_file, us); - + /* Create SPH particles group */ /* message("Writing particle arrays..."); */ h_grp = H5Gcreate1(h_file, "/PartType0", 0); - if(h_grp < 0) - error( "Error while creating particle group.\n"); + if (h_grp < 0) error("Error while creating particle group.\n"); /* Write arrays */ - writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH); - writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED); - writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total, mpi_rank, offset, mass, us, UNIT_CONV_MASS); - writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH); - writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS); - writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts, N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS); - writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, N_total, mpi_rank, offset, dt, us, UNIT_CONV_TIME); - writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, N_total, mpi_rank, offset, a, us, UNIT_CONV_ACCELERATION); - writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total, mpi_rank, offset, rho, us, UNIT_CONV_DENSITY); + writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, + N_total, mpi_rank, offset, x, us, UNIT_CONV_LENGTH); + writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, + N_total, mpi_rank, offset, v, us, UNIT_CONV_SPEED); + writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, N_total, + mpi_rank, offset, mass, us, UNIT_CONV_MASS); + writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, + N_total, mpi_rank, offset, h, us, UNIT_CONV_LENGTH); + writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, + N_total, mpi_rank, offset, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS); + writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts, + N_total, mpi_rank, offset, id, us, UNIT_CONV_NO_UNITS); + writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, N_total, + mpi_rank, offset, dt, us, UNIT_CONV_TIME); + writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, + N_total, mpi_rank, offset, a, us, UNIT_CONV_ACCELERATION); + writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, N_total, + mpi_rank, offset, rho, us, UNIT_CONV_DENSITY); /* Close particle group */ H5Gclose(h_grp); /* Write LXMF file descriptor */ - if(mpi_rank == 0) - writeXMFfooter(xmfFile); + if (mpi_rank == 0) writeXMFfooter(xmfFile); /* message("Done writing particles..."); */ @@ -592,7 +605,4 @@ void write_output_parallel (struct engine *e, struct UnitSystem* us, int mpi_ra ++outputCount; } - -#endif /* HAVE_HDF5 */ - - +#endif /* HAVE_HDF5 */ diff --git a/src/parallel_io.h b/src/parallel_io.h index 78699a5938894ef4577e9a14938e4a16dae2b612..fa46a230ab73e52a3b471dc0b157f5cf0f99ef73 100644 --- a/src/parallel_io.h +++ b/src/parallel_io.h @@ -1,28 +1,44 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_PARALLEL_IO_H +#define SWIFT_PARALLEL_IO_H +/* MPI headers. */ +#ifdef WITH_MPI +#include <mpi.h> +#endif + +/* Includes. */ +#include "engine.h" +#include "part.h" +#include "units.h" #if defined(HAVE_HDF5) && defined(WITH_MPI) && defined(HAVE_PARALLEL_HDF5) -void read_ic_parallel ( char* fileName, double dim[3], struct part **parts, int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info); +void read_ic_parallel(char* fileName, double dim[3], struct part** parts, + int* N, int* periodic, int mpi_rank, int mpi_size, + MPI_Comm comm, MPI_Info info); -void write_output_parallel ( struct engine* e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info); +void write_output_parallel(struct engine* e, struct UnitSystem* us, + int mpi_rank, int mpi_size, MPI_Comm comm, + MPI_Info info); #endif +#endif /* SWIFT_PARALLEL_IO_H */ diff --git a/src/part.h b/src/part.h index 3e8f5891b15677faeca01b20a2edacd6e97481ab..380c2dedb2d7847c0d0efe937d0b24feb0a736f0 100644 --- a/src/part.h +++ b/src/part.h @@ -1,170 +1,167 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ - +#ifndef SWIFT_PART_H +#define SWIFT_PART_H /* Some constants. */ -#define part_maxwait 3 -#define part_maxunlock 39 -#define part_dtmax 10 -#define part_align 64 - +#define part_maxwait 3 +#define part_maxunlock 39 +#define part_dtmax 10 +#define part_align 64 /* Extra particle data not needed during the computation. */ struct xpart { - /* Old position, at last tree rebuild. */ - double x_old[3]; - - /* Velocity at the half-step. */ - float v_hdt[3]; - - /* Entropy at the half-step. */ - float u_hdt; - - /* Old density. */ - float omega; - - /* particle's current time-step. */ - float dt_curr; - - } __attribute__((aligned (32))); - - + /* Old position, at last tree rebuild. */ + double x_old[3]; + + /* Velocity at the half-step. */ + float v_hdt[3]; + + /* Entropy at the half-step. */ + float u_hdt; + + /* Old density. */ + float omega; + + /* particle's current time-step. */ + float dt_curr; + +} __attribute__((aligned(32))); + /* Gravity particle. */ struct gpart { - /* Particle position. */ - double x[3]; - - /* Particle velocity. */ - float v[3]; - - /* Particle acceleration. */ - float a[3]; - - /* Particle mass. */ - float mass; - - /* Particle time step. */ - float dt; - - /* Anonymous union for id/part. */ - union { - - /* Particle ID. */ - size_t id; - - /* Pointer to corresponding SPH part. */ - struct part *part; - - }; - - } __attribute__((aligned (part_align))); - + /* Particle position. */ + double x[3]; + + /* Particle velocity. */ + float v[3]; + + /* Particle acceleration. */ + float a[3]; + + /* Particle mass. */ + float mass; + + /* Particle time step. */ + float dt; + + /* Anonymous union for id/part. */ + union { + + /* Particle ID. */ + size_t id; + + /* Pointer to corresponding SPH part. */ + struct part *part; + }; + +} __attribute__((aligned(part_align))); /* Data of a single particle. */ struct part { - /* Particle position. */ - double x[3]; - - /* Particle velocity. */ - float v[3]; - - /* Particle acceleration. */ - float a[3]; - - /* Particle cutoff radius. */ - float h; - - /* Particle time-step. */ - float dt; - - /* Particle internal energy. */ - float u; - - /* Particle density. */ - float rho; - - /* Derivative of the density with respect to this particle's smoothing length. */ - float rho_dh; + /* Particle position. */ + double x[3]; + + /* Particle velocity. */ + float v[3]; + + /* Particle acceleration. */ + float a[3]; + + /* Particle cutoff radius. */ + float h; + + /* Particle time-step. */ + float dt; + + /* Particle internal energy. */ + float u; + + /* Particle density. */ + float rho; + + /* Derivative of the density with respect to this particle's smoothing length. + */ + float rho_dh; #ifndef LEGACY_GADGET2_SPH - /* Particle viscosity parameter */ - float alpha; -#endif - - /* Store density/force specific stuff. */ - union { - - struct { - - /* Particle velocity divergence. */ - float div_v; - - /* Derivative of particle number density. */ - float wcount_dh; - - /* Particle velocity curl. */ - float curl_v[3]; - - /* Particle number density. */ - float wcount; - - } density; - - struct { - - /* Balsara switch */ - float balsara; - - /* Aggregate quantities. */ - float POrho2; - - /* Change in particle energy over time. */ - float u_dt; - - /* Change in smoothing length over time. */ - float h_dt; - - /* Signal velocity */ - float v_sig; - - /* Sound speed */ - float c; - - } force; - - }; - - /* Particle pressure. */ - // float P; - - /* Particle mass. */ - float mass; - - /* Particle ID. */ - unsigned long long id; - - /* Associated gravitas. */ - struct gpart *gpart; - - } __attribute__((aligned (part_align))); - + /* Particle viscosity parameter */ + float alpha; +#endif + + /* Store density/force specific stuff. */ + union { + + struct { + + /* Particle velocity divergence. */ + float div_v; + + /* Derivative of particle number density. */ + float wcount_dh; + + /* Particle velocity curl. */ + float curl_v[3]; + + /* Particle number density. */ + float wcount; + + } density; + + struct { + + /* Balsara switch */ + float balsara; + + /* Aggregate quantities. */ + float POrho2; + + /* Change in particle energy over time. */ + float u_dt; + + /* Change in smoothing length over time. */ + float h_dt; + + /* Signal velocity */ + float v_sig; + + /* Sound speed */ + float c; + + } force; + }; + + /* Particle pressure. */ + // float P; + + /* Particle mass. */ + float mass; + + /* Particle ID. */ + unsigned long long id; + + /* Associated gravitas. */ + struct gpart *gpart; + +} __attribute__((aligned(part_align))); +#endif /* SWIFT_PART_H */ diff --git a/src/proxy.c b/src/proxy.c index a90ebc68e29566c00136e3189cd9fc024a37cb5e..bafa185cdcaf2100992398657b650a954daceb91 100644 --- a/src/proxy.c +++ b/src/proxy.c @@ -1,129 +1,126 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <string.h> -#include <pthread.h> -#include <math.h> #include <float.h> #include <limits.h> #include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif -/* Local headers. */ -#include "const.h" -#include "cycle.h" -#include "atomic.h" -#include "timers.h" -#include "const.h" -#include "vector.h" -#include "lock.h" -#include "space.h" -#include "part.h" -#include "multipole.h" -#include "cell.h" -#include "task.h" -#include "debug.h" +/* This object's header. */ #include "proxy.h" -#include "error.h" +/* Local headers. */ +#include "error.h" /** * @brief Exchange cells with a remote node. * * @param p The #proxy. */ - -void proxy_cells_exch1 ( struct proxy *p ) { + +void proxy_cells_exch1(struct proxy *p) { #ifdef WITH_MPI - int k, ind; - - /* Get the number of pcells we will need to send. */ - p->size_pcells_out = 0; - for ( k = 0 ; k < p->nr_cells_out ; k++ ) - p->size_pcells_out += p->cells_out[k]->pcell_size; - - /* Send the number of pcells. */ - if ( MPI_Isend( &p->size_pcells_out , 1 , MPI_INT , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_cells_count_out ) != MPI_SUCCESS ) - error( "Failed to isend nr of pcells." ); - // message( "isent pcell count (%i) from node %i to node %i." , p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout); - - /* Allocate and fill the pcell buffer. */ - if ( p->pcells_out != NULL ) - free( p->pcells_out ); - if ( ( p->pcells_out = malloc( sizeof(struct pcell) * p->size_pcells_out ) ) == NULL ) - error( "Failed to allocate pcell_out buffer." ); - for ( ind = 0 , k = 0 ; k < p->nr_cells_out ; k++ ) { - memcpy( &p->pcells_out[ind] , p->cells_out[k]->pcell , sizeof(struct pcell) * p->cells_out[k]->pcell_size ); - ind += p->cells_out[k]->pcell_size; - } - - /* Send the pcell buffer. */ - if ( MPI_Isend( p->pcells_out , sizeof(struct pcell)*p->size_pcells_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_cells , MPI_COMM_WORLD , &p->req_cells_out ) != MPI_SUCCESS ) - error( "Failed to pcell_out buffer." ); - // message( "isent pcells (%i) from node %i to node %i." , p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout); - - /* Receive the number of pcells. */ - if ( MPI_Irecv( &p->size_pcells_in , 1 , MPI_INT , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_cells_count_in ) != MPI_SUCCESS ) - error( "Failed to irecv nr of pcells." ); - // message( "irecv pcells count on node %i from node %i." , p->mynodeID , p->nodeID ); fflush(stdout); - + int k, ind; + + /* Get the number of pcells we will need to send. */ + p->size_pcells_out = 0; + for (k = 0; k < p->nr_cells_out; k++) + p->size_pcells_out += p->cells_out[k]->pcell_size; + + /* Send the number of pcells. */ + if (MPI_Isend(&p->size_pcells_out, 1, MPI_INT, p->nodeID, + p->mynodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD, + &p->req_cells_count_out) != MPI_SUCCESS) + error("Failed to isend nr of pcells."); + // message( "isent pcell count (%i) from node %i to node %i." , + // p->size_pcells_out , p->mynodeID , p->nodeID ); fflush(stdout); + + /* Allocate and fill the pcell buffer. */ + if (p->pcells_out != NULL) free(p->pcells_out); + if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) == + NULL) + error("Failed to allocate pcell_out buffer."); + for (ind = 0, k = 0; k < p->nr_cells_out; k++) { + memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell, + sizeof(struct pcell) * p->cells_out[k]->pcell_size); + ind += p->cells_out[k]->pcell_size; + } + + /* Send the pcell buffer. */ + if (MPI_Isend(p->pcells_out, sizeof(struct pcell) * p->size_pcells_out, + MPI_BYTE, p->nodeID, + p->mynodeID * proxy_tag_shift + proxy_tag_cells, MPI_COMM_WORLD, + &p->req_cells_out) != MPI_SUCCESS) + error("Failed to pcell_out buffer."); + // message( "isent pcells (%i) from node %i to node %i." , p->size_pcells_out + // , p->mynodeID , p->nodeID ); fflush(stdout); + + /* Receive the number of pcells. */ + if (MPI_Irecv(&p->size_pcells_in, 1, MPI_INT, p->nodeID, + p->nodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD, + &p->req_cells_count_in) != MPI_SUCCESS) + error("Failed to irecv nr of pcells."); +// message( "irecv pcells count on node %i from node %i." , p->mynodeID , +// p->nodeID ); fflush(stdout); + #else - error( "SWIFT was not compiled with MPI support." ); + error("SWIFT was not compiled with MPI support."); #endif +} - } - - -void proxy_cells_exch2 ( struct proxy *p ) { +void proxy_cells_exch2(struct proxy *p) { #ifdef WITH_MPI - /* Re-allocate the pcell_in buffer. */ - if ( p->pcells_in != NULL ) - free( p->pcells_in ); - if ( ( p->pcells_in = (struct pcell *)malloc( sizeof(struct pcell) * p->size_pcells_in ) ) == NULL ) - error( "Failed to allocate pcell_in buffer." ); - - /* Receive the particle buffers. */ - if ( MPI_Irecv( p->pcells_in , sizeof(struct pcell)*p->size_pcells_in , MPI_BYTE , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_cells , MPI_COMM_WORLD , &p->req_cells_in ) != MPI_SUCCESS ) - error( "Failed to irecv part data." ); - // message( "irecv pcells (%i) on node %i from node %i." , p->size_pcells_in , p->mynodeID , p->nodeID ); fflush(stdout); + /* Re-allocate the pcell_in buffer. */ + if (p->pcells_in != NULL) free(p->pcells_in); + if ((p->pcells_in = (struct pcell *)malloc(sizeof(struct pcell) * + p->size_pcells_in)) == NULL) + error("Failed to allocate pcell_in buffer."); + + /* Receive the particle buffers. */ + if (MPI_Irecv(p->pcells_in, sizeof(struct pcell) * p->size_pcells_in, + MPI_BYTE, p->nodeID, + p->nodeID * proxy_tag_shift + proxy_tag_cells, MPI_COMM_WORLD, + &p->req_cells_in) != MPI_SUCCESS) + error("Failed to irecv part data."); +// message( "irecv pcells (%i) on node %i from node %i." , p->size_pcells_in , +// p->mynodeID , p->nodeID ); fflush(stdout); #else - error( "SWIFT was not compiled with MPI support." ); + error("SWIFT was not compiled with MPI support."); #endif - - } - +} /** * @brief Add a cell to the given proxy's input list. @@ -132,32 +129,29 @@ void proxy_cells_exch2 ( struct proxy *p ) { * @param c The #cell. */ -void proxy_addcell_in ( struct proxy *p , struct cell *c ) { - - int k; - struct cell **temp; - - /* Check if the cell is already registered with the proxy. */ - for ( k = 0 ; k < p->nr_cells_in ; k++ ) - if ( p->cells_in[k] == c ) - return; - - /* Do we need to grow the number of in cells? */ - if ( p->nr_cells_in == p->size_cells_in ) { - p->size_cells_in *= proxy_buffgrow; - if ( ( temp = malloc( sizeof(struct cell *) * p->size_cells_in ) ) == NULL ) - error( "Failed to allocate ingoing cell list." ); - memcpy( temp , p->cells_in , sizeof(struct cell *) * p->nr_cells_in ); - free( p->cells_in ); - p->cells_in = temp; - } - - /* Add the cell. */ - p->cells_in[ p->nr_cells_in ] = c; - p->nr_cells_in += 1; - - } +void proxy_addcell_in(struct proxy *p, struct cell *c) { + + int k; + struct cell **temp; + /* Check if the cell is already registered with the proxy. */ + for (k = 0; k < p->nr_cells_in; k++) + if (p->cells_in[k] == c) return; + + /* Do we need to grow the number of in cells? */ + if (p->nr_cells_in == p->size_cells_in) { + p->size_cells_in *= proxy_buffgrow; + if ((temp = malloc(sizeof(struct cell *) * p->size_cells_in)) == NULL) + error("Failed to allocate ingoing cell list."); + memcpy(temp, p->cells_in, sizeof(struct cell *) * p->nr_cells_in); + free(p->cells_in); + p->cells_in = temp; + } + + /* Add the cell. */ + p->cells_in[p->nr_cells_in] = c; + p->nr_cells_in += 1; +} /** * @brief Add a cell to the given proxy's output list. @@ -166,101 +160,114 @@ void proxy_addcell_in ( struct proxy *p , struct cell *c ) { * @param c The #cell. */ -void proxy_addcell_out ( struct proxy *p , struct cell *c ) { - - int k; - struct cell **temp; - - /* Check if the cell is already registered with the proxy. */ - for ( k = 0 ; k < p->nr_cells_out ; k++ ) - if ( p->cells_out[k] == c ) - return; - - /* Do we need to grow the number of out cells? */ - if ( p->nr_cells_out == p->size_cells_out ) { - p->size_cells_out *= proxy_buffgrow; - if ( ( temp = malloc( sizeof(struct cell *) * p->size_cells_out ) ) == NULL ) - error( "Failed to allocate outgoing cell list." ); - memcpy( temp , p->cells_out , sizeof(struct cell *) * p->nr_cells_out ); - free( p->cells_out ); - p->cells_out = temp; - } - - /* Add the cell. */ - p->cells_out[ p->nr_cells_out ] = c; - p->nr_cells_out += 1; - - } +void proxy_addcell_out(struct proxy *p, struct cell *c) { + + int k; + struct cell **temp; + + /* Check if the cell is already registered with the proxy. */ + for (k = 0; k < p->nr_cells_out; k++) + if (p->cells_out[k] == c) return; + /* Do we need to grow the number of out cells? */ + if (p->nr_cells_out == p->size_cells_out) { + p->size_cells_out *= proxy_buffgrow; + if ((temp = malloc(sizeof(struct cell *) * p->size_cells_out)) == NULL) + error("Failed to allocate outgoing cell list."); + memcpy(temp, p->cells_out, sizeof(struct cell *) * p->nr_cells_out); + free(p->cells_out); + p->cells_out = temp; + } + + /* Add the cell. */ + p->cells_out[p->nr_cells_out] = c; + p->nr_cells_out += 1; +} /** * @brief Exchange particles with a remote node. * * @param p The #proxy. */ - -void proxy_parts_exch1 ( struct proxy *p ) { + +void proxy_parts_exch1(struct proxy *p) { #ifdef WITH_MPI - /* Send the number of particles. */ - if ( MPI_Isend( &p->nr_parts_out , 1 , MPI_INT , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_parts_count_out ) != MPI_SUCCESS ) - error( "Failed to isend nr of parts." ); - // message( "isent particle count (%i) from node %i to node %i." , p->nr_parts_out , p->mynodeID , p->nodeID ); fflush(stdout); - - /* Send the particle buffers. */ - if ( p->nr_parts_out > 0 ) { - if ( MPI_Isend( p->parts_out , sizeof(struct part)*p->nr_parts_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_parts , MPI_COMM_WORLD , &p->req_parts_out ) != MPI_SUCCESS || - MPI_Isend( p->xparts_out , sizeof(struct xpart)*p->nr_parts_out , MPI_BYTE , p->nodeID , p->mynodeID*proxy_tag_shift + proxy_tag_xparts , MPI_COMM_WORLD , &p->req_xparts_out ) != MPI_SUCCESS ) - error( "Failed to isend part data." ); - // message( "isent particle data (%i) to node %i." , p->nr_parts_out , p->nodeID ); fflush(stdout); - /* for ( int k = 0 ; k < p->nr_parts_out ; k++ ) - message( "sending particle %lli, x=[%.3e %.3e %.3e], h=%.3e, to node %i." , - p->parts_out[k].id , p->parts_out[k].x[0] , p->parts_out[k].x[1] , p->parts_out[k].x[2] , - p->parts_out[k].h , p->nodeID ); */ - } - - /* Receive the number of particles. */ - if ( MPI_Irecv( &p->nr_parts_in , 1 , MPI_INT , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_count , MPI_COMM_WORLD , &p->req_parts_count_in ) != MPI_SUCCESS ) - error( "Failed to irecv nr of parts." ); - // message( "irecv particle count on node %i from node %i." , p->mynodeID , p->nodeID ); fflush(stdout); - + /* Send the number of particles. */ + if (MPI_Isend(&p->nr_parts_out, 1, MPI_INT, p->nodeID, + p->mynodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD, + &p->req_parts_count_out) != MPI_SUCCESS) + error("Failed to isend nr of parts."); + // message( "isent particle count (%i) from node %i to node %i." , + // p->nr_parts_out , p->mynodeID , p->nodeID ); fflush(stdout); + + /* Send the particle buffers. */ + if (p->nr_parts_out > 0) { + if (MPI_Isend(p->parts_out, sizeof(struct part) * p->nr_parts_out, MPI_BYTE, + p->nodeID, p->mynodeID * proxy_tag_shift + proxy_tag_parts, + MPI_COMM_WORLD, &p->req_parts_out) != MPI_SUCCESS || + MPI_Isend(p->xparts_out, sizeof(struct xpart) * p->nr_parts_out, + MPI_BYTE, p->nodeID, + p->mynodeID * proxy_tag_shift + proxy_tag_xparts, + MPI_COMM_WORLD, &p->req_xparts_out) != MPI_SUCCESS) + error("Failed to isend part data."); + // message( "isent particle data (%i) to node %i." , p->nr_parts_out , + // p->nodeID ); fflush(stdout); + for (int k = 0; k < p->nr_parts_out; k++) + message("sending particle %lli, x=[%.3e %.3e %.3e], h=%.3e, to node %i.", + p->parts_out[k].id, p->parts_out[k].x[0], p->parts_out[k].x[1], + p->parts_out[k].x[2], p->parts_out[k].h, p->nodeID); + } + + /* Receive the number of particles. */ + if (MPI_Irecv(&p->nr_parts_in, 1, MPI_INT, p->nodeID, + p->nodeID * proxy_tag_shift + proxy_tag_count, MPI_COMM_WORLD, + &p->req_parts_count_in) != MPI_SUCCESS) + error("Failed to irecv nr of parts."); +// message( "irecv particle count on node %i from node %i." , p->mynodeID , +// p->nodeID ); fflush(stdout); + #else - error( "SWIFT was not compiled with MPI support." ); + error("SWIFT was not compiled with MPI support."); #endif +} - } - - -void proxy_parts_exch2 ( struct proxy *p ) { +void proxy_parts_exch2(struct proxy *p) { #ifdef WITH_MPI - /* Is there enough space in the buffer? */ - if ( p->nr_parts_in > p->size_parts_in ) { - do { - p->size_parts_in *= proxy_buffgrow; - } while ( p->nr_parts_in > p->size_parts_in ); - free( p->parts_in ); free( p->xparts_in ); - if ( ( p->parts_in = (struct part *)malloc( sizeof(struct part) * p->size_parts_in ) ) == NULL || - ( p->xparts_in = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_in ) ) == NULL ) - error( "Failed to re-allocate parts_in buffers." ); - } - - /* Receive the particle buffers. */ - if ( p->nr_parts_in > 0 ) { - if ( MPI_Irecv( p->parts_in , sizeof(struct part)*p->nr_parts_in , MPI_BYTE , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_parts , MPI_COMM_WORLD , &p->req_parts_in ) != MPI_SUCCESS || - MPI_Irecv( p->xparts_in , sizeof(struct xpart)*p->nr_parts_in , MPI_BYTE , p->nodeID , p->nodeID*proxy_tag_shift + proxy_tag_xparts , MPI_COMM_WORLD , &p->req_xparts_in ) != MPI_SUCCESS ) - error( "Failed to irecv part data." ); - // message( "irecv particle data (%i) from node %i." , p->nr_parts_in , p->nodeID ); fflush(stdout); - } + /* Is there enough space in the buffer? */ + if (p->nr_parts_in > p->size_parts_in) { + do { + p->size_parts_in *= proxy_buffgrow; + } while (p->nr_parts_in > p->size_parts_in); + free(p->parts_in); + free(p->xparts_in); + if ((p->parts_in = (struct part *)malloc( + sizeof(struct part) *p->size_parts_in)) == NULL || + (p->xparts_in = (struct xpart *)malloc(sizeof(struct xpart) * + p->size_parts_in)) == NULL) + error("Failed to re-allocate parts_in buffers."); + } + + /* Receive the particle buffers. */ + if (p->nr_parts_in > 0) { + if (MPI_Irecv(p->parts_in, sizeof(struct part) * p->nr_parts_in, MPI_BYTE, + p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_parts, + MPI_COMM_WORLD, &p->req_parts_in) != MPI_SUCCESS || + MPI_Irecv(p->xparts_in, sizeof(struct xpart) * p->nr_parts_in, MPI_BYTE, + p->nodeID, p->nodeID * proxy_tag_shift + proxy_tag_xparts, + MPI_COMM_WORLD, &p->req_xparts_in) != MPI_SUCCESS) + error("Failed to irecv part data."); + // message( "irecv particle data (%i) from node %i." , p->nr_parts_in , + // p->nodeID ); fflush(stdout); + } #else - error( "SWIFT was not compiled with MPI support." ); + error("SWIFT was not compiled with MPI support."); #endif - - } - +} /** * @brief Load parts onto a proxy for exchange. @@ -270,34 +277,37 @@ void proxy_parts_exch2 ( struct proxy *p ) { * @param xparts Pointer to an array of #xpart to send. * @param N The number of parts. */ - -void proxy_parts_load ( struct proxy *p , struct part *parts , struct xpart *xparts , int N ) { - - /* Is there enough space in the buffer? */ - if ( p->nr_parts_out + N > p->size_parts_out ) { - do { - p->size_parts_out *= proxy_buffgrow; - } while ( p->nr_parts_out + N > p->size_parts_out ); - struct part *tp; - struct xpart *txp; - if ( ( tp = (struct part *)malloc( sizeof(struct part) * p->size_parts_out ) ) == NULL || - ( txp = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_out ) ) == NULL ) - error( "Failed to re-allocate parts_out buffers." ); - memcpy( tp , p->parts_out , sizeof(struct part) * p->nr_parts_out ); - memcpy( txp , p->xparts_out , sizeof(struct xpart) * p->nr_parts_out ); - free( p->parts_out ); free( p->xparts_out ); - p->parts_out = tp; p->xparts_out = txp; - } - - /* Copy the parts and xparts data to the buffer. */ - memcpy( &p->parts_out[ p->nr_parts_out ] , parts , sizeof(struct part) * N ); - memcpy( &p->xparts_out[ p->nr_parts_out ] , xparts , sizeof(struct xpart) * N ); - - /* Increase the counters. */ - p->nr_parts_out += N; - - } +void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts, + int N) { + + /* Is there enough space in the buffer? */ + if (p->nr_parts_out + N > p->size_parts_out) { + do { + p->size_parts_out *= proxy_buffgrow; + } while (p->nr_parts_out + N > p->size_parts_out); + struct part *tp; + struct xpart *txp; + if ((tp = (struct part *)malloc(sizeof(struct part) *p->size_parts_out)) == + NULL || + (txp = (struct xpart *)malloc(sizeof(struct xpart) * + p->size_parts_out)) == NULL) + error("Failed to re-allocate parts_out buffers."); + memcpy(tp, p->parts_out, sizeof(struct part) * p->nr_parts_out); + memcpy(txp, p->xparts_out, sizeof(struct xpart) * p->nr_parts_out); + free(p->parts_out); + free(p->xparts_out); + p->parts_out = tp; + p->xparts_out = txp; + } + + /* Copy the parts and xparts data to the buffer. */ + memcpy(&p->parts_out[p->nr_parts_out], parts, sizeof(struct part) * N); + memcpy(&p->xparts_out[p->nr_parts_out], xparts, sizeof(struct xpart) * N); + + /* Increase the counters. */ + p->nr_parts_out += N; +} /** * @brief Initialize the given proxy. @@ -306,41 +316,46 @@ void proxy_parts_load ( struct proxy *p , struct part *parts , struct xpart *xpa * @param mynodeID The node this proxy is running on. * @param nodeID The node with which this proxy will communicate. */ - -void proxy_init ( struct proxy *p , int mynodeID , int nodeID ) { - - /* Set the nodeID. */ - p->mynodeID = mynodeID; - p->nodeID = nodeID; - - /* Allocate the cell send and receive buffers, if needed. */ - if ( p->cells_in == NULL ) { - p->size_cells_in = proxy_buffinit; - if ( ( p->cells_in = (struct cell **)malloc( sizeof(void *) * p->size_cells_in ) ) == NULL ) - error( "Failed to allocate cells_in buffer." ); - } - p->nr_cells_in = 0; - if ( p->cells_out == NULL ) { - p->size_cells_out = proxy_buffinit; - if ( ( p->cells_out = (struct cell **)malloc( sizeof(void *) * p->size_cells_out ) ) == NULL ) - error( "Failed to allocate cells_out buffer." ); - } - p->nr_cells_out = 0; - - /* Allocate the part send and receive buffers, if needed. */ - if ( p->parts_in == NULL ) { - p->size_parts_in = proxy_buffinit; - if ( ( p->parts_in = (struct part *)malloc( sizeof(struct part) * p->size_parts_in ) ) == NULL || - ( p->xparts_in = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_in ) ) == NULL ) - error( "Failed to allocate parts_in buffers." ); - } - p->nr_parts_in = 0; - if ( p->parts_out == NULL ) { - p->size_parts_out = proxy_buffinit; - if ( ( p->parts_out = (struct part *)malloc( sizeof(struct part) * p->size_parts_out ) ) == NULL || - ( p->xparts_out = (struct xpart *)malloc( sizeof(struct xpart) * p->size_parts_out ) ) == NULL ) - error( "Failed to allocate parts_out buffers." ); - } - p->nr_parts_out = 0; - - } + +void proxy_init(struct proxy *p, int mynodeID, int nodeID) { + + /* Set the nodeID. */ + p->mynodeID = mynodeID; + p->nodeID = nodeID; + + /* Allocate the cell send and receive buffers, if needed. */ + if (p->cells_in == NULL) { + p->size_cells_in = proxy_buffinit; + if ((p->cells_in = + (struct cell **)malloc(sizeof(void *) * p->size_cells_in)) == NULL) + error("Failed to allocate cells_in buffer."); + } + p->nr_cells_in = 0; + if (p->cells_out == NULL) { + p->size_cells_out = proxy_buffinit; + if ((p->cells_out = (struct cell **)malloc(sizeof(void *) * + p->size_cells_out)) == NULL) + error("Failed to allocate cells_out buffer."); + } + p->nr_cells_out = 0; + + /* Allocate the part send and receive buffers, if needed. */ + if (p->parts_in == NULL) { + p->size_parts_in = proxy_buffinit; + if ((p->parts_in = (struct part *)malloc( + sizeof(struct part) *p->size_parts_in)) == NULL || + (p->xparts_in = (struct xpart *)malloc(sizeof(struct xpart) * + p->size_parts_in)) == NULL) + error("Failed to allocate parts_in buffers."); + } + p->nr_parts_in = 0; + if (p->parts_out == NULL) { + p->size_parts_out = proxy_buffinit; + if ((p->parts_out = (struct part *)malloc( + sizeof(struct part) *p->size_parts_out)) == NULL || + (p->xparts_out = (struct xpart *)malloc(sizeof(struct xpart) * + p->size_parts_out)) == NULL) + error("Failed to allocate parts_out buffers."); + } + p->nr_parts_out = 0; +} diff --git a/src/proxy.h b/src/proxy.h index 4710f5a8909bbab3b85fb71e15ae762a602a9dad..8cb08d0a66095597227b52b317f3808190cdc45f 100644 --- a/src/proxy.h +++ b/src/proxy.h @@ -1,76 +1,80 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_PROXY_H +#define SWIFT_PROXY_H - +/* Includes. */ +#include "cell.h" +#include "part.h" /* Some constants. */ -#define proxy_buffgrow 1.5 -#define proxy_buffinit 100 +#define proxy_buffgrow 1.5 +#define proxy_buffinit 100 /* Proxy tag arithmetic. */ -#define proxy_tag_shift 8 -#define proxy_tag_count 0 -#define proxy_tag_parts 1 -#define proxy_tag_xparts 2 -#define proxy_tag_cells 3 - +#define proxy_tag_shift 8 +#define proxy_tag_count 0 +#define proxy_tag_parts 1 +#define proxy_tag_xparts 2 +#define proxy_tag_cells 3 /* Data structure for the proxy. */ struct proxy { - /* ID of the node this proxy represents. */ - int mynodeID, nodeID; - - /* Incomming cells. */ - struct cell **cells_in; - struct pcell *pcells_in; - int nr_cells_in, size_cells_in, size_pcells_in; - - /* Outgoing cells. */ - struct cell **cells_out; - struct pcell *pcells_out; - int nr_cells_out, size_cells_out, size_pcells_out; - - /* The parts and xparts buffers for input and output. */ - struct part *parts_in, *parts_out; - struct xpart *xparts_in, *xparts_out; - int size_parts_in, size_parts_out; - int nr_parts_in, nr_parts_out; - - /* MPI request handles. */ - #ifdef WITH_MPI - MPI_Request req_parts_count_out, req_parts_count_in; - MPI_Request req_parts_out, req_parts_in; - MPI_Request req_xparts_out, req_xparts_in; - MPI_Request req_cells_count_out, req_cells_count_in; - MPI_Request req_cells_out, req_cells_in; - #endif - - }; + /* ID of the node this proxy represents. */ + int mynodeID, nodeID; + + /* Incomming cells. */ + struct cell **cells_in; + struct pcell *pcells_in; + int nr_cells_in, size_cells_in, size_pcells_in; + /* Outgoing cells. */ + struct cell **cells_out; + struct pcell *pcells_out; + int nr_cells_out, size_cells_out, size_pcells_out; + + /* The parts and xparts buffers for input and output. */ + struct part *parts_in, *parts_out; + struct xpart *xparts_in, *xparts_out; + int size_parts_in, size_parts_out; + int nr_parts_in, nr_parts_out; + +/* MPI request handles. */ +#ifdef WITH_MPI + MPI_Request req_parts_count_out, req_parts_count_in; + MPI_Request req_parts_out, req_parts_in; + MPI_Request req_xparts_out, req_xparts_in; + MPI_Request req_cells_count_out, req_cells_count_in; + MPI_Request req_cells_out, req_cells_in; +#endif +}; /* Function prototypes. */ -void proxy_init ( struct proxy *p , int mynodeID , int nodeID ); -void proxy_parts_load ( struct proxy *p , struct part *parts , struct xpart *xparts , int N ); -void proxy_parts_exch1 ( struct proxy *p ); -void proxy_parts_exch2 ( struct proxy *p ); -void proxy_addcell_in ( struct proxy *p , struct cell *c ); -void proxy_addcell_out ( struct proxy *p , struct cell *c ); -void proxy_cells_exch1 ( struct proxy *p ); -void proxy_cells_exch2 ( struct proxy *p ); +void proxy_init(struct proxy *p, int mynodeID, int nodeID); +void proxy_parts_load(struct proxy *p, struct part *parts, struct xpart *xparts, + int N); +void proxy_parts_exch1(struct proxy *p); +void proxy_parts_exch2(struct proxy *p); +void proxy_addcell_in(struct proxy *p, struct cell *c); +void proxy_addcell_out(struct proxy *p, struct cell *c); +void proxy_cells_exch1(struct proxy *p); +void proxy_cells_exch2(struct proxy *p); + +#endif /* SWIFT_PROXY_H */ diff --git a/src/queue.c b/src/queue.c index 497a23d2024cd6c8f29890802d8c00b4ab639621..3fa0096bf0fab8ecc6ec2508d5a7c2529451e54d 100644 --- a/src/queue.c +++ b/src/queue.c @@ -1,20 +1,20 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ @@ -27,37 +27,25 @@ /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif +/* This object's header. */ +#include "queue.h" + /* Local headers. */ #include "const.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "timers.h" -#include "space.h" -#include "part.h" -#include "multipole.h" -#include "cell.h" -#include "queue.h" #include "error.h" -#include "inline.h" - - /* Counter macros. */ #ifdef COUNTER - #define COUNT(c) ( __sync_add_and_fetch( &queue_counter[ c ] , 1 ) ) +#define COUNT(c) (__sync_add_and_fetch(&queue_counter[c], 1)) #else - #define COUNT(c) +#define COUNT(c) #endif - /* The counters. */ -int queue_counter[ queue_counter_count ]; - - +int queue_counter[queue_counter_count]; /** * @brief Insert a used tasks into the given queue. @@ -65,82 +53,75 @@ int queue_counter[ queue_counter_count ]; * @param q The #queue. * @param t The #task. */ - -void queue_insert ( struct queue *q , struct task *t ) { - - int k, *tid; - struct task *tasks; - - /* Lock the queue. */ - if ( lock_lock( &q->lock ) != 0 ) - error( "Failed to get queue lock." ); - - tid = q->tid; - tasks = q->tasks; - - /* Does the queue need to be grown? */ - if ( q->count == q->size ) { - int *temp; - q->size *= queue_sizegrow; - if ( ( temp = (int *)malloc( sizeof(int) * q->size ) ) == NULL ) - error( "Failed to allocate new indices." ); - memcpy( temp , tid , sizeof(int) * q->count ); - free( tid ); - q->tid = tid = temp; - } - - /* Drop the task at the end of the queue. */ - tid[ q->count ] = ( t - tasks ); - q->count += 1; - - /* Shuffle up. */ - for ( k = q->count - 1 ; k > 0 ; k = (k-1)/2 ) - if ( tasks[ tid[k] ].weight > tasks[ tid[(k-1)/2] ].weight ) { - int temp = tid[k]; - tid[k] = tid[(k-1)/2]; - tid[(k-1)/2] = temp; - } - else - break; - - /* Check the queue's consistency. */ - /* for ( k = 1 ; k < q->count ; k++ ) - if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight ) - error( "Queue heap is disordered." ); */ - - /* Unlock the queue. */ - if ( lock_unlock( &q->lock ) != 0 ) - error( "Failed to unlock queue." ); - } +void queue_insert(struct queue *q, struct task *t) { + + int k, *tid; + struct task *tasks; + + /* Lock the queue. */ + if (lock_lock(&q->lock) != 0) error("Failed to get queue lock."); + + tid = q->tid; + tasks = q->tasks; + + /* Does the queue need to be grown? */ + if (q->count == q->size) { + int *temp; + q->size *= queue_sizegrow; + if ((temp = (int *)malloc(sizeof(int) * q->size)) == NULL) + error("Failed to allocate new indices."); + memcpy(temp, tid, sizeof(int) * q->count); + free(tid); + q->tid = tid = temp; + } + + /* Drop the task at the end of the queue. */ + tid[q->count] = (t - tasks); + q->count += 1; + + /* Shuffle up. */ + for (k = q->count - 1; k > 0; k = (k - 1) / 2) + if (tasks[tid[k]].weight > tasks[tid[(k - 1) / 2]].weight) { + int temp = tid[k]; + tid[k] = tid[(k - 1) / 2]; + tid[(k - 1) / 2] = temp; + } else + break; + + /* Check the queue's consistency. */ + /* for ( k = 1 ; k < q->count ; k++ ) + if ( tasks[ tid[(k-1)/2] ].weight < tasks[ tid[k] ].weight ) + error( "Queue heap is disordered." ); */ + + /* Unlock the queue. */ + if (lock_unlock(&q->lock) != 0) error("Failed to unlock queue."); +} -/** +/** * @brief Initialize the given queue. * * @param q The #queue. * @param tasks List of tasks to which the queue indices refer to. */ - -void queue_init ( struct queue *q , struct task *tasks ) { - - /* Allocate the task list if needed. */ - q->size = queue_sizeinit; - if ( ( q->tid = (int *)malloc( sizeof(int) * q->size ) ) == NULL ) - error( "Failed to allocate queue tids." ); - - /* Set the tasks pointer. */ - q->tasks = tasks; - - /* Init counters. */ - q->count = 0; - - /* Init the queue lock. */ - if ( lock_init( &q->lock ) != 0 ) - error( "Failed to init queue lock." ); - } - - +void queue_init(struct queue *q, struct task *tasks) { + + /* Allocate the task list if needed. */ + q->size = queue_sizeinit; + if ((q->tid = (int *)malloc(sizeof(int) * q->size)) == NULL) + error("Failed to allocate queue tids."); + + /* Set the tasks pointer. */ + q->tasks = tasks; + + /* Init counters. */ + q->count = 0; + + /* Init the queue lock. */ + if (lock_init(&q->lock) != 0) error("Failed to init queue lock."); +} + /** * @brief Get a task free of dependencies and conflicts. * @@ -148,111 +129,102 @@ void queue_init ( struct queue *q , struct task *tasks ) { * @param super The super-cell tat might conflict with the #queue * @param blocking Block until access to the queue is granted. */ - -struct task *queue_gettask ( struct queue *q , struct cell *super , int blocking ) { - - int k, qcount, *qtid, gotcha; - lock_type *qlock = &q->lock; - struct task *qtasks, *res = NULL; - - /* If there are no tasks, leave immediately. */ - if ( q->count == 0 ) - return NULL; - - /* Grab the task lock. */ - if ( blocking ) { - if ( lock_lock( qlock ) != 0 ) - error( "Locking the qlock failed.\n" ); - } - else { - if ( lock_trylock( qlock ) != 0 ) - return NULL; - } - - /* Set some pointers we will use often. */ - qtid = q->tid; - qtasks = q->tasks; - qcount = q->count; - gotcha = 0; - - /* Loop over the task IDs looking for tasks with the same super-cell. */ - if ( super != NULL ) { - for ( k = 0 ; k < qcount && k < queue_maxsuper ; k++ ) { - - /* Put a finger on the task. */ - res = &qtasks[ qtid[k] ]; - - /* Try to lock the task and exit if successful. */ - if ( ( res->ci->super == super || ( res->cj != NULL && res->cj->super == super ) ) && - task_lock( res ) ) { - gotcha = 1; - break; - } - - } /* loop over the task IDs. */ - } - - /* Loop over the task IDs again if nothing was found, take anything. */ - if ( !gotcha ) { - for ( k = 0 ; k < qcount ; k++ ) { - - /* Put a finger on the task. */ - res = &qtasks[ qtid[k] ]; - - /* Try to lock the task and exit if successful. */ - if ( task_lock( res ) ) - break; - - } /* loop over the task IDs. */ - } - - /* Did we get a task? */ - if ( k < qcount ) { - - /* Another one bites the dust. */ - qcount = q->count -= 1; - - /* Swap this task with the last task and re-heap. */ - if ( k < qcount ) { - qtid[ k ] = qtid[ qcount ]; - int w = qtasks[ qtid[k] ].weight; - while ( k > 0 && w > qtasks[ qtid[(k-1)/2] ].weight ) { - int temp = q->tid[k]; - q->tid[k] = q->tid[(k-1)/2]; - q->tid[(k-1)/2] = temp; - k = (k-1)/2; - } - int i; - while ( ( i = 2*k+1 ) < qcount ) { - if ( i+1 < qcount && qtasks[ qtid[i+1] ].weight > qtasks[ qtid[i] ].weight ) - i += 1; - if ( qtasks[ qtid[i] ].weight > w ) { - int temp = qtid[i]; - qtid[i] = qtid[k]; - qtid[k] = temp; - k = i; - } - else - break; - } - } - - } - else - res = NULL; - - /* Check the queue's consistency. */ - /* for ( k = 1 ; k < q->count ; k++ ) - if ( qtasks[ qtid[(k-1)/2] ].weight < qtasks[ qtid[k] ].weight ) - error( "Queue heap is disordered." ); */ - - /* Release the task lock. */ - if ( lock_unlock( qlock ) != 0 ) - error( "Unlocking the qlock failed.\n" ); - - /* Take the money and run. */ - return res; +struct task *queue_gettask(struct queue *q, struct cell *super, int blocking) { + + int k, qcount, *qtid, gotcha; + lock_type *qlock = &q->lock; + struct task *qtasks, *res = NULL; + + /* If there are no tasks, leave immediately. */ + if (q->count == 0) return NULL; + + /* Grab the task lock. */ + if (blocking) { + if (lock_lock(qlock) != 0) error("Locking the qlock failed.\n"); + } else { + if (lock_trylock(qlock) != 0) return NULL; + } + + /* Set some pointers we will use often. */ + qtid = q->tid; + qtasks = q->tasks; + qcount = q->count; + gotcha = 0; + + /* Loop over the task IDs looking for tasks with the same super-cell. */ + if (super != NULL) { + for (k = 0; k < qcount && k < queue_maxsuper; k++) { + + /* Put a finger on the task. */ + res = &qtasks[qtid[k]]; + + /* Try to lock the task and exit if successful. */ + if ((res->ci->super == super || + (res->cj != NULL && res->cj->super == super)) && + task_lock(res)) { + gotcha = 1; + break; + } + + } /* loop over the task IDs. */ + } + + /* Loop over the task IDs again if nothing was found, take anything. */ + if (!gotcha) { + for (k = 0; k < qcount; k++) { + + /* Put a finger on the task. */ + res = &qtasks[qtid[k]]; + + /* Try to lock the task and exit if successful. */ + if (task_lock(res)) break; + + } /* loop over the task IDs. */ + } + + /* Did we get a task? */ + if (k < qcount) { + + /* Another one bites the dust. */ + qcount = q->count -= 1; + + /* Swap this task with the last task and re-heap. */ + if (k < qcount) { + qtid[k] = qtid[qcount]; + int w = qtasks[qtid[k]].weight; + while (k > 0 && w > qtasks[qtid[(k - 1) / 2]].weight) { + int temp = q->tid[k]; + q->tid[k] = q->tid[(k - 1) / 2]; + q->tid[(k - 1) / 2] = temp; + k = (k - 1) / 2; + } + int i; + while ((i = 2 * k + 1) < qcount) { + if (i + 1 < qcount && + qtasks[qtid[i + 1]].weight > qtasks[qtid[i]].weight) + i += 1; + if (qtasks[qtid[i]].weight > w) { + int temp = qtid[i]; + qtid[i] = qtid[k]; + qtid[k] = temp; + k = i; + } else + break; + } } + } else + res = NULL; + + /* Check the queue's consistency. */ + /* for ( k = 1 ; k < q->count ; k++ ) + if ( qtasks[ qtid[(k-1)/2] ].weight < qtasks[ qtid[k] ].weight ) + error( "Queue heap is disordered." ); */ + + /* Release the task lock. */ + if (lock_unlock(qlock) != 0) error("Unlocking the qlock failed.\n"); + /* Take the money and run. */ + return res; +} diff --git a/src/queue.h b/src/queue.h index 76ae9b6971e456af17f338561eaecb5670172d21..533007684fa41a4a25a10a14c504358926d0fe06 100644 --- a/src/queue.h +++ b/src/queue.h @@ -1,56 +1,61 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_QUEUE_H +#define SWIFT_QUEUE_H +/* Includes. */ +#include "cell.h" +#include "lock.h" +#include "task.h" /* Some constants. */ -#define queue_maxsuper 50 -#define queue_sizeinit 100 -#define queue_sizegrow 2 - +#define queue_maxsuper 50 +#define queue_sizeinit 100 +#define queue_sizegrow 2 /* Counters. */ enum { - queue_counter_swap = 0, - queue_counter_count, - }; -extern int queue_counter[ queue_counter_count ]; - + queue_counter_swap = 0, + queue_counter_count, +}; +extern int queue_counter[queue_counter_count]; /** The queue struct. */ struct queue { - /* The lock to access this queue. */ - lock_type lock; + /* The lock to access this queue. */ + lock_type lock; + + /* Size, count and next element. */ + int size, count; - /* Size, count and next element. */ - int size, count; - - /* The actual tasks to which the indices refer. */ - struct task *tasks; - - /* The task indices. */ - int *tid; + /* The actual tasks to which the indices refer. */ + struct task *tasks; - } __attribute__((aligned (64))); - + /* The task indices. */ + int *tid; + +} __attribute__((aligned(64))); /* Function prototypes. */ -struct task *queue_gettask ( struct queue *q , struct cell *super , int blocking ); -void queue_init ( struct queue *q , struct task *tasks ); -void queue_insert ( struct queue *q , struct task *t ); +struct task *queue_gettask(struct queue *q, struct cell *super, int blocking); +void queue_init(struct queue *q, struct task *tasks); +void queue_insert(struct queue *q, struct task *t); + +#endif /* SWIFT_QUEUE_H */ diff --git a/src/runner.c b/src/runner.c index 71f425196622a99636118e62f6e4e1fef4f117e3..a3056f414bd352c2a9b857b980e7989dfc28130a 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1,56 +1,45 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> -#include <math.h> #include <float.h> #include <limits.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif +/* This object's header. */ +#include "runner.h" + /* Local headers. */ #include "const.h" -#include "cycle.h" -#include "atomic.h" -#include "timers.h" -#include "const.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "space.h" -#include "multipole.h" -#include "cell.h" -#include "queue.h" -#include "scheduler.h" #include "engine.h" -#include "runner.h" #include "error.h" +#include "scheduler.h" +#include "space.h" +#include "task.h" +#include "timers.h" /* Include the right variant of the SPH interactions */ #ifdef LEGACY_GADGET2_SPH @@ -61,31 +50,28 @@ #include "runner_iact_grav.h" /* Convert cell location to ID. */ -#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) +#define cell_getid(cdim, i, j, k) \ + ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* The counters. */ -int runner_counter[ runner_counter_count ]; - - - -const float runner_shift[13*3] = { - 5.773502691896258e-01 , 5.773502691896258e-01 , 5.773502691896258e-01 , - 7.071067811865475e-01 , 7.071067811865475e-01 , 0.0 , - 5.773502691896258e-01 , 5.773502691896258e-01 , -5.773502691896258e-01 , - 7.071067811865475e-01 , 0.0 , 7.071067811865475e-01 , - 1.0 , 0.0 , 0.0 , - 7.071067811865475e-01 , 0.0 , -7.071067811865475e-01 , - 5.773502691896258e-01 , -5.773502691896258e-01 , 5.773502691896258e-01 , - 7.071067811865475e-01 , -7.071067811865475e-01 , 0.0 , - 5.773502691896258e-01 , -5.773502691896258e-01 , -5.773502691896258e-01 , - 0.0 , 7.071067811865475e-01 , 7.071067811865475e-01 , - 0.0 , 1.0 , 0.0 , - 0.0 , 7.071067811865475e-01 , -7.071067811865475e-01 , - 0.0 , 0.0 , 1.0 , - }; -const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 }; - +int runner_counter[runner_counter_count]; + +const float runner_shift[13 * 3] = { + 5.773502691896258e-01, 5.773502691896258e-01, 5.773502691896258e-01, + 7.071067811865475e-01, 7.071067811865475e-01, 0.0, + 5.773502691896258e-01, 5.773502691896258e-01, -5.773502691896258e-01, + 7.071067811865475e-01, 0.0, 7.071067811865475e-01, + 1.0, 0.0, 0.0, + 7.071067811865475e-01, 0.0, -7.071067811865475e-01, + 5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01, + 7.071067811865475e-01, -7.071067811865475e-01, 0.0, + 5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01, + 0.0, 7.071067811865475e-01, 7.071067811865475e-01, + 0.0, 1.0, 0.0, + 0.0, 7.071067811865475e-01, -7.071067811865475e-01, + 0.0, 0.0, 1.0, }; +const char runner_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; /* Import the density loop functions. */ #define FUNCTION density @@ -99,119 +85,85 @@ const char runner_flip[27] = { 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 /* Import the gravity loop functions. */ #include "runner_doiact_grav.h" - -/** - * @brief Send a local cell's particle data to another node. - * - * @param r The #runner. - * @param c The #cell. - * @param nodeID The destination node's ID. - * @param tag bit to distinguish between xv and rho sends. - */ - -void runner_dosend ( struct runner *r , struct cell *c , int nodeID , int tag ) { - -#ifdef WITH_MPI - - MPI_Request req; - - /* First check if all the density tasks have been run. */ - if ( tag & 1 ) - if ( c->parts[0].rho == 0.0 ) - error( "Attempting to send rhos before ghost task completed." ); - - /* Emit the isend. */ - if ( MPI_Isend( c->parts , sizeof(struct part) * c->count , MPI_BYTE , nodeID , tag , MPI_COMM_WORLD , &req ) != MPI_SUCCESS ) - error( "Failed to isend particle data." ); - - message( "sending %i parts with tag=%i from %i to %i." , - c->count , tag , r->e->nodeID , nodeID ); fflush(stdout); - - /* Free the request handler as we don't care what happens next. */ - MPI_Request_free( &req ); - -#else - error( "SWIFT was not compiled with MPI support." ); -#endif - - } - - /** * @brief Sort the entries in ascending order using QuickSort. * * @param sort The entries * @param N The number of entries. */ - -void runner_dosort_ascending ( struct entry *sort , int N ) { - - struct { - short int lo, hi; - } qstack[10]; - int qpos, i, j, lo, hi, imin; - struct entry temp; - float pivot; - - /* Sort parts in cell_i in decreasing order with quicksort */ - qstack[0].lo = 0; qstack[0].hi = N - 1; qpos = 0; - while ( qpos >= 0 ) { - lo = qstack[qpos].lo; hi = qstack[qpos].hi; - qpos -= 1; - if ( hi - lo < 15 ) { - for ( i = lo ; i < hi ; i++ ) { - imin = i; - for ( j = i+1 ; j <= hi ; j++ ) - if ( sort[j].d < sort[imin].d ) - imin = j; - if ( imin != i ) { - temp = sort[imin]; sort[imin] = sort[i]; sort[i] = temp; - } - } - } - else { - pivot = sort[ ( lo + hi ) / 2 ].d; - i = lo; j = hi; - while ( i <= j ) { - while ( sort[i].d < pivot ) i++; - while ( sort[j].d > pivot ) j--; - if ( i <= j ) { - if ( i < j ) { - temp = sort[i]; sort[i] = sort[j]; sort[j] = temp; - } - i += 1; j -= 1; - } - } - if ( j > ( lo + hi ) / 2 ) { - if ( lo < j ) { - qpos += 1; - qstack[qpos].lo = lo; - qstack[qpos].hi = j; - } - if ( i < hi ) { - qpos += 1; - qstack[qpos].lo = i; - qstack[qpos].hi = hi; - } - } - else { - if ( i < hi ) { - qpos += 1; - qstack[qpos].lo = i; - qstack[qpos].hi = hi; - } - if ( lo < j ) { - qpos += 1; - qstack[qpos].lo = lo; - qstack[qpos].hi = j; - } - } - } + +void runner_dosort_ascending(struct entry *sort, int N) { + + struct { + short int lo, hi; + } qstack[10]; + int qpos, i, j, lo, hi, imin; + struct entry temp; + float pivot; + + /* Sort parts in cell_i in decreasing order with quicksort */ + qstack[0].lo = 0; + qstack[0].hi = N - 1; + qpos = 0; + while (qpos >= 0) { + lo = qstack[qpos].lo; + hi = qstack[qpos].hi; + qpos -= 1; + if (hi - lo < 15) { + for (i = lo; i < hi; i++) { + imin = i; + for (j = i + 1; j <= hi; j++) + if (sort[j].d < sort[imin].d) imin = j; + if (imin != i) { + temp = sort[imin]; + sort[imin] = sort[i]; + sort[i] = temp; + } + } + } else { + pivot = sort[(lo + hi) / 2].d; + i = lo; + j = hi; + while (i <= j) { + while (sort[i].d < pivot) i++; + while (sort[j].d > pivot) j--; + if (i <= j) { + if (i < j) { + temp = sort[i]; + sort[i] = sort[j]; + sort[j] = temp; + } + i += 1; + j -= 1; + } + } + if (j > (lo + hi) / 2) { + if (lo < j) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + if (i < hi) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + } else { + if (i < hi) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; } - + if (lo < j) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + } } - - + } +} + /** * @brief Sort the particles in the given cell along all cardinal directions. * @@ -221,1147 +173,1183 @@ void runner_dosort_ascending ( struct entry *sort , int N ) { * @param clock Flag indicating whether to record the timing or not, needed * for recursive calls. */ - -void runner_dosort ( struct runner *r , struct cell *c , int flags , int clock ) { - - struct entry *finger; - struct entry *fingers[8]; - struct part *parts = c->parts; - struct entry *sort; - int j, k, count = c->count; - int i, ind, off[8], inds[8], temp_i, missing; - // float shift[3]; - float buff[8], px[3]; - - TIMER_TIC - - /* Clean-up the flags, i.e. filter out what's already been sorted. */ - flags &= ~c->sorted; - if ( flags == 0 ) - return; - - /* start by allocating the entry arrays. */ - if ( c->sort == NULL || c->sortsize < count ) { - if ( c->sort != NULL ) - free( c->sort ); - c->sortsize = count * 1.1; - if ( ( c->sort = (struct entry *)malloc( sizeof(struct entry) * (c->sortsize + 1) * 13 ) ) == NULL ) - error( "Failed to allocate sort memory." ); - } - sort = c->sort; - - /* Does this cell have any progeny? */ - if ( c->split ) { - - /* Fill in the gaps within the progeny. */ - for ( k = 0 ; k < 8 ; k++ ) { - if ( c->progeny[k] == NULL ) - continue; - missing = flags & ~c->progeny[k]->sorted; - if ( missing ) - runner_dosort( r , c->progeny[k] , missing , 0 ); - } - - /* Loop over the 13 different sort arrays. */ - for ( j = 0 ; j < 13 ; j++ ) { - - /* Has this sort array been flagged? */ - if ( !( flags & (1 << j) ) ) - continue; - - /* Init the particle index offsets. */ - for ( off[0] = 0 , k = 1 ; k < 8 ; k++ ) - if ( c->progeny[k-1] != NULL ) - off[k] = off[k-1] + c->progeny[k-1]->count; - else - off[k] = off[k-1]; - - /* Init the entries and indices. */ - for ( k = 0 ; k < 8 ; k++ ) { - inds[k] = k; - if ( c->progeny[k] != NULL && c->progeny[k]->count > 0 ) { - fingers[k] = &c->progeny[k]->sort[ j*(c->progeny[k]->count + 1) ]; - buff[k] = fingers[k]->d; - off[k] = off[k]; - } - else - buff[k] = FLT_MAX; - } - - /* Sort the buffer. */ - for ( i = 0 ; i < 7 ; i++ ) - for ( k = i+1 ; k < 8 ; k++ ) - if ( buff[ inds[k] ] < buff[ inds[i] ] ) { - temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i; - } - - /* For each entry in the new sort list. */ - finger = &sort[ j*(count + 1) ]; - for ( ind = 0 ; ind < count ; ind++ ) { - - /* Copy the minimum into the new sort array. */ - finger[ind].d = buff[inds[0]]; - finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; - - /* Update the buffer. */ - fingers[inds[0]] += 1; - buff[inds[0]] = fingers[inds[0]]->d; - - /* Find the smallest entry. */ - for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) { - temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i; - } - - } /* Merge. */ - - /* Add a sentinel. */ - sort[ j*(count + 1) + count ].d = FLT_MAX; - sort[ j*(count + 1) + count ].i = 0; - - /* Mark as sorted. */ - c->sorted |= ( 1 << j ); - - } /* loop over sort arrays. */ - - } /* progeny? */ - - /* Otherwise, just sort. */ - else { - - /* Fill the sort array. */ - for ( k = 0 ; k < count ; k++ ) { - px[0] = parts[k].x[0]; - px[1] = parts[k].x[1]; - px[2] = parts[k].x[2]; - for ( j = 0 ; j < 13 ; j++ ) - if ( flags & (1 << j) ) { - sort[ j*(count + 1) + k].i = k; - sort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ]; - } - } - - /* Add the sentinel and sort. */ - for ( j = 0 ; j < 13 ; j++ ) - if ( flags & (1 << j) ) { - sort[ j*(count + 1) + count ].d = FLT_MAX; - sort[ j*(count + 1) + count ].i = 0; - runner_dosort_ascending( &sort[ j*(count + 1) ] , count ); - c->sorted |= ( 1 << j ); - } - + +void runner_dosort(struct runner *r, struct cell *c, int flags, int clock) { + + struct entry *finger; + struct entry *fingers[8]; + struct part *parts = c->parts; + struct entry *sort; + int j, k, count = c->count; + int i, ind, off[8], inds[8], temp_i, missing; + // float shift[3]; + float buff[8], px[3]; + + TIMER_TIC + + /* Clean-up the flags, i.e. filter out what's already been sorted. */ + flags &= ~c->sorted; + if (flags == 0) return; + + /* start by allocating the entry arrays. */ + if (c->sort == NULL || c->sortsize < count) { + if (c->sort != NULL) free(c->sort); + c->sortsize = count * 1.1; + if ((c->sort = (struct entry *)malloc(sizeof(struct entry) * + (c->sortsize + 1) * 13)) == NULL) + error("Failed to allocate sort memory."); + } + sort = c->sort; + + /* Does this cell have any progeny? */ + if (c->split) { + + /* Fill in the gaps within the progeny. */ + for (k = 0; k < 8; k++) { + if (c->progeny[k] == NULL) continue; + missing = flags & ~c->progeny[k]->sorted; + if (missing) runner_dosort(r, c->progeny[k], missing, 0); + } + + /* Loop over the 13 different sort arrays. */ + for (j = 0; j < 13; j++) { + + /* Has this sort array been flagged? */ + if (!(flags & (1 << j))) continue; + + /* Init the particle index offsets. */ + for (off[0] = 0, k = 1; k < 8; k++) + if (c->progeny[k - 1] != NULL) + off[k] = off[k - 1] + c->progeny[k - 1]->count; + else + off[k] = off[k - 1]; + + /* Init the entries and indices. */ + for (k = 0; k < 8; k++) { + inds[k] = k; + if (c->progeny[k] != NULL && c->progeny[k]->count > 0) { + fingers[k] = &c->progeny[k]->sort[j * (c->progeny[k]->count + 1)]; + buff[k] = fingers[k]->d; + off[k] = off[k]; + } else + buff[k] = FLT_MAX; + } + + /* Sort the buffer. */ + for (i = 0; i < 7; i++) + for (k = i + 1; k < 8; k++) + if (buff[inds[k]] < buff[inds[i]]) { + temp_i = inds[i]; + inds[i] = inds[k]; + inds[k] = temp_i; + } + + /* For each entry in the new sort list. */ + finger = &sort[j * (count + 1)]; + for (ind = 0; ind < count; ind++) { + + /* Copy the minimum into the new sort array. */ + finger[ind].d = buff[inds[0]]; + finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; + + /* Update the buffer. */ + fingers[inds[0]] += 1; + buff[inds[0]] = fingers[inds[0]]->d; + + /* Find the smallest entry. */ + for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) { + temp_i = inds[k - 1]; + inds[k - 1] = inds[k]; + inds[k] = temp_i; } - - /* Verify the sorting. */ - /* for ( j = 0 ; j < 13 ; j++ ) { - if ( !( flags & (1 << j) ) ) - continue; - finger = &sort[ j*(count + 1) ]; - for ( k = 1 ; k < count ; k++ ) { - if ( finger[k].d < finger[k-1].d ) - error( "Sorting failed, ascending array." ); - if ( finger[k].i >= count ) - error( "Sorting failed, indices borked." ); - } - } */ - - #ifdef TIMER_VERBOSE - message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." , - r->id , count , c->depth , - (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , - ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); - #else - if ( clock ) - TIMER_TOC(timer_dosort); - #endif + } /* Merge. */ + + /* Add a sentinel. */ + sort[j * (count + 1) + count].d = FLT_MAX; + sort[j * (count + 1) + count].i = 0; + + /* Mark as sorted. */ + c->sorted |= (1 << j); + + } /* loop over sort arrays. */ + + } /* progeny? */ + + /* Otherwise, just sort. */ + else { + + /* Fill the sort array. */ + for (k = 0; k < count; k++) { + px[0] = parts[k].x[0]; + px[1] = parts[k].x[1]; + px[2] = parts[k].x[2]; + for (j = 0; j < 13; j++) + if (flags & (1 << j)) { + sort[j * (count + 1) + k].i = k; + sort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] + + px[1] * runner_shift[3 * j + 1] + + px[2] * runner_shift[3 * j + 2]; + } } - - -void runner_dogsort ( struct runner *r , struct cell *c , int flags , int clock ) { - - struct entry *finger; - struct entry *fingers[8]; - struct gpart *gparts = c->gparts; - struct entry *gsort; - int j, k, count = c->gcount; - int i, ind, off[8], inds[8], temp_i, missing; - // float shift[3]; - float buff[8], px[3]; - - TIMER_TIC - - /* Clean-up the flags, i.e. filter out what's already been sorted. */ - flags &= ~c->gsorted; - if ( flags == 0 ) - return; - - /* start by allocating the entry arrays. */ - if ( c->gsort == NULL || c->gsortsize < count ) { - if ( c->gsort != NULL ) - free( c->gsort ); - c->gsortsize = count * 1.1; - if ( ( c->gsort = (struct entry *)malloc( sizeof(struct entry) * (c->gsortsize + 1) * 13 ) ) == NULL ) - error( "Failed to allocate sort memory." ); + + /* Add the sentinel and sort. */ + for (j = 0; j < 13; j++) + if (flags & (1 << j)) { + sort[j * (count + 1) + count].d = FLT_MAX; + sort[j * (count + 1) + count].i = 0; + runner_dosort_ascending(&sort[j * (count + 1)], count); + c->sorted |= (1 << j); + } + } + +/* Verify the sorting. */ +/* for ( j = 0 ; j < 13 ; j++ ) { + if ( !( flags & (1 << j) ) ) + continue; + finger = &sort[ j*(count + 1) ]; + for ( k = 1 ; k < count ; k++ ) { + if ( finger[k].d < finger[k-1].d ) + error( "Sorting failed, ascending array." ); + if ( finger[k].i >= count ) + error( "Sorting failed, indices borked." ); } - gsort = c->gsort; - - /* Does this cell have any progeny? */ - if ( c->split ) { - - /* Fill in the gaps within the progeny. */ - for ( k = 0 ; k < 8 ; k++ ) { - if ( c->progeny[k] == NULL ) - continue; - missing = flags & ~c->progeny[k]->gsorted; - if ( missing ) - runner_dogsort( r , c->progeny[k] , missing , 0 ); - } - - /* Loop over the 13 different sort arrays. */ - for ( j = 0 ; j < 13 ; j++ ) { - - /* Has this sort array been flagged? */ - if ( !( flags & (1 << j) ) ) - continue; - - /* Init the particle index offsets. */ - for ( off[0] = 0 , k = 1 ; k < 8 ; k++ ) - if ( c->progeny[k-1] != NULL ) - off[k] = off[k-1] + c->progeny[k-1]->gcount; - else - off[k] = off[k-1]; - - /* Init the entries and indices. */ - for ( k = 0 ; k < 8 ; k++ ) { - inds[k] = k; - if ( c->progeny[k] != NULL && c->progeny[k]->gcount > 0 ) { - fingers[k] = &c->progeny[k]->gsort[ j*(c->progeny[k]->gcount + 1) ]; - buff[k] = fingers[k]->d; - off[k] = off[k]; - } - else - buff[k] = FLT_MAX; - } - - /* Sort the buffer. */ - for ( i = 0 ; i < 7 ; i++ ) - for ( k = i+1 ; k < 8 ; k++ ) - if ( buff[ inds[k] ] < buff[ inds[i] ] ) { - temp_i = inds[i]; inds[i] = inds[k]; inds[k] = temp_i; - } - - /* For each entry in the new sort list. */ - finger = &gsort[ j*(count + 1) ]; - for ( ind = 0 ; ind < count ; ind++ ) { - - /* Copy the minimum into the new sort array. */ - finger[ind].d = buff[inds[0]]; - finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; - - /* Update the buffer. */ - fingers[inds[0]] += 1; - buff[inds[0]] = fingers[inds[0]]->d; - - /* Find the smallest entry. */ - for ( k = 1 ; k < 8 && buff[inds[k]] < buff[inds[k-1]] ; k++ ) { - temp_i = inds[k-1]; inds[k-1] = inds[k]; inds[k] = temp_i; - } - - } /* Merge. */ - - /* Add a sentinel. */ - gsort[ j*(count + 1) + count ].d = FLT_MAX; - gsort[ j*(count + 1) + count ].i = 0; - - /* Mark as sorted. */ - c->gsorted |= ( 1 << j ); - - } /* loop over sort arrays. */ - - } /* progeny? */ - - /* Otherwise, just sort. */ - else { - - /* Fill the sort array. */ - for ( k = 0 ; k < count ; k++ ) { - px[0] = gparts[k].x[0]; - px[1] = gparts[k].x[1]; - px[2] = gparts[k].x[2]; - for ( j = 0 ; j < 13 ; j++ ) - if ( flags & (1 << j) ) { - gsort[ j*(count + 1) + k].i = k; - gsort[ j*(count + 1) + k].d = px[0]*runner_shift[ 3*j + 0 ] + px[1]*runner_shift[ 3*j + 1 ] + px[2]*runner_shift[ 3*j + 2 ]; - } - } - - /* Add the sentinel and sort. */ - for ( j = 0 ; j < 13 ; j++ ) - if ( flags & (1 << j) ) { - gsort[ j*(count + 1) + count ].d = FLT_MAX; - gsort[ j*(count + 1) + count ].i = 0; - runner_dosort_ascending( &gsort[ j*(count + 1) ] , count ); - c->gsorted |= ( 1 << j ); - } - + } */ + +#ifdef TIMER_VERBOSE + message( + "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) " + "took %.3f ms.", + r->id, count, c->depth, (flags & 0x1000) >> 12, (flags & 0x800) >> 11, + (flags & 0x400) >> 10, (flags & 0x200) >> 9, (flags & 0x100) >> 8, + (flags & 0x80) >> 7, (flags & 0x40) >> 6, (flags & 0x20) >> 5, + (flags & 0x10) >> 4, (flags & 0x8) >> 3, (flags & 0x4) >> 2, + (flags & 0x2) >> 1, (flags & 0x1) >> 0, + ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000); + fflush(stdout); +#else + if (clock) TIMER_TOC(timer_dosort); +#endif +} + +void runner_dogsort(struct runner *r, struct cell *c, int flags, int clock) { + + struct entry *finger; + struct entry *fingers[8]; + struct gpart *gparts = c->gparts; + struct entry *gsort; + int j, k, count = c->gcount; + int i, ind, off[8], inds[8], temp_i, missing; + // float shift[3]; + float buff[8], px[3]; + + TIMER_TIC + + /* Clean-up the flags, i.e. filter out what's already been sorted. */ + flags &= ~c->gsorted; + if (flags == 0) return; + + /* start by allocating the entry arrays. */ + if (c->gsort == NULL || c->gsortsize < count) { + if (c->gsort != NULL) free(c->gsort); + c->gsortsize = count * 1.1; + if ((c->gsort = (struct entry *)malloc(sizeof(struct entry) * + (c->gsortsize + 1) * 13)) == NULL) + error("Failed to allocate sort memory."); + } + gsort = c->gsort; + + /* Does this cell have any progeny? */ + if (c->split) { + + /* Fill in the gaps within the progeny. */ + for (k = 0; k < 8; k++) { + if (c->progeny[k] == NULL) continue; + missing = flags & ~c->progeny[k]->gsorted; + if (missing) runner_dogsort(r, c->progeny[k], missing, 0); + } + + /* Loop over the 13 different sort arrays. */ + for (j = 0; j < 13; j++) { + + /* Has this sort array been flagged? */ + if (!(flags & (1 << j))) continue; + + /* Init the particle index offsets. */ + for (off[0] = 0, k = 1; k < 8; k++) + if (c->progeny[k - 1] != NULL) + off[k] = off[k - 1] + c->progeny[k - 1]->gcount; + else + off[k] = off[k - 1]; + + /* Init the entries and indices. */ + for (k = 0; k < 8; k++) { + inds[k] = k; + if (c->progeny[k] != NULL && c->progeny[k]->gcount > 0) { + fingers[k] = &c->progeny[k]->gsort[j * (c->progeny[k]->gcount + 1)]; + buff[k] = fingers[k]->d; + off[k] = off[k]; + } else + buff[k] = FLT_MAX; + } + + /* Sort the buffer. */ + for (i = 0; i < 7; i++) + for (k = i + 1; k < 8; k++) + if (buff[inds[k]] < buff[inds[i]]) { + temp_i = inds[i]; + inds[i] = inds[k]; + inds[k] = temp_i; + } + + /* For each entry in the new sort list. */ + finger = &gsort[j * (count + 1)]; + for (ind = 0; ind < count; ind++) { + + /* Copy the minimum into the new sort array. */ + finger[ind].d = buff[inds[0]]; + finger[ind].i = fingers[inds[0]]->i + off[inds[0]]; + + /* Update the buffer. */ + fingers[inds[0]] += 1; + buff[inds[0]] = fingers[inds[0]]->d; + + /* Find the smallest entry. */ + for (k = 1; k < 8 && buff[inds[k]] < buff[inds[k - 1]]; k++) { + temp_i = inds[k - 1]; + inds[k - 1] = inds[k]; + inds[k] = temp_i; } - - /* Verify the sorting. */ - /* for ( j = 0 ; j < 13 ; j++ ) { - if ( !( flags & (1 << j) ) ) - continue; - finger = &c->gsort[ j*(count + 1) ]; - for ( k = 1 ; k < count ; k++ ) { - if ( finger[k].d < finger[k-1].d ) - error( "Sorting failed, ascending array." ); - if ( finger[k].i < 0 || finger[k].i >= count ) - error( "Sorting failed, indices borked." ); - } - } */ - - #ifdef TIMER_VERBOSE - message( "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) took %.3f ms." , - r->id , count , c->depth , - (flags & 0x1000) >> 12 , (flags & 0x800) >> 11 , (flags & 0x400) >> 10 , (flags & 0x200) >> 9 , (flags & 0x100) >> 8 , (flags & 0x80) >> 7 , (flags & 0x40) >> 6 , (flags & 0x20) >> 5 , (flags & 0x10) >> 4 , (flags & 0x8) >> 3 , (flags & 0x4) >> 2 , (flags & 0x2) >> 1 , (flags & 0x1) >> 0 , - ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000 ); fflush(stdout); - #else - if ( clock ) - TIMER_TOC(timer_dosort); - #endif + } /* Merge. */ + + /* Add a sentinel. */ + gsort[j * (count + 1) + count].d = FLT_MAX; + gsort[j * (count + 1) + count].i = 0; + + /* Mark as sorted. */ + c->gsorted |= (1 << j); + + } /* loop over sort arrays. */ + + } /* progeny? */ + + /* Otherwise, just sort. */ + else { + + /* Fill the sort array. */ + for (k = 0; k < count; k++) { + px[0] = gparts[k].x[0]; + px[1] = gparts[k].x[1]; + px[2] = gparts[k].x[2]; + for (j = 0; j < 13; j++) + if (flags & (1 << j)) { + gsort[j * (count + 1) + k].i = k; + gsort[j * (count + 1) + k].d = px[0] * runner_shift[3 * j + 0] + + px[1] * runner_shift[3 * j + 1] + + px[2] * runner_shift[3 * j + 2]; + } } - - + + /* Add the sentinel and sort. */ + for (j = 0; j < 13; j++) + if (flags & (1 << j)) { + gsort[j * (count + 1) + count].d = FLT_MAX; + gsort[j * (count + 1) + count].i = 0; + runner_dosort_ascending(&gsort[j * (count + 1)], count); + c->gsorted |= (1 << j); + } + } + +/* Verify the sorting. */ +/* for ( j = 0 ; j < 13 ; j++ ) { + if ( !( flags & (1 << j) ) ) + continue; + finger = &c->gsort[ j*(count + 1) ]; + for ( k = 1 ; k < count ; k++ ) { + if ( finger[k].d < finger[k-1].d ) + error( "Sorting failed, ascending array." ); + if ( finger[k].i < 0 || finger[k].i >= count ) + error( "Sorting failed, indices borked." ); + } + } */ + +#ifdef TIMER_VERBOSE + message( + "runner %02i: %i parts at depth %i (flags = %i%i%i%i%i%i%i%i%i%i%i%i%i) " + "took %.3f ms.", + r->id, count, c->depth, (flags & 0x1000) >> 12, (flags & 0x800) >> 11, + (flags & 0x400) >> 10, (flags & 0x200) >> 9, (flags & 0x100) >> 8, + (flags & 0x80) >> 7, (flags & 0x40) >> 6, (flags & 0x20) >> 5, + (flags & 0x10) >> 4, (flags & 0x8) >> 3, (flags & 0x4) >> 2, + (flags & 0x2) >> 1, (flags & 0x1) >> 0, + ((double)TIMER_TOC(timer_dosort)) / CPU_TPS * 1000); + fflush(stdout); +#else + if (clock) TIMER_TOC(timer_dosort); +#endif +} + /** * @brief Intermediate task between density and force * * @param r The runner thread. * @param c The cell. */ - -void runner_doghost ( struct runner *r , struct cell *c ) { - - struct part *p, *parts = c->parts; - struct cell *finger; - int i, k, redo, count = c->count; - int *pid; - float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc; - float normDiv_v, normCurl_v; + +void runner_doghost(struct runner *r, struct cell *c) { + + struct part *p, *parts = c->parts; + struct cell *finger; + int i, k, redo, count = c->count; + int *pid; + float h, ih, ih2, ih4, h_corr, rho, wcount, rho_dh, wcount_dh, u, fc; + float normDiv_v, normCurl_v; #ifndef LEGACY_GADGET2_SPH - float alpha_dot, tau, S; + float alpha_dot, tau, S; #endif - float dt_step = r->e->dt_step; - TIMER_TIC - - /* Recurse? */ - if ( c->split ) { - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - runner_doghost( r , c->progeny[k] ); - return; + float dt_step = r->e->dt_step; + TIMER_TIC + + /* Recurse? */ + if (c->split) { + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) runner_doghost(r, c->progeny[k]); + return; + } + + /* Init the IDs that have to be updated. */ + if ((pid = (int *)alloca(sizeof(int) * count)) == NULL) + error("Call to alloca failed."); + for (k = 0; k < count; k++) pid[k] = k; + + /* While there are particles that need to be updated... */ + while (count > 0) { + + /* Reset the redo-count. */ + redo = 0; + + /* Loop over the parts in this cell. */ + __builtin_prefetch(&parts[pid[0]], 0, 1); + __builtin_prefetch(&parts[pid[0]].rho_dh, 0, 1); + __builtin_prefetch(&parts[pid[1]], 0, 1); + __builtin_prefetch(&parts[pid[1]].rho_dh, 0, 1); + __builtin_prefetch(&parts[pid[2]], 0, 1); + __builtin_prefetch(&parts[pid[2]].rho_dh, 0, 1); + for (i = 0; i < count; i++) { + + /* Get a direct pointer on the part. */ + __builtin_prefetch(&parts[pid[i + 3]], 0, 1); + __builtin_prefetch(&parts[pid[i + 3]].rho_dh, 0, 1); + p = &parts[pid[i]]; + + /* Is this part within the timestep? */ + if (p->dt <= dt_step) { + + /* Some smoothing length multiples. */ + h = p->h; + ih = 1.0f / h; + ih2 = ih * ih; + ih4 = ih2 * ih2; + + /* Final operation on the density. */ + p->rho = rho = ih * ih2 * (p->rho + p->mass * kernel_root); + p->rho_dh = rho_dh = (p->rho_dh - 3.0f * p->mass * kernel_root) * ih4; + wcount = (p->density.wcount + kernel_root) * + (4.0f / 3.0 * M_PI * kernel_gamma3); + wcount_dh = + p->density.wcount_dh * ih * (4.0f / 3.0 * M_PI * kernel_gamma3); + + /* If no derivative, double the smoothing length. */ + if (wcount_dh == 0.0f) h_corr = p->h; + + /* Otherwise, compute the smoothing length update (Newton step). */ + else { + h_corr = (kernel_nwneigh - wcount) / wcount_dh; + + /* Truncate to the range [ -p->h/2 , p->h ]. */ + h_corr = fminf(h_corr, h); + h_corr = fmaxf(h_corr, -h / 2.f); } - - /* Init the IDs that have to be updated. */ - if ( ( pid = (int *)alloca( sizeof(int) * count ) ) == NULL ) - error( "Call to alloca failed." ); - for ( k = 0 ; k < count ; k++ ) - pid[k] = k; - - /* While there are particles that need to be updated... */ - while ( count > 0 ) { - - /* Reset the redo-count. */ - redo = 0; - - /* Loop over the parts in this cell. */ - __builtin_prefetch( &parts[ pid[0] ] , 0 , 1 ); - __builtin_prefetch( &parts[ pid[0] ].rho_dh , 0 , 1 ); - __builtin_prefetch( &parts[ pid[1] ] , 0 , 1 ); - __builtin_prefetch( &parts[ pid[1] ].rho_dh , 0 , 1 ); - __builtin_prefetch( &parts[ pid[2] ] , 0 , 1 ); - __builtin_prefetch( &parts[ pid[2] ].rho_dh , 0 , 1 ); - for ( i = 0 ; i < count ; i++ ) { - - /* Get a direct pointer on the part. */ - __builtin_prefetch( &parts[ pid[i+3] ] , 0 , 1 ); - __builtin_prefetch( &parts[ pid[i+3] ].rho_dh , 0 , 1 ); - p = &parts[ pid[i] ]; - - /* Is this part within the timestep? */ - if ( p->dt <= dt_step ) { - - /* Some smoothing length multiples. */ - h = p->h; - ih = 1.0f / h; - ih2 = ih * ih; - ih4 = ih2 * ih2; - - /* Final operation on the density. */ - p->rho = rho = ih * ih2 * ( p->rho + p->mass*kernel_root ); - p->rho_dh = rho_dh = ( p->rho_dh - 3.0f*p->mass*kernel_root ) * ih4; - wcount = ( p->density.wcount + kernel_root ) * ( 4.0f / 3.0 * M_PI * kernel_gamma3 ); - wcount_dh = p->density.wcount_dh * ih * ( 4.0f / 3.0 * M_PI * kernel_gamma3 ); - - /* If no derivative, double the smoothing length. */ - if ( wcount_dh == 0.0f ) - h_corr = p->h; - - /* Otherwise, compute the smoothing length update (Newton step). */ - else { - h_corr = ( kernel_nwneigh - wcount ) / wcount_dh; - - /* Truncate to the range [ -p->h/2 , p->h ]. */ - h_corr = fminf( h_corr , h ); - h_corr = fmaxf( h_corr , -h/2.f ); - - } - - /* Apply the correction to p->h and to the compact part. */ - p->h += h_corr; - - /* Did we get the right number density? */ - if ( wcount > kernel_nwneigh + const_delta_nwneigh || - wcount < kernel_nwneigh - const_delta_nwneigh ) { - // message( "particle %lli (h=%e,depth=%i) has bad wcount=%.3f." , p->id , p->h , c->depth , wcount ); fflush(stdout); - // p->h += ( p->density.wcount + kernel_root - kernel_nwneigh ) / p->density.wcount_dh; - pid[redo] = pid[i]; - redo += 1; - p->density.wcount = 0.0; - p->density.wcount_dh = 0.0; - p->rho = 0.0; - p->rho_dh = 0.0; - p->density.div_v = 0.0; - for ( k=0 ; k < 3 ; k++) - p->density.curl_v[k] = 0.0; - continue; - } - - /* Pre-compute some stuff for the balsara switch. */ - normDiv_v = fabs( p->density.div_v / rho * ih4 ); - normCurl_v = sqrtf( p->density.curl_v[0] * p->density.curl_v[0] + p->density.curl_v[1] * p->density.curl_v[1] + p->density.curl_v[2] * p->density.curl_v[2] ) / rho * ih4; - - /* As of here, particle force variables will be set. Do _NOT_ - try to read any particle density variables! */ - - /* Compute this particle's sound speed. */ - u = p->u; - p->force.c = fc = sqrtf( const_hydro_gamma * ( const_hydro_gamma - 1.0f ) * u ); - - /* Compute the P/Omega/rho2. */ - p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho + h * rho_dh / 3.0f ); - - /* Balsara switch */ - p->force.balsara = normDiv_v / ( normDiv_v + normCurl_v + 0.0001f * fc * ih ); - - #ifndef LEGACY_GADGET2_SPH - /* Viscosity parameter decay time */ - tau = h / ( 2.f * const_viscosity_length * p->force.c ); - - /* Viscosity source term */ - S = fmaxf( -normDiv_v, 0.f ); - - /* Compute the particle's viscosity parameter time derivative */ - alpha_dot = ( const_viscosity_alpha_min - p->alpha ) / tau + ( const_viscosity_alpha_max - p->alpha ) * S; - - /* Update particle's viscosity paramter */ - p->alpha += alpha_dot * p->dt; - #endif - - /* Reset the acceleration. */ - for ( k = 0 ; k < 3 ; k++ ) - p->a[k] = 0.0f; - - /* Reset the time derivatives. */ - p->force.u_dt = 0.0f; - p->force.h_dt = 0.0f; - p->force.v_sig = 0.0f; - - } - - } - - /* Re-set the counter for the next loop (potentially). */ - count = redo; - if ( count > 0 ) { - - // error( "Bad smoothing length, fixing this isn't implemented yet." ); - - /* Climb up the cell hierarchy. */ - for ( finger = c ; finger != NULL ; finger = finger->parent ) { - - /* Run through this cell's density interactions. */ - for ( struct link *l = finger->density ; l != NULL ; l = l->next ) { - - /* Self-interaction? */ - if ( l->t->type == task_type_self ) - runner_doself_subset_density( r , finger , parts , pid , count ); - - /* Otherwise, pair interaction? */ - else if ( l->t->type == task_type_pair ) { - - /* Left or right? */ - if ( l->t->ci == finger ) - runner_dopair_subset_density( r , finger , parts , pid , count , l->t->cj ); - else - runner_dopair_subset_density( r , finger , parts , pid , count , l->t->ci ); - - } - - /* Otherwise, sub interaction? */ - else if ( l->t->type == task_type_sub ) { - - /* Left or right? */ - if ( l->t->ci == finger ) - runner_dosub_subset_density( r , finger , parts , pid , count , l->t->cj , -1 , 1 ); - else - runner_dosub_subset_density( r , finger , parts , pid , count , l->t->ci , -1 , 1 ); - - } - - } - - } - - } - + + /* Apply the correction to p->h and to the compact part. */ + p->h += h_corr; + + /* Did we get the right number density? */ + if (wcount > kernel_nwneigh + const_delta_nwneigh || + wcount < kernel_nwneigh - const_delta_nwneigh) { + // message( "particle %lli (h=%e,depth=%i) has bad wcount=%.3f." , + // p->id , p->h , c->depth , wcount ); fflush(stdout); + // p->h += ( p->density.wcount + kernel_root - kernel_nwneigh ) / + // p->density.wcount_dh; + pid[redo] = pid[i]; + redo += 1; + p->density.wcount = 0.0; + p->density.wcount_dh = 0.0; + p->rho = 0.0; + p->rho_dh = 0.0; + p->density.div_v = 0.0; + for (k = 0; k < 3; k++) p->density.curl_v[k] = 0.0; + continue; } - #ifdef TIMER_VERBOSE - message( "runner %02i: %i parts at depth %i took %.3f ms." , - r->id , c->count , c->depth , - ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000 ); fflush(stdout); - #else - TIMER_TOC(timer_doghost); - #endif - + /* Pre-compute some stuff for the balsara switch. */ + normDiv_v = fabs(p->density.div_v / rho * ih4); + normCurl_v = sqrtf(p->density.curl_v[0] * p->density.curl_v[0] + + p->density.curl_v[1] * p->density.curl_v[1] + + p->density.curl_v[2] * p->density.curl_v[2]) / + rho * ih4; + + /* As of here, particle force variables will be set. Do _NOT_ + try to read any particle density variables! */ + + /* Compute this particle's sound speed. */ + u = p->u; + p->force.c = fc = + sqrtf(const_hydro_gamma * (const_hydro_gamma - 1.0f) * u); + + /* Compute the P/Omega/rho2. */ + p->force.POrho2 = + u * (const_hydro_gamma - 1.0f) / (rho + h * rho_dh / 3.0f); + + /* Balsara switch */ + p->force.balsara = + normDiv_v / (normDiv_v + normCurl_v + 0.0001f * fc * ih); + +#ifndef LEGACY_GADGET2_SPH + /* Viscosity parameter decay time */ + tau = h / (2.f * const_viscosity_length * p->force.c); + + /* Viscosity source term */ + S = fmaxf(-normDiv_v, 0.f); + + /* Compute the particle's viscosity parameter time derivative */ + alpha_dot = (const_viscosity_alpha_min - p->alpha) / tau + + (const_viscosity_alpha_max - p->alpha) * S; + + /* Update particle's viscosity paramter */ + p->alpha += alpha_dot * p->dt; +#endif + + /* Reset the acceleration. */ + for (k = 0; k < 3; k++) p->a[k] = 0.0f; + + /* Reset the time derivatives. */ + p->force.u_dt = 0.0f; + p->force.h_dt = 0.0f; + p->force.v_sig = 0.0f; + } + } + + /* Re-set the counter for the next loop (potentially). */ + count = redo; + if (count > 0) { + + // error( "Bad smoothing length, fixing this isn't implemented yet." ); + + /* Climb up the cell hierarchy. */ + for (finger = c; finger != NULL; finger = finger->parent) { + + /* Run through this cell's density interactions. */ + for (struct link *l = finger->density; l != NULL; l = l->next) { + + /* Self-interaction? */ + if (l->t->type == task_type_self) + runner_doself_subset_density(r, finger, parts, pid, count); + + /* Otherwise, pair interaction? */ + else if (l->t->type == task_type_pair) { + + /* Left or right? */ + if (l->t->ci == finger) + runner_dopair_subset_density(r, finger, parts, pid, count, + l->t->cj); + else + runner_dopair_subset_density(r, finger, parts, pid, count, + l->t->ci); + + } + + /* Otherwise, sub interaction? */ + else if (l->t->type == task_type_sub) { + + /* Left or right? */ + if (l->t->ci == finger) + runner_dosub_subset_density(r, finger, parts, pid, count, + l->t->cj, -1, 1); + else + runner_dosub_subset_density(r, finger, parts, pid, count, + l->t->ci, -1, 1); + } + } + } } - - + } + +#ifdef TIMER_VERBOSE + message("runner %02i: %i parts at depth %i took %.3f ms.", r->id, c->count, + c->depth, ((double)TIMER_TOC(timer_doghost)) / CPU_TPS * 1000); + fflush(stdout); +#else + TIMER_TOC(timer_doghost); +#endif +} + /** * @brief Compute the second kick of the given cell. * * @param r The runner thread. * @param c The cell. */ - -void runner_dokick2 ( struct runner *r , struct cell *c ) { - - int j, k, count = 0, nr_parts = c->count; - float dt_min = FLT_MAX, dt_max = 0.0f; - double ekin = 0.0, epot = 0.0; - float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f }; - float x[3], v_hdt[3], u_hdt, h, pdt, m; - float dt_step = r->e->dt_step, dt = r->e->dt, hdt, idt; - float dt_cfl, dt_h_change, dt_u_change, dt_new; - float h_dt, u_dt; - struct part *restrict p, *restrict parts = c->parts; - struct xpart *restrict xp, *restrict xparts = c->xparts; - - TIMER_TIC - - /* Init idt to avoid compiler stupidity. */ - idt = ( dt > 0 ) ? 1.0f / dt : 0.0f; - hdt = dt / 2; - - /* Loop over the particles and kick them. */ - __builtin_prefetch( &parts[0] , 0 , 1 ); - __builtin_prefetch( &parts[0].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[0] , 0 , 1 ); - __builtin_prefetch( &parts[1] , 0 , 1 ); - __builtin_prefetch( &parts[1].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[1] , 0 , 1 ); - __builtin_prefetch( &parts[2] , 0 , 1 ); - __builtin_prefetch( &parts[2].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[2] , 0 , 1 ); - for ( k = 0 ; k < nr_parts ; k++ ) { - - /* Get a handle on the part. */ - __builtin_prefetch( &parts[k+3] , 0 , 1 ); - __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[k+3] , 0 , 1 ); - p = &parts[k]; - xp = &xparts[k]; - - /* Get local copies of particle data. */ - pdt = p->dt; - m = p->mass; - x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2]; - v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2]; - u_hdt = xp->u_hdt; - - /* Update the particle's data (if active). */ - if ( pdt <= dt_step ) { - - /* Increase the number of particles updated. */ - count += 1; - - /* Scale the derivatives as they're freshly computed. */ - h = p->h; - h_dt = p->force.h_dt *= h * 0.333333333f; - xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f; - - /* Compute the new time step. */ - u_dt = p->force.u_dt; - dt_cfl = const_cfl * h / p->force.v_sig; - dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX; - dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX; - dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) ); - if ( pdt == 0.0f ) - p->dt = pdt = dt_new; - else - p->dt = pdt = fminf( dt_new , 2.0f*pdt ); - - /* Update positions and energies at the full step. */ - p->v[0] = v_hdt[0] + hdt * p->a[0]; - p->v[1] = v_hdt[1] + hdt * p->a[1]; - p->v[2] = v_hdt[2] + hdt * p->a[2]; - p->u = u_hdt + hdt * u_dt; - - /* Set the new particle-specific time step. */ - if ( dt > 0.0f ) { - float dt_curr = dt; - j = (int)( pdt * idt ); - while ( j > 1 ) { - dt_curr *= 2.0f; - j >>= 1; - } - xp->dt_curr = dt_curr; - } - - } - - /* Get the smallest/largest dt. */ - dt_min = fminf( dt_min , pdt ); - dt_max = fmaxf( dt_max , pdt ); - - /* Collect total energy. */ - ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] ); - epot += m * u_hdt; - - /* Collect momentum */ - mom[0] += m * v_hdt[0]; - mom[1] += m * v_hdt[1]; - mom[2] += m * v_hdt[2]; - - /* Collect angular momentum */ - ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] ); - ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] ); - ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] ); - - /* Collect entropic function */ - // lent += u * pow( p->rho, 1.f-const_gamma ); +void runner_dokick2(struct runner *r, struct cell *c) { + + int j, k, count = 0, nr_parts = c->count; + float dt_min = FLT_MAX, dt_max = 0.0f; + double ekin = 0.0, epot = 0.0; + float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f}; + float x[3], v_hdt[3], u_hdt, h, pdt, m; + float dt_step = r->e->dt_step, dt = r->e->dt, hdt, idt; + float dt_cfl, dt_h_change, dt_u_change, dt_new; + float h_dt, u_dt; + struct part *restrict p, *restrict parts = c->parts; + struct xpart *restrict xp, *restrict xparts = c->xparts; + + TIMER_TIC + + /* Init idt to avoid compiler stupidity. */ + idt = (dt > 0) ? 1.0f / dt : 0.0f; + hdt = dt / 2; + + /* Loop over the particles and kick them. */ + __builtin_prefetch(&parts[0], 0, 1); + __builtin_prefetch(&parts[0].rho_dh, 0, 1); + __builtin_prefetch(&xparts[0], 0, 1); + __builtin_prefetch(&parts[1], 0, 1); + __builtin_prefetch(&parts[1].rho_dh, 0, 1); + __builtin_prefetch(&xparts[1], 0, 1); + __builtin_prefetch(&parts[2], 0, 1); + __builtin_prefetch(&parts[2].rho_dh, 0, 1); + __builtin_prefetch(&xparts[2], 0, 1); + for (k = 0; k < nr_parts; k++) { + + /* Get a handle on the part. */ + __builtin_prefetch(&parts[k + 3], 0, 1); + __builtin_prefetch(&parts[k + 3].rho_dh, 0, 1); + __builtin_prefetch(&xparts[k + 3], 0, 1); + p = &parts[k]; + xp = &xparts[k]; + + /* Get local copies of particle data. */ + pdt = p->dt; + m = p->mass; + x[0] = p->x[0]; + x[1] = p->x[1]; + x[2] = p->x[2]; + v_hdt[0] = xp->v_hdt[0]; + v_hdt[1] = xp->v_hdt[1]; + v_hdt[2] = xp->v_hdt[2]; + u_hdt = xp->u_hdt; + + /* Update the particle's data (if active). */ + if (pdt <= dt_step) { + + /* Increase the number of particles updated. */ + count += 1; + + /* Scale the derivatives as they're freshly computed. */ + h = p->h; + h_dt = p->force.h_dt *= h * 0.333333333f; + xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f; + + /* Compute the new time step. */ + u_dt = p->force.u_dt; + dt_cfl = const_cfl * h / p->force.v_sig; + dt_h_change = + (h_dt != 0.0f) ? fabsf(const_ln_max_h_change * h / h_dt) : FLT_MAX; + dt_u_change = + (u_dt != 0.0f) ? fabsf(const_max_u_change * p->u / u_dt) : FLT_MAX; + dt_new = fminf(dt_cfl, fminf(dt_h_change, dt_u_change)); + if (pdt == 0.0f) + p->dt = pdt = dt_new; + else + p->dt = pdt = fminf(dt_new, 2.0f * pdt); + + /* Update positions and energies at the full step. */ + p->v[0] = v_hdt[0] + hdt * p->a[0]; + p->v[1] = v_hdt[1] + hdt * p->a[1]; + p->v[2] = v_hdt[2] + hdt * p->a[2]; + p->u = u_hdt + hdt * u_dt; + + /* Set the new particle-specific time step. */ + if (dt > 0.0f) { + float dt_curr = dt; + j = (int)(pdt * idt); + while (j > 1) { + dt_curr *= 2.0f; + j >>= 1; } - - #ifdef TIMER_VERBOSE - message( "runner %02i: %i parts at depth %i took %.3f ms." , - r->id , c->count , c->depth , - ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout); - #else - TIMER_TOC(timer_kick2); - #endif - - /* Store the computed values in the cell. */ - c->dt_min = dt_min; - c->dt_max = dt_max; - c->updated = count; - c->ekin = ekin; - c->epot = epot; - c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2]; - c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2]; - + xp->dt_curr = dt_curr; + } } + /* Get the smallest/largest dt. */ + dt_min = fminf(dt_min, pdt); + dt_max = fmaxf(dt_max, pdt); + + /* Collect total energy. */ + ekin += 0.5 * m * + (v_hdt[0] * v_hdt[0] + v_hdt[1] * v_hdt[1] + v_hdt[2] * v_hdt[2]); + epot += m * u_hdt; + + /* Collect momentum */ + mom[0] += m * v_hdt[0]; + mom[1] += m * v_hdt[1]; + mom[2] += m * v_hdt[2]; + + /* Collect angular momentum */ + ang[0] += m * (x[1] * v_hdt[2] - x[2] * v_hdt[1]); + ang[1] += m * (x[2] * v_hdt[0] - x[0] * v_hdt[2]); + ang[2] += m * (x[0] * v_hdt[1] - x[1] * v_hdt[0]); + + /* Collect entropic function */ + // lent += u * pow( p->rho, 1.f-const_gamma ); + } + +#ifdef TIMER_VERBOSE + message("runner %02i: %i parts at depth %i took %.3f ms.", r->id, c->count, + c->depth, ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000); + fflush(stdout); +#else + TIMER_TOC(timer_kick2); +#endif + + /* Store the computed values in the cell. */ + c->dt_min = dt_min; + c->dt_max = dt_max; + c->updated = count; + c->ekin = ekin; + c->epot = epot; + c->mom[0] = mom[0]; + c->mom[1] = mom[1]; + c->mom[2] = mom[2]; + c->ang[0] = ang[0]; + c->ang[1] = ang[1]; + c->ang[2] = ang[2]; +} /** * @brief Mapping function to set dt_min and dt_max, do the first * kick. */ -void runner_dokick1 ( struct runner *r , struct cell *c ) { - - int j, k; - struct engine *e = r->e; - float pdt, dt_step = e->dt_step, dt = e->dt, hdt = dt/2; - float dt_min, dt_max, h_max, dx, dx_max; - float a[3], v[3], u, u_dt, h, h_dt, w, rho; - double x[3], x_old[3]; - struct part *restrict p, *restrict parts = c->parts; - struct xpart *restrict xp, *restrict xparts = c->xparts; - - /* No children? */ - if ( !c->split ) { - - /* Init the min/max counters. */ - dt_min = FLT_MAX; - dt_max = 0.0f; - h_max = 0.0f; - dx_max = 0.0f; - - /* Loop over parts. */ - __builtin_prefetch( &parts[0] , 0 , 1 ); - __builtin_prefetch( &parts[0].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[0] , 0 , 1 ); - __builtin_prefetch( &parts[1] , 0 , 1 ); - __builtin_prefetch( &parts[1].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[1] , 0 , 1 ); - __builtin_prefetch( &parts[2] , 0 , 1 ); - __builtin_prefetch( &parts[2].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[2] , 0 , 1 ); - for ( k = 0 ; k < c->count ; k++ ) { - - /* Get a handle on the kth particle. */ - __builtin_prefetch( &parts[k+3] , 0 , 1 ); - __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[k+3] , 0 , 1 ); - p = &parts[k]; - xp = &xparts[k]; - - /* Load the data locally. */ - a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2]; - v[0] = p->v[0]; v[1] = p->v[1]; v[2] = p->v[2]; - x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2]; - x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2]; - h = p->h; - u = p->u; - h_dt = p->force.h_dt; - u_dt = p->force.u_dt; - pdt = p->dt; - - /* Store the min/max dt. */ - dt_min = fminf( dt_min , pdt ); - dt_max = fmaxf( dt_max , pdt ); - - /* Update the half-step velocities from the current velocities. */ - xp->v_hdt[0] = v[0] + hdt * a[0]; - xp->v_hdt[1] = v[1] + hdt * a[1]; - xp->v_hdt[2] = v[2] + hdt * a[2]; - xp->u_hdt = u + hdt * u_dt; - - /* Move the particles with the velocities at the half-step. */ - p->x[0] = x[0] += dt * xp->v_hdt[0]; - p->x[1] = x[1] += dt * xp->v_hdt[1]; - p->x[2] = x[2] += dt * xp->v_hdt[2]; - dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) + - (x[1] - x_old[1])*(x[1] - x_old[1]) + - (x[2] - x_old[2])*(x[2] - x_old[2]) ); - dx_max = fmaxf( dx_max , dx ); - - /* Update positions and energies at the half-step. */ - p->v[0] = v[0] + dt * a[0]; - p->v[1] = v[1] + dt * a[1]; - p->v[2] = v[2] + dt * a[2]; - w = u_dt / u * dt; - if ( fabsf( w ) < 0.01f ) - p->u = u *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ); - else - p->u = u *= expf( w ); - w = h_dt / h * dt; - if ( fabsf( w ) < 0.01f ) - p->h = h *= 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ); - else - p->h = h *= expf( w ); - h_max = fmaxf( h_max , h ); - - - /* Integrate other values if this particle will not be updated. */ - /* Init fields for density calculation. */ - if ( pdt > dt_step ) { - float w = -3.0f * h_dt / h * dt; - if ( fabsf( w ) < 0.1f ) - rho = p->rho *= 1.0f + w*( 1.0f + w*( 0.5f + w*(1.0f/6.0f + 1.0f/24.0f*w ) ) ); - else - rho = p->rho *= expf( w ); - p->force.POrho2 = u * ( const_hydro_gamma - 1.0f ) / ( rho * xp->omega ); - } - else { - p->density.wcount = 0.0f; - p->density.wcount_dh = 0.0f; - p->rho = 0.0f; - p->rho_dh = 0.0f; - p->density.div_v = 0.0f; - for ( j = 0 ; j < 3 ; ++j) - p->density.curl_v[j] = 0.0f; - } - - } - - } - - /* Otherwise, agregate data from children. */ - else { - - /* Init with the first non-null child. */ - dt_min = FLT_MAX; - dt_max = 0.0f; - h_max = 0.0f; - dx_max = 0.0f; - - /* Loop over the progeny. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) { - if ( c->count < space_subsize ) - runner_dokick1( r , c->progeny[k] ); - dt_min = fminf( dt_min , c->progeny[k]->dt_min ); - dt_max = fmaxf( dt_max , c->progeny[k]->dt_max ); - h_max = fmaxf( h_max , c->progeny[k]->h_max ); - dx_max = fmaxf( dx_max , c->progeny[k]->dx_max ); - } - - } - - /* Store the values. */ - c->dt_min = dt_min; - c->dt_max = dt_max; - c->h_max = h_max; - c->dx_max = dx_max; - +void runner_dokick1(struct runner *r, struct cell *c) { + + int j, k; + struct engine *e = r->e; + float pdt, dt_step = e->dt_step, dt = e->dt, hdt = dt / 2; + float dt_min, dt_max, h_max, dx, dx_max; + float a[3], v[3], u, u_dt, h, h_dt, w, rho; + double x[3], x_old[3]; + struct part *restrict p, *restrict parts = c->parts; + struct xpart *restrict xp, *restrict xparts = c->xparts; + + /* No children? */ + if (!c->split) { + + /* Init the min/max counters. */ + dt_min = FLT_MAX; + dt_max = 0.0f; + h_max = 0.0f; + dx_max = 0.0f; + + /* Loop over parts. */ + __builtin_prefetch(&parts[0], 0, 1); + __builtin_prefetch(&parts[0].rho_dh, 0, 1); + __builtin_prefetch(&xparts[0], 0, 1); + __builtin_prefetch(&parts[1], 0, 1); + __builtin_prefetch(&parts[1].rho_dh, 0, 1); + __builtin_prefetch(&xparts[1], 0, 1); + __builtin_prefetch(&parts[2], 0, 1); + __builtin_prefetch(&parts[2].rho_dh, 0, 1); + __builtin_prefetch(&xparts[2], 0, 1); + for (k = 0; k < c->count; k++) { + + /* Get a handle on the kth particle. */ + __builtin_prefetch(&parts[k + 3], 0, 1); + __builtin_prefetch(&parts[k + 3].rho_dh, 0, 1); + __builtin_prefetch(&xparts[k + 3], 0, 1); + p = &parts[k]; + xp = &xparts[k]; + + /* Load the data locally. */ + a[0] = p->a[0]; + a[1] = p->a[1]; + a[2] = p->a[2]; + v[0] = p->v[0]; + v[1] = p->v[1]; + v[2] = p->v[2]; + x[0] = p->x[0]; + x[1] = p->x[1]; + x[2] = p->x[2]; + x_old[0] = xp->x_old[0]; + x_old[1] = xp->x_old[1]; + x_old[2] = xp->x_old[2]; + h = p->h; + u = p->u; + h_dt = p->force.h_dt; + u_dt = p->force.u_dt; + pdt = p->dt; + + /* Store the min/max dt. */ + dt_min = fminf(dt_min, pdt); + dt_max = fmaxf(dt_max, pdt); + + /* Update the half-step velocities from the current velocities. */ + xp->v_hdt[0] = v[0] + hdt * a[0]; + xp->v_hdt[1] = v[1] + hdt * a[1]; + xp->v_hdt[2] = v[2] + hdt * a[2]; + xp->u_hdt = u + hdt * u_dt; + + /* Move the particles with the velocities at the half-step. */ + p->x[0] = x[0] += dt * xp->v_hdt[0]; + p->x[1] = x[1] += dt * xp->v_hdt[1]; + p->x[2] = x[2] += dt * xp->v_hdt[2]; + dx = sqrtf((x[0] - x_old[0]) * (x[0] - x_old[0]) + + (x[1] - x_old[1]) * (x[1] - x_old[1]) + + (x[2] - x_old[2]) * (x[2] - x_old[2])); + dx_max = fmaxf(dx_max, dx); + + /* Update positions and energies at the half-step. */ + p->v[0] = v[0] + dt * a[0]; + p->v[1] = v[1] + dt * a[1]; + p->v[2] = v[2] + dt * a[2]; + w = u_dt / u * dt; + if (fabsf(w) < 0.01f) + p->u = u *= + 1.0f + + w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w))); + else + p->u = u *= expf(w); + w = h_dt / h * dt; + if (fabsf(w) < 0.01f) + p->h = h *= + 1.0f + + w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w))); + else + p->h = h *= expf(w); + h_max = fmaxf(h_max, h); + + /* Integrate other values if this particle will not be updated. */ + /* Init fields for density calculation. */ + if (pdt > dt_step) { + float w = -3.0f * h_dt / h * dt; + if (fabsf(w) < 0.1f) + rho = p->rho *= + 1.0f + + w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w))); + else + rho = p->rho *= expf(w); + p->force.POrho2 = u * (const_hydro_gamma - 1.0f) / (rho * xp->omega); + } else { + p->density.wcount = 0.0f; + p->density.wcount_dh = 0.0f; + p->rho = 0.0f; + p->rho_dh = 0.0f; + p->density.div_v = 0.0f; + for (j = 0; j < 3; ++j) p->density.curl_v[j] = 0.0f; + } } + } + + /* Otherwise, agregate data from children. */ + else { + + /* Init with the first non-null child. */ + dt_min = FLT_MAX; + dt_max = 0.0f; + h_max = 0.0f; + dx_max = 0.0f; + + /* Loop over the progeny. */ + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + if (c->count < space_subsize) runner_dokick1(r, c->progeny[k]); + dt_min = fminf(dt_min, c->progeny[k]->dt_min); + dt_max = fmaxf(dt_max, c->progeny[k]->dt_max); + h_max = fmaxf(h_max, c->progeny[k]->h_max); + dx_max = fmaxf(dx_max, c->progeny[k]->dx_max); + } + } + + /* Store the values. */ + c->dt_min = dt_min; + c->dt_max = dt_max; + c->h_max = h_max; + c->dx_max = dx_max; +} /** * @brief Combined second and first kick for fixed dt. * * @param r The runner thread. * @param c The cell. - * @param timer The timer + * @param timer The timer */ - -void runner_dokick ( struct runner *r , struct cell *c , int timer ) { - - int k, count = 0, nr_parts = c->count, updated; - float dt_min = FLT_MAX, dt_max = 0.0f; - float h_max, dx, dx_max; - double ekin = 0.0, epot = 0.0; - float mom[3] = { 0.0f , 0.0f , 0.0f }, ang[3] = { 0.0f , 0.0f , 0.0f }; - float x[3], x_old[3], v_hdt[3], a[3], u, u_hdt, h, pdt, m, w; - float dt = r->e->dt, hdt = 0.5f*dt; - float dt_cfl, dt_h_change, dt_u_change, dt_new; - float h_dt, u_dt; - struct part *restrict p, *restrict parts = c->parts; - struct xpart *restrict xp, *restrict xparts = c->xparts; - - TIMER_TIC - - /* No children? */ - if ( !c->split ) { - - /* Init the min/max counters. */ - dt_min = FLT_MAX; - dt_max = 0.0f; - h_max = 0.0f; - dx_max = 0.0f; - - /* Loop over the particles and kick them. */ - __builtin_prefetch( &parts[0] , 0 , 1 ); - __builtin_prefetch( &parts[0].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[0] , 0 , 1 ); - __builtin_prefetch( &parts[1] , 0 , 1 ); - __builtin_prefetch( &parts[1].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[1] , 0 , 1 ); - __builtin_prefetch( &parts[2] , 0 , 1 ); - __builtin_prefetch( &parts[2].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[2] , 0 , 1 ); - for ( k = 0 ; k < nr_parts ; k++ ) { - - /* Get a handle on the part. */ - __builtin_prefetch( &parts[k+3] , 0 , 1 ); - __builtin_prefetch( &parts[k+3].rho_dh , 0 , 1 ); - __builtin_prefetch( &xparts[k+3] , 0 , 1 ); - p = &parts[k]; - xp = &xparts[k]; - - /* Get local copies of particle data. */ - pdt = p->dt; - u_dt = p->force.u_dt; - h = p->h; - m = p->mass; - x[0] = p->x[0]; x[1] = p->x[1]; x[2] = p->x[2]; - a[0] = p->a[0]; a[1] = p->a[1]; a[2] = p->a[2]; - x_old[0] = xp->x_old[0]; x_old[1] = xp->x_old[1]; x_old[2] = xp->x_old[2]; - v_hdt[0] = xp->v_hdt[0]; v_hdt[1] = xp->v_hdt[1]; v_hdt[2] = xp->v_hdt[2]; - u_hdt = xp->u_hdt; - - /* Scale the derivatives if they're freshly computed. */ - h_dt = p->force.h_dt *= h * 0.333333333f; - count += 1; - xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f; - - /* Update the particle's time step. */ - dt_cfl = const_cfl * h / p->force.v_sig; - dt_h_change = ( h_dt != 0.0f ) ? fabsf( const_ln_max_h_change * h / h_dt ) : FLT_MAX; - dt_u_change = ( u_dt != 0.0f ) ? fabsf( const_max_u_change * p->u / u_dt ) : FLT_MAX; - dt_new = fminf( dt_cfl , fminf( dt_h_change , dt_u_change ) ); - if ( pdt == 0.0f ) - p->dt = pdt = dt_new; - else - p->dt = pdt = fminf( dt_new , 2.0f*pdt ); - - /* Get the smallest/largest dt. */ - dt_min = fminf( dt_min , pdt ); - dt_max = fmaxf( dt_max , pdt ); - - /* Step and store the velocity and internal energy. */ - xp->v_hdt[0] = ( v_hdt[0] += dt * a[0] ); - xp->v_hdt[1] = ( v_hdt[1] += dt * a[1] ); - xp->v_hdt[2] = ( v_hdt[2] += dt * a[2] ); - xp->u_hdt = ( u_hdt += dt * u_dt ); - - /* Move the particles with the velocitie at the half-step. */ - p->x[0] = x[0] += dt * v_hdt[0]; - p->x[1] = x[1] += dt * v_hdt[1]; - p->x[2] = x[2] += dt * v_hdt[2]; - dx = sqrtf( (x[0] - x_old[0])*(x[0] - x_old[0]) + - (x[1] - x_old[1])*(x[1] - x_old[1]) + - (x[2] - x_old[2])*(x[2] - x_old[2]) ); - dx_max = fmaxf( dx_max , dx ); - - /* Update positions and energies at the next full step. */ - p->v[0] = v_hdt[0] + hdt * a[0]; - p->v[1] = v_hdt[1] + hdt * a[1]; - p->v[2] = v_hdt[2] + hdt * a[2]; - w = u_dt / u_hdt * hdt; - if ( fabsf( w ) < 0.01f ) - p->u = u = u_hdt * ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) ); - else - p->u = u = u_hdt * expf( w ); - w = h_dt / h * dt; - if ( fabsf( w ) < 0.01f ) - p->h = h *= ( 1.0f + w*( 1.0f + w*( 0.5f + w*( 1.0f/6.0f + 1.0f/24.0f*w ) ) ) ); - else - p->h = h *= expf( w ); - h_max = fmaxf( h_max , h ); - - /* Collect momentum */ - mom[0] += m * v_hdt[0]; - mom[1] += m * v_hdt[1]; - mom[2] += m * v_hdt[2]; - - /* Collect angular momentum */ - ang[0] += m * ( x[1]*v_hdt[2] - x[2]*v_hdt[1] ); - ang[1] += m * ( x[2]*v_hdt[0] - x[0]*v_hdt[2] ); - ang[2] += m * ( x[0]*v_hdt[1] - x[1]*v_hdt[0] ); - - /* Collect total energy. */ - ekin += 0.5 * m * ( v_hdt[0]*v_hdt[0] + v_hdt[1]*v_hdt[1] + v_hdt[2]*v_hdt[2] ); - epot += m * u_hdt; - - /* Init fields for density calculation. */ - p->density.wcount = 0.0f; - p->density.wcount_dh = 0.0f; - p->rho = 0.0f; - p->rho_dh = 0.0f; - p->density.div_v = 0.0f; - p->density.curl_v[0] = 0.0f; - p->density.curl_v[1] = 0.0f; - p->density.curl_v[2] = 0.0f; - - } - - } - - /* Otherwise, agregate data from children. */ - else { - - /* Init with the first non-null child. */ - dt_min = FLT_MAX; - dt_max = 0.0f; - h_max = 0.0f; - dx_max = 0.0f; - updated = 0; - ekin = 0.0; - epot = 0.0; - mom[0] = 0.0f; mom[1] = 0.0f; mom[2] = 0.0f; - ang[0] = 0.0f; ang[1] = 0.0f; ang[2] = 0.0f; - - /* Loop over the progeny. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) { - struct cell *cp = c->progeny[k]; - runner_dokick( r , cp , 0 ); - dt_min = fminf( dt_min , cp->dt_min ); - dt_max = fmaxf( dt_max , cp->dt_max ); - h_max = fmaxf( h_max , cp->h_max ); - dx_max = fmaxf( dx_max , cp->dx_max ); - updated += cp->count; - ekin += cp->ekin; - epot += cp->epot; - mom[0] += cp->mom[0]; mom[1] += cp->mom[1]; mom[2] += cp->mom[2]; - ang[0] += cp->ang[0]; ang[1] += cp->ang[1]; ang[2] += cp->ang[2]; - } - - } - /* Store the values. */ - c->dt_min = dt_min; - c->dt_max = dt_max; - c->h_max = h_max; - c->dx_max = dx_max; - c->updated = count; - c->ekin = ekin; - c->epot = epot; - c->mom[0] = mom[0]; c->mom[1] = mom[1]; c->mom[2] = mom[2]; - c->ang[0] = ang[0]; c->ang[1] = ang[1]; c->ang[2] = ang[2]; - - if ( timer ) { - #ifdef TIMER_VERBOSE - message( "runner %02i: %i parts at depth %i took %.3f ms." , - r->id , c->count , c->depth , - ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000 ); fflush(stdout); - #else - TIMER_TOC(timer_kick2); - #endif - } - +void runner_dokick(struct runner *r, struct cell *c, int timer) { + + int k, count = 0, nr_parts = c->count, updated; + float dt_min = FLT_MAX, dt_max = 0.0f; + float h_max, dx, dx_max; + double ekin = 0.0, epot = 0.0; + float mom[3] = {0.0f, 0.0f, 0.0f}, ang[3] = {0.0f, 0.0f, 0.0f}; + float x[3], x_old[3], v_hdt[3], a[3], u, u_hdt, h, pdt, m, w; + float dt = r->e->dt, hdt = 0.5f * dt; + float dt_cfl, dt_h_change, dt_u_change, dt_new; + float h_dt, u_dt; + struct part *restrict p, *restrict parts = c->parts; + struct xpart *restrict xp, *restrict xparts = c->xparts; + + TIMER_TIC + + /* No children? */ + if (!c->split) { + + /* Init the min/max counters. */ + dt_min = FLT_MAX; + dt_max = 0.0f; + h_max = 0.0f; + dx_max = 0.0f; + + /* Loop over the particles and kick them. */ + __builtin_prefetch(&parts[0], 0, 1); + __builtin_prefetch(&parts[0].rho_dh, 0, 1); + __builtin_prefetch(&xparts[0], 0, 1); + __builtin_prefetch(&parts[1], 0, 1); + __builtin_prefetch(&parts[1].rho_dh, 0, 1); + __builtin_prefetch(&xparts[1], 0, 1); + __builtin_prefetch(&parts[2], 0, 1); + __builtin_prefetch(&parts[2].rho_dh, 0, 1); + __builtin_prefetch(&xparts[2], 0, 1); + for (k = 0; k < nr_parts; k++) { + + /* Get a handle on the part. */ + __builtin_prefetch(&parts[k + 3], 0, 1); + __builtin_prefetch(&parts[k + 3].rho_dh, 0, 1); + __builtin_prefetch(&xparts[k + 3], 0, 1); + p = &parts[k]; + xp = &xparts[k]; + + /* Get local copies of particle data. */ + pdt = p->dt; + u_dt = p->force.u_dt; + h = p->h; + m = p->mass; + x[0] = p->x[0]; + x[1] = p->x[1]; + x[2] = p->x[2]; + a[0] = p->a[0]; + a[1] = p->a[1]; + a[2] = p->a[2]; + x_old[0] = xp->x_old[0]; + x_old[1] = xp->x_old[1]; + x_old[2] = xp->x_old[2]; + v_hdt[0] = xp->v_hdt[0]; + v_hdt[1] = xp->v_hdt[1]; + v_hdt[2] = xp->v_hdt[2]; + u_hdt = xp->u_hdt; + + /* Scale the derivatives if they're freshly computed. */ + h_dt = p->force.h_dt *= h * 0.333333333f; + count += 1; + xp->omega = 1.0f + h * p->rho_dh / p->rho * 0.3333333333f; + + /* Update the particle's time step. */ + dt_cfl = const_cfl * h / p->force.v_sig; + dt_h_change = + (h_dt != 0.0f) ? fabsf(const_ln_max_h_change * h / h_dt) : FLT_MAX; + dt_u_change = + (u_dt != 0.0f) ? fabsf(const_max_u_change * p->u / u_dt) : FLT_MAX; + dt_new = fminf(dt_cfl, fminf(dt_h_change, dt_u_change)); + if (pdt == 0.0f) + p->dt = pdt = dt_new; + else + p->dt = pdt = fminf(dt_new, 2.0f * pdt); + + /* Get the smallest/largest dt. */ + dt_min = fminf(dt_min, pdt); + dt_max = fmaxf(dt_max, pdt); + + /* Step and store the velocity and internal energy. */ + xp->v_hdt[0] = (v_hdt[0] += dt * a[0]); + xp->v_hdt[1] = (v_hdt[1] += dt * a[1]); + xp->v_hdt[2] = (v_hdt[2] += dt * a[2]); + xp->u_hdt = (u_hdt += dt * u_dt); + + /* Move the particles with the velocitie at the half-step. */ + p->x[0] = x[0] += dt * v_hdt[0]; + p->x[1] = x[1] += dt * v_hdt[1]; + p->x[2] = x[2] += dt * v_hdt[2]; + dx = sqrtf((x[0] - x_old[0]) * (x[0] - x_old[0]) + + (x[1] - x_old[1]) * (x[1] - x_old[1]) + + (x[2] - x_old[2]) * (x[2] - x_old[2])); + dx_max = fmaxf(dx_max, dx); + + /* Update positions and energies at the next full step. */ + p->v[0] = v_hdt[0] + hdt * a[0]; + p->v[1] = v_hdt[1] + hdt * a[1]; + p->v[2] = v_hdt[2] + hdt * a[2]; + w = u_dt / u_hdt * hdt; + if (fabsf(w) < 0.01f) + p->u = u = + u_hdt * + (1.0f + + w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w)))); + else + p->u = u = u_hdt * expf(w); + w = h_dt / h * dt; + if (fabsf(w) < 0.01f) + p->h = h *= + (1.0f + + w * (1.0f + w * (0.5f + w * (1.0f / 6.0f + 1.0f / 24.0f * w)))); + else + p->h = h *= expf(w); + h_max = fmaxf(h_max, h); + + /* Collect momentum */ + mom[0] += m * v_hdt[0]; + mom[1] += m * v_hdt[1]; + mom[2] += m * v_hdt[2]; + + /* Collect angular momentum */ + ang[0] += m * (x[1] * v_hdt[2] - x[2] * v_hdt[1]); + ang[1] += m * (x[2] * v_hdt[0] - x[0] * v_hdt[2]); + ang[2] += m * (x[0] * v_hdt[1] - x[1] * v_hdt[0]); + + /* Collect total energy. */ + ekin += 0.5 * m * + (v_hdt[0] * v_hdt[0] + v_hdt[1] * v_hdt[1] + v_hdt[2] * v_hdt[2]); + epot += m * u_hdt; + + /* Init fields for density calculation. */ + p->density.wcount = 0.0f; + p->density.wcount_dh = 0.0f; + p->rho = 0.0f; + p->rho_dh = 0.0f; + p->density.div_v = 0.0f; + p->density.curl_v[0] = 0.0f; + p->density.curl_v[1] = 0.0f; + p->density.curl_v[2] = 0.0f; } + } + + /* Otherwise, agregate data from children. */ + else { + + /* Init with the first non-null child. */ + dt_min = FLT_MAX; + dt_max = 0.0f; + h_max = 0.0f; + dx_max = 0.0f; + updated = 0; + ekin = 0.0; + epot = 0.0; + mom[0] = 0.0f; + mom[1] = 0.0f; + mom[2] = 0.0f; + ang[0] = 0.0f; + ang[1] = 0.0f; + ang[2] = 0.0f; + + /* Loop over the progeny. */ + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + struct cell *cp = c->progeny[k]; + runner_dokick(r, cp, 0); + dt_min = fminf(dt_min, cp->dt_min); + dt_max = fmaxf(dt_max, cp->dt_max); + h_max = fmaxf(h_max, cp->h_max); + dx_max = fmaxf(dx_max, cp->dx_max); + updated += cp->count; + ekin += cp->ekin; + epot += cp->epot; + mom[0] += cp->mom[0]; + mom[1] += cp->mom[1]; + mom[2] += cp->mom[2]; + ang[0] += cp->ang[0]; + ang[1] += cp->ang[1]; + ang[2] += cp->ang[2]; + } + } + + /* Store the values. */ + c->dt_min = dt_min; + c->dt_max = dt_max; + c->h_max = h_max; + c->dx_max = dx_max; + c->updated = count; + c->ekin = ekin; + c->epot = epot; + c->mom[0] = mom[0]; + c->mom[1] = mom[1]; + c->mom[2] = mom[2]; + c->ang[0] = ang[0]; + c->ang[1] = ang[1]; + c->ang[2] = ang[2]; + + if (timer) { +#ifdef TIMER_VERBOSE + message("runner %02i: %i parts at depth %i took %.3f ms.", r->id, c->count, + c->depth, ((double)TIMER_TOC(timer_kick2)) / CPU_TPS * 1000); + fflush(stdout); +#else + TIMER_TOC(timer_kick2); +#endif + } +} /** * @brief The #runner main thread routine. * * @param data A pointer to this thread's data. */ - -void *runner_main ( void *data ) { - - struct runner *r = (struct runner *)data; - struct engine *e = r->e; - struct scheduler *sched = &e->sched; - struct task *t = NULL; - struct cell *ci, *cj, *super; - struct part *parts; - int k, nr_parts; - - /* Main loop. */ - while ( 1 ) { - - /* Wait at the barrier. */ - engine_barrier( e , r->id ); - - /* Re-set the pointer to the previous super cell. */ - super = NULL; - - /* Loop while there are tasks... */ - while ( 1 ) { - - /* If there's no old task, try to get a new one. */ - if ( t == NULL ) { - - /* Get the task. */ - TIMER_TIC - t = scheduler_gettask( sched , r->qid , super ); - TIMER_TOC(timer_gettask); - - /* Did I get anything? */ - if ( t == NULL ) - break; - - } - - /* Get the cells. */ - ci = t->ci; - cj = t->cj; - t->rid = r->cpuid; - - /* Set super to the first cell that I own. */ - if ( ci->super != NULL && ci->super->owner == r->qid ) - super = ci->super; - else if ( cj != NULL && cj->super != NULL && cj->super->owner == r->qid ) - super = cj->super; - /* else - super = NULL; */ - - /* Prefetch? */ - if ( runner_prefetch && - t->type != task_type_kick1 && t->type != task_type_kick2 && t->type != task_type_ghost ) { - for ( int k = 0 ; k < ci->count ; k++ ) - __builtin_prefetch( &ci->parts[k] , 1 , 3 ); - if ( cj != NULL ) - for ( int k = 0 ; k < cj->count ; k++ ) - __builtin_prefetch( &cj->parts[k] , 1 , 3 ); - } - - /* Different types of tasks... */ - switch ( t->type ) { - case task_type_self: - if ( t->subtype == task_subtype_density ) - runner_doself1_density( r , ci ); - else if ( t->subtype == task_subtype_force ) - runner_doself2_force( r , ci ); - else - error( "Unknown task subtype." ); - break; - case task_type_pair: - if ( t->subtype == task_subtype_density ) - runner_dopair1_density( r , ci , cj ); - else if ( t->subtype == task_subtype_force ) - runner_dopair2_force( r , ci , cj ); - else - error( "Unknown task subtype." ); - break; - case task_type_sort: - runner_dosort( r , ci , t->flags , 1 ); - break; - case task_type_sub: - if ( t->subtype == task_subtype_density ) - runner_dosub1_density( r , ci , cj , t->flags , 1 ); - else if ( t->subtype == task_subtype_force ) - runner_dosub2_force( r , ci , cj , t->flags , 1 ); - else if ( t->subtype == task_subtype_grav ) - runner_dosub_grav( r , ci , cj , 1 ); - else - error( "Unknown task subtype." ); - break; - case task_type_ghost: - runner_doghost( r , ci ); - break; - case task_type_kick1: - runner_dokick1( r , ci ); - break; - case task_type_kick2: - if ( e->policy & engine_policy_fixdt ) - runner_dokick( r , ci , 1 ); - else - runner_dokick2( r , ci ); - break; - case task_type_send: - break; - case task_type_recv: - parts = ci->parts; - nr_parts = ci->count; - for ( k = 0 ; k < nr_parts ; k++ ) - parts[k].dt = FLT_MAX; - ci->dt_min = ci->dt_max = FLT_MAX; - break; - case task_type_grav_pp: - if ( t->cj == NULL ) - runner_doself_grav( r , t->ci ); - else - runner_dopair_grav( r , t->ci , t->cj ); - break; - case task_type_grav_mm: - runner_dograv_mm( r , t->ci , t->cj ); - break; - case task_type_grav_up: - runner_dograv_up( r , t->ci ); - break; - case task_type_grav_down: - runner_dograv_down( r , t->ci ); - break; - default: - error( "Unknown task type." ); - } - - /* We're done with this task, see if we get a next one. */ - t = scheduler_done( sched , t ); - - } /* main loop. */ - - } - - /* Be kind, rewind. */ - return NULL; - - } - +void *runner_main(void *data) { + + struct runner *r = (struct runner *)data; + struct engine *e = r->e; + struct scheduler *sched = &e->sched; + struct task *t = NULL; + struct cell *ci, *cj, *super; + struct part *parts; + int k, nr_parts; + + /* Main loop. */ + while (1) { + + /* Wait at the barrier. */ + engine_barrier(e, r->id); + + /* Re-set the pointer to the previous super cell. */ + super = NULL; + + /* Loop while there are tasks... */ + while (1) { + + /* If there's no old task, try to get a new one. */ + if (t == NULL) { + + /* Get the task. */ + TIMER_TIC + t = scheduler_gettask(sched, r->qid, super); + TIMER_TOC(timer_gettask); + + /* Did I get anything? */ + if (t == NULL) break; + } + + /* Get the cells. */ + ci = t->ci; + cj = t->cj; + t->rid = r->cpuid; + + /* Set super to the first cell that I own. */ + if (ci->super != NULL && ci->super->owner == r->qid) + super = ci->super; + else if (cj != NULL && cj->super != NULL && cj->super->owner == r->qid) + super = cj->super; + /* else + super = NULL; */ + + /* Prefetch? */ + if (runner_prefetch && t->type != task_type_kick1 && + t->type != task_type_kick2 && t->type != task_type_ghost) { + for (int k = 0; k < ci->count; k++) + __builtin_prefetch(&ci->parts[k], 1, 3); + if (cj != NULL) + for (int k = 0; k < cj->count; k++) + __builtin_prefetch(&cj->parts[k], 1, 3); + } + + /* Different types of tasks... */ + switch (t->type) { + case task_type_self: + if (t->subtype == task_subtype_density) + runner_doself1_density(r, ci); + else if (t->subtype == task_subtype_force) + runner_doself2_force(r, ci); + else + error("Unknown task subtype."); + break; + case task_type_pair: + if (t->subtype == task_subtype_density) + runner_dopair1_density(r, ci, cj); + else if (t->subtype == task_subtype_force) + runner_dopair2_force(r, ci, cj); + else + error("Unknown task subtype."); + break; + case task_type_sort: + runner_dosort(r, ci, t->flags, 1); + break; + case task_type_sub: + if (t->subtype == task_subtype_density) + runner_dosub1_density(r, ci, cj, t->flags, 1); + else if (t->subtype == task_subtype_force) + runner_dosub2_force(r, ci, cj, t->flags, 1); + else if (t->subtype == task_subtype_grav) + runner_dosub_grav(r, ci, cj, 1); + else + error("Unknown task subtype."); + break; + case task_type_ghost: + runner_doghost(r, ci); + break; + case task_type_kick1: + runner_dokick1(r, ci); + break; + case task_type_kick2: + if (e->policy & engine_policy_fixdt) + runner_dokick(r, ci, 1); + else + runner_dokick2(r, ci); + break; + case task_type_send: + break; + case task_type_recv: + parts = ci->parts; + nr_parts = ci->count; + for (k = 0; k < nr_parts; k++) parts[k].dt = FLT_MAX; + ci->dt_min = ci->dt_max = FLT_MAX; + break; + case task_type_grav_pp: + if (t->cj == NULL) + runner_doself_grav(r, t->ci); + else + runner_dopair_grav(r, t->ci, t->cj); + break; + case task_type_grav_mm: + runner_dograv_mm(r, t->ci, t->cj); + break; + case task_type_grav_up: + runner_dograv_up(r, t->ci); + break; + case task_type_grav_down: + runner_dograv_down(r, t->ci); + break; + default: + error("Unknown task type."); + } + + /* We're done with this task, see if we get a next one. */ + t = scheduler_done(sched, t); + + } /* main loop. */ + } + + /* Be kind, rewind. */ + return NULL; +} diff --git a/src/runner.h b/src/runner.h index 91ac475d7079a5a49dd194668bea567e9528a74c..30e75bd6ad21d45baf328adef23d2b500015ce9b 100644 --- a/src/runner.h +++ b/src/runner.h @@ -1,87 +1,99 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_RUNNER_H +#define SWIFT_RUNNER_H +/* Some standard headers. */ +#include <pthread.h> + +/* Includes. */ +#include "cell.h" #include "inline.h" +/* Forward-declare the engine type to avoid cyclic header dependencies. */ +struct engine; + /* Some constants/flags. */ -#define runner_prefetch 0 +#define runner_prefetch 0 /* SID stuff. */ extern const char runner_flip[]; - /* Counters. */ enum runner_counters { - runner_counter_swap = 0, - runner_counter_stall, - runner_counter_steal_stall, - runner_counter_steal_empty, - runner_counter_keep, - runner_counter_iact, - runner_counter_count, - }; -extern int runner_counter[ runner_counter_count ]; - + runner_counter_swap = 0, + runner_counter_stall, + runner_counter_steal_stall, + runner_counter_steal_empty, + runner_counter_keep, + runner_counter_iact, + runner_counter_count, +}; +extern int runner_counter[runner_counter_count]; /* Counter macros. */ #ifdef COUNTER - #define COUNT(c) ( __sync_add_and_fetch( &runner_counter[ c ] , 1 ) ) +#define COUNT(c) (__sync_add_and_fetch(&runner_counter[c], 1)) #else - #define COUNT(c) +#define COUNT(c) #endif - /* Histogram functions. */ #define runner_hist_a 1.0 #define runner_hist_b 100.0 #define runner_hist_N 99 -long long int runner_hist_bins[ runner_hist_N ]; -#define runner_hist_hit( x ) __sync_add_and_fetch( &runner_hist_bins[ (int)fmax( 0.0 , fmin( runner_hist_N-1 , ((x) - runner_hist_a) / (runner_hist_b - runner_hist_a) * runner_hist_N ) ) ] , 1 ) - - +long long int runner_hist_bins[runner_hist_N]; +#define runner_hist_hit(x) \ + __sync_add_and_fetch( \ + &runner_hist_bins[(int)fmax( \ + 0.0, fmin(runner_hist_N - 1, ((x) - runner_hist_a) / \ + (runner_hist_b - runner_hist_a) * \ + runner_hist_N))], \ + 1) /* A struct representing a runner's thread and its data. */ struct runner { - /* The id of this thread. */ - int id; + /* The id of this thread. */ + int id; - /* The thread which it is running. */ - pthread_t thread; - - /* The queue to use to get tasks. */ - int cpuid, qid; + /* The thread which it is running. */ + pthread_t thread; - /* The underlying runner. */ - struct engine *e; - - }; + /* The queue to use to get tasks. */ + int cpuid, qid; + /* The underlying runner. */ + struct engine *e; +}; /* Function prototypes. */ -void runner_doghost ( struct runner *r , struct cell *c ); -void runner_dopair_density ( struct runner *r , struct cell *ci , struct cell *cj ); -void runner_doself_density ( struct runner *r , struct cell *c ); -void runner_dosub_density ( struct runner *r , struct cell *ci , struct cell *cj , int flags ); -void runner_dosort ( struct runner *r , struct cell *c , int flag , int clock ); -void runner_dogsort ( struct runner *r , struct cell *c , int flag , int clock ); -void runner_dokick ( struct runner *r , struct cell *c , int timer ); -void runner_dokick1 ( struct runner *r , struct cell *c ); -void runner_dokick2 ( struct runner *r , struct cell *c ); -void *runner_main ( void *data ); +void runner_doghost(struct runner *r, struct cell *c); +void runner_dopair_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_doself_density(struct runner *r, struct cell *c); +void runner_dosub_density(struct runner *r, struct cell *ci, struct cell *cj, + int flags); +void runner_dosort(struct runner *r, struct cell *c, int flag, int clock); +void runner_dogsort(struct runner *r, struct cell *c, int flag, int clock); +void runner_dokick(struct runner *r, struct cell *c, int timer); +void runner_dokick1(struct runner *r, struct cell *c); +void runner_dokick2(struct runner *r, struct cell *c); +void *runner_main(void *data); + +#endif /* SWIFT_RUNNER_H */ diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 1c28b81f72572ea305814aac9ecabd65f41cbfae..017529cc94021ee9ea38ce543ac8a3c4dea2e1db 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1,22 +1,25 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +/* Includes. */ +#include "cell.h" +#include "part.h" /* Before including this file, define FUNCTION, which is the name of the interaction function. This creates the interaction functions @@ -24,73 +27,71 @@ and runner_dosub_FUNCTION calling the pairwise interaction function runner_iact_FUNCTION. */ -#define PASTE(x,y) x ## _ ## y +#define PASTE(x, y) x##_##y -#define _DOPAIR1(f) PASTE(runner_dopair1,f) +#define _DOPAIR1(f) PASTE(runner_dopair1, f) #define DOPAIR1 _DOPAIR1(FUNCTION) -#define _DOPAIR2(f) PASTE(runner_dopair2,f) +#define _DOPAIR2(f) PASTE(runner_dopair2, f) #define DOPAIR2 _DOPAIR2(FUNCTION) -#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset,f) +#define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f) #define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION) -#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive,f) +#define _DOPAIR_SUBSET_NAIVE(f) PASTE(runner_dopair_subset_naive, f) #define DOPAIR_SUBSET_NAIVE _DOPAIR_SUBSET_NAIVE(FUNCTION) -#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive,f) +#define _DOPAIR_NAIVE(f) PASTE(runner_dopair_naive, f) #define DOPAIR_NAIVE _DOPAIR_NAIVE(FUNCTION) -#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive,f) +#define _DOSELF_NAIVE(f) PASTE(runner_doself_naive, f) #define DOSELF_NAIVE _DOSELF_NAIVE(FUNCTION) -#define _DOSELF1(f) PASTE(runner_doself1,f) +#define _DOSELF1(f) PASTE(runner_doself1, f) #define DOSELF1 _DOSELF1(FUNCTION) -#define _DOSELF2(f) PASTE(runner_doself2,f) +#define _DOSELF2(f) PASTE(runner_doself2, f) #define DOSELF2 _DOSELF2(FUNCTION) -#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset,f) +#define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f) #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION) -#define _DOSUB1(f) PASTE(runner_dosub1,f) +#define _DOSUB1(f) PASTE(runner_dosub1, f) #define DOSUB1 _DOSUB1(FUNCTION) -#define _DOSUB2(f) PASTE(runner_dosub2,f) +#define _DOSUB2(f) PASTE(runner_dosub2, f) #define DOSUB2 _DOSUB2(FUNCTION) -#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset,f) +#define _DOSUB_SUBSET(f) PASTE(runner_dosub_subset, f) #define DOSUB_SUBSET _DOSUB_SUBSET(FUNCTION) -#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym,f) +#define _IACT_NONSYM(f) PASTE(runner_iact_nonsym, f) #define IACT_NONSYM _IACT_NONSYM(FUNCTION) -#define _IACT(f) PASTE(runner_iact,f) +#define _IACT(f) PASTE(runner_iact, f) #define IACT _IACT(FUNCTION) -#define _TIMER_DOSELF(f) PASTE(timer_doself,f) +#define _TIMER_DOSELF(f) PASTE(timer_doself, f) #define TIMER_DOSELF _TIMER_DOSELF(FUNCTION) -#define _TIMER_DOPAIR(f) PASTE(timer_dopair,f) +#define _TIMER_DOPAIR(f) PASTE(timer_dopair, f) #define TIMER_DOPAIR _TIMER_DOPAIR(FUNCTION) -#define _TIMER_DOSUB(f) PASTE(timer_dosub,f) +#define _TIMER_DOSUB(f) PASTE(timer_dosub, f) #define TIMER_DOSUB _TIMER_DOSUB(FUNCTION) -#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset,f) +#define _TIMER_DOSELF_SUBSET(f) PASTE(timer_doself_subset, f) #define TIMER_DOSELF_SUBSET _TIMER_DOSELF_SUBSET(FUNCTION) -#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset,f) +#define _TIMER_DOPAIR_SUBSET(f) PASTE(timer_dopair_subset, f) #define TIMER_DOPAIR_SUBSET _TIMER_DOPAIR_SUBSET(FUNCTION) -#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec,f) +#define _IACT_NONSYM_VEC(f) PASTE(runner_iact_nonsym_vec, f) #define IACT_NONSYM_VEC _IACT_NONSYM_VEC(FUNCTION) -#define _IACT_VEC(f) PASTE(runner_iact_vec,f) +#define _IACT_VEC(f) PASTE(runner_iact_vec, f) #define IACT_VEC _IACT_VEC(FUNCTION) - - /** * @brief Compute the interactions between a cell pair. * @@ -98,218 +99,218 @@ * @param ci The first #cell. * @param cj The second #cell. */ - -void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { - - struct engine *e = r->e; - int pid, pjd, k, count_i = ci->count, count_j = cj->count; - double shift[3] = { 0.0 , 0.0 , 0.0 }; - struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts; - struct part *restrict pi, *restrict pj; - double pix[3]; - float dx[3], hi, hig2, r2; - float dt_step = e->dt_step; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the relative distance between the pairs, wrapping. */ - for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) - shift[k] = e->s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) - shift[k] = -e->s->dim[k]; + +void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, + struct cell *restrict cj) { + + struct engine *e = r->e; + int pid, pjd, k, count_i = ci->count, count_j = cj->count; + double shift[3] = {0.0, 0.0, 0.0}; + struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts; + struct part *restrict pi, *restrict pj; + double pix[3]; + float dx[3], hi, hig2, r2; + float dt_step = e->dt_step; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the relative distance between the pairs, wrapping. */ + for (k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + } + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with + %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , + cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Loop over the parts in ci. */ + for (pid = 0; pid < count_i; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[pid]; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) { + +#ifndef VECTORIZE + + IACT(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; } - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Loop over the parts in ci. */ - for ( pid = 0 ; pid < count_i ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ pid ]; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ pjd ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 || r2 < pj->h*pj->h*kernel_gamma2 ) { - - #ifndef VECTORIZE - - IACT( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOPAIR); - #endif +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf( + "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) " + "took %.3f ms.\n", + r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max, + ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOPAIR); +#endif +} + +void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { + + int pid, pjd, k, count = c->count; + struct part *restrict parts = c->parts; + struct part *restrict pi, *restrict pj; + double pix[3] = {0.0, 0.0, 0.0}; + float dx[3], hi, hig2, r2; + float dt_step = r->e->dt_step; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (c->dt_min > dt_step) return; + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with + %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , + cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Loop over the parts in ci. */ + for (pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts[pid]; + pix[0] = pi->x[0]; + pix[1] = pi->x[1]; + pix[2] = pi->x[2]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; - } + /* Loop over the parts in cj. */ + for (pjd = pid + 1; pjd < count; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) { + +#ifndef VECTORIZE + + IACT(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } +#endif + } -void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) { - - int pid, pjd, k, count = c->count; - struct part *restrict parts = c->parts; - struct part *restrict pi, *restrict pj; - double pix[3] = {0.0,0.0,0.0}; - float dx[3], hi, hig2, r2; - float dt_step = r->e->dt_step; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( c->dt_min > dt_step ) - return; - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Loop over the parts in ci. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts[ pid ]; - pix[0] = pi->x[0]; - pix[1] = pi->x[1]; - pix[2] = pi->x[2]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Loop over the parts in cj. */ - for ( pjd = pid+1 ; pjd < count ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts[ pjd ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 || r2 < pj->h*pj->h*kernel_gamma2 ) { - - #ifndef VECTORIZE - - IACT( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOSELF); - #endif + } /* loop over the parts in cj. */ - } + } /* loop over the parts in ci. */ +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + IACT(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf("runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n", r->id, + count, c->depth, ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOSELF); +#endif +} /** * @brief Compute the interactions between a cell pair, but only for the @@ -322,197 +323,202 @@ void DOSELF_NAIVE ( struct runner *r , struct cell *restrict c ) { * @param count The number of particles in @c ind. * @param cj The second #cell. */ - -void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts_i , int *restrict ind , int count , struct cell *restrict cj ) { - - struct engine *e = r->e; - int pid, pjd, sid, k, count_j = cj->count, flipped; - double shift[3] = { 0.0 , 0.0 , 0.0 }; - struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; - double pix[3]; - float dx[3], hi, hig2, r2, di, dxj; - struct entry *sort_j; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Get the relative distance between the pairs, wrapping. */ - for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) - shift[k] = e->s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) - shift[k] = -e->s->dim[k]; + +void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, + struct part *restrict parts_i, int *restrict ind, int count, + struct cell *restrict cj) { + + struct engine *e = r->e; + int pid, pjd, sid, k, count_j = cj->count, flipped; + double shift[3] = {0.0, 0.0, 0.0}; + struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; + double pix[3]; + float dx[3], hi, hig2, r2, di, dxj; + struct entry *sort_j; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Get the relative distance between the pairs, wrapping. */ + for (k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + } + + /* Get the sorting index. */ + for (sid = 0, k = 0; k < 3; k++) + sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) + ? 0 + : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); + + /* Switch the cells around? */ + flipped = runner_flip[sid]; + sid = sortlistID[sid]; + + /* Have the cells been sorted? */ + if (!(cj->sorted & (1 << sid))) error("Trying to interact unsorted cells."); + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with + %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , + cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Pick-out the sorted lists. */ + sort_j = &cj->sort[sid * (cj->count + 1)]; + dxj = cj->dx_max; + + /* Parts are on the left? */ + if (!flipped) { + + /* Loop over the parts_i. */ + for (pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ind[pid]]; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; + di = hi * kernel_gamma + dxj + pix[0] * runner_shift[3 * sid + 0] + + pix[1] * runner_shift[3 * sid + 1] + + pix[2] * runner_shift[3 * sid + 2]; + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[sort_j[pjd].i]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; } - - /* Get the sorting index. */ - for ( sid = 0 , k = 0 ; k < 3 ; k++ ) - sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 ); - - /* Switch the cells around? */ - flipped = runner_flip[sid]; - sid = sortlistID[sid]; - - /* Have the cells been sorted? */ - if ( !(cj->sorted & (1 << sid) ) ) - error( "Trying to interact unsorted cells." ); - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Pick-out the sorted lists. */ - sort_j = &cj->sort[ sid*(cj->count + 1) ]; - dxj = cj->dx_max; - - /* Parts are on the left? */ - if ( !flipped ) { - - /* Loop over the parts_i. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ ind[ pid ] ]; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - di = hi*kernel_gamma + dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ]; - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j && sort_j[ pjd ].d < di ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ sort_j[ pjd ].i ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - + + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif } - - /* Parts are on the right. */ - else { - - /* Loop over the parts_i. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ ind[ pid ] ]; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - di = -hi*kernel_gamma - dxj + pix[0]*runner_shift[ 3*sid + 0 ] + pix[1]*runner_shift[ 3*sid + 1 ] + pix[2]*runner_shift[ 3*sid + 2 ]; - - /* Loop over the parts in cj. */ - for ( pjd = count_j-1 ; pjd >= 0 && di < sort_j[ pjd ].d ; pjd-- ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ sort_j[ pjd ].i ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + } + + /* Parts are on the right. */ + else { + + /* Loop over the parts_i. */ + for (pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ind[pid]]; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; + di = -hi * kernel_gamma - dxj + pix[0] * runner_shift[3 * sid + 0] + + pix[1] * runner_shift[3 * sid + 1] + + pix[2] * runner_shift[3 * sid + 2]; + + /* Loop over the parts in cj. */ + for (pjd = count_j - 1; pjd >= 0 && di < sort_j[pjd].d; pjd--) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[sort_j[pjd].i]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; } - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(timer_dopair_subset); - #endif + /* Hit or miss? */ + if (r2 < hig2) { - } +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + } + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf( + "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) " + "took %.3f ms.\n", + r->id, count, count_j, ci->depth, ci->h_max, cj->h_max, + ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000); +#else + TIMER_TOC(timer_dopair_subset); +#endif +} /** * @brief Compute the interactions between a cell pair, but only for the @@ -525,111 +531,114 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part * * @param count The number of particles in @c ind. * @param cj The second #cell. */ - -void DOPAIR_SUBSET_NAIVE ( struct runner *r , struct cell *restrict ci , struct part *restrict parts_i , int *restrict ind , int count , struct cell *restrict cj ) { - - struct engine *e = r->e; - int pid, pjd, k, count_j = cj->count; - double shift[3] = { 0.0 , 0.0 , 0.0 }; - struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; - double pix[3]; - float dx[3], hi, hig2, r2; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Get the relative distance between the pairs, wrapping. */ - for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) - shift[k] = e->s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) - shift[k] = -e->s->dim[k]; - } - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Loop over the parts_i. */ - for ( pid = 0 ; pid < count ; pid++ ) { - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ ind[ pid ] ]; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; +void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, + struct part *restrict parts_i, int *restrict ind, + int count, struct cell *restrict cj) { + + struct engine *e = r->e; + int pid, pjd, k, count_j = cj->count; + double shift[3] = {0.0, 0.0, 0.0}; + struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts; + double pix[3]; + float dx[3], hi, hig2, r2; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Get the relative distance between the pairs, wrapping. */ + for (k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + } + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with + %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , + cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Loop over the parts_i. */ + for (pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[ind[pid]]; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ pjd ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } - } /* loop over the parts in ci. */ +#endif + } - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(timer_dopair_subset); - #endif + } /* loop over the parts in cj. */ + } /* loop over the parts in ci. */ - } +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif +#ifdef TIMER_VERBOSE + printf( + "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) " + "took %.3f ms.\n", + r->id, count, count_j, ci->depth, ci->h_max, cj->h_max, + ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000); +#else + TIMER_TOC(timer_dopair_subset); +#endif +} /** * @brief Compute the interactions between a cell pair, but only for the @@ -641,103 +650,104 @@ void DOPAIR_SUBSET_NAIVE ( struct runner *r , struct cell *restrict ci , struct * @param ind The list of indices of particles in @c ci to interact with. * @param count The number of particles in @c ind. */ - -void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts , int *restrict ind , int count ) { - - int pid, pjd, k, count_i = ci->count; - struct part *restrict parts_j = ci->parts; - struct part *restrict pi, *restrict pj; - double pix[3] = {0.0,0.0,0.0}; - float dx[3], hi, hig2, r2; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" , - ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , cj->loc[2] , - ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); - tic = getticks(); */ - - /* Loop over the parts in ci. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts[ ind[ pid ] ]; - pix[0] = pi->x[0]; - pix[1] = pi->x[1]; - pix[2] = pi->x[2]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_i ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ pjd ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 > 0.0f && r2 < hig2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_doself_subset[%02i]: %i/%i parts at depth %i took %.3f ms.\n" , r->id , count , ci->count , ci->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(timer_dopair_subset); - #endif +void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, + struct part *restrict parts, int *restrict ind, int count) { + + int pid, pjd, k, count_i = ci->count; + struct part *restrict parts_j = ci->parts; + struct part *restrict pi, *restrict pj; + double pix[3] = {0.0, 0.0, 0.0}; + float dx[3], hi, hig2, r2; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with + %i/%i parts and shift = [ %g %g %g ].\n" , + ci->loc[0] , ci->loc[1] , ci->loc[2] , cj->loc[0] , cj->loc[1] , + cj->loc[2] , + ci->count , cj->count , shift[0] , shift[1] , shift[2] ); fflush(stdout); + tic = getticks(); */ + + /* Loop over the parts in ci. */ + for (pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts[ind[pid]]; + pix[0] = pi->x[0]; + pix[1] = pi->x[1]; + pix[2] = pi->x[2]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; - } + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_i; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 > 0.0f && r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf("runner_doself_subset[%02i]: %i/%i parts at depth %i took %.3f ms.\n", + r->id, count, ci->count, ci->depth, + ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000); +#else + TIMER_TOC(timer_dopair_subset); +#endif +} /** * @brief Compute the interactions between a cell pair. @@ -746,605 +756,596 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part * * @param ci The first #cell. * @param cj The second #cell. */ - -void DOPAIR1 ( struct runner *r , struct cell *ci , struct cell *cj ) { - - struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }; - struct entry *restrict sort_i, *restrict sort_j; - struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3], pjx[3], di, dj; - float dx[3], hi, hig2, hj, hjg2, r2, dx_max; - double hi_max, hj_max; - double di_max, dj_min; - int count_i, count_j; - float dt_step = e->dt_step; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the sort ID. */ - sid = space_getsid( e->s , &ci , &cj , shift ); - - /* Have the cells been sorted? */ - if ( !(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid) ) ) - error( "Trying to interact unsorted cells." ); - - /* Get the cutoff shift. */ - for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ ) - rshift += shift[k]*runner_shift[ 3*sid + k ]; - - /* Pick-out the sorted lists. */ - sort_i = &ci->sort[ sid*(ci->count + 1) ]; - sort_j = &cj->sort[ sid*(cj->count + 1) ]; - - /* Get some other useful values. */ - hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma; - count_i = ci->count; count_j = cj->count; - parts_i = ci->parts; parts_j = cj->parts; - di_max = sort_i[count_i-1].d - rshift; - dj_min = sort_j[0].d; - dx_max = ( ci->dx_max + cj->dx_max ); - - /* Loop over the parts in ci. */ - for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ sort_i[ pid ].i ]; - if ( pi->dt > dt_step ) - continue; - hi = pi->h; - di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift; - if ( di < dj_min ) - continue; - - hig2 = hi * hi * kernel_gamma2; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ sort_j[pjd].i ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hi; - hjq[icount] = pj->h; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); - tic = getticks(); */ +void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { + + struct engine *restrict e = r->e; + int pid, pjd, k, sid; + double rshift, shift[3] = {0.0, 0.0, 0.0}; + struct entry *restrict sort_i, *restrict sort_j; + struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; + double pix[3], pjx[3], di, dj; + float dx[3], hi, hig2, hj, hjg2, r2, dx_max; + double hi_max, hj_max; + double di_max, dj_min; + int count_i, count_j; + float dt_step = e->dt_step; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float hiq[VEC_SIZE] __attribute__((aligned(16))); + float hjq[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the sort ID. */ + sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) + error("Trying to interact unsorted cells."); + + /* Get the cutoff shift. */ + for (rshift = 0.0, k = 0; k < 3; k++) + rshift += shift[k] * runner_shift[3 * sid + k]; + + /* Pick-out the sorted lists. */ + sort_i = &ci->sort[sid * (ci->count + 1)]; + sort_j = &cj->sort[sid * (cj->count + 1)]; + + /* Get some other useful values. */ + hi_max = ci->h_max * kernel_gamma - rshift; + hj_max = cj->h_max * kernel_gamma; + count_i = ci->count; + count_j = cj->count; + parts_i = ci->parts; + parts_j = cj->parts; + di_max = sort_i[count_i - 1].d - rshift; + dj_min = sort_j[0].d; + dx_max = (ci->dx_max + cj->dx_max); + + /* Loop over the parts in ci. */ + for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; + pid--) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[sort_i[pid].i]; + if (pi->dt > dt_step) continue; + hi = pi->h; + di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + if (di < dj_min) continue; + + hig2 = hi * hi * kernel_gamma2; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max ; pjd++ ) { - - /* Get a hold of the jth part in cj. */ - pj = &parts_j[ sort_j[ pjd ].i ]; - if ( pj->dt > dt_step ) - continue; + for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[sort_j[pjd].i]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hi; + hjq[icount] = pj->h; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + + /* printf( "runner_dopair: first half took %.3f ms...\n" , + ((double)(getticks() - tic)) / CPU_TPS * 1000 ); + tic = getticks(); */ + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + pjd++) { + + /* Get a hold of the jth part in cj. */ + pj = &parts_j[sort_j[pjd].i]; + if (pj->dt > dt_step) continue; + hj = pj->h; + dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + if (dj > di_max) continue; + + for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; + hjg2 = hj * hj * kernel_gamma2; + + /* Loop over the parts in ci. */ + for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { + + /* Get a pointer to the jth particle. */ + pi = &parts_i[sort_i[pid].i]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pjx[k] - pi->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hjg2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hj, pi->h, pj, pi); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + hiq[icount] = hj; + hjq[icount] = pi->h; + piq[icount] = pj; + pjq[icount] = pi; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + IACT_NONSYM_VEC(r2q, dxq, hiq, hjq, piq, pjq); + icount = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + IACT_NONSYM(r2q[k], &dxq[3 * k], hiq[k], hjq[k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf( + "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) " + "took %.3f ms.\n", + r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max, + fmax(ci->h[0], fmax(ci->h[1], ci->h[2])), + ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOPAIR); +#endif +} + +void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { + + struct engine *restrict e = r->e; + int pid, pjd, k, sid; + double rshift, shift[3] = {0.0, 0.0, 0.0}; + struct entry *restrict sort_i, *restrict sort_j; + struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL; + int countdt_i = 0, countdt_j = 0; + struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; + double pix[3], pjx[3], di, dj; + float dx[3], hi, hig2, hj, hjg2, r2, dx_max; + double hi_max, hj_max; + double di_max, dj_min; + int count_i, count_j; + float dt_step = e->dt_step; +#ifdef VECTORIZE + int icount1 = 0; + float r2q1[VEC_SIZE] __attribute__((aligned(16))); + float hiq1[VEC_SIZE] __attribute__((aligned(16))); + float hjq1[VEC_SIZE] __attribute__((aligned(16))); + float dxq1[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; + int icount2 = 0; + float r2q2[VEC_SIZE] __attribute__((aligned(16))); + float hiq2[VEC_SIZE] __attribute__((aligned(16))); + float hjq2[VEC_SIZE] __attribute__((aligned(16))); + float dxq2[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the shift ID. */ + sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid))) + error("Trying to interact unsorted cells."); + + /* Get the cutoff shift. */ + for (rshift = 0.0, k = 0; k < 3; k++) + rshift += shift[k] * runner_shift[3 * sid + k]; + + /* Pick-out the sorted lists. */ + sort_i = &ci->sort[sid * (ci->count + 1)]; + sort_j = &cj->sort[sid * (cj->count + 1)]; + + /* Get some other useful values. */ + hi_max = ci->h_max * kernel_gamma - rshift; + hj_max = cj->h_max * kernel_gamma; + count_i = ci->count; + count_j = cj->count; + parts_i = ci->parts; + parts_j = cj->parts; + di_max = sort_i[count_i - 1].d - rshift; + dj_min = sort_j[0].d; + dx_max = (ci->dx_max + cj->dx_max); + + /* Collect the number of parts left and right below dt. */ + if (ci->dt_max <= dt_step) { + sortdt_i = sort_i; + countdt_i = count_i; + } else if (ci->dt_min <= dt_step) { + if ((sortdt_i = (struct entry *)alloca(sizeof(struct entry) * count_i)) == + NULL) + error("Failed to allocate dt sortlists."); + for (k = 0; k < count_i; k++) + if (parts_i[sort_i[k].i].dt <= dt_step) { + sortdt_i[countdt_i] = sort_i[k]; + countdt_i += 1; + } + } + if (cj->dt_max <= dt_step) { + sortdt_j = sort_j; + countdt_j = count_j; + } else if (cj->dt_min <= dt_step) { + if ((sortdt_j = (struct entry *)alloca(sizeof(struct entry) * count_j)) == + NULL) + error("Failed to allocate dt sortlists."); + for (k = 0; k < count_j; k++) + if (parts_j[sort_j[k].i].dt <= dt_step) { + sortdt_j[countdt_j] = sort_j[k]; + countdt_j += 1; + } + } + + /* Loop over the parts in ci. */ + for (pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; + pid--) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[sort_i[pid].i]; + hi = pi->h; + di = sort_i[pid].d + hi * kernel_gamma + dx_max - rshift; + if (di < dj_min) continue; + + hig2 = hi * hi * kernel_gamma2; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + + /* Look at valid dt parts only? */ + if (pi->dt > dt_step) { + + /* Loop over the parts in cj within dt. */ + for (pjd = 0; pjd < countdt_j && sortdt_j[pjd].d < di; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[sortdt_j[pjd].i]; hj = pj->h; - dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift; - if ( dj > di_max ) - continue; - - for ( k = 0 ; k < 3 ; k++ ) - pjx[k] = pj->x[k] + shift[k]; - hjg2 = hj * hj * kernel_gamma2; - - /* Loop over the parts in ci. */ - for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) { - - /* Get a pointer to the jth particle. */ - pi = &parts_i[ sort_i[pid].i ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pjx[k] - pi->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hjg2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hj , pi->h , pj , pi ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - hiq[icount] = hj; - hjq[icount] = pi->h; - piq[icount] = pj; - pjq[icount] = pi; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq ); - icount = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOPAIR); - #endif + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pj->x[k] - pix[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hj, hi, pj, pi); + +#else + + /* Add this interaction to the queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hj; + hjq1[icount1] = hi; + piq1[icount1] = pj; + pjq1[icount1] = pi; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; + } + +#endif + } + + } /* loop over the parts in cj. */ } + /* Otherwise, look at all parts. */ + else { + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j && sort_j[pjd].d < di; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[sort_j[pjd].i]; + hj = pj->h; -void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { - - struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }; - struct entry *restrict sort_i, *restrict sort_j; - struct entry *restrict sortdt_i = NULL, *restrict sortdt_j = NULL; - int countdt_i = 0, countdt_j = 0; - struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3], pjx[3], di, dj; - float dx[3], hi, hig2, hj, hjg2, r2, dx_max; - double hi_max, hj_max; - double di_max, dj_min; - int count_i, count_j; - float dt_step = e->dt_step; - #ifdef VECTORIZE - int icount1 = 0; - float r2q1[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq1[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq1[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; - int icount2 = 0; - float r2q2[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq2[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq2[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the shift ID. */ - sid = space_getsid( e->s , &ci , &cj , shift ); - - /* Have the cells been sorted? */ - if ( !(ci->sorted & (1 << sid)) || !(cj->sorted & (1 << sid) ) ) - error( "Trying to interact unsorted cells." ); - - /* Get the cutoff shift. */ - for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ ) - rshift += shift[k]*runner_shift[ 3*sid + k ]; - - /* Pick-out the sorted lists. */ - sort_i = &ci->sort[ sid*(ci->count + 1) ]; - sort_j = &cj->sort[ sid*(cj->count + 1) ]; - - /* Get some other useful values. */ - hi_max = ci->h_max*kernel_gamma - rshift; hj_max = cj->h_max*kernel_gamma; - count_i = ci->count; count_j = cj->count; - parts_i = ci->parts; parts_j = cj->parts; - di_max = sort_i[count_i-1].d - rshift; - dj_min = sort_j[0].d; - dx_max = ( ci->dx_max + cj->dx_max ); - - /* Collect the number of parts left and right below dt. */ - if ( ci->dt_max <= dt_step ) { - sortdt_i = sort_i; - countdt_i = count_i; + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; } - else if ( ci->dt_min <= dt_step ) { - if ( ( sortdt_i = (struct entry *)alloca( sizeof(struct entry) * count_i ) ) == NULL ) - error( "Failed to allocate dt sortlists." ); - for ( k = 0 ; k < count_i ; k++ ) - if ( parts_i[ sort_i[ k ].i ].dt <= dt_step ) { - sortdt_i[ countdt_i ] = sort_i[k]; - countdt_i += 1; - } + + /* Hit or miss? */ + if (r2 < hig2) { + +#ifndef VECTORIZE + + /* Does pj need to be updated too? */ + if (pj->dt <= dt_step) + IACT(r2, dx, hi, hj, pi, pj); + else + IACT_NONSYM(r2, dx, hi, hj, pi, pj); + +#else + + /* Does pj need to be updated too? */ + if (pj->dt <= dt_step) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3 * icount2 + 0] = dx[0]; + dxq2[3 * icount2 + 1] = dx[1]; + dxq2[3 * icount2 + 2] = dx[2]; + hiq2[icount2] = hi; + hjq2[icount2] = hj; + piq2[icount2] = pi; + pjq2[icount2] = pj; + icount2 += 1; + + /* Flush? */ + if (icount2 == VEC_SIZE) { + IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2); + icount2 = 0; + } + + } else { + + /* Add this interaction to the non-symmetric queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hi; + hjq1[icount1] = hj; + piq1[icount1] = pi; + pjq1[icount1] = pj; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; + } + } + +#endif } - if ( cj->dt_max <= dt_step ) { - sortdt_j = sort_j; - countdt_j = count_j; + + } /* loop over the parts in cj. */ + } + + } /* loop over the parts in ci. */ + + /* printf( "runner_dopair: first half took %.3f ms...\n" , + ((double)(getticks() - tic)) / CPU_TPS * 1000 ); + tic = getticks(); */ + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; + pjd++) { + + /* Get a hold of the jth part in cj. */ + pj = &parts_j[sort_j[pjd].i]; + hj = pj->h; + dj = sort_j[pjd].d - hj * kernel_gamma - dx_max - rshift; + if (dj > di_max) continue; + + for (k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; + hjg2 = hj * hj * kernel_gamma2; + + /* Is this particle outside the dt? */ + if (pj->dt > dt_step) { + + /* Loop over the parts in ci. */ + for (pid = countdt_i - 1; pid >= 0 && sortdt_i[pid].d > dj; pid--) { + + /* Get a pointer to the jth particle. */ + pi = &parts_i[sortdt_i[pid].i]; + hi = pi->h; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pi->x[k] - pjx[k]; + r2 += dx[k] * dx[k]; } - else if ( cj->dt_min <= dt_step ) { - if ( ( sortdt_j = (struct entry *)alloca( sizeof(struct entry) * count_j ) ) == NULL ) - error( "Failed to allocate dt sortlists." ); - for ( k = 0 ; k < count_j ; k++ ) - if ( parts_j[ sort_j[ k ].i ].dt <= dt_step ) { - sortdt_j[ countdt_j ] = sort_j[k]; - countdt_j += 1; - } + + /* Hit or miss? */ + if (r2 < hjg2 && r2 > hi * hi * kernel_gamma2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hi, hj, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hi; + hjq1[icount1] = hj; + piq1[icount1] = pi; + pjq1[icount1] = pj; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; + } + +#endif } - - /* Loop over the parts in ci. */ - for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min ; pid-- ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ sort_i[ pid ].i ]; + + } /* loop over the parts in cj. */ + } + + /* Otherwise, interact with all particles in cj. */ + else { + + /* Loop over the parts in ci. */ + for (pid = count_i - 1; pid >= 0 && sort_i[pid].d > dj; pid--) { + + /* Get a pointer to the jth particle. */ + pi = &parts_i[sort_i[pid].i]; hi = pi->h; - di = sort_i[pid].d + hi*kernel_gamma + dx_max - rshift; - if ( di < dj_min ) - continue; - - hig2 = hi * hi * kernel_gamma2; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - - /* Look at valid dt parts only? */ - if ( pi->dt > dt_step ) { - - /* Loop over the parts in cj within dt. */ - for ( pjd = 0 ; pjd < countdt_j && sortdt_j[pjd].d < di ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ sortdt_j[pjd].i ]; - hj = pj->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pj->x[k] - pix[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hj , hi , pj , pi ); - - #else - - /* Add this interaction to the queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hj; - hjq1[icount1] = hi; - piq1[icount1] = pj; - pjq1[icount1] = pi; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } - - /* Otherwise, look at all parts. */ - else { - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ sort_j[pjd].i ]; - hj = pj->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 ) { - - #ifndef VECTORIZE - - /* Does pj need to be updated too? */ - if ( pj->dt <= dt_step ) - IACT( r2 , dx , hi , hj , pi , pj ); - else - IACT_NONSYM( r2 , dx , hi , hj , pi , pj ); - - #else - - /* Does pj need to be updated too? */ - if ( pj->dt <= dt_step ) { - - /* Add this interaction to the symmetric queue. */ - r2q2[icount2] = r2; - dxq2[3*icount2+0] = dx[0]; - dxq2[3*icount2+1] = dx[1]; - dxq2[3*icount2+2] = dx[2]; - hiq2[icount2] = hi; - hjq2[icount2] = hj; - piq2[icount2] = pi; - pjq2[icount2] = pj; - icount2 += 1; - - /* Flush? */ - if ( icount2 == VEC_SIZE ) { - IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); - icount2 = 0; - } - - } - - else { - - /* Add this interaction to the non-symmetric queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hi; - hjq1[icount1] = hj; - piq1[icount1] = pi; - pjq1[icount1] = pj; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - } - - #endif - - } - - } /* loop over the parts in cj. */ - - } - - } /* loop over the parts in ci. */ - - /* printf( "runner_dopair: first half took %.3f ms...\n" , ((double)(getticks() - tic)) / CPU_TPS * 1000 ); - tic = getticks(); */ - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max ; pjd++ ) { - - /* Get a hold of the jth part in cj. */ - pj = &parts_j[ sort_j[ pjd ].i ]; - hj = pj->h; - dj = sort_j[pjd].d - hj*kernel_gamma - dx_max - rshift; - if ( dj > di_max ) - continue; - - for ( k = 0 ; k < 3 ; k++ ) - pjx[k] = pj->x[k] + shift[k]; - hjg2 = hj * hj * kernel_gamma2; - - /* Is this particle outside the dt? */ - if ( pj->dt > dt_step ) { - - /* Loop over the parts in ci. */ - for ( pid = countdt_i-1 ; pid >= 0 && sortdt_i[pid].d > dj ; pid-- ) { - - /* Get a pointer to the jth particle. */ - pi = &parts_i[ sortdt_i[pid].i ]; - hi = pi->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pi->x[k] - pjx[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hjg2 && r2 > hi*hi*kernel_gamma2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hi , hj , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hi; - hjq1[icount1] = hj; - piq1[icount1] = pi; - pjq1[icount1] = pj; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - #endif - - } - - } /* loop over the parts in cj. */ + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pjx[k] - pi->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hjg2 && r2 > hi * hi * kernel_gamma2) { + +#ifndef VECTORIZE + + /* Does pi need to be updated too? */ + if (pi->dt <= dt_step) + IACT(r2, dx, hj, hi, pj, pi); + else + IACT_NONSYM(r2, dx, hj, hi, pj, pi); + +#else + + /* Does pi need to be updated too? */ + if (pi->dt <= dt_step) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3 * icount2 + 0] = dx[0]; + dxq2[3 * icount2 + 1] = dx[1]; + dxq2[3 * icount2 + 2] = dx[2]; + hiq2[icount2] = hj; + hjq2[icount2] = hi; + piq2[icount2] = pj; + pjq2[icount2] = pi; + icount2 += 1; + + /* Flush? */ + if (icount2 == VEC_SIZE) { + IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2); + icount2 = 0; } - - /* Otherwise, interact with all particles in cj. */ - else { - - /* Loop over the parts in ci. */ - for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) { - - /* Get a pointer to the jth particle. */ - pi = &parts_i[ sort_i[pid].i ]; - hi = pi->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pjx[k] - pi->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hjg2 && r2 > hi*hi*kernel_gamma2 ) { - - #ifndef VECTORIZE - - /* Does pi need to be updated too? */ - if ( pi->dt <= dt_step ) - IACT( r2 , dx , hj , hi , pj , pi ); - else - IACT_NONSYM( r2 , dx , hj , hi , pj , pi ); - - #else - - /* Does pi need to be updated too? */ - if ( pi->dt <= dt_step ) { - - /* Add this interaction to the symmetric queue. */ - r2q2[icount2] = r2; - dxq2[3*icount2+0] = dx[0]; - dxq2[3*icount2+1] = dx[1]; - dxq2[3*icount2+2] = dx[2]; - hiq2[icount2] = hj; - hjq2[icount2] = hi; - piq2[icount2] = pj; - pjq2[icount2] = pi; - icount2 += 1; - - /* Flush? */ - if ( icount2 == VEC_SIZE ) { - IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); - icount2 = 0; - } - - } - - else { - - /* Add this interaction to the non-summetric queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hj; - hjq1[icount1] = hi; - piq1[icount1] = pj; - pjq1[icount1] = pi; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - } - - #endif - - } - - } /* loop over the parts in cj. */ - + + } else { + + /* Add this interaction to the non-summetric queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hj; + hjq1[icount1] = hi; + piq1[icount1] = pj; + pjq1[icount1] = pi; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; } - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount1 > 0 ) - for ( k = 0 ; k < icount1 ; k++ ) - IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] ); - if ( icount2 > 0 ) - for ( k = 0 ; k < icount2 ; k++ ) - IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOPAIR); - #endif + } + +#endif + } + } /* loop over the parts in cj. */ } + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount1 > 0) + for (k = 0; k < icount1; k++) + IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); + if (icount2 > 0) + for (k = 0; k < icount2; k++) + IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); +#endif + +#ifdef TIMER_VERBOSE + printf( + "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) " + "took %.3f ms.\n", + r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max, + fmax(ci->h[0], fmax(ci->h[1], ci->h[2])), + ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOPAIR); +#endif +} /** * @brief Compute the cell self-interaction. @@ -1353,441 +1354,425 @@ void DOPAIR2 ( struct runner *r , struct cell *ci , struct cell *cj ) { * @param c The #cell. */ -void DOSELF1 ( struct runner *r , struct cell *restrict c ) { - - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hj, hig2, r2; - struct part *restrict parts = c->parts, *restrict pi, *restrict pj; - float dt_step = r->e->dt_step; - int firstdt = 0, countdt = 0, *indt = NULL, doj; - #ifdef VECTORIZE - int icount1 = 0; - float r2q1[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq1[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq1[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; - int icount2 = 0; - float r2q2[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq2[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq2[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; - #endif - TIMER_TIC - - /* Set up indt if needed. */ - if ( c->dt_min > dt_step ) - return; - else if ( c->dt_max > dt_step ) { - if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL ) - error( "Failed to allocate indt." ); - for ( k = 0 ; k < count ; k++ ) - if ( parts[k].dt <= dt_step ) { - indt[ countdt ] = k; - countdt += 1; - } +void DOSELF1(struct runner *r, struct cell *restrict c) { + + int k, pid, pjd, count = c->count; + double pix[3]; + float dx[3], hi, hj, hig2, r2; + struct part *restrict parts = c->parts, *restrict pi, *restrict pj; + float dt_step = r->e->dt_step; + int firstdt = 0, countdt = 0, *indt = NULL, doj; +#ifdef VECTORIZE + int icount1 = 0; + float r2q1[VEC_SIZE] __attribute__((aligned(16))); + float hiq1[VEC_SIZE] __attribute__((aligned(16))); + float hjq1[VEC_SIZE] __attribute__((aligned(16))); + float dxq1[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; + int icount2 = 0; + float r2q2[VEC_SIZE] __attribute__((aligned(16))); + float hiq2[VEC_SIZE] __attribute__((aligned(16))); + float hjq2[VEC_SIZE] __attribute__((aligned(16))); + float dxq2[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; +#endif + TIMER_TIC + + /* Set up indt if needed. */ + if (c->dt_min > dt_step) + return; + else if (c->dt_max > dt_step) { + if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) + error("Failed to allocate indt."); + for (k = 0; k < count; k++) + if (parts[k].dt <= dt_step) { + indt[countdt] = k; + countdt += 1; + } + } + + /* Loop over the particles in the cell. */ + for (pid = 0; pid < count; pid++) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + + /* Get the particle position and radius. */ + for (k = 0; k < 3; k++) pix[k] = pi->x[k]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; + + /* Is the ith particle inactive? */ + if (pi->dt > dt_step) { + + /* Loop over the other particles .*/ + for (pjd = firstdt; pjd < countdt; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts[indt[pjd]]; + hj = pj->h; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pj->x[k] - pix[k]; + r2 += dx[k] * dx[k]; } - - /* Loop over the particles in the cell. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a pointer to the ith particle. */ - pi = &parts[pid]; - - /* Get the particle position and radius. */ - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Is the ith particle inactive? */ - if ( pi->dt > dt_step ) { - - /* Loop over the other particles .*/ - for ( pjd = firstdt ; pjd < countdt ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts[ indt[ pjd ] ]; - hj = pj->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pj->x[k] - pix[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hj*hj*kernel_gamma2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hj , hi , pj , pi ); - - #else - - /* Add this interaction to the queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hj; - hjq1[icount1] = hi; - piq1[icount1] = pj; - pjq1[icount1] = pi; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - #endif - - } - - } /* loop over all other particles. */ - + + /* Hit or miss? */ + if (r2 < hj * hj * kernel_gamma2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hj, hi, pj, pi); + +#else + + /* Add this interaction to the queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hj; + hjq1[icount1] = hi; + piq1[icount1] = pj; + pjq1[icount1] = pi; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; + } + +#endif + } + + } /* loop over all other particles. */ + + } + + /* Otherwise, interact with all candidates. */ + else { + + /* We caught a live one! */ + firstdt += 1; + + /* Loop over the other particles .*/ + for (pjd = pid + 1; pjd < count; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts[pjd]; + hj = pj->h; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + doj = (pj->dt <= dt_step) && (r2 < hj * hj * kernel_gamma2); + + /* Hit or miss? */ + if (r2 < hig2 || doj) { + +#ifndef VECTORIZE + + /* Which parts need to be updated? */ + if (r2 < hig2 && doj) + IACT(r2, dx, hi, hj, pi, pj); + else if (!doj) + IACT_NONSYM(r2, dx, hi, hj, pi, pj); + else { + dx[0] = -dx[0]; + dx[1] = -dx[1]; + dx[2] = -dx[2]; + IACT_NONSYM(r2, dx, hj, hi, pj, pi); + } + +#else + + /* Does pj need to be updated too? */ + if (r2 < hig2 && doj) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3 * icount2 + 0] = dx[0]; + dxq2[3 * icount2 + 1] = dx[1]; + dxq2[3 * icount2 + 2] = dx[2]; + hiq2[icount2] = hi; + hjq2[icount2] = hj; + piq2[icount2] = pi; + pjq2[icount2] = pj; + icount2 += 1; + + /* Flush? */ + if (icount2 == VEC_SIZE) { + IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2); + icount2 = 0; } - - /* Otherwise, interact with all candidates. */ - else { - - /* We caught a live one! */ - firstdt += 1; - - /* Loop over the other particles .*/ - for ( pjd = pid+1 ; pjd < count ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; - hj = pj->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - doj = ( pj->dt <= dt_step ) && ( r2 < hj*hj*kernel_gamma2 ); - - /* Hit or miss? */ - if ( r2 < hig2 || doj ) { - - #ifndef VECTORIZE - - /* Which parts need to be updated? */ - if ( r2 < hig2 && doj ) - IACT( r2 , dx , hi , hj , pi , pj ); - else if ( !doj ) - IACT_NONSYM( r2 , dx , hi , hj , pi , pj ); - else { - dx[0] = -dx[0]; dx[1] = -dx[1]; dx[2] = -dx[2]; - IACT_NONSYM( r2 , dx , hj , hi , pj , pi ); - } - - #else - - /* Does pj need to be updated too? */ - if ( r2 < hig2 && doj ) { - - /* Add this interaction to the symmetric queue. */ - r2q2[icount2] = r2; - dxq2[3*icount2+0] = dx[0]; - dxq2[3*icount2+1] = dx[1]; - dxq2[3*icount2+2] = dx[2]; - hiq2[icount2] = hi; - hjq2[icount2] = hj; - piq2[icount2] = pi; - pjq2[icount2] = pj; - icount2 += 1; - - /* Flush? */ - if ( icount2 == VEC_SIZE ) { - IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); - icount2 = 0; - } - - } - - else if ( !doj ) { - - /* Add this interaction to the non-symmetric queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hi; - hjq1[icount1] = hj; - piq1[icount1] = pi; - pjq1[icount1] = pj; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - } - - else { - - /* Add this interaction to the non-symmetric queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = -dx[0]; - dxq1[3*icount1+1] = -dx[1]; - dxq1[3*icount1+2] = -dx[2]; - hiq1[icount1] = hj; - hjq1[icount1] = hi; - piq1[icount1] = pj; - pjq1[icount1] = pi; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - } - - #endif - - } - - } /* loop over all other particles. */ - + + } else if (!doj) { + + /* Add this interaction to the non-symmetric queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hi; + hjq1[icount1] = hj; + piq1[icount1] = pi; + pjq1[icount1] = pj; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; } - - } /* loop over all particles. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount1 > 0 ) - for ( k = 0 ; k < icount1 ; k++ ) - IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] ); - if ( icount2 > 0 ) - for ( k = 0 ; k < icount2 ; k++ ) - IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_doself1[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOSELF); - #endif + } else { + + /* Add this interaction to the non-symmetric queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = -dx[0]; + dxq1[3 * icount1 + 1] = -dx[1]; + dxq1[3 * icount1 + 2] = -dx[2]; + hiq1[icount1] = hj; + hjq1[icount1] = hi; + piq1[icount1] = pj; + pjq1[icount1] = pi; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; + } + } + +#endif + } + + } /* loop over all other particles. */ } + } /* loop over all particles. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount1 > 0) + for (k = 0; k < icount1; k++) + IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); + if (icount2 > 0) + for (k = 0; k < icount2; k++) + IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); +#endif + +#ifdef TIMER_VERBOSE + printf("runner_doself1[%02i]: %i parts at depth %i took %.3f ms.\n", r->id, + count, c->depth, ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOSELF); +#endif +} + +void DOSELF2(struct runner *r, struct cell *restrict c) { + + int k, pid, pjd, count = c->count; + double pix[3]; + float dx[3], hi, hj, hig2, r2; + struct part *restrict parts = c->parts, *restrict pi, *restrict pj; + float dt_step = r->e->dt_step; + int firstdt = 0, countdt = 0, *indt = NULL; +#ifdef VECTORIZE + int icount1 = 0; + float r2q1[VEC_SIZE] __attribute__((aligned(16))); + float hiq1[VEC_SIZE] __attribute__((aligned(16))); + float hjq1[VEC_SIZE] __attribute__((aligned(16))); + float dxq1[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; + int icount2 = 0; + float r2q2[VEC_SIZE] __attribute__((aligned(16))); + float hiq2[VEC_SIZE] __attribute__((aligned(16))); + float hjq2[VEC_SIZE] __attribute__((aligned(16))); + float dxq2[3 * VEC_SIZE] __attribute__((aligned(16))); + struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; +#endif + TIMER_TIC + + /* Set up indt if needed. */ + if (c->dt_min > dt_step) + return; + else if (c->dt_max > dt_step) { + if ((indt = (int *)alloca(sizeof(int) * count)) == NULL) + error("Failed to allocate indt."); + for (k = 0; k < count; k++) + if (parts[k].dt <= dt_step) { + indt[countdt] = k; + countdt += 1; + } + } + + /* Loop over the particles in the cell. */ + for (pid = 0; pid < count; pid++) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + + /* Get the particle position and radius. */ + for (k = 0; k < 3; k++) pix[k] = pi->x[k]; + hi = pi->h; + hig2 = hi * hi * kernel_gamma2; + + /* Is the ith particle not active? */ + if (pi->dt > dt_step) { + + /* Loop over the other particles .*/ + for (pjd = firstdt; pjd < countdt; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts[indt[pjd]]; + hj = pj->h; -void DOSELF2 ( struct runner *r , struct cell *restrict c ) { - - int k, pid, pjd, count = c->count; - double pix[3]; - float dx[3], hi, hj, hig2, r2; - struct part *restrict parts = c->parts, *restrict pi, *restrict pj; - float dt_step = r->e->dt_step; - int firstdt = 0, countdt = 0, *indt = NULL; - #ifdef VECTORIZE - int icount1 = 0; - float r2q1[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq1[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq1[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq1[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq1[VEC_SIZE], *pjq1[VEC_SIZE]; - int icount2 = 0; - float r2q2[VEC_SIZE] __attribute__ ((aligned (16))); - float hiq2[VEC_SIZE] __attribute__ ((aligned (16))); - float hjq2[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq2[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct part *piq2[VEC_SIZE], *pjq2[VEC_SIZE]; - #endif - TIMER_TIC - - /* Set up indt if needed. */ - if ( c->dt_min > dt_step ) - return; - else if ( c->dt_max > dt_step ) { - if ( ( indt = (int *)alloca( sizeof(int) * count ) ) == NULL ) - error( "Failed to allocate indt." ); - for ( k = 0 ; k < count ; k++ ) - if ( parts[k].dt <= dt_step ) { - indt[ countdt ] = k; - countdt += 1; - } + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pj->x[k] - pix[k]; + r2 += dx[k] * dx[k]; } - - /* Loop over the particles in the cell. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a pointer to the ith particle. */ - pi = &parts[pid]; - - /* Get the particle position and radius. */ - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k]; - hi = pi->h; - hig2 = hi * hi * kernel_gamma2; - - /* Is the ith particle not active? */ - if ( pi->dt > dt_step ) { - - /* Loop over the other particles .*/ - for ( pjd = firstdt ; pjd < countdt ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts[ indt[ pjd ] ]; - hj = pj->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pj->x[k] - pix[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 || r2 < hj*hj*kernel_gamma2 ) { - - #ifndef VECTORIZE - - IACT_NONSYM( r2 , dx , hj , hi , pj , pi ); - - #else - - /* Add this interaction to the queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hj; - hjq1[icount1] = hi; - piq1[icount1] = pj; - pjq1[icount1] = pi; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - #endif - - } - - } /* loop over all other particles. */ - + + /* Hit or miss? */ + if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) { + +#ifndef VECTORIZE + + IACT_NONSYM(r2, dx, hj, hi, pj, pi); + +#else + + /* Add this interaction to the queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hj; + hjq1[icount1] = hi; + piq1[icount1] = pj; + pjq1[icount1] = pi; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; + } + +#endif + } + + } /* loop over all other particles. */ + + } + + /* Otherwise, interact with all candidates. */ + else { + + /* We caught a live one! */ + firstdt += 1; + + /* Loop over the other particles .*/ + for (pjd = pid + 1; pjd < count; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts[pjd]; + hj = pj->h; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + + /* Hit or miss? */ + if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) { + +#ifndef VECTORIZE + + /* Does pj need to be updated too? */ + if (pj->dt <= dt_step) + IACT(r2, dx, hi, hj, pi, pj); + else + IACT_NONSYM(r2, dx, hi, hj, pi, pj); + +#else + + /* Does pj need to be updated too? */ + if (pj->dt <= dt_step) { + + /* Add this interaction to the symmetric queue. */ + r2q2[icount2] = r2; + dxq2[3 * icount2 + 0] = dx[0]; + dxq2[3 * icount2 + 1] = dx[1]; + dxq2[3 * icount2 + 2] = dx[2]; + hiq2[icount2] = hi; + hjq2[icount2] = hj; + piq2[icount2] = pi; + pjq2[icount2] = pj; + icount2 += 1; + + /* Flush? */ + if (icount2 == VEC_SIZE) { + IACT_VEC(r2q2, dxq2, hiq2, hjq2, piq2, pjq2); + icount2 = 0; } - - /* Otherwise, interact with all candidates. */ - else { - - /* We caught a live one! */ - firstdt += 1; - - /* Loop over the other particles .*/ - for ( pjd = pid+1 ; pjd < count ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts[pjd]; - hj = pj->h; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Hit or miss? */ - if ( r2 < hig2 || r2 < hj*hj*kernel_gamma2 ) { - - #ifndef VECTORIZE - - /* Does pj need to be updated too? */ - if ( pj->dt <= dt_step ) - IACT( r2 , dx , hi , hj , pi , pj ); - else - IACT_NONSYM( r2 , dx , hi , hj , pi , pj ); - - #else - - /* Does pj need to be updated too? */ - if ( pj->dt <= dt_step ) { - - /* Add this interaction to the symmetric queue. */ - r2q2[icount2] = r2; - dxq2[3*icount2+0] = dx[0]; - dxq2[3*icount2+1] = dx[1]; - dxq2[3*icount2+2] = dx[2]; - hiq2[icount2] = hi; - hjq2[icount2] = hj; - piq2[icount2] = pi; - pjq2[icount2] = pj; - icount2 += 1; - - /* Flush? */ - if ( icount2 == VEC_SIZE ) { - IACT_VEC( r2q2 , dxq2 , hiq2 , hjq2 , piq2 , pjq2 ); - icount2 = 0; - } - - } - - else { - - /* Add this interaction to the non-symmetric queue. */ - r2q1[icount1] = r2; - dxq1[3*icount1+0] = dx[0]; - dxq1[3*icount1+1] = dx[1]; - dxq1[3*icount1+2] = dx[2]; - hiq1[icount1] = hi; - hjq1[icount1] = hj; - piq1[icount1] = pi; - pjq1[icount1] = pj; - icount1 += 1; - - /* Flush? */ - if ( icount1 == VEC_SIZE ) { - IACT_NONSYM_VEC( r2q1 , dxq1 , hiq1 , hjq1 , piq1 , pjq1 ); - icount1 = 0; - } - - } - - #endif - - } - - } /* loop over all other particles. */ - + + } else { + + /* Add this interaction to the non-symmetric queue. */ + r2q1[icount1] = r2; + dxq1[3 * icount1 + 0] = dx[0]; + dxq1[3 * icount1 + 1] = dx[1]; + dxq1[3 * icount1 + 2] = dx[2]; + hiq1[icount1] = hi; + hjq1[icount1] = hj; + piq1[icount1] = pi; + pjq1[icount1] = pj; + icount1 += 1; + + /* Flush? */ + if (icount1 == VEC_SIZE) { + IACT_NONSYM_VEC(r2q1, dxq1, hiq1, hjq1, piq1, pjq1); + icount1 = 0; } - - } /* loop over all particles. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount1 > 0 ) - for ( k = 0 ; k < icount1 ; k++ ) - IACT_NONSYM( r2q1[k] , &dxq1[3*k] , hiq1[k] , hjq1[k] , piq1[k] , pjq1[k] ); - if ( icount2 > 0 ) - for ( k = 0 ; k < icount2 ; k++ ) - IACT( r2q2[k] , &dxq2[3*k] , hiq2[k] , hjq2[k] , piq2[k] , pjq2[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_doself2[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOSELF); - #endif + } + +#endif + } + } /* loop over all other particles. */ } + } /* loop over all particles. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount1 > 0) + for (k = 0; k < icount1; k++) + IACT_NONSYM(r2q1[k], &dxq1[3 * k], hiq1[k], hjq1[k], piq1[k], pjq1[k]); + if (icount2 > 0) + for (k = 0; k < icount2; k++) + IACT(r2q2[k], &dxq2[3 * k], hiq2[k], hjq2[k], piq2[k], pjq2[k]); +#endif + +#ifdef TIMER_VERBOSE + printf("runner_doself2[%02i]: %i parts at depth %i took %.3f ms.\n", r->id, + count, c->depth, ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOSELF); +#endif +} /** * @brief Compute grouped sub-cell interactions @@ -1802,1015 +1787,1156 @@ void DOSELF2 ( struct runner *r , struct cell *restrict c ) { * redundant computations to find the sid on-the-fly. */ -void DOSUB1 ( struct runner *r , struct cell *ci , struct cell *cj , int sid , int gettimer ) { - - int j = 0, k; - double shift[3]; - float h; - struct space *s = r->e->s; - float dt_step = r->e->dt_step; - - TIMER_TIC - - /* Is this a single cell? */ - if ( cj == NULL ) { - - /* Should we even bother? */ - if ( ci->dt_min > dt_step ) - return; - - /* Recurse? */ - if ( ci->split ) { - - /* Loop over all progeny. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) { - DOSUB1( r , ci->progeny[k] , NULL , -1 , 0 ); - for ( j = k+1 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL ) - DOSUB1( r , ci->progeny[k] , ci->progeny[j] , -1 , 0 ); - } - - } - - /* Otherwsie, compute self-interaction. */ - else - DOSELF1( r , ci ); - - } /* self-interaction. */ - - /* Otherwise, it's a pair interaction. */ - else { - - /* Should we even bother? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the cell dimensions. */ - h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) ); - - /* Get the type of pair if not specified explicitly. */ - // if ( sid < 0 ) - sid = space_getsid( s , &ci , &cj , shift ); - - /* Recurse? */ - if ( ci->split && cj->split && - fmaxf( ci->h_max , cj->h_max )*kernel_gamma + ci->dx_max + cj->dx_max < h/2 ) { - - /* Different types of flags. */ - switch ( sid ) { - - /* Regular sub-cell interactions of a single cell. */ - case 0: /* ( 1 , 1 , 1 ) */ - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - break; - - case 1: /* ( 1 , 1 , 0 ) */ - if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 ); - break; - - case 2: /* ( 1 , 1 , -1 ) */ - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - break; - - case 3: /* ( 1 , 0 , 1 ) */ - if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 ); - break; - - case 4: /* ( 1 , 0 , 0 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[3] , -1 , 0 ); - break; - - case 5: /* ( 1 , 0 , -1 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 ); - break; - - case 6: /* ( 1 , -1 , 1 ) */ - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - break; - - case 7: /* ( 1 , -1 , 0 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 ); - break; - - case 8: /* ( 1 , -1 , -1 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB1( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - break; - - case 9: /* ( 0 , 1 , 1 ) */ - if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 ); - break; - - case 10: /* ( 0 , 1 , 0 ) */ - if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[2] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[2] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOSUB1( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[5] , -1 , 0 ); - break; - - case 11: /* ( 0 , 1 , -1 ) */ - if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOSUB1( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOSUB1( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 ); - break; - - case 12: /* ( 0 , 0 , 1 ) */ - if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[1] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[1] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[1] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL ) - DOSUB1( r , ci->progeny[1] , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL ) - DOSUB1( r , ci->progeny[3] , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL ) - DOSUB1( r , ci->progeny[5] , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL ) - DOSUB1( r , ci->progeny[7] , cj->progeny[6] , -1 , 0 ); - break; - - } - - } - - /* Otherwise, compute the pair directly. */ - else if ( ci->dt_min <= dt_step || cj->dt_min <= dt_step ) { - - /* Do any of the cells need to be sorted first? */ - if ( !(ci->sorted & (1 << sid) ) ) - runner_dosort( r , ci , (1 << sid) , 1 ); - if ( !(cj->sorted & (1 << sid) ) ) - runner_dosort( r , cj , (1 << sid) , 1 ); - - /* Compute the interactions. */ - DOPAIR1( r , ci , cj ); - - } - - } /* otherwise, pair interaction. */ - +void DOSUB1(struct runner *r, struct cell *ci, struct cell *cj, int sid, + int gettimer) { + + int j = 0, k; + double shift[3]; + float h; + struct space *s = r->e->s; + float dt_step = r->e->dt_step; + + TIMER_TIC + + /* Is this a single cell? */ + if (cj == NULL) { - if ( gettimer ) - #ifdef TIMER_VERBOSE - printf( "runner_dosub1[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , sid , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOSUB); - #endif + /* Should we even bother? */ + if (ci->dt_min > dt_step) return; + + /* Recurse? */ + if (ci->split) { + + /* Loop over all progeny. */ + for (k = 0; k < 8; k++) + if (ci->progeny[k] != NULL) { + DOSUB1(r, ci->progeny[k], NULL, -1, 0); + for (j = k + 1; j < 8; j++) + if (ci->progeny[j] != NULL) + DOSUB1(r, ci->progeny[k], ci->progeny[j], -1, 0); + } } + /* Otherwsie, compute self-interaction. */ + else + DOSELF1(r, ci); + + } /* self-interaction. */ + + /* Otherwise, it's a pair interaction. */ + else { + + /* Should we even bother? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the cell dimensions. */ + h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + + /* Get the type of pair if not specified explicitly. */ + // if ( sid < 0 ) + sid = space_getsid(s, &ci, &cj, shift); + + /* Recurse? */ + if (ci->split && cj->split && + fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < + h / 2) { + + /* Different types of flags. */ + switch (sid) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + break; + + case 1: /* ( 1 , 1 , 0 ) */ + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0); + break; + + case 2: /* ( 1 , 1 , -1 ) */ + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); + break; + + case 3: /* ( 1 , 0 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[0], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[1], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[2], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[3], -1, 0); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[3], -1, 0); + break; + + case 6: /* ( 1 , -1 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); + break; + + case 7: /* ( 1 , -1 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[3], -1, 0); + break; + + case 8: /* ( 1 , -1 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB1(r, ci->progeny[4], cj->progeny[3], -1, 0); + break; + + case 9: /* ( 0 , 1 , 1 ) */ + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[2], cj->progeny[0], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[2], cj->progeny[4], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[1], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[4], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[5], -1, 0); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB1(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB1(r, ci->progeny[6], cj->progeny[5], -1, 0); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[1], cj->progeny[0], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[1], cj->progeny[2], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[1], cj->progeny[4], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) + DOSUB1(r, ci->progeny[1], cj->progeny[6], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[2], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) + DOSUB1(r, ci->progeny[3], cj->progeny[6], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[4], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) + DOSUB1(r, ci->progeny[5], cj->progeny[6], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) + DOSUB1(r, ci->progeny[7], cj->progeny[6], -1, 0); + break; + } -void DOSUB2 ( struct runner *r , struct cell *ci , struct cell *cj , int sid , int gettimer ) { - - int j, k; - double shift[3]; - float h; - struct space *s = r->e->s; - float dt_step = r->e->dt_step; - - TIMER_TIC - - /* Is this a single cell? */ - if ( cj == NULL ) { - - /* Should we even bother? */ - if ( ci->dt_min > dt_step ) - return; - - /* Recurse? */ - if ( ci->split ) { - - /* Loop over all progeny. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) { - DOSUB2( r , ci->progeny[k] , NULL , -1 , 0 ); - for ( j = k+1 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL ) - DOSUB2( r , ci->progeny[k] , ci->progeny[j] , -1 , 0 ); - } - - } - - /* Otherwsie, compute self-interaction. */ - else - DOSELF2( r , ci ); - - } /* self-interaction. */ - - /* Otherwise, it's a pair interaction. */ - else { - - /* Should we even bother? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the cell dimensions. */ - h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) ); - - /* Get the type of pair if not specified explicitly. */ - // if ( sid < 0 ) - sid = space_getsid( s , &ci , &cj , shift ); - - /* Recurse? */ - if ( ci->split && cj->split && - fmaxf( ci->h_max , cj->h_max )*kernel_gamma + ci->dx_max + cj->dx_max < h/2 ) { - - /* Different types of flags. */ - switch ( sid ) { - - /* Regular sub-cell interactions of a single cell. */ - case 0: /* ( 1 , 1 , 1 ) */ - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - break; - - case 1: /* ( 1 , 1 , 0 ) */ - if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 ); - break; - - case 2: /* ( 1 , 1 , -1 ) */ - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - break; - - case 3: /* ( 1 , 0 , 1 ) */ - if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 ); - break; - - case 4: /* ( 1 , 0 , 0 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[3] , -1 , 0 ); - break; - - case 5: /* ( 1 , 0 , -1 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[3] , -1 , 0 ); - break; - - case 6: /* ( 1 , -1 , 1 ) */ - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - break; - - case 7: /* ( 1 , -1 , 0 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[3] , -1 , 0 ); - break; - - case 8: /* ( 1 , -1 , -1 ) */ - if ( ci->progeny[4] != NULL && cj->progeny[3] != NULL ) - DOSUB2( r , ci->progeny[4] , cj->progeny[3] , -1 , 0 ); - break; - - case 9: /* ( 0 , 1 , 1 ) */ - if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 ); - break; - - case 10: /* ( 0 , 1 , 0 ) */ - if ( ci->progeny[2] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[2] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[2] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOSUB2( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[5] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[5] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[5] , -1 , 0 ); - break; - - case 11: /* ( 0 , 1 , -1 ) */ - if ( ci->progeny[2] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[2] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[5] != NULL ) - DOSUB2( r , ci->progeny[2] , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[5] != NULL ) - DOSUB2( r , ci->progeny[6] , cj->progeny[5] , -1 , 0 ); - break; - - case 12: /* ( 0 , 0 , 1 ) */ - if ( ci->progeny[1] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[1] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[1] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[1] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[6] != NULL ) - DOSUB2( r , ci->progeny[1] , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[6] != NULL ) - DOSUB2( r , ci->progeny[3] , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[6] != NULL ) - DOSUB2( r , ci->progeny[5] , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[6] != NULL ) - DOSUB2( r , ci->progeny[7] , cj->progeny[6] , -1 , 0 ); - break; - - } - - } - - /* Otherwise, compute the pair directly. */ - else if ( ci->dt_min <= dt_step || cj->dt_min <= dt_step ) { - - /* Do any of the cells need to be sorted first? */ - if ( !(ci->sorted & (1 << sid) ) ) - runner_dosort( r , ci , (1 << sid) , 1 ); - if ( !(cj->sorted & (1 << sid) ) ) - runner_dosort( r , cj , (1 << sid) , 1 ); - - /* Compute the interactions. */ - DOPAIR2( r , ci , cj ); - - } - - } /* otherwise, pair interaction. */ - + } - if ( gettimer ) - #ifdef TIMER_VERBOSE - printf( "runner_dosub2[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , sid , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOSUB); - #endif + /* Otherwise, compute the pair directly. */ + else if (ci->dt_min <= dt_step || cj->dt_min <= dt_step) { + /* Do any of the cells need to be sorted first? */ + if (!(ci->sorted & (1 << sid))) runner_dosort(r, ci, (1 << sid), 1); + if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1); + + /* Compute the interactions. */ + DOPAIR1(r, ci, cj); } + } /* otherwise, pair interaction. */ -void DOSUB_SUBSET ( struct runner *r , struct cell *ci , struct part *parts , int *ind , int count , struct cell *cj , int sid , int gettimer ) { - - int j, k; - double shift[3]; - float h; - struct space *s = r->e->s; - struct cell *sub = NULL; - float dt_step = r->e->dt_step; - - TIMER_TIC - - /* Find out in which sub-cell of ci the parts are. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) { - // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] && - // parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] + ci->progeny[k]->h[0] && - // parts[ ind[ 0 ] ].x[1] >= ci->progeny[k]->loc[1] && - // parts[ ind[ 0 ] ].x[1] <= ci->progeny[k]->loc[1] + ci->progeny[k]->h[1] && - // parts[ ind[ 0 ] ].x[2] >= ci->progeny[k]->loc[2] && - // parts[ ind[ 0 ] ].x[2] <= ci->progeny[k]->loc[2] + ci->progeny[k]->h[2] ) { - if ( &parts[ ind[0] ] >= &ci->progeny[k]->parts[0] && - &parts[ ind[0] ] < &ci->progeny[k]->parts[ci->progeny[k]->count] ) { - sub = ci->progeny[k]; - break; - } - } - - - /* Is this a single cell? */ - if ( cj == NULL ) { - - /* Recurse? */ - if ( ci->split ) { - - /* Loop over all progeny. */ - DOSUB_SUBSET( r , sub , parts , ind , count , NULL , -1 , 0 ); - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != sub && ci->progeny[j] != NULL ) - DOSUB_SUBSET( r , sub , parts , ind , count , ci->progeny[j] , -1 , 0 ); - - } - - /* Otherwsie, compute self-interaction. */ - else - DOSELF_SUBSET( r , ci , parts , ind , count ); - - } /* self-interaction. */ - - /* Otherwise, it's a pair interaction. */ - else { - - /* Get the cell dimensions. */ - h = fmin( ci->h[0] , fmin( ci->h[1] , ci->h[2] ) ); - - /* Recurse? */ - if ( ci->split && cj->split && - fmaxf( ci->h_max , cj->h_max )*kernel_gamma + ci->dx_max + cj->dx_max < h/2 ) { - - /* Get the type of pair if not specified explicitly. */ - sid = space_getsid( s , &ci , &cj , shift ); - - /* Different types of flags. */ - switch ( sid ) { - - /* Regular sub-cell interactions of a single cell. */ - case 0: /* ( 1 , 1 , 1 ) */ - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , ci->progeny[0] , parts , ind , count , cj->progeny[7] , -1 , 0 ); - break; - - case 1: /* ( 1 , 1 , 0 ) */ - if ( ci->progeny[6] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - break; - - case 2: /* ( 1 , 1 , -1 ) */ - if ( ci->progeny[6] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - break; - - case 3: /* ( 1 , 0 , 1 ) */ - if ( ci->progeny[5] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - break; - - case 4: /* ( 1 , 0 , 0 ) */ - if ( ci->progeny[4] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[4] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[4] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[4] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - break; - - case 5: /* ( 1 , 0 , -1 ) */ - if ( ci->progeny[4] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[4] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - break; - - case 6: /* ( 1 , -1 , 1 ) */ - if ( ci->progeny[5] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - break; - - case 7: /* ( 1 , -1 , 0 ) */ - if ( ci->progeny[4] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[4] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - break; - - case 8: /* ( 1 , -1 , -1 ) */ - if ( ci->progeny[4] == sub && cj->progeny[3] != NULL ) - DOSUB_SUBSET( r , ci->progeny[4] , parts , ind , count , cj->progeny[3] , -1 , 0 ); - if ( ci->progeny[4] != NULL && cj->progeny[3] == sub ) - DOSUB_SUBSET( r , cj->progeny[3] , parts , ind , count , ci->progeny[4] , -1 , 0 ); - break; - - case 9: /* ( 0 , 1 , 1 ) */ - if ( ci->progeny[3] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - break; - - case 10: /* ( 0 , 1 , 0 ) */ - if ( ci->progeny[2] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[2] , -1 , 0 ); - if ( ci->progeny[2] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[2] , -1 , 0 ); - if ( ci->progeny[2] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[2] , -1 , 0 ); - if ( ci->progeny[2] == sub && cj->progeny[5] != NULL ) - DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[5] == sub ) - DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[2] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[5] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[5] == sub ) - DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[5] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[5] == sub ) - DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[5] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[5] == sub ) - DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - break; - - case 11: /* ( 0 , 1 , -1 ) */ - if ( ci->progeny[2] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[2] , -1 , 0 ); - if ( ci->progeny[2] == sub && cj->progeny[5] != NULL ) - DOSUB_SUBSET( r , ci->progeny[2] , parts , ind , count , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[2] != NULL && cj->progeny[5] == sub ) - DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[2] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[1] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[1] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[1] == sub ) - DOSUB_SUBSET( r , cj->progeny[1] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - if ( ci->progeny[6] == sub && cj->progeny[5] != NULL ) - DOSUB_SUBSET( r , ci->progeny[6] , parts , ind , count , cj->progeny[5] , -1 , 0 ); - if ( ci->progeny[6] != NULL && cj->progeny[5] == sub ) - DOSUB_SUBSET( r , cj->progeny[5] , parts , ind , count , ci->progeny[6] , -1 , 0 ); - break; - - case 12: /* ( 0 , 0 , 1 ) */ - if ( ci->progeny[1] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[1] , -1 , 0 ); - if ( ci->progeny[1] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[1] , -1 , 0 ); - if ( ci->progeny[1] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[1] , -1 , 0 ); - if ( ci->progeny[1] == sub && cj->progeny[6] != NULL ) - DOSUB_SUBSET( r , ci->progeny[1] , parts , ind , count , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[1] != NULL && cj->progeny[6] == sub ) - DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[1] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[3] == sub && cj->progeny[6] != NULL ) - DOSUB_SUBSET( r , ci->progeny[3] , parts , ind , count , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[3] != NULL && cj->progeny[6] == sub ) - DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[3] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[5] == sub && cj->progeny[6] != NULL ) - DOSUB_SUBSET( r , ci->progeny[5] , parts , ind , count , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[5] != NULL && cj->progeny[6] == sub ) - DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[5] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[0] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[0] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[0] == sub ) - DOSUB_SUBSET( r , cj->progeny[0] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[2] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[2] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[2] == sub ) - DOSUB_SUBSET( r , cj->progeny[2] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[4] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[4] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[4] == sub ) - DOSUB_SUBSET( r , cj->progeny[4] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - if ( ci->progeny[7] == sub && cj->progeny[6] != NULL ) - DOSUB_SUBSET( r , ci->progeny[7] , parts , ind , count , cj->progeny[6] , -1 , 0 ); - if ( ci->progeny[7] != NULL && cj->progeny[6] == sub ) - DOSUB_SUBSET( r , cj->progeny[6] , parts , ind , count , ci->progeny[7] , -1 , 0 ); - break; - - } - - } - - /* Otherwise, compute the pair directly. */ - else if ( ci->dt_min <= dt_step || cj->dt_min <= dt_step ) { - - /* Get the relative distance between the pairs, wrapping. */ - for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -s->dim[k]/2 ) - shift[k] = s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > s->dim[k]/2 ) - shift[k] = -s->dim[k]; - } - - /* Get the sorting index. */ - for ( sid = 0 , k = 0 ; k < 3 ; k++ ) - sid = 3*sid + ( (cj->loc[k] - ci->loc[k] + shift[k] < 0) ? 0 : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1 ); - sid = sortlistID[sid]; - - /* Do any of the cells need to be sorted first? */ - if ( !(cj->sorted & (1 << sid) ) ) - runner_dosort( r , cj , (1 << sid) , 1 ); - - /* Compute the interactions. */ - DOPAIR_SUBSET( r , ci , parts , ind , count , cj ); - - } - - } /* otherwise, pair interaction. */ - + if (gettimer) +#ifdef TIMER_VERBOSE + printf("runner_dosub1[%02i]: flags=%i at depth %i took %.3f ms.\n", r->id, + sid, ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOSUB); +#endif +} + +void DOSUB2(struct runner *r, struct cell *ci, struct cell *cj, int sid, + int gettimer) { + + int j, k; + double shift[3]; + float h; + struct space *s = r->e->s; + float dt_step = r->e->dt_step; + + TIMER_TIC + + /* Is this a single cell? */ + if (cj == NULL) { + + /* Should we even bother? */ + if (ci->dt_min > dt_step) return; - if ( gettimer ) - #ifdef TIMER_VERBOSE - printf( "runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n" , r->id , sid , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOSUB); - #endif + /* Recurse? */ + if (ci->split) { + + /* Loop over all progeny. */ + for (k = 0; k < 8; k++) + if (ci->progeny[k] != NULL) { + DOSUB2(r, ci->progeny[k], NULL, -1, 0); + for (j = k + 1; j < 8; j++) + if (ci->progeny[j] != NULL) + DOSUB2(r, ci->progeny[k], ci->progeny[j], -1, 0); + } + + } + + /* Otherwsie, compute self-interaction. */ + else + DOSELF2(r, ci); + + } /* self-interaction. */ + + /* Otherwise, it's a pair interaction. */ + else { + + /* Should we even bother? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the cell dimensions. */ + h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + + /* Get the type of pair if not specified explicitly. */ + // if ( sid < 0 ) + sid = space_getsid(s, &ci, &cj, shift); + + /* Recurse? */ + if (ci->split && cj->split && + fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < + h / 2) { + + /* Different types of flags. */ + switch (sid) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + break; + + case 1: /* ( 1 , 1 , 0 ) */ + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0); + break; + + case 2: /* ( 1 , 1 , -1 ) */ + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); + break; + + case 3: /* ( 1 , 0 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[0], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[1], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[2], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[3], -1, 0); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[1], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[3], -1, 0); + break; + + case 6: /* ( 1 , -1 , 1 ) */ + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); + break; + + case 7: /* ( 1 , -1 , 0 ) */ + if (ci->progeny[4] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[2], -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[3], -1, 0); + break; + + case 8: /* ( 1 , -1 , -1 ) */ + if (ci->progeny[4] != NULL && cj->progeny[3] != NULL) + DOSUB2(r, ci->progeny[4], cj->progeny[3], -1, 0); + break; + + case 9: /* ( 0 , 1 , 1 ) */ + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + if (ci->progeny[2] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[2], cj->progeny[0], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[2], cj->progeny[4], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[1], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[5] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[0], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[4], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[1], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[5] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[5], -1, 0); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + if (ci->progeny[2] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[2], cj->progeny[1], -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] != NULL) + DOSUB2(r, ci->progeny[2], cj->progeny[5], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[1], -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] != NULL) + DOSUB2(r, ci->progeny[6], cj->progeny[5], -1, 0); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + if (ci->progeny[1] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[1], cj->progeny[0], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[1], cj->progeny[2], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[1], cj->progeny[4], -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[6] != NULL) + DOSUB2(r, ci->progeny[1], cj->progeny[6], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[0], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[2], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[4], -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[6] != NULL) + DOSUB2(r, ci->progeny[3], cj->progeny[6], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[0], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[2], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[4], -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[6] != NULL) + DOSUB2(r, ci->progeny[5], cj->progeny[6], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[0], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[2], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[4], -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[6] != NULL) + DOSUB2(r, ci->progeny[7], cj->progeny[6], -1, 0); + break; + } + + } + + /* Otherwise, compute the pair directly. */ + else if (ci->dt_min <= dt_step || cj->dt_min <= dt_step) { + + /* Do any of the cells need to be sorted first? */ + if (!(ci->sorted & (1 << sid))) runner_dosort(r, ci, (1 << sid), 1); + if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1); + + /* Compute the interactions. */ + DOPAIR2(r, ci, cj); + } + + } /* otherwise, pair interaction. */ + + if (gettimer) +#ifdef TIMER_VERBOSE + printf("runner_dosub2[%02i]: flags=%i at depth %i took %.3f ms.\n", r->id, + sid, ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOSUB); +#endif +} + +void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, + int *ind, int count, struct cell *cj, int sid, int gettimer) { + + int j, k; + double shift[3]; + float h; + struct space *s = r->e->s; + struct cell *sub = NULL; + float dt_step = r->e->dt_step; + + TIMER_TIC + + /* Find out in which sub-cell of ci the parts are. */ + for (k = 0; k < 8; k++) + if (ci->progeny[k] != NULL) { + // if ( parts[ ind[ 0 ] ].x[0] >= ci->progeny[k]->loc[0] && + // parts[ ind[ 0 ] ].x[0] <= ci->progeny[k]->loc[0] + + // ci->progeny[k]->h[0] && + // parts[ ind[ 0 ] ].x[1] >= ci->progeny[k]->loc[1] && + // parts[ ind[ 0 ] ].x[1] <= ci->progeny[k]->loc[1] + + // ci->progeny[k]->h[1] && + // parts[ ind[ 0 ] ].x[2] >= ci->progeny[k]->loc[2] && + // parts[ ind[ 0 ] ].x[2] <= ci->progeny[k]->loc[2] + + // ci->progeny[k]->h[2] ) { + if (&parts[ind[0]] >= &ci->progeny[k]->parts[0] && + &parts[ind[0]] < &ci->progeny[k]->parts[ci->progeny[k]->count]) { + sub = ci->progeny[k]; + break; + } + } + + /* Is this a single cell? */ + if (cj == NULL) { + + /* Recurse? */ + if (ci->split) { + + /* Loop over all progeny. */ + DOSUB_SUBSET(r, sub, parts, ind, count, NULL, -1, 0); + for (j = 0; j < 8; j++) + if (ci->progeny[j] != sub && ci->progeny[j] != NULL) + DOSUB_SUBSET(r, sub, parts, ind, count, ci->progeny[j], -1, 0); + + } + + /* Otherwsie, compute self-interaction. */ + else + DOSELF_SUBSET(r, ci, parts, ind, count); + + } /* self-interaction. */ + + /* Otherwise, it's a pair interaction. */ + else { + + /* Get the cell dimensions. */ + h = fmin(ci->h[0], fmin(ci->h[1], ci->h[2])); + + /* Recurse? */ + if (ci->split && cj->split && + fmaxf(ci->h_max, cj->h_max) * kernel_gamma + ci->dx_max + cj->dx_max < + h / 2) { + + /* Get the type of pair if not specified explicitly. */ + sid = space_getsid(s, &ci, &cj, shift); + + /* Different types of flags. */ + switch (sid) { + + /* Regular sub-cell interactions of a single cell. */ + case 0: /* ( 1 , 1 , 1 ) */ + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, ci->progeny[0], parts, ind, count, cj->progeny[7], + -1, 0); + break; + + case 1: /* ( 1 , 1 , 0 ) */ + if (ci->progeny[6] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[7], + -1, 0); + break; + + case 2: /* ( 1 , 1 , -1 ) */ + if (ci->progeny[6] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6], + -1, 0); + break; + + case 3: /* ( 1 , 0 , 1 ) */ + if (ci->progeny[5] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[7], + -1, 0); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + if (ci->progeny[4] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[4] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[4] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[4] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[7], + -1, 0); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + if (ci->progeny[4] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[4] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[6], + -1, 0); + break; + + case 6: /* ( 1 , -1 , 1 ) */ + if (ci->progeny[5] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5], + -1, 0); + break; + + case 7: /* ( 1 , -1 , 0 ) */ + if (ci->progeny[4] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[4] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[5], + -1, 0); + break; + + case 8: /* ( 1 , -1 , -1 ) */ + if (ci->progeny[4] == sub && cj->progeny[3] != NULL) + DOSUB_SUBSET(r, ci->progeny[4], parts, ind, count, cj->progeny[3], + -1, 0); + if (ci->progeny[4] != NULL && cj->progeny[3] == sub) + DOSUB_SUBSET(r, cj->progeny[3], parts, ind, count, ci->progeny[4], + -1, 0); + break; + + case 9: /* ( 0 , 1 , 1 ) */ + if (ci->progeny[3] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[7], + -1, 0); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + if (ci->progeny[2] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[2], + -1, 0); + if (ci->progeny[2] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[2], + -1, 0); + if (ci->progeny[2] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[2], + -1, 0); + if (ci->progeny[2] == sub && cj->progeny[5] != NULL) + DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[5], + -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] == sub) + DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[2], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[5] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[5], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[5] == sub) + DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[5] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[5], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] == sub) + DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[5] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[5], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[5] == sub) + DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[7], + -1, 0); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + if (ci->progeny[2] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[2], + -1, 0); + if (ci->progeny[2] == sub && cj->progeny[5] != NULL) + DOSUB_SUBSET(r, ci->progeny[2], parts, ind, count, cj->progeny[5], + -1, 0); + if (ci->progeny[2] != NULL && cj->progeny[5] == sub) + DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[2], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[1] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[1], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[1] == sub) + DOSUB_SUBSET(r, cj->progeny[1], parts, ind, count, ci->progeny[6], + -1, 0); + if (ci->progeny[6] == sub && cj->progeny[5] != NULL) + DOSUB_SUBSET(r, ci->progeny[6], parts, ind, count, cj->progeny[5], + -1, 0); + if (ci->progeny[6] != NULL && cj->progeny[5] == sub) + DOSUB_SUBSET(r, cj->progeny[5], parts, ind, count, ci->progeny[6], + -1, 0); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + if (ci->progeny[1] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[1], + -1, 0); + if (ci->progeny[1] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[1], + -1, 0); + if (ci->progeny[1] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[1], + -1, 0); + if (ci->progeny[1] == sub && cj->progeny[6] != NULL) + DOSUB_SUBSET(r, ci->progeny[1], parts, ind, count, cj->progeny[6], + -1, 0); + if (ci->progeny[1] != NULL && cj->progeny[6] == sub) + DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[1], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[3] == sub && cj->progeny[6] != NULL) + DOSUB_SUBSET(r, ci->progeny[3], parts, ind, count, cj->progeny[6], + -1, 0); + if (ci->progeny[3] != NULL && cj->progeny[6] == sub) + DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[3], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[5] == sub && cj->progeny[6] != NULL) + DOSUB_SUBSET(r, ci->progeny[5], parts, ind, count, cj->progeny[6], + -1, 0); + if (ci->progeny[5] != NULL && cj->progeny[6] == sub) + DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[5], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[0] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[0], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[0] == sub) + DOSUB_SUBSET(r, cj->progeny[0], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[2] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[2], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[2] == sub) + DOSUB_SUBSET(r, cj->progeny[2], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[4] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[4], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[4] == sub) + DOSUB_SUBSET(r, cj->progeny[4], parts, ind, count, ci->progeny[7], + -1, 0); + if (ci->progeny[7] == sub && cj->progeny[6] != NULL) + DOSUB_SUBSET(r, ci->progeny[7], parts, ind, count, cj->progeny[6], + -1, 0); + if (ci->progeny[7] != NULL && cj->progeny[6] == sub) + DOSUB_SUBSET(r, cj->progeny[6], parts, ind, count, ci->progeny[7], + -1, 0); + break; + } + + } + /* Otherwise, compute the pair directly. */ + else if (ci->dt_min <= dt_step || cj->dt_min <= dt_step) { + + /* Get the relative distance between the pairs, wrapping. */ + for (k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -s->dim[k] / 2) + shift[k] = s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > s->dim[k] / 2) + shift[k] = -s->dim[k]; + } + + /* Get the sorting index. */ + for (sid = 0, k = 0; k < 3; k++) + sid = + 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) + ? 0 + : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); + sid = sortlistID[sid]; + + /* Do any of the cells need to be sorted first? */ + if (!(cj->sorted & (1 << sid))) runner_dosort(r, cj, (1 << sid), 1); + + /* Compute the interactions. */ + DOPAIR_SUBSET(r, ci, parts, ind, count, cj); } + } /* otherwise, pair interaction. */ + if (gettimer) +#ifdef TIMER_VERBOSE + printf("runner_dosub[%02i]: flags=%i at depth %i took %.3f ms.\n", r->id, + sid, ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOSUB); +#endif +} diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index ba24b6bf4a024d4ae9f6e83f325cdcd75edee145..98fd23585768b4594e84099177a5d291912230cb 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -1,23 +1,27 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_RUNNER_DOIACT_GRAV_H +#define SWIFT_RUNNER_DOIACT_GRAV_H - +/* Includes. */ +#include "cell.h" +#include "part.h" /** * @brief Compute the sorted gravity interactions between a cell pair. @@ -26,171 +30,179 @@ * @param ci The first #cell. * @param cj The second #cell. */ - -void runner_dopair_grav_new ( struct runner *r , struct cell *ci , struct cell *cj ) { - - struct engine *restrict e = r->e; - int pid, pjd, k, sid; - double rshift, shift[3] = { 0.0 , 0.0 , 0.0 }, nshift[3]; - struct entry *restrict sort_i, *restrict sort_j; - struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; - double pix[3]; - float dx[3], r2, h_max, di, dj; - int count_i, count_j, cnj, cnj_new; - float dt_step = e->dt_step; - struct multipole m; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the sort ID. */ - sid = space_getsid( e->s , &ci , &cj , shift ); - - /* Make sure the cells are sorted. */ - runner_dogsort( r , ci , (1 << sid) , 0 ); - runner_dogsort( r , cj , (1 << sid) , 0 ); - - /* Have the cells been sorted? */ - if ( !(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid) ) ) - error( "Trying to interact unsorted cells." ); - - /* Get the cutoff shift. */ - for ( rshift = 0.0 , k = 0 ; k < 3 ; k++ ) - rshift += shift[k]*runner_shift[ 3*sid + k ]; - - /* Pick-out the sorted lists. */ - sort_i = &ci->gsort[ sid*(ci->count + 1) ]; - sort_j = &cj->gsort[ sid*(cj->count + 1) ]; - - /* Get some other useful values. */ - h_max = sqrtf( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) * const_theta_max; - count_i = ci->gcount; count_j = cj->gcount; - parts_i = ci->gparts; parts_j = cj->gparts; - cnj = count_j; - multipole_reset( &m ); - nshift[0] = -shift[0]; nshift[1] = -shift[1]; nshift[2] = -shift[2]; - - /* Loop over the parts in ci. */ - for ( pid = count_i-1 ; pid >= 0 ; pid-- ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ sort_i[ pid ].i ]; - if ( pi->dt > dt_step ) - continue; - di = sort_i[pid].d + h_max - rshift; - - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < cnj && sort_j[pjd].d < di ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ sort_j[pjd].i ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - #ifndef VECTORIZE - - // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) - // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) ); - - runner_iact_grav( r2 , dx , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - runner_iact_vec_grav( r2q , dxq , piq , pjq ); - icount = 0; - } - - #endif - - } /* loop over the parts in cj. */ - - /* Set the new limit. */ - cnj_new = pjd; - - /* Add trailing parts to the multipole. */ - for ( pjd = cnj_new ; pjd < cnj ; pjd++ ) { - - /* Add the part to the multipole. */ - multipole_addpart( &m , &parts_j[ sort_j[pjd].i ] ); - - } /* add trailing parts to the multipole. */ - - /* Set the new cnj. */ - cnj = cnj_new; - - /* Interact the ith particle with the multipole. */ - multipole_iact_mp( &m , pi , nshift ); - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] ); - #endif - - /* Re-set the multipole. */ - multipole_reset( &m ); - - /* Loop over the parts in cj and interact with the multipole in ci. */ - for ( pid = count_i - 1 , pjd = 0 ; pjd < count_j ; pjd++ ) { - - /* Get the position of pj along the axis. */ - dj = sort_j[pjd].d - h_max + rshift; - - /* Add any left-over parts in cell_i to the multipole. */ - while ( pid >= 0 && sort_i[pid].d < dj ) { - - /* Add this particle to the multipole. */ - multipole_addpart( &m , &parts_i[ sort_i[pid].i ] ); - - /* Decrease pid. */ - pid -= 1; - - } - - /* Interact pj with the multipole. */ - multipole_iact_mp( &m , &parts_j[ sort_j[pjd].i ] , shift ); - - } /* loop over the parts in cj and interact with the multipole. */ - - - #ifdef TIMER_VERBOSE - printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 ); - #else - TIMER_TOC(TIMER_DOPAIR); - #endif +void runner_dopair_grav_new(struct runner *r, struct cell *ci, + struct cell *cj) { + + struct engine *restrict e = r->e; + int pid, pjd, k, sid; + double rshift, shift[3] = {0.0, 0.0, 0.0}, nshift[3]; + struct entry *restrict sort_i, *restrict sort_j; + struct gpart *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j; + double pix[3]; + float dx[3], r2, h_max, di, dj; + int count_i, count_j, cnj, cnj_new; + float dt_step = e->dt_step; + struct multipole m; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the sort ID. */ + sid = space_getsid(e->s, &ci, &cj, shift); + + /* Make sure the cells are sorted. */ + runner_dogsort(r, ci, (1 << sid), 0); + runner_dogsort(r, cj, (1 << sid), 0); + + /* Have the cells been sorted? */ + if (!(ci->gsorted & (1 << sid)) || !(cj->gsorted & (1 << sid))) + error("Trying to interact unsorted cells."); + + /* Get the cutoff shift. */ + for (rshift = 0.0, k = 0; k < 3; k++) + rshift += shift[k] * runner_shift[3 * sid + k]; + + /* Pick-out the sorted lists. */ + sort_i = &ci->gsort[sid * (ci->count + 1)]; + sort_j = &cj->gsort[sid * (cj->count + 1)]; + + /* Get some other useful values. */ + h_max = + sqrtf(ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]) * + const_theta_max; + count_i = ci->gcount; + count_j = cj->gcount; + parts_i = ci->gparts; + parts_j = cj->gparts; + cnj = count_j; + multipole_reset(&m); + nshift[0] = -shift[0]; + nshift[1] = -shift[1]; + nshift[2] = -shift[2]; + + /* Loop over the parts in ci. */ + for (pid = count_i - 1; pid >= 0; pid--) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[sort_i[pid].i]; + if (pi->dt > dt_step) continue; + di = sort_i[pid].d + h_max - rshift; + + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < cnj && sort_j[pjd].d < di; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[sort_j[pjd].i]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + +#ifndef VECTORIZE + + // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) + // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in + // cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long + // long int)ci) / sizeof(struct cell) , ((long long int)cj) / + // sizeof(struct cell) ); + + runner_iact_grav(r2, dx, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + runner_iact_vec_grav(r2q, dxq, piq, pjq); + icount = 0; + } + +#endif + + } /* loop over the parts in cj. */ + + /* Set the new limit. */ + cnj_new = pjd; + + /* Add trailing parts to the multipole. */ + for (pjd = cnj_new; pjd < cnj; pjd++) { + + /* Add the part to the multipole. */ + multipole_addpart(&m, &parts_j[sort_j[pjd].i]); + + } /* add trailing parts to the multipole. */ + + /* Set the new cnj. */ + cnj = cnj_new; + + /* Interact the ith particle with the multipole. */ + multipole_iact_mp(&m, pi, nshift); + + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]); +#endif + + /* Re-set the multipole. */ + multipole_reset(&m); + + /* Loop over the parts in cj and interact with the multipole in ci. */ + for (pid = count_i - 1, pjd = 0; pjd < count_j; pjd++) { + + /* Get the position of pj along the axis. */ + dj = sort_j[pjd].d - h_max + rshift; + + /* Add any left-over parts in cell_i to the multipole. */ + while (pid >= 0 && sort_i[pid].d < dj) { + + /* Add this particle to the multipole. */ + multipole_addpart(&m, &parts_i[sort_i[pid].i]); + + /* Decrease pid. */ + pid -= 1; } + /* Interact pj with the multipole. */ + multipole_iact_mp(&m, &parts_j[sort_j[pjd].i], shift); + + } /* loop over the parts in cj and interact with the multipole. */ + +#ifdef TIMER_VERBOSE + printf( + "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) " + "took %.3f ms.\n", + r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max, + fmax(ci->h[0], fmax(ci->h[1], ci->h[2])), + ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000); +#else + TIMER_TOC(TIMER_DOPAIR); +#endif +} /** * @brief Compute the recursive upward sweep, i.e. construct the @@ -199,36 +211,33 @@ void runner_dopair_grav_new ( struct runner *r , struct cell *ci , struct cell * * @param r The #runner. * @param c The top-level #cell. */ - -void runner_dograv_up ( struct runner *r , struct cell *c ) { - - /* Re-set this cell's multipole. */ - multipole_reset( &c->multipole ); - - /* Split? */ - if ( c->split ) { - - /* Recurse. */ - for ( int k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - runner_dograv_up( r , c->progeny[k] ); - - /* Collect the multipoles from the progeny. */ - multipole_reset( &c->multipole ); - for ( int k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - multipole_merge( &c->multipole , &c->progeny[k]->multipole ); - - } - - /* No, leaf node. */ - else - - /* Just collect the multipole. */ - multipole_init( &c->multipole , c->gparts , c->gcount ); - } +void runner_dograv_up(struct runner *r, struct cell *c) { + + /* Re-set this cell's multipole. */ + multipole_reset(&c->multipole); + /* Split? */ + if (c->split) { + + /* Recurse. */ + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) runner_dograv_up(r, c->progeny[k]); + + /* Collect the multipoles from the progeny. */ + multipole_reset(&c->multipole); + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) + multipole_merge(&c->multipole, &c->progeny[k]->multipole); + + } + + /* No, leaf node. */ + else + + /* Just collect the multipole. */ + multipole_init(&c->multipole, c->gparts, c->gcount); +} /** * @brief Compute the recursive downward sweep, i.e. apply the multipole @@ -237,45 +246,41 @@ void runner_dograv_up ( struct runner *r , struct cell *c ) { * @param r The #runner. * @param c The top-level #cell. */ - -void runner_dograv_down ( struct runner *r , struct cell *c ) { - - struct multipole *m = &c->multipole; - - /* Split? */ - if ( c->split ) { - - /* Apply this cell's accelleration on the multipoles below. */ - for ( int k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) { - struct multipole *mp = &c->progeny[k]->multipole; - mp->a[0] += m->a[0]; - mp->a[1] += m->a[1]; - mp->a[2] += m->a[2]; - } - - /* Recurse. */ - for ( int k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - runner_dograv_down( r , c->progeny[k] ); - - } - - /* No, leaf node. */ - else { - - /* Apply the multipole accelleration to all gparts. */ - for ( int k = 0 ; k < c->gcount ; k++ ) { - struct gpart *p = &c->gparts[k]; - p->a[0] += m->a[0]; - p->a[1] += m->a[1]; - p->a[2] += m->a[2]; - } - - } - } +void runner_dograv_down(struct runner *r, struct cell *c) { + struct multipole *m = &c->multipole; + + /* Split? */ + if (c->split) { + + /* Apply this cell's accelleration on the multipoles below. */ + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + struct multipole *mp = &c->progeny[k]->multipole; + mp->a[0] += m->a[0]; + mp->a[1] += m->a[1]; + mp->a[2] += m->a[2]; + } + + /* Recurse. */ + for (int k = 0; k < 8; k++) + if (c->progeny[k] != NULL) runner_dograv_down(r, c->progeny[k]); + + } + + /* No, leaf node. */ + else { + + /* Apply the multipole accelleration to all gparts. */ + for (int k = 0; k < c->gcount; k++) { + struct gpart *p = &c->gparts[k]; + p->a[0] += m->a[0]; + p->a[1] += m->a[1]; + p->a[2] += m->a[2]; + } + } +} /** * @brief Compute the multipole-multipole interaction between two cells. @@ -284,48 +289,45 @@ void runner_dograv_down ( struct runner *r , struct cell *c ) { * @param ci The first #cell. * @param cj The second #cell. */ - -void runner_dograv_mm ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { - struct engine *e = r->e; - int k; - double shift[3] = { 0.0 , 0.0 , 0.0 }; - float dx[3], theta; +void runner_dograv_mm(struct runner *r, struct cell *restrict ci, + struct cell *restrict cj) { + + struct engine *e = r->e; + int k; + double shift[3] = {0.0, 0.0, 0.0}; + float dx[3], theta; + + /* Compute the shift between the cells. */ + for (k = 0; k < 3; k++) { + dx[k] = cj->loc[k] - ci->loc[k]; + if (r->e->s->periodic) { + if (dx[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (dx[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + dx[k] += shift[k]; + } + } + theta = + sqrt((dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) / + (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2])); - /* Compute the shift between the cells. */ - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = cj->loc[k] - ci->loc[k]; - if ( r->e->s->periodic ) { - if ( dx[k] < -e->s->dim[k]/2 ) - shift[k] = e->s->dim[k]; - else if ( dx[k] > e->s->dim[k]/2 ) - shift[k] = -e->s->dim[k]; - dx[k] += shift[k]; - } - } - theta = sqrt( ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / - ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ) ); - - /* Do an MM or an MP/PM? */ - if ( theta > const_theta_max*4 ) { - - /* Update the multipoles. */ - multipole_iact_mm( &ci->multipole , &cj->multipole , shift ); - - } - - else { - - /* Interact the multipoles via their parts. */ - for ( k = 0 ; k < ci->gcount ; k++ ) - multipole_iact_mp( &cj->multipole , &ci->gparts[k] , shift ); - for ( k = 0 ; k < cj->gcount ; k++ ) - multipole_iact_mp( &ci->multipole , &cj->gparts[k] , shift ); - - } + /* Do an MM or an MP/PM? */ + if (theta > const_theta_max * 4) { - } + /* Update the multipoles. */ + multipole_iact_mm(&ci->multipole, &cj->multipole, shift); + + } else { + /* Interact the multipoles via their parts. */ + for (k = 0; k < ci->gcount; k++) + multipole_iact_mp(&cj->multipole, &ci->gparts[k], shift); + for (k = 0; k < cj->gcount; k++) + multipole_iact_mp(&ci->multipole, &cj->gparts[k], shift); + } +} /** * @brief Compute the interactions between a cell pair. @@ -334,106 +336,109 @@ void runner_dograv_mm ( struct runner *r , struct cell *restrict ci , struct cel * @param ci The first #cell. * @param cj The second #cell. */ - -void runner_dopair_grav ( struct runner *r , struct cell *restrict ci , struct cell *restrict cj ) { - - struct engine *e = r->e; - int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount; - double shift[3] = { 0.0 , 0.0 , 0.0 }; - struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts; - struct gpart *restrict pi, *restrict pj; - double pix[3]; - float dx[3], r2; - float dt_step = e->dt_step; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( ci->dt_min > dt_step && cj->dt_min > dt_step ) - return; - - /* Get the relative distance between the pairs, wrapping. */ - if ( e->s->periodic ) - for ( k = 0 ; k < 3 ; k++ ) { - if ( cj->loc[k] - ci->loc[k] < -e->s->dim[k]/2 ) - shift[k] = e->s->dim[k]; - else if ( cj->loc[k] - ci->loc[k] > e->s->dim[k]/2 ) - shift[k] = -e->s->dim[k]; - } - - /* Loop over the parts in ci. */ - for ( pid = 0 ; pid < count_i ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts_i[ pid ]; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k] - shift[k]; - - /* Loop over the parts in cj. */ - for ( pjd = 0 ; pjd < count_j ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts_j[ pjd ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Compute the interaction. */ - #ifndef VECTORIZE - - // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) - // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long long int)ci) / sizeof(struct cell) , ((long long int)cj) / sizeof(struct cell) ); - - runner_iact_grav( r2 , dx , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - runner_iact_vec_grav( r2q , dxq , piq , pjq ); - icount = 0; - } - - #endif - - } /* loop over the parts in cj. */ - - } /* loop over the parts in ci. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_dopair_naive_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(timer_dopair_grav); - #endif - +void runner_dopair_grav(struct runner *r, struct cell *restrict ci, + struct cell *restrict cj) { + + struct engine *e = r->e; + int pid, pjd, k, count_i = ci->gcount, count_j = cj->gcount; + double shift[3] = {0.0, 0.0, 0.0}; + struct gpart *restrict parts_i = ci->gparts, *restrict parts_j = cj->gparts; + struct gpart *restrict pi, *restrict pj; + double pix[3]; + float dx[3], r2; + float dt_step = e->dt_step; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (ci->dt_min > dt_step && cj->dt_min > dt_step) return; + + /* Get the relative distance between the pairs, wrapping. */ + if (e->s->periodic) + for (k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; } + /* Loop over the parts in ci. */ + for (pid = 0; pid < count_i; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts_i[pid]; + for (k = 0; k < 3; k++) pix[k] = pi->x[k] - shift[k]; + + /* Loop over the parts in cj. */ + for (pjd = 0; pjd < count_j; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts_j[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + +/* Compute the interaction. */ +#ifndef VECTORIZE + + // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) + // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in + // cells %lli/%lli." , pi->part->id , pj->part->id , sqrtf(r2) , ((long + // long int)ci) / sizeof(struct cell) , ((long long int)cj) / + // sizeof(struct cell) ); + + runner_iact_grav(r2, dx, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + runner_iact_vec_grav(r2q, dxq, piq, pjq); + icount = 0; + } + +#endif + + } /* loop over the parts in cj. */ + + } /* loop over the parts in ci. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf( + "runner_dopair_naive_grav[%02i]: %i/%i parts at depth %i " + "(r_max=%.3f/%.3f) took %.3f ms.\n", + r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max, + ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000); +#else + TIMER_TOC(timer_dopair_grav); +#endif +} /** * @brief Compute the interactions within a cell. @@ -441,186 +446,184 @@ void runner_dopair_grav ( struct runner *r , struct cell *restrict ci , struct c * @param r The #runner. * @param c The #cell. */ - -void runner_doself_grav ( struct runner *r , struct cell *restrict c ) { - - struct engine *e = r->e; - int pid, pjd, k, count = c->gcount; - struct gpart *restrict parts = c->gparts; - struct gpart *restrict pi, *restrict pj; - double pix[3] = { 0.0 , 0.0 , 0.0 }; - float dx[3], r2; - float dt_step = e->dt_step; - #ifdef VECTORIZE - int icount = 0; - float r2q[VEC_SIZE] __attribute__ ((aligned (16))); - float dxq[3*VEC_SIZE] __attribute__ ((aligned (16))); - struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; - #endif - TIMER_TIC - - /* Anything to do here? */ - if ( c->dt_min > dt_step ) - return; - - /* Loop over every part in c. */ - for ( pid = 0 ; pid < count ; pid++ ) { - - /* Get a hold of the ith part in ci. */ - pi = &parts[ pid ]; - for ( k = 0 ; k < 3 ; k++ ) - pix[k] = pi->x[k]; - - /* Loop over every other part in c. */ - for ( pjd = pid+1 ; pjd < count ; pjd++ ) { - - /* Get a pointer to the jth particle. */ - pj = &parts[ pjd ]; - - /* Compute the pairwise distance. */ - r2 = 0.0f; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = pix[k] - pj->x[k]; - r2 += dx[k]*dx[k]; - } - - /* Compute the interaction. */ - #ifndef VECTORIZE - - // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) - // message( "interacting particles pi=%lli and pj=%lli with r=%.3e." , pi->part->id , pj->part->id , sqrtf(r2) ); - - runner_iact_grav( r2 , dx , pi , pj ); - - #else - - /* Add this interaction to the queue. */ - r2q[icount] = r2; - dxq[3*icount+0] = dx[0]; - dxq[3*icount+1] = dx[1]; - dxq[3*icount+2] = dx[2]; - piq[icount] = pi; - pjq[icount] = pj; - icount += 1; - - /* Flush? */ - if ( icount == VEC_SIZE ) { - runner_iact_vec_grav( r2q , dxq , piq , pjq ); - icount = 0; - } - - #endif - - } /* loop over the remaining parts in c. */ - - } /* loop over the parts in c. */ - - #ifdef VECTORIZE - /* Pick up any leftovers. */ - if ( icount > 0 ) - for ( k = 0 ; k < icount ; k++ ) - runner_iact_grav( r2q[k] , &dxq[3*k] , piq[k] , pjq[k] ); - #endif - - #ifdef TIMER_VERBOSE - printf( "runner_doself_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(timer_doself_grav); - #endif - - - } +void runner_doself_grav(struct runner *r, struct cell *restrict c) { + + struct engine *e = r->e; + int pid, pjd, k, count = c->gcount; + struct gpart *restrict parts = c->gparts; + struct gpart *restrict pi, *restrict pj; + double pix[3] = {0.0, 0.0, 0.0}; + float dx[3], r2; + float dt_step = e->dt_step; +#ifdef VECTORIZE + int icount = 0; + float r2q[VEC_SIZE] __attribute__((aligned(16))); + float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); + struct gpart *piq[VEC_SIZE], *pjq[VEC_SIZE]; +#endif + TIMER_TIC + + /* Anything to do here? */ + if (c->dt_min > dt_step) return; + + /* Loop over every part in c. */ + for (pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + pi = &parts[pid]; + for (k = 0; k < 3; k++) pix[k] = pi->x[k]; + + /* Loop over every other part in c. */ + for (pjd = pid + 1; pjd < count; pjd++) { + + /* Get a pointer to the jth particle. */ + pj = &parts[pjd]; + + /* Compute the pairwise distance. */ + r2 = 0.0f; + for (k = 0; k < 3; k++) { + dx[k] = pix[k] - pj->x[k]; + r2 += dx[k] * dx[k]; + } + +/* Compute the interaction. */ +#ifndef VECTORIZE + + // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) + // message( "interacting particles pi=%lli and pj=%lli with r=%.3e." , + // pi->part->id , pj->part->id , sqrtf(r2) ); + + runner_iact_grav(r2, dx, pi, pj); + +#else + + /* Add this interaction to the queue. */ + r2q[icount] = r2; + dxq[3 * icount + 0] = dx[0]; + dxq[3 * icount + 1] = dx[1]; + dxq[3 * icount + 2] = dx[2]; + piq[icount] = pi; + pjq[icount] = pj; + icount += 1; + + /* Flush? */ + if (icount == VEC_SIZE) { + runner_iact_vec_grav(r2q, dxq, piq, pjq); + icount = 0; + } + +#endif + + } /* loop over the remaining parts in c. */ + + } /* loop over the parts in c. */ + +#ifdef VECTORIZE + /* Pick up any leftovers. */ + if (icount > 0) + for (k = 0; k < icount; k++) + runner_iact_grav(r2q[k], &dxq[3 * k], piq[k], pjq[k]); +#endif + +#ifdef TIMER_VERBOSE + printf( + "runner_doself_grav[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) " + "took %.3f ms.\n", + r->id, count_i, count_j, ci->depth, ci->h_max, cj->h_max, + ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000); +#else + TIMER_TOC(timer_doself_grav); +#endif +} /** * @brief Compute a gravity sub-task. - * + * * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. * @param gettimer Flag to record timer or not. */ - -void runner_dosub_grav ( struct runner *r , struct cell *ci , struct cell *cj , int gettimer ) { - int j, k, periodic = r->e->s->periodic; - struct space *s = r->e->s; +void runner_dosub_grav(struct runner *r, struct cell *ci, struct cell *cj, + int gettimer) { - TIMER_TIC + int j, k, periodic = r->e->s->periodic; + struct space *s = r->e->s; - /* Self-interaction? */ - if ( cj == NULL ) { + TIMER_TIC - /* If the cell is split, recurse. */ - if ( ci->split ) { + /* Self-interaction? */ + if (cj == NULL) { - /* Split this task into tasks on its progeny. */ - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL ) { - runner_dosub_grav( r , ci->progeny[j] , NULL , 0 ); - for ( k = j+1 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) - runner_dosub_grav( r , ci->progeny[j] , ci->progeny[k] , 0 ); - } + /* If the cell is split, recurse. */ + if (ci->split) { - } + /* Split this task into tasks on its progeny. */ + for (j = 0; j < 8; j++) + if (ci->progeny[j] != NULL) { + runner_dosub_grav(r, ci->progeny[j], NULL, 0); + for (k = j + 1; k < 8; k++) + if (ci->progeny[k] != NULL) + runner_dosub_grav(r, ci->progeny[j], ci->progeny[k], 0); + } - /* Otherwise, just make a pp task out of it. */ - else - runner_doself_grav( r , ci ); + } - } + /* Otherwise, just make a pp task out of it. */ + else + runner_doself_grav(r, ci); - /* Nope, pair. */ - else { - - /* Get the opening angle theta. */ - float dx[3], theta; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = fabsf( ci->loc[k] - cj->loc[k] ); - if ( periodic && dx[k] > 0.5*s->dim[k] ) - dx[k] = -dx[k] + s->dim[k]; - if ( dx[k] > 0.0f ) - dx[k] -= ci->h[k]; - } - theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / - ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ); - - /* Split the interacton? */ - if ( theta < const_theta_max*const_theta_max ) { - - /* Are both ci and cj split? */ - if ( ci->split && cj->split ) { - - /* Split this task into tasks on its progeny. */ - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL ) { - for ( k = 0 ; k < 8 ; k++ ) - if ( cj->progeny[k] != NULL ) - runner_dosub_grav( r , ci->progeny[j] , cj->progeny[k] , 0 ); - } - - } - - /* Otherwise, make a pp task out of it. */ - else - runner_dopair_grav( r , ci , cj ); - - } - - /* Otherwise, mm interaction is fine. */ - else - runner_dograv_mm( r , ci , cj ); + } - } - - if ( gettimer ) - #ifdef TIMER_VERBOSE - printf( "runner_dosub_grav[%02i]: at depth %i took %.3f ms.\n" , r->id , ci->depth , ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000 ); - #else - TIMER_TOC(timer_dosub_grav); - #endif + /* Nope, pair. */ + else { + /* Get the opening angle theta. */ + float dx[3], theta; + for (k = 0; k < 3; k++) { + dx[k] = fabsf(ci->loc[k] - cj->loc[k]); + if (periodic && dx[k] > 0.5 * s->dim[k]) dx[k] = -dx[k] + s->dim[k]; + if (dx[k] > 0.0f) dx[k] -= ci->h[k]; } + theta = (dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) / + (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]); + + /* Split the interacton? */ + if (theta < const_theta_max * const_theta_max) { + + /* Are both ci and cj split? */ + if (ci->split && cj->split) { + + /* Split this task into tasks on its progeny. */ + for (j = 0; j < 8; j++) + if (ci->progeny[j] != NULL) { + for (k = 0; k < 8; k++) + if (cj->progeny[k] != NULL) + runner_dosub_grav(r, ci->progeny[j], cj->progeny[k], 0); + } + } + /* Otherwise, make a pp task out of it. */ + else + runner_dopair_grav(r, ci, cj); + + } + + /* Otherwise, mm interaction is fine. */ + else + runner_dograv_mm(r, ci, cj); + } + + if (gettimer) +#ifdef TIMER_VERBOSE + printf("runner_dosub_grav[%02i]: at depth %i took %.3f ms.\n", r->id, + ci->depth, ((double)TIMER_TOC(TIMER_DOSUB)) / CPU_TPS * 1000); +#else + TIMER_TOC(timer_dosub_grav); +#endif +} + +#endif /* SWIFT_RUNNER_DOIACT_GRAV_H */ diff --git a/src/runner_iact.h b/src/runner_iact.h index 0a6b9c4ce74b41e1eb795b3c13adc1348b63aa23..e1561132af3fff847989af34c268dfb1069ed40d 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -2,873 +2,976 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * Matthieu Schaller (matthieu.schaller@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_RUNNER_IACT_H +#define SWIFT_RUNNER_IACT_H +/* Includes. */ +#include "const.h" #include "kernel.h" +#include "part.h" #include "vector.h" /** * @file runner_iact.h * @brief SPH interaction functions following the Gadget-2 version of SPH. * - * The interactions computed here are the ones presented in the Gadget-2 paper and use the same - * numerical coefficients as the Gadget-2 code. When used with the Spline-3 kernel, the results - * should be equivalent to the ones obtained with Gadget-2 up to the rounding errors and interactions + * The interactions computed here are the ones presented in the Gadget-2 paper + *and use the same + * numerical coefficients as the Gadget-2 code. When used with the Spline-3 + *kernel, the results + * should be equivalent to the ones obtained with Gadget-2 up to the rounding + *errors and interactions * missed by the Gadget-2 tree-code neighbours search. * - * The code uses internal energy instead of entropy as a thermodynamical variable. + * The code uses internal energy instead of entropy as a thermodynamical + *variable. */ - /** * @brief Density loop */ -__attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r = sqrtf( r2 ), ri = 1.0f / r; - float xi, xj; - float h_inv; - float wi, wj, wi_dx, wj_dx; - float mi, mj; - float dvdr; - float dv[3], curlvr[3]; - int k; - - /* Get the masses. */ - mi = pi->mass; mj = pj->mass; - - /* Compute dv dot r */ - dv[0] = pi->v[0] - pj->v[0]; - dv[1] = pi->v[1] - pj->v[1]; - dv[2] = pi->v[2] - pj->v[2]; - dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2]; - dvdr *= ri; - - /* Compute dv cross r */ - curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1]; - curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2]; - curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0]; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k] *= ri; - - /* Compute density of pi. */ - h_inv = 1.0 / hi; - xi = r * h_inv; - kernel_deval( xi , &wi , &wi_dx ); - - pi->rho += mj * wi; - pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx ); - pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; - - pi->density.div_v += mj * dvdr * wi_dx; - for ( k = 0 ; k < 3 ; k++ ) - pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; - - /* Compute density of pj. */ - h_inv = 1.0 / hj; - xj = r * h_inv; - kernel_deval( xj , &wj , &wj_dx ); - - pj->rho += mi * wj; - pj->rho_dh -= mi * ( 3.0*wj + xj*wj_dx ); - pj->density.wcount += wj; - pj->density.wcount_dh -= xj * wj_dx; - - pj->density.div_v += mi * dvdr * wj_dx; - for ( k = 0 ; k < 3 ; k++ ) - pj->density.curl_v[k] += mi * curlvr[k] * wj_dx; - - } - +__attribute__((always_inline)) INLINE static void runner_iact_density( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r = sqrtf(r2), ri = 1.0f / r; + float xi, xj; + float h_inv; + float wi, wj, wi_dx, wj_dx; + float mi, mj; + float dvdr; + float dv[3], curlvr[3]; + int k; + + /* Get the masses. */ + mi = pi->mass; + mj = pj->mass; + + /* Compute dv dot r */ + dv[0] = pi->v[0] - pj->v[0]; + dv[1] = pi->v[1] - pj->v[1]; + dv[2] = pi->v[2] - pj->v[2]; + dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; + dvdr *= ri; + + /* Compute dv cross r */ + curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1]; + curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; + curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; + for (k = 0; k < 3; k++) curlvr[k] *= ri; + + /* Compute density of pi. */ + h_inv = 1.0 / hi; + xi = r * h_inv; + kernel_deval(xi, &wi, &wi_dx); + + pi->rho += mj * wi; + pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx); + pi->density.wcount += wi; + pi->density.wcount_dh -= xi * wi_dx; + + pi->density.div_v += mj * dvdr * wi_dx; + for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; + + /* Compute density of pj. */ + h_inv = 1.0 / hj; + xj = r * h_inv; + kernel_deval(xj, &wj, &wj_dx); + + pj->rho += mi * wj; + pj->rho_dh -= mi * (3.0 * wj + xj * wj_dx); + pj->density.wcount += wj; + pj->density.wcount_dh -= xj * wj_dx; + + pj->density.div_v += mi * dvdr * wj_dx; + for (k = 0; k < 3; k++) pj->density.curl_v[k] += mi * curlvr[k] * wj_dx; +} + /** * @brief Density loop (Vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_vec_density( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; - vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; - vector mi, mj; - vector dx[3], dv[3]; - vector vi[3], vj[3]; - vector dvdr, div_vi, div_vj; - vector curlvr[3], curl_vi[3], curl_vj[3]; - int k, j; - - #if VEC_SIZE==8 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - #elif VEC_SIZE==4 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); - xi.v = r.v * hi_inv.v; - - hj.v = vec_load( Hj ); - hj_inv.v = vec_rcp( hj.v ); - hj_inv.v = hj_inv.v - hj_inv.v * ( hj_inv.v * hj.v - vec_set1( 1.0f ) ); - xj.v = r.v * hj_inv.v; - - kernel_deval_vec( &xi , &wi , &wi_dx ); - kernel_deval_vec( &xj , &wj , &wj_dx ); - - /* Compute dv. */ - dv[0].v = vi[0].v - vj[0].v; - dv[1].v = vi[1].v - vj[1].v; - dv[2].v = vi[2].v - vj[2].v; - - /* Compute dv dot r */ - dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Compute dv cross r */ - curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; - curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; - curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k].v *= ri.v; - - rhoi.v = mj.v * wi.v; - rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v ); - wcounti.v = wi.v; - wcounti_dh.v = xi.v * wi_dx.v; - div_vi.v = mj.v * dvdr.v * wi_dx.v; - for ( k = 0 ; k < 3 ; k++ ) - curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; - - rhoj.v = mi.v * wj.v; - rhoj_dh.v = mi.v * ( vec_set1( 3.0f ) * wj.v + xj.v * wj_dx.v ); - wcountj.v = wj.v; - wcountj_dh.v = xj.v * wj_dx.v; - div_vj.v = mi.v * dvdr.v * wj_dx.v; - for ( k = 0 ; k < 3 ; k++ ) - curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; - - - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->rho += rhoi.f[k]; - pi[k]->rho_dh -= rhoi_dh.f[k]; - pi[k]->density.wcount += wcounti.f[k]; - pi[k]->density.wcount_dh -= wcounti_dh.f[k]; - pi[k]->density.div_v += div_vi.f[k]; - for( j = 0 ; j < 3 ; j++ ) - pi[k]->density.curl_v[j] += curl_vi[j].f[k]; - pj[k]->rho += rhoj.f[k]; - pj[k]->rho_dh -= rhoj_dh.f[k]; - pj[k]->density.wcount += wcountj.f[k]; - pj[k]->density.wcount_dh -= wcountj_dh.f[k]; - pj[k]->density.div_v += div_vj.f[k]; - for( j = 0 ; j < 3 ; j++ ) - pj[k]->density.curl_v[j] += curl_vj[j].f[k]; - } - + vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; + vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; + vector mi, mj; + vector dx[3], dv[3]; + vector vi[3], vj[3]; + vector dvdr, div_vi, div_vj; + vector curlvr[3], curl_vi[3], curl_vj[3]; + int k, j; + +#if VEC_SIZE == 8 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, + pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + xi.v = r.v * hi_inv.v; + + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f)); + xj.v = r.v * hj_inv.v; + + kernel_deval_vec(&xi, &wi, &wi_dx); + kernel_deval_vec(&xj, &wj, &wj_dx); + + /* Compute dv. */ + dv[0].v = vi[0].v - vj[0].v; + dv[1].v = vi[1].v - vj[1].v; + dv[2].v = vi[2].v - vj[2].v; + + /* Compute dv dot r */ + dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Compute dv cross r */ + curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; + curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; + curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; + for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; + + rhoi.v = mj.v * wi.v; + rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v); + wcounti.v = wi.v; + wcounti_dh.v = xi.v * wi_dx.v; + div_vi.v = mj.v * dvdr.v * wi_dx.v; + for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; + + rhoj.v = mi.v * wj.v; + rhoj_dh.v = mi.v * (vec_set1(3.0f) * wj.v + xj.v * wj_dx.v); + wcountj.v = wj.v; + wcountj_dh.v = xj.v * wj_dx.v; + div_vj.v = mi.v * dvdr.v * wj_dx.v; + for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; + + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->rho += rhoi.f[k]; + pi[k]->rho_dh -= rhoi_dh.f[k]; + pi[k]->density.wcount += wcounti.f[k]; + pi[k]->density.wcount_dh -= wcounti_dh.f[k]; + pi[k]->density.div_v += div_vi.f[k]; + for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k]; + pj[k]->rho += rhoj.f[k]; + pj[k]->rho_dh -= rhoj_dh.f[k]; + pj[k]->density.wcount += wcountj.f[k]; + pj[k]->density.wcount_dh -= wcountj_dh.f[k]; + pj[k]->density.div_v += div_vj.f[k]; + for (j = 0; j < 3; j++) pj[k]->density.curl_v[j] += curl_vj[j].f[k]; + } + #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); - -#endif - - } - + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); +#endif +} /** * @brief Density loop (non-symmetric version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r, ri; - float xi; - float h_inv; - float wi, wi_dx; - float mj; - float dvdr; - float dv[3], curlvr[3]; - int k; - - /* Get the masses. */ - mj = pj->mass; - - /* Get r and r inverse. */ - r = sqrtf( r2 ); - ri = 1.0f / r; - - /* Compute dv dot r */ - dv[0] = pi->v[0] - pj->v[0]; - dv[1] = pi->v[1] - pj->v[1]; - dv[2] = pi->v[2] - pj->v[2]; - dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2]; - dvdr *= ri; - - /* Compute dv cross r */ - curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1]; - curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2]; - curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0]; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k] *= ri; - - h_inv = 1.0 / hi; - xi = r * h_inv; - kernel_deval( xi , &wi , &wi_dx ); - - pi->rho += mj * wi; - pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx ); - pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; - - pi->density.div_v += mj * dvdr * wi_dx; - for ( k = 0 ; k < 3 ; k++ ) - pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; - - } - +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r, ri; + float xi; + float h_inv; + float wi, wi_dx; + float mj; + float dvdr; + float dv[3], curlvr[3]; + int k; + + /* Get the masses. */ + mj = pj->mass; + + /* Get r and r inverse. */ + r = sqrtf(r2); + ri = 1.0f / r; + + /* Compute dv dot r */ + dv[0] = pi->v[0] - pj->v[0]; + dv[1] = pi->v[1] - pj->v[1]; + dv[2] = pi->v[2] - pj->v[2]; + dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; + dvdr *= ri; + + /* Compute dv cross r */ + curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1]; + curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; + curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; + for (k = 0; k < 3; k++) curlvr[k] *= ri; + + h_inv = 1.0 / hi; + xi = r * h_inv; + kernel_deval(xi, &wi, &wi_dx); + + pi->rho += mj * wi; + pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx); + pi->density.wcount += wi; + pi->density.wcount_dh -= xi * wi_dx; + + pi->density.div_v += mj * dvdr * wi_dx; + for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; +} + /** * @brief Density loop (non-symmetric vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) + INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, + float *Hi, float *Hj, + struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx; - vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; - vector mj; - vector dx[3], dv[3]; - vector vi[3], vj[3]; - vector dvdr; - vector curlvr[3], curl_vi[3]; - int k, j; - - #if VEC_SIZE==8 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - #elif VEC_SIZE==4 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); - xi.v = r.v * hi_inv.v; - - kernel_deval_vec( &xi , &wi , &wi_dx ); - - /* Compute dv. */ - dv[0].v = vi[0].v - vj[0].v; - dv[1].v = vi[1].v - vj[1].v; - dv[2].v = vi[2].v - vj[2].v; - - /* Compute dv dot r */ - dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Compute dv cross r */ - curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; - curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; - curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k].v *= ri.v; - - rhoi.v = mj.v * wi.v; - rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v ); - wcounti.v = wi.v; - wcounti_dh.v = xi.v * wi_dx.v; - div_vi.v = mj.v * dvdr.v * wi_dx.v; - for ( k = 0 ; k < 3 ; k++ ) - curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; - - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->rho += rhoi.f[k]; - pi[k]->rho_dh -= rhoi_dh.f[k]; - pi[k]->density.wcount += wcounti.f[k]; - pi[k]->density.wcount_dh -= wcounti_dh.f[k]; - pi[k]->density.div_v += div_vi.f[k]; - for( j = 0 ; j < 3 ; j++ ) - pi[k]->density.curl_v[j] += curl_vi[j].f[k]; - } - + vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx; + vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; + vector mj; + vector dx[3], dv[3]; + vector vi[3], vj[3]; + vector dvdr; + vector curlvr[3], curl_vi[3]; + int k, j; + +#if VEC_SIZE == 8 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + xi.v = r.v * hi_inv.v; + + kernel_deval_vec(&xi, &wi, &wi_dx); + + /* Compute dv. */ + dv[0].v = vi[0].v - vj[0].v; + dv[1].v = vi[1].v - vj[1].v; + dv[2].v = vi[2].v - vj[2].v; + + /* Compute dv dot r */ + dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Compute dv cross r */ + curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; + curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; + curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; + for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; + + rhoi.v = mj.v * wi.v; + rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v); + wcounti.v = wi.v; + wcounti_dh.v = xi.v * wi_dx.v; + div_vi.v = mj.v * dvdr.v * wi_dx.v; + for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; + + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->rho += rhoi.f[k]; + pi[k]->rho_dh -= rhoi_dh.f[k]; + pi[k]->density.wcount += wcounti.f[k]; + pi[k]->density.wcount_dh -= wcounti_dh.f[k]; + pi[k]->density.div_v += div_vi.f[k]; + for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k]; + } + #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_nonsym_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_nonsym_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); #endif - - } - +} /** * @brief Force loop */ -__attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r = sqrtf( r2 ), ri = 1.0f / r; - float xi, xj; - float hi_inv, hi2_inv; - float hj_inv, hj2_inv; - float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; - float mi, mj, POrho2i, POrho2j, rhoi, rhoj; - float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u; - // float dt_max; - float f; - int k; - - /* Get some values in local variables. */ - mi = pi->mass; mj = pj->mass; - rhoi = pi->rho; rhoj = pj->rho; - POrho2i = pi->force.POrho2; - POrho2j = pj->force.POrho2; - - /* Get the kernel for hi. */ - hi_inv = 1.0f / hi; - hi2_inv = hi_inv * hi_inv; - xi = r * hi_inv; - kernel_deval( xi , &wi , &wi_dx ); - wi_dr = hi2_inv * hi2_inv * wi_dx; - - /* Get the kernel for hj. */ - hj_inv = 1.0f / hj; - hj2_inv = hj_inv * hj_inv; - xj = r * hj_inv; - kernel_deval( xj , &wj , &wj_dx ); - wj_dr = hj2_inv * hj2_inv * wj_dx; - - /* Compute dv dot r. */ - dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2]; - dvdr *= ri; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij = fminf( dvdr , 0.f ); - - /* Compute signal velocity */ - v_sig = pi->force.c + pj->force.c - 2.0f*omega_ij; - - /* Compute viscosity parameter */ - alpha_ij = -0.5f * ( pi->alpha + pj->alpha ); - - /* Compute viscosity tensor */ - Pi_ij = alpha_ij * v_sig * omega_ij / ( rhoi + rhoj ); - - /* Apply balsara switch */ - Pi_ij *= ( pi->force.balsara + pj->force.balsara ); - - /* Termal conductivity */ - v_sig_u = sqrtf( 2.f * ( const_hydro_gamma - 1.f ) * fabs( rhoi * pi->u - rhoj * pj->u ) / ( rhoi + rhoj ) ); - tc = const_conductivity_alpha * v_sig_u / ( rhoi + rhoj ); - tc *= ( wi_dr + wj_dr ); - - /* Get the common factor out. */ - w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f = dx[k] * w; - pi->a[k] -= mj * f; - pj->a[k] += mi * f; - } - - /* Get the time derivative for u. */ - pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) ); - pj->force.u_dt += mi * dvdr * ( POrho2j * wj_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Add the thermal conductivity */ - pi->force.u_dt += mj * tc * ( pi->u - pj->u ); - pj->force.u_dt += mi * tc * ( pj->u - pi->u ); - - /* Get the time derivative for h. */ - pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; - pj->force.h_dt -= mi * dvdr / rhoi * wj_dr; - - /* Update the signal velocity. */ - pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig ); - pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig ); - - } - +__attribute__((always_inline)) INLINE static void runner_iact_force( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r = sqrtf(r2), ri = 1.0f / r; + float xi, xj; + float hi_inv, hi2_inv; + float hj_inv, hj2_inv; + float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; + float mi, mj, POrho2i, POrho2j, rhoi, rhoj; + float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u; + // float dt_max; + float f; + int k; + + /* Get some values in local variables. */ + mi = pi->mass; + mj = pj->mass; + rhoi = pi->rho; + rhoj = pj->rho; + POrho2i = pi->force.POrho2; + POrho2j = pj->force.POrho2; + + /* Get the kernel for hi. */ + hi_inv = 1.0f / hi; + hi2_inv = hi_inv * hi_inv; + xi = r * hi_inv; + kernel_deval(xi, &wi, &wi_dx); + wi_dr = hi2_inv * hi2_inv * wi_dx; + + /* Get the kernel for hj. */ + hj_inv = 1.0f / hj; + hj2_inv = hj_inv * hj_inv; + xj = r * hj_inv; + kernel_deval(xj, &wj, &wj_dx); + wj_dr = hj2_inv * hj2_inv * wj_dx; + + /* Compute dv dot r. */ + dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] + + (pi->v[2] - pj->v[2]) * dx[2]; + dvdr *= ri; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij = fminf(dvdr, 0.f); + + /* Compute signal velocity */ + v_sig = pi->force.c + pj->force.c - 2.0f * omega_ij; + + /* Compute viscosity parameter */ + alpha_ij = -0.5f * (pi->alpha + pj->alpha); + + /* Compute viscosity tensor */ + Pi_ij = alpha_ij * v_sig * omega_ij / (rhoi + rhoj); + + /* Apply balsara switch */ + Pi_ij *= (pi->force.balsara + pj->force.balsara); + + /* Termal conductivity */ + v_sig_u = sqrtf(2.f * (const_hydro_gamma - 1.f) * + fabs(rhoi * pi->u - rhoj * pj->u) / (rhoi + rhoj)); + tc = const_conductivity_alpha * v_sig_u / (rhoi + rhoj); + tc *= (wi_dr + wj_dr); + + /* Get the common factor out. */ + w = ri * + ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr)); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f = dx[k] * w; + pi->a[k] -= mj * f; + pj->a[k] += mi * f; + } + + /* Get the time derivative for u. */ + pi->force.u_dt += + mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr)); + pj->force.u_dt += + mi * dvdr * (POrho2j * wj_dr + 0.125f * Pi_ij * (wi_dr + wj_dr)); + + /* Add the thermal conductivity */ + pi->force.u_dt += mj * tc * (pi->u - pj->u); + pj->force.u_dt += mi * tc * (pj->u - pi->u); + + /* Get the time derivative for h. */ + pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; + pj->force.h_dt -= mi * dvdr / rhoi * wj_dr; + + /* Update the signal velocity. */ + pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig); + pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig); +} /** * @brief Force loop (Vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, r2, ri; - vector xi, xj; - vector hi, hj, hi_inv, hj_inv; - vector hi2_inv, hj2_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector w; - vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju; - vector mi, mj; - vector f; - vector dx[3]; - vector vi[3], vj[3]; - vector pia[3], pja[3]; - vector piu_dt, pju_dt; - vector pih_dt, pjh_dt; - vector ci, cj, v_sig, vi_sig, vj_sig; - vector omega_ij, Pi_ij, balsara; - vector pialpha, pjalpha, alpha_ij, v_sig_u, tc; - int j, k; - - /* Load stuff. */ - #if VEC_SIZE==8 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho ); - piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u , pi[4]->u , pi[5]->u , pi[6]->u , pi[7]->u ); - pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u , pj[4]->u , pj[5]->u , pj[6]->u , pj[7]->u ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara ); - pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha, pi[4]->alpha, pi[5]->alpha , pi[6]->alpha, pi[7]->alpha ); - pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha, pj[4]->alpha, pj[5]->alpha , pj[6]->alpha, pj[7]->alpha ); - #elif VEC_SIZE==4 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho ); - piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u ); - pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara ); - pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha ); - pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha ); - #else - #error - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - /* Get the kernel for hi. */ - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); - hi2_inv.v = hi_inv.v * hi_inv.v; - xi.v = r.v * hi_inv.v; - kernel_deval_vec( &xi , &wi , &wi_dx ); - wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; - - /* Get the kernel for hj. */ - hj.v = vec_load( Hj ); - hj_inv.v = vec_rcp( hj.v ); - hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); - hj2_inv.v = hj_inv.v * hj_inv.v; - xj.v = r.v * hj_inv.v; - kernel_deval_vec( &xj , &wj , &wj_dx ); - wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; - - /* Compute dv dot r. */ - dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Get the time derivative for h. */ - pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; - pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) ); - - /* Compute signal velocity */ - v_sig.v = ci.v + cj.v - vec_set1( 2.0f )*omega_ij.v; - - /* Compute viscosity parameter */ - alpha_ij.v = vec_set1(-0.5f) * ( pialpha.v + pjalpha.v ); - - /* Compute viscosity tensor */ - Pi_ij.v = balsara.v * alpha_ij.v * v_sig.v * omega_ij.v / (pirho.v + pjrho.v); - Pi_ij.v *= ( wi_dr.v + wj_dr.v ); - - /* Termal conductivity */ - v_sig_u.v = vec_sqrt( vec_set1( 2.f * ( const_hydro_gamma - 1.f ) ) * vec_fabs( pirho.v * piu.v - pjrho.v * pju.v ) / ( pirho.v + pjrho.v ) ); - tc.v = vec_set1( const_conductivity_alpha ) * v_sig_u.v / ( pirho.v + pjrho.v ); - tc.v *= ( wi_dr.v + wj_dr.v ); - - /* Get the common factor out. */ - w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f.v = dx[k].v * w.v; - pia[k].v = mj.v * f.v; - pja[k].v = mi.v * f.v; - } - - /* Get the time derivative for u. */ - piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v ); - pju_dt.v = mi.v * dvdr.v * ( pjPOrho2.v * wj_dr.v + vec_set1( 0.125f ) * Pi_ij.v ); - - /* Add the thermal conductivity */ - piu_dt.v += mj.v * tc.v * ( piu.v - pju.v ); - pju_dt.v += mi.v * tc.v * ( pju.v - piu.v ); - - /* compute the signal velocity (this is always symmetrical). */ - vi_sig.v = vec_fmax( vi_sig.v , v_sig.v ); - vj_sig.v = vec_fmax( vj_sig.v , v_sig.v ); - - /* Store the forces back on the particles. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->force.u_dt += piu_dt.f[k]; - pj[k]->force.u_dt += pju_dt.f[k]; - pi[k]->force.h_dt -= pih_dt.f[k]; - pj[k]->force.h_dt -= pjh_dt.f[k]; - pi[k]->force.v_sig = vi_sig.f[k]; - pj[k]->force.v_sig = vj_sig.f[k]; - for ( j = 0 ; j < 3 ; j++ ) { - pi[k]->a[j] -= pia[j].f[k]; - pj[k]->a[j] += pja[j].f[k]; - } - } - + vector r, r2, ri; + vector xi, xj; + vector hi, hj, hi_inv, hj_inv; + vector hi2_inv, hj2_inv; + vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector w; + vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju; + vector mi, mj; + vector f; + vector dx[3]; + vector vi[3], vj[3]; + vector pia[3], pja[3]; + vector piu_dt, pju_dt; + vector pih_dt, pjh_dt; + vector ci, cj, v_sig, vi_sig, vj_sig; + vector omega_ij, Pi_ij, balsara; + vector pialpha, pjalpha, alpha_ij, v_sig_u, tc; + int j, k; + +/* Load stuff. */ +#if VEC_SIZE == 8 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, + pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + piPOrho2.v = + vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2, + pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2, + pi[6]->force.POrho2, pi[7]->force.POrho2); + pjPOrho2.v = + vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2, + pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2, + pj[6]->force.POrho2, pj[7]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, + pi[5]->rho, pi[6]->rho, pi[7]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, + pj[5]->rho, pj[6]->rho, pj[7]->rho); + piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u, pi[4]->u, pi[5]->u, + pi[6]->u, pi[7]->u); + pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u, pj[4]->u, pj[5]->u, + pj[6]->u, pj[7]->u); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c, + pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c, + pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig, + pi[6]->force.v_sig, pi[7]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig, + pj[6]->force.v_sig, pj[7]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); + balsara.v = + vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, + pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, + pi[6]->force.balsara, pi[7]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, + pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, + pj[6]->force.balsara, pj[7]->force.balsara); + pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha, + pi[4]->alpha, pi[5]->alpha, pi[6]->alpha, pi[7]->alpha); + pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha, + pj[4]->alpha, pj[5]->alpha, pj[6]->alpha, pj[7]->alpha); +#elif VEC_SIZE == 4 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, + pi[2]->force.POrho2, pi[3]->force.POrho2); + pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, + pj[2]->force.POrho2, pj[3]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); + piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u); + pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); + balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, + pi[2]->force.balsara, pi[3]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, + pj[2]->force.balsara, pj[3]->force.balsara); + pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha); + pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha); +#else +#error +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + /* Get the kernel for hi. */ + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f)); + hi2_inv.v = hi_inv.v * hi_inv.v; + xi.v = r.v * hi_inv.v; + kernel_deval_vec(&xi, &wi, &wi_dx); + wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; + + /* Get the kernel for hj. */ + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f)); + hj2_inv.v = hj_inv.v * hj_inv.v; + xj.v = r.v * hj_inv.v; + kernel_deval_vec(&xj, &wj, &wj_dx); + wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; + + /* Compute dv dot r. */ + dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + + ((vi[2].v - vj[2].v) * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Get the time derivative for h. */ + pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; + pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); + + /* Compute signal velocity */ + v_sig.v = ci.v + cj.v - vec_set1(2.0f) * omega_ij.v; + + /* Compute viscosity parameter */ + alpha_ij.v = vec_set1(-0.5f) * (pialpha.v + pjalpha.v); + + /* Compute viscosity tensor */ + Pi_ij.v = balsara.v * alpha_ij.v * v_sig.v * omega_ij.v / (pirho.v + pjrho.v); + Pi_ij.v *= (wi_dr.v + wj_dr.v); + + /* Termal conductivity */ + v_sig_u.v = vec_sqrt(vec_set1(2.f * (const_hydro_gamma - 1.f)) * + vec_fabs(pirho.v * piu.v - pjrho.v * pju.v) / + (pirho.v + pjrho.v)); + tc.v = vec_set1(const_conductivity_alpha) * v_sig_u.v / (pirho.v + pjrho.v); + tc.v *= (wi_dr.v + wj_dr.v); + + /* Get the common factor out. */ + w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) + + vec_set1(0.25f) * Pi_ij.v); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f.v = dx[k].v * w.v; + pia[k].v = mj.v * f.v; + pja[k].v = mi.v * f.v; + } + + /* Get the time derivative for u. */ + piu_dt.v = + mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v); + pju_dt.v = + mi.v * dvdr.v * (pjPOrho2.v * wj_dr.v + vec_set1(0.125f) * Pi_ij.v); + + /* Add the thermal conductivity */ + piu_dt.v += mj.v * tc.v * (piu.v - pju.v); + pju_dt.v += mi.v * tc.v * (pju.v - piu.v); + + /* compute the signal velocity (this is always symmetrical). */ + vi_sig.v = vec_fmax(vi_sig.v, v_sig.v); + vj_sig.v = vec_fmax(vj_sig.v, v_sig.v); + + /* Store the forces back on the particles. */ + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->force.u_dt += piu_dt.f[k]; + pj[k]->force.u_dt += pju_dt.f[k]; + pi[k]->force.h_dt -= pih_dt.f[k]; + pj[k]->force.h_dt -= pjh_dt.f[k]; + pi[k]->force.v_sig = vi_sig.f[k]; + pj[k]->force.v_sig = vj_sig.f[k]; + for (j = 0; j < 3; j++) { + pi[k]->a[j] -= pia[j].f[k]; + pj[k]->a[j] += pja[j].f[k]; + } + } + #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); #endif - - } - +} /** * @brief Force loop (non-symmetric version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r = sqrtf( r2 ), ri = 1.0f / r; - float xi, xj; - float hi_inv, hi2_inv; - float hj_inv, hj2_inv; - float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; - float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj; - float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u; - // float dt_max; - float f; - int k; - - /* Get some values in local variables. */ - // mi = pi->mass; - mj = pj->mass; - rhoi = pi->rho; rhoj = pj->rho; - POrho2i = pi->force.POrho2; - POrho2j = pj->force.POrho2; - - /* Get the kernel for hi. */ - hi_inv = 1.0f / hi; - hi2_inv = hi_inv * hi_inv; - xi = r * hi_inv; - kernel_deval( xi , &wi , &wi_dx ); - wi_dr = hi2_inv * hi2_inv * wi_dx; - - /* Get the kernel for hj. */ - hj_inv = 1.0f / hj; - hj2_inv = hj_inv * hj_inv; - xj = r * hj_inv; - kernel_deval( xj , &wj , &wj_dx ); - wj_dr = hj2_inv * hj2_inv * wj_dx; - - /* Compute dv dot r. */ - dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2]; - dvdr *= ri; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij = fminf( dvdr , 0.f ); - - /* Compute signal velocity */ - v_sig = pi->force.c + pj->force.c - 2.0f*omega_ij; - - /* Compute viscosity parameter */ - alpha_ij = -0.5f * ( pi->alpha + pj->alpha ); - - /* Compute viscosity tensor */ - Pi_ij = alpha_ij * v_sig * omega_ij / ( rhoi + rhoj ); - - /* Apply balsara switch */ - Pi_ij *= ( pi->force.balsara + pj->force.balsara ); - - /* Termal conductivity */ - v_sig_u = sqrtf( 2.f * ( const_hydro_gamma - 1.f ) * fabs( rhoi * pi->u - rhoj * pj->u ) / ( rhoi + rhoj ) ); - tc = const_conductivity_alpha * v_sig_u / ( rhoi + rhoj ); - tc *= ( wi_dr + wj_dr ); - - /* Get the common factor out. */ - w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f = dx[k] * w; - pi->a[k] -= mj * f; - } - - /* Get the time derivative for u. */ - pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Add the thermal conductivity */ - pi->force.u_dt += mj * tc * ( pi->u - pj->u ); - - /* Get the time derivative for h. */ - pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; - - /* Update the signal velocity. */ - pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig ); - pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig ); - - } - +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r = sqrtf(r2), ri = 1.0f / r; + float xi, xj; + float hi_inv, hi2_inv; + float hj_inv, hj2_inv; + float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; + float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj; + float v_sig, omega_ij, Pi_ij, alpha_ij, tc, v_sig_u; + // float dt_max; + float f; + int k; + + /* Get some values in local variables. */ + // mi = pi->mass; + mj = pj->mass; + rhoi = pi->rho; + rhoj = pj->rho; + POrho2i = pi->force.POrho2; + POrho2j = pj->force.POrho2; + + /* Get the kernel for hi. */ + hi_inv = 1.0f / hi; + hi2_inv = hi_inv * hi_inv; + xi = r * hi_inv; + kernel_deval(xi, &wi, &wi_dx); + wi_dr = hi2_inv * hi2_inv * wi_dx; + + /* Get the kernel for hj. */ + hj_inv = 1.0f / hj; + hj2_inv = hj_inv * hj_inv; + xj = r * hj_inv; + kernel_deval(xj, &wj, &wj_dx); + wj_dr = hj2_inv * hj2_inv * wj_dx; + + /* Compute dv dot r. */ + dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] + + (pi->v[2] - pj->v[2]) * dx[2]; + dvdr *= ri; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij = fminf(dvdr, 0.f); + + /* Compute signal velocity */ + v_sig = pi->force.c + pj->force.c - 2.0f * omega_ij; + + /* Compute viscosity parameter */ + alpha_ij = -0.5f * (pi->alpha + pj->alpha); + + /* Compute viscosity tensor */ + Pi_ij = alpha_ij * v_sig * omega_ij / (rhoi + rhoj); + + /* Apply balsara switch */ + Pi_ij *= (pi->force.balsara + pj->force.balsara); + + /* Termal conductivity */ + v_sig_u = sqrtf(2.f * (const_hydro_gamma - 1.f) * + fabs(rhoi * pi->u - rhoj * pj->u) / (rhoi + rhoj)); + tc = const_conductivity_alpha * v_sig_u / (rhoi + rhoj); + tc *= (wi_dr + wj_dr); + + /* Get the common factor out. */ + w = ri * + ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr)); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f = dx[k] * w; + pi->a[k] -= mj * f; + } + + /* Get the time derivative for u. */ + pi->force.u_dt += + mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr)); + + /* Add the thermal conductivity */ + pi->force.u_dt += mj * tc * (pi->u - pj->u); + + /* Get the time derivative for h. */ + pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; + + /* Update the signal velocity. */ + pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig); + pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig); +} /** * @brief Force loop (Vectorized non-symmetric version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, r2, ri; - vector xi, xj; - vector hi, hj, hi_inv, hj_inv; - vector hi2_inv, hj2_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector w; - vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju; - vector mj; - vector f; - vector dx[3]; - vector vi[3], vj[3]; - vector pia[3]; - vector piu_dt; - vector pih_dt; - vector ci, cj, v_sig, vi_sig, vj_sig; - vector omega_ij, Pi_ij, balsara; - vector pialpha, pjalpha, alpha_ij, v_sig_u, tc; - int j, k; - - /* Load stuff. */ - #if VEC_SIZE==8 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho ); - piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u , pi[4]->u , pi[5]->u , pi[6]->u , pi[7]->u ); - pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u , pj[4]->u , pj[5]->u , pj[6]->u , pj[7]->u ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara ); - pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha, pi[4]->alpha, pi[5]->alpha , pi[6]->alpha, pi[7]->alpha ); - pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha, pj[4]->alpha, pj[5]->alpha , pj[6]->alpha, pj[7]->alpha ); - #elif VEC_SIZE==4 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho ); - piu.v = vec_set( pi[0]->u , pi[1]->u , pi[2]->u , pi[3]->u ); - pju.v = vec_set( pj[0]->u , pj[1]->u , pj[2]->u , pj[3]->u ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara ); - pialpha.v = vec_set( pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha ); - pjalpha.v = vec_set( pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha ); - #else - #error - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - /* Get the kernel for hi. */ - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); - hi2_inv.v = hi_inv.v * hi_inv.v; - xi.v = r.v * hi_inv.v; - kernel_deval_vec( &xi , &wi , &wi_dx ); - wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; - - /* Get the kernel for hj. */ - hj.v = vec_load( Hj ); - hj_inv.v = vec_rcp( hj.v ); - hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); - hj2_inv.v = hj_inv.v * hj_inv.v; - xj.v = r.v * hj_inv.v; - kernel_deval_vec( &xj , &wj , &wj_dx ); - wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; - - /* Compute dv dot r. */ - dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Get the time derivative for h. */ - pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) ); - - /* Compute signal velocity */ - v_sig.v = ci.v + cj.v - vec_set1( 2.0f )*omega_ij.v; - - /* Compute viscosity parameter */ - alpha_ij.v = vec_set1(-0.5f) * ( pialpha.v + pjalpha.v ); - - /* Compute viscosity tensor */ - Pi_ij.v = balsara.v * alpha_ij.v * v_sig.v * omega_ij.v / (pirho.v + pjrho.v); - Pi_ij.v *= ( wi_dr.v + wj_dr.v ); - - /* Termal conductivity */ - v_sig_u.v = vec_sqrt( vec_set1( 2.f * ( const_hydro_gamma - 1.f ) ) * vec_fabs( pirho.v * piu.v - pjrho.v * pju.v ) / ( pirho.v + pjrho.v ) ); - tc.v = vec_set1( const_conductivity_alpha ) * v_sig_u.v / ( pirho.v + pjrho.v ); - tc.v *= ( wi_dr.v + wj_dr.v ); - - /* Get the common factor out. */ - w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f.v = dx[k].v * w.v; - pia[k].v = mj.v * f.v; - } - - /* Get the time derivative for u. */ - piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v ); - - /* Add the thermal conductivity */ - piu_dt.v += mj.v * tc.v * ( piu.v - pju.v ); - - /* compute the signal velocity (this is always symmetrical). */ - vi_sig.v = vec_fmax( vi_sig.v , v_sig.v ); - vj_sig.v = vec_fmax( vj_sig.v , v_sig.v ); - - /* Store the forces back on the particles. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->force.u_dt += piu_dt.f[k]; - pi[k]->force.h_dt -= pih_dt.f[k]; - pi[k]->force.v_sig = vi_sig.f[k]; - pj[k]->force.v_sig = vj_sig.f[k]; - for ( j = 0 ; j < 3 ; j++ ) - pi[k]->a[j] -= pia[j].f[k]; - } - + vector r, r2, ri; + vector xi, xj; + vector hi, hj, hi_inv, hj_inv; + vector hi2_inv, hj2_inv; + vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector w; + vector piPOrho2, pjPOrho2, pirho, pjrho, piu, pju; + vector mj; + vector f; + vector dx[3]; + vector vi[3], vj[3]; + vector pia[3]; + vector piu_dt; + vector pih_dt; + vector ci, cj, v_sig, vi_sig, vj_sig; + vector omega_ij, Pi_ij, balsara; + vector pialpha, pjalpha, alpha_ij, v_sig_u, tc; + int j, k; + +/* Load stuff. */ +#if VEC_SIZE == 8 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + piPOrho2.v = + vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2, + pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2, + pi[6]->force.POrho2, pi[7]->force.POrho2); + pjPOrho2.v = + vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2, + pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2, + pj[6]->force.POrho2, pj[7]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, + pi[5]->rho, pi[6]->rho, pi[7]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, + pj[5]->rho, pj[6]->rho, pj[7]->rho); + piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u, pi[4]->u, pi[5]->u, + pi[6]->u, pi[7]->u); + pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u, pj[4]->u, pj[5]->u, + pj[6]->u, pj[7]->u); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c, + pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c, + pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig, + pi[6]->force.v_sig, pi[7]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig, + pj[6]->force.v_sig, pj[7]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); + balsara.v = + vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, + pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, + pi[6]->force.balsara, pi[7]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, + pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, + pj[6]->force.balsara, pj[7]->force.balsara); + pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha, + pi[4]->alpha, pi[5]->alpha, pi[6]->alpha, pi[7]->alpha); + pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha, + pj[4]->alpha, pj[5]->alpha, pj[6]->alpha, pj[7]->alpha); +#elif VEC_SIZE == 4 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, + pi[2]->force.POrho2, pi[3]->force.POrho2); + pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, + pj[2]->force.POrho2, pj[3]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); + piu.v = vec_set(pi[0]->u, pi[1]->u, pi[2]->u, pi[3]->u); + pju.v = vec_set(pj[0]->u, pj[1]->u, pj[2]->u, pj[3]->u); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); + balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, + pi[2]->force.balsara, pi[3]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, + pj[2]->force.balsara, pj[3]->force.balsara); + pialpha.v = vec_set(pi[0]->alpha, pi[1]->alpha, pi[2]->alpha, pi[3]->alpha); + pjalpha.v = vec_set(pj[0]->alpha, pj[1]->alpha, pj[2]->alpha, pj[3]->alpha); #else +#error +#endif - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_nonsym_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + /* Get the kernel for hi. */ + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f)); + hi2_inv.v = hi_inv.v * hi_inv.v; + xi.v = r.v * hi_inv.v; + kernel_deval_vec(&xi, &wi, &wi_dx); + wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; + + /* Get the kernel for hj. */ + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f)); + hj2_inv.v = hj_inv.v * hj_inv.v; + xj.v = r.v * hj_inv.v; + kernel_deval_vec(&xj, &wj, &wj_dx); + wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; + + /* Compute dv dot r. */ + dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + + ((vi[2].v - vj[2].v) * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Get the time derivative for h. */ + pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); + + /* Compute signal velocity */ + v_sig.v = ci.v + cj.v - vec_set1(2.0f) * omega_ij.v; + + /* Compute viscosity parameter */ + alpha_ij.v = vec_set1(-0.5f) * (pialpha.v + pjalpha.v); + + /* Compute viscosity tensor */ + Pi_ij.v = balsara.v * alpha_ij.v * v_sig.v * omega_ij.v / (pirho.v + pjrho.v); + Pi_ij.v *= (wi_dr.v + wj_dr.v); + + /* Termal conductivity */ + v_sig_u.v = vec_sqrt(vec_set1(2.f * (const_hydro_gamma - 1.f)) * + vec_fabs(pirho.v * piu.v - pjrho.v * pju.v) / + (pirho.v + pjrho.v)); + tc.v = vec_set1(const_conductivity_alpha) * v_sig_u.v / (pirho.v + pjrho.v); + tc.v *= (wi_dr.v + wj_dr.v); + + /* Get the common factor out. */ + w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) + + vec_set1(0.25f) * Pi_ij.v); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f.v = dx[k].v * w.v; + pia[k].v = mj.v * f.v; + } + + /* Get the time derivative for u. */ + piu_dt.v = + mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v); + + /* Add the thermal conductivity */ + piu_dt.v += mj.v * tc.v * (piu.v - pju.v); + + /* compute the signal velocity (this is always symmetrical). */ + vi_sig.v = vec_fmax(vi_sig.v, v_sig.v); + vj_sig.v = vec_fmax(vj_sig.v, v_sig.v); + + /* Store the forces back on the particles. */ + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->force.u_dt += piu_dt.f[k]; + pi[k]->force.h_dt -= pih_dt.f[k]; + pi[k]->force.v_sig = vi_sig.f[k]; + pj[k]->force.v_sig = vj_sig.f[k]; + for (j = 0; j < 3; j++) pi[k]->a[j] -= pia[j].f[k]; + } -#endif - - } - +#else + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_nonsym_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); +#endif +} +#endif /* SWIFT_RUNNER_IACT_H */ diff --git a/src/runner_iact_grav.h b/src/runner_iact_grav.h index da1f552ae073aab3575de03255a3919d7a14cf95..2fd30c1c3854db56564300f0a3e1a13a6dc31251 100644 --- a/src/runner_iact_grav.h +++ b/src/runner_iact_grav.h @@ -2,22 +2,26 @@ * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * Matthieu Schaller (matthieu.schaller@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_RUNNER_IACT_GRAV_H +#define SWIFT_RUNNER_IACT_GRAV_H +/* Includes. */ +#include "const.h" #include "kernel.h" #include "vector.h" @@ -27,93 +31,93 @@ * */ - /** * @brief Gravity potential */ -__attribute__ ((always_inline)) INLINE static void runner_iact_grav ( float r2 , float *dx , struct gpart *pi , struct gpart *pj ) { - - float ir, r; - float w, acc; - float mi = pi->mass, mj = pj->mass; - int k; - - /* Get the absolute distance. */ - ir = 1.0f / sqrtf( r2 ); - r = r2 * ir; - - /* Evaluate the gravity kernel. */ - kernel_grav_eval( r , &acc ); - - /* Scale the acceleration. */ - acc *= const_G * ir * ir * ir; - - /* Aggregate the accellerations. */ - for ( k = 0 ; k < 3 ; k++ ) { - w = acc * dx[k]; - pi->a[k] -= w * mj; - pj->a[k] += w * mi; - } +__attribute__((always_inline)) INLINE static void runner_iact_grav( + float r2, float *dx, struct gpart *pi, struct gpart *pj) { + + float ir, r; + float w, acc; + float mi = pi->mass, mj = pj->mass; + int k; + + /* Get the absolute distance. */ + ir = 1.0f / sqrtf(r2); + r = r2 * ir; + + /* Evaluate the gravity kernel. */ + kernel_grav_eval(r, &acc); + + /* Scale the acceleration. */ + acc *= const_G * ir * ir * ir; + + /* Aggregate the accellerations. */ + for (k = 0; k < 3; k++) { + w = acc * dx[k]; + pi->a[k] -= w * mj; + pj->a[k] += w * mi; + } +} - } - - /** * @brief Gravity potential (Vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_vec_grav ( float *R2 , float *Dx , struct gpart **pi , struct gpart **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_vec_grav( + float *R2, float *Dx, struct gpart **pi, struct gpart **pj) { #ifdef VECTORIZE - vector ir, r, r2, dx[3]; - vector w, acc, ai, aj; - vector mi, mj; - int j, k; - - #if VEC_SIZE==8 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - #elif VEC_SIZE==4 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - #endif - - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ir.v = vec_rsqrt( r2.v ); - ir.v = ir.v - vec_set1( 0.5f ) * ir.v * ( r2.v * ir.v * ir.v - vec_set1( 1.0f ) ); - r.v = r2.v * ir.v; - - /* Evaluate the gravity kernel. */ - blender_eval_vec( &r , &acc ); - - /* Scale the acceleration. */ - acc.v *= vec_set1( const_G ) * ir.v * ir.v * ir.v; - - /* Aggregate the accellerations. */ - for ( k = 0 ; k < 3 ; k++ ) { - w.v = acc.v * dx[k].v; - ai.v = w.v * mj.v; - aj.v = w.v * mi.v; - for ( j = 0 ; j < VEC_SIZE ; j++ ) { - pi[j]->a[k] -= ai.f[j]; - pj[j]->a[k] += aj.f[j]; - } - } + vector ir, r, r2, dx[3]; + vector w, acc, ai, aj; + vector mi, mj; + int j, k; + +#if VEC_SIZE == 8 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, + pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ir.v = vec_rsqrt(r2.v); + ir.v = ir.v - vec_set1(0.5f) * ir.v * (r2.v * ir.v * ir.v - vec_set1(1.0f)); + r.v = r2.v * ir.v; + + /* Evaluate the gravity kernel. */ + blender_eval_vec(&r, &acc); + + /* Scale the acceleration. */ + acc.v *= vec_set1(const_G) * ir.v * ir.v * ir.v; + + /* Aggregate the accellerations. */ + for (k = 0; k < 3; k++) { + w.v = acc.v * dx[k].v; + ai.v = w.v * mj.v; + aj.v = w.v * mi.v; + for (j = 0; j < VEC_SIZE; j++) { + pi[j]->a[k] -= ai.f[j]; + pj[j]->a[k] += aj.f[j]; + } + } #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_grav( R2[k] , &Dx[3*k] , pi[k] , pj[k] ); - + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_grav(R2[k], &Dx[3 * k], pi[k], pj[k]); + #endif - - } - +} +#endif /* SWIFT_RUNNER_IACT_GRAV_H */ diff --git a/src/runner_iact_legacy.h b/src/runner_iact_legacy.h index aa50cc1fe2c09fa558a21eaf8b9079ffc08b6cbb..3f5df4cd40668862a2e2c0a01c5b28069f184377 100644 --- a/src/runner_iact_legacy.h +++ b/src/runner_iact_legacy.h @@ -2,809 +2,900 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * Matthieu Schaller (matthieu.schaller@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_RUNNER_IACT_LEGACY_H +#define SWIFT_RUNNER_IACT_LECAGY_H +/* Includes. */ +#include "const.h" #include "kernel.h" +#include "part.h" #include "vector.h" /** * @file runner_iact_legacy.h * @brief SPH interaction functions following the Gadget-2 version of SPH. * - * The interactions computed here are the ones presented in the Gadget-2 paper and use the same - * numerical coefficients as the Gadget-2 code. When used with the Spline-3 kernel, the results - * should be equivalent to the ones obtained with Gadget-2 up to the rounding errors and interactions + * The interactions computed here are the ones presented in the Gadget-2 paper + *and use the same + * numerical coefficients as the Gadget-2 code. When used with the Spline-3 + *kernel, the results + * should be equivalent to the ones obtained with Gadget-2 up to the rounding + *errors and interactions * missed by the Gadget-2 tree-code neighbours search. * - * The code uses internal energy instead of entropy as a thermodynamical variable. + * The code uses internal energy instead of entropy as a thermodynamical + *variable. */ - /** * @brief Density loop */ -__attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r = sqrtf( r2 ), ri = 1.0f / r; - float xi, xj; - float h_inv; - float wi, wj, wi_dx, wj_dx; - float mi, mj; - float dvdr; - float dv[3], curlvr[3]; - int k; - - /* Get the masses. */ - mi = pi->mass; mj = pj->mass; - - /* Compute dv dot r */ - dv[0] = pi->v[0] - pj->v[0]; - dv[1] = pi->v[1] - pj->v[1]; - dv[2] = pi->v[2] - pj->v[2]; - dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2]; - dvdr *= ri; - - /* Compute dv cross r */ - curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1]; - curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2]; - curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0]; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k] *= ri; - - /* Compute density of pi. */ - h_inv = 1.0 / hi; - xi = r * h_inv; - kernel_deval( xi , &wi , &wi_dx ); - - pi->rho += mj * wi; - pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx ); - pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; - - pi->density.div_v += mj * dvdr * wi_dx; - for ( k = 0 ; k < 3 ; k++ ) - pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; - - /* Compute density of pj. */ - h_inv = 1.0 / hj; - xj = r * h_inv; - kernel_deval( xj , &wj , &wj_dx ); - - pj->rho += mi * wj; - pj->rho_dh -= mi * ( 3.0*wj + xj*wj_dx ); - pj->density.wcount += wj; - pj->density.wcount_dh -= xj * wj_dx; - - pj->density.div_v += mi * dvdr * wj_dx; - for ( k = 0 ; k < 3 ; k++ ) - pj->density.curl_v[k] += mi * curlvr[k] * wj_dx; - - } - +__attribute__((always_inline)) INLINE static void runner_iact_density( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r = sqrtf(r2), ri = 1.0f / r; + float xi, xj; + float h_inv; + float wi, wj, wi_dx, wj_dx; + float mi, mj; + float dvdr; + float dv[3], curlvr[3]; + int k; + + /* Get the masses. */ + mi = pi->mass; + mj = pj->mass; + + /* Compute dv dot r */ + dv[0] = pi->v[0] - pj->v[0]; + dv[1] = pi->v[1] - pj->v[1]; + dv[2] = pi->v[2] - pj->v[2]; + dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; + dvdr *= ri; + + /* Compute dv cross r */ + curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1]; + curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; + curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; + for (k = 0; k < 3; k++) curlvr[k] *= ri; + + /* Compute density of pi. */ + h_inv = 1.0 / hi; + xi = r * h_inv; + kernel_deval(xi, &wi, &wi_dx); + + pi->rho += mj * wi; + pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx); + pi->density.wcount += wi; + pi->density.wcount_dh -= xi * wi_dx; + + pi->density.div_v += mj * dvdr * wi_dx; + for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; + + /* Compute density of pj. */ + h_inv = 1.0 / hj; + xj = r * h_inv; + kernel_deval(xj, &wj, &wj_dx); + + pj->rho += mi * wj; + pj->rho_dh -= mi * (3.0 * wj + xj * wj_dx); + pj->density.wcount += wj; + pj->density.wcount_dh -= xj * wj_dx; + + pj->density.div_v += mi * dvdr * wj_dx; + for (k = 0; k < 3; k++) pj->density.curl_v[k] += mi * curlvr[k] * wj_dx; +} + /** * @brief Density loop (Vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_vec_density( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, r2, ri, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; - vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; - vector mi, mj; - vector dx[3], dv[3]; - vector vi[3], vj[3]; - vector dvdr, div_vi, div_vj; - vector curlvr[3], curl_vi[3], curl_vj[3]; - int k, j; - - #if VEC_SIZE==8 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - #elif VEC_SIZE==4 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); - xi.v = r.v * hi_inv.v; - - hj.v = vec_load( Hj ); - hj_inv.v = vec_rcp( hj.v ); - hj_inv.v = hj_inv.v - hj_inv.v * ( hj_inv.v * hj.v - vec_set1( 1.0f ) ); - xj.v = r.v * hj_inv.v; - - kernel_deval_vec( &xi , &wi , &wi_dx ); - kernel_deval_vec( &xj , &wj , &wj_dx ); - - /* Compute dv. */ - dv[0].v = vi[0].v - vj[0].v; - dv[1].v = vi[1].v - vj[1].v; - dv[2].v = vi[2].v - vj[2].v; - - /* Compute dv dot r */ - dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Compute dv cross r */ - curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; - curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; - curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k].v *= ri.v; - - rhoi.v = mj.v * wi.v; - rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v ); - wcounti.v = wi.v; - wcounti_dh.v = xi.v * wi_dx.v; - div_vi.v = mj.v * dvdr.v * wi_dx.v; - for ( k = 0 ; k < 3 ; k++ ) - curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; - - rhoj.v = mi.v * wj.v; - rhoj_dh.v = mi.v * ( vec_set1( 3.0f ) * wj.v + xj.v * wj_dx.v ); - wcountj.v = wj.v; - wcountj_dh.v = xj.v * wj_dx.v; - div_vj.v = mi.v * dvdr.v * wj_dx.v; - for ( k = 0 ; k < 3 ; k++ ) - curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; - - - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->rho += rhoi.f[k]; - pi[k]->rho_dh -= rhoi_dh.f[k]; - pi[k]->density.wcount += wcounti.f[k]; - pi[k]->density.wcount_dh -= wcounti_dh.f[k]; - pi[k]->density.div_v += div_vi.f[k]; - for( j = 0 ; j < 3 ; j++ ) - pi[k]->density.curl_v[j] += curl_vi[j].f[k]; - pj[k]->rho += rhoj.f[k]; - pj[k]->rho_dh -= rhoj_dh.f[k]; - pj[k]->density.wcount += wcountj.f[k]; - pj[k]->density.wcount_dh -= wcountj_dh.f[k]; - pj[k]->density.div_v += div_vj.f[k]; - for( j = 0 ; j < 3 ; j++ ) - pj[k]->density.curl_v[j] += curl_vj[j].f[k]; - } - + vector r, r2, ri, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; + vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; + vector mi, mj; + vector dx[3], dv[3]; + vector vi[3], vj[3]; + vector dvdr, div_vi, div_vj; + vector curlvr[3], curl_vi[3], curl_vj[3]; + int k, j; + +#if VEC_SIZE == 8 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, + pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + xi.v = r.v * hi_inv.v; + + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj_inv.v * hj.v - vec_set1(1.0f)); + xj.v = r.v * hj_inv.v; + + kernel_deval_vec(&xi, &wi, &wi_dx); + kernel_deval_vec(&xj, &wj, &wj_dx); + + /* Compute dv. */ + dv[0].v = vi[0].v - vj[0].v; + dv[1].v = vi[1].v - vj[1].v; + dv[2].v = vi[2].v - vj[2].v; + + /* Compute dv dot r */ + dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Compute dv cross r */ + curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; + curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; + curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; + for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; + + rhoi.v = mj.v * wi.v; + rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v); + wcounti.v = wi.v; + wcounti_dh.v = xi.v * wi_dx.v; + div_vi.v = mj.v * dvdr.v * wi_dx.v; + for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; + + rhoj.v = mi.v * wj.v; + rhoj_dh.v = mi.v * (vec_set1(3.0f) * wj.v + xj.v * wj_dx.v); + wcountj.v = wj.v; + wcountj_dh.v = xj.v * wj_dx.v; + div_vj.v = mi.v * dvdr.v * wj_dx.v; + for (k = 0; k < 3; k++) curl_vj[k].v = mi.v * curlvr[k].v * wj_dx.v; + + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->rho += rhoi.f[k]; + pi[k]->rho_dh -= rhoi_dh.f[k]; + pi[k]->density.wcount += wcounti.f[k]; + pi[k]->density.wcount_dh -= wcounti_dh.f[k]; + pi[k]->density.div_v += div_vi.f[k]; + for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k]; + pj[k]->rho += rhoj.f[k]; + pj[k]->rho_dh -= rhoj_dh.f[k]; + pj[k]->density.wcount += wcountj.f[k]; + pj[k]->density.wcount_dh -= wcountj_dh.f[k]; + pj[k]->density.div_v += div_vj.f[k]; + for (j = 0; j < 3; j++) pj[k]->density.curl_v[j] += curl_vj[j].f[k]; + } + #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); - -#endif - - } - + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); +#endif +} /** * @brief Density loop (non-symmetric version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r, ri; - float xi; - float h_inv; - float wi, wi_dx; - float mj; - float dvdr; - float dv[3], curlvr[3]; - int k; - - /* Get the masses. */ - mj = pj->mass; - - /* Get r and r inverse. */ - r = sqrtf( r2 ); - ri = 1.0f / r; - - /* Compute dv dot r */ - dv[0] = pi->v[0] - pj->v[0]; - dv[1] = pi->v[1] - pj->v[1]; - dv[2] = pi->v[2] - pj->v[2]; - dvdr = dv[0]*dx[0] + dv[1]*dx[1] + dv[2]*dx[2]; - dvdr *= ri; - - /* Compute dv cross r */ - curlvr[0] = dv[1]*dx[2] - dv[2]*dx[1]; - curlvr[1] = dv[2]*dx[0] - dv[0]*dx[2]; - curlvr[2] = dv[0]*dx[1] - dv[1]*dx[0]; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k] *= ri; - - h_inv = 1.0 / hi; - xi = r * h_inv; - kernel_deval( xi , &wi , &wi_dx ); - - pi->rho += mj * wi; - pi->rho_dh -= mj * ( 3.0*wi + xi*wi_dx ); - pi->density.wcount += wi; - pi->density.wcount_dh -= xi * wi_dx; - - pi->density.div_v += mj * dvdr * wi_dx; - for ( k = 0 ; k < 3 ; k++ ) - pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; - - } - +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r, ri; + float xi; + float h_inv; + float wi, wi_dx; + float mj; + float dvdr; + float dv[3], curlvr[3]; + int k; + + /* Get the masses. */ + mj = pj->mass; + + /* Get r and r inverse. */ + r = sqrtf(r2); + ri = 1.0f / r; + + /* Compute dv dot r */ + dv[0] = pi->v[0] - pj->v[0]; + dv[1] = pi->v[1] - pj->v[1]; + dv[2] = pi->v[2] - pj->v[2]; + dvdr = dv[0] * dx[0] + dv[1] * dx[1] + dv[2] * dx[2]; + dvdr *= ri; + + /* Compute dv cross r */ + curlvr[0] = dv[1] * dx[2] - dv[2] * dx[1]; + curlvr[1] = dv[2] * dx[0] - dv[0] * dx[2]; + curlvr[2] = dv[0] * dx[1] - dv[1] * dx[0]; + for (k = 0; k < 3; k++) curlvr[k] *= ri; + + h_inv = 1.0 / hi; + xi = r * h_inv; + kernel_deval(xi, &wi, &wi_dx); + + pi->rho += mj * wi; + pi->rho_dh -= mj * (3.0 * wi + xi * wi_dx); + pi->density.wcount += wi; + pi->density.wcount_dh -= xi * wi_dx; + + pi->density.div_v += mj * dvdr * wi_dx; + for (k = 0; k < 3; k++) pi->density.curl_v[k] += mj * curlvr[k] * wi_dx; +} + /** * @brief Density loop (non-symmetric vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) + INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, + float *Hi, float *Hj, + struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, r2, ri, xi, hi, hi_inv, wi, wi_dx; - vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; - vector mj; - vector dx[3], dv[3]; - vector vi[3], vj[3]; - vector dvdr; - vector curlvr[3], curl_vi[3]; - int k, j; - - #if VEC_SIZE==8 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - #elif VEC_SIZE==4 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) ); - xi.v = r.v * hi_inv.v; - - kernel_deval_vec( &xi , &wi , &wi_dx ); - - /* Compute dv. */ - dv[0].v = vi[0].v - vj[0].v; - dv[1].v = vi[1].v - vj[1].v; - dv[2].v = vi[2].v - vj[2].v; - - /* Compute dv dot r */ - dvdr.v = ( dv[0].v * dx[0].v ) + ( dv[1].v * dx[1].v ) + ( dv[2].v * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Compute dv cross r */ - curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; - curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; - curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; - for ( k = 0 ; k < 3 ; k++ ) - curlvr[k].v *= ri.v; - - rhoi.v = mj.v * wi.v; - rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v ); - wcounti.v = wi.v; - wcounti_dh.v = xi.v * wi_dx.v; - div_vi.v = mj.v * dvdr.v * wi_dx.v; - for ( k = 0 ; k < 3 ; k++ ) - curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; - - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->rho += rhoi.f[k]; - pi[k]->rho_dh -= rhoi_dh.f[k]; - pi[k]->density.wcount += wcounti.f[k]; - pi[k]->density.wcount_dh -= wcounti_dh.f[k]; - pi[k]->density.div_v += div_vi.f[k]; - for( j = 0 ; j < 3 ; j++ ) - pi[k]->density.curl_v[j] += curl_vi[j].f[k]; - } - + vector r, r2, ri, xi, hi, hi_inv, wi, wi_dx; + vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; + vector mj; + vector dx[3], dv[3]; + vector vi[3], vj[3]; + vector dvdr; + vector curlvr[3], curl_vi[3]; + int k, j; + +#if VEC_SIZE == 8 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); +#elif VEC_SIZE == 4 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi_inv.v * hi.v - vec_set1(1.0f)); + xi.v = r.v * hi_inv.v; + + kernel_deval_vec(&xi, &wi, &wi_dx); + + /* Compute dv. */ + dv[0].v = vi[0].v - vj[0].v; + dv[1].v = vi[1].v - vj[1].v; + dv[2].v = vi[2].v - vj[2].v; + + /* Compute dv dot r */ + dvdr.v = (dv[0].v * dx[0].v) + (dv[1].v * dx[1].v) + (dv[2].v * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Compute dv cross r */ + curlvr[0].v = dv[1].v * dx[2].v - dv[2].v * dx[1].v; + curlvr[1].v = dv[2].v * dx[0].v - dv[0].v * dx[2].v; + curlvr[2].v = dv[0].v * dx[1].v - dv[1].v * dx[0].v; + for (k = 0; k < 3; k++) curlvr[k].v *= ri.v; + + rhoi.v = mj.v * wi.v; + rhoi_dh.v = mj.v * (vec_set1(3.0f) * wi.v + xi.v * wi_dx.v); + wcounti.v = wi.v; + wcounti_dh.v = xi.v * wi_dx.v; + div_vi.v = mj.v * dvdr.v * wi_dx.v; + for (k = 0; k < 3; k++) curl_vi[k].v = mj.v * curlvr[k].v * wi_dx.v; + + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->rho += rhoi.f[k]; + pi[k]->rho_dh -= rhoi_dh.f[k]; + pi[k]->density.wcount += wcounti.f[k]; + pi[k]->density.wcount_dh -= wcounti_dh.f[k]; + pi[k]->density.div_v += div_vi.f[k]; + for (j = 0; j < 3; j++) pi[k]->density.curl_v[j] += curl_vi[j].f[k]; + } + #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_nonsym_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_nonsym_density(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); #endif - - } - +} /** * @brief Force loop */ -__attribute__ ((always_inline)) INLINE static void runner_iact_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r = sqrtf( r2 ), ri = 1.0f / r; - float xi, xj; - float hi_inv, hi2_inv; - float hj_inv, hj2_inv; - float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; - float mi, mj, POrho2i, POrho2j, rhoi, rhoj; - float v_sig, omega_ij, Pi_ij; - // float dt_max; - float f; - int k; - - /* Get some values in local variables. */ - mi = pi->mass; mj = pj->mass; - rhoi = pi->rho; rhoj = pj->rho; - POrho2i = pi->force.POrho2; - POrho2j = pj->force.POrho2; - - /* Get the kernel for hi. */ - hi_inv = 1.0f / hi; - hi2_inv = hi_inv * hi_inv; - xi = r * hi_inv; - kernel_deval( xi , &wi , &wi_dx ); - wi_dr = hi2_inv * hi2_inv * wi_dx; - - /* Get the kernel for hj. */ - hj_inv = 1.0f / hj; - hj2_inv = hj_inv * hj_inv; - xj = r * hj_inv; - kernel_deval( xj , &wj , &wj_dx ); - wj_dr = hj2_inv * hj2_inv * wj_dx; - - /* Compute dv dot r. */ - dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2]; - dvdr *= ri; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij = fminf( dvdr , 0.f ); - - /* Compute signal velocity */ - v_sig = pi->force.c + pj->force.c - 3.0f*omega_ij; - - /* Compute viscosity tensor */ - Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / ( rhoi + rhoj ); - - /* Apply balsara switch */ - Pi_ij *= ( pi->force.balsara + pj->force.balsara ); - - /* Get the common factor out. */ - w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f = dx[k] * w; - pi->a[k] -= mj * f; - pj->a[k] += mi * f; - } - - /* Get the time derivative for u. */ - pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) ); - pj->force.u_dt += mi * dvdr * ( POrho2j * wj_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Get the time derivative for h. */ - pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; - pj->force.h_dt -= mi * dvdr / rhoi * wj_dr; - - /* Update the signal velocity. */ - pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig ); - pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig ); - - } - +__attribute__((always_inline)) INLINE static void runner_iact_force( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r = sqrtf(r2), ri = 1.0f / r; + float xi, xj; + float hi_inv, hi2_inv; + float hj_inv, hj2_inv; + float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; + float mi, mj, POrho2i, POrho2j, rhoi, rhoj; + float v_sig, omega_ij, Pi_ij; + // float dt_max; + float f; + int k; + + /* Get some values in local variables. */ + mi = pi->mass; + mj = pj->mass; + rhoi = pi->rho; + rhoj = pj->rho; + POrho2i = pi->force.POrho2; + POrho2j = pj->force.POrho2; + + /* Get the kernel for hi. */ + hi_inv = 1.0f / hi; + hi2_inv = hi_inv * hi_inv; + xi = r * hi_inv; + kernel_deval(xi, &wi, &wi_dx); + wi_dr = hi2_inv * hi2_inv * wi_dx; + + /* Get the kernel for hj. */ + hj_inv = 1.0f / hj; + hj2_inv = hj_inv * hj_inv; + xj = r * hj_inv; + kernel_deval(xj, &wj, &wj_dx); + wj_dr = hj2_inv * hj2_inv * wj_dx; + + /* Compute dv dot r. */ + dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] + + (pi->v[2] - pj->v[2]) * dx[2]; + dvdr *= ri; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij = fminf(dvdr, 0.f); + + /* Compute signal velocity */ + v_sig = pi->force.c + pj->force.c - 3.0f * omega_ij; + + /* Compute viscosity tensor */ + Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / (rhoi + rhoj); + + /* Apply balsara switch */ + Pi_ij *= (pi->force.balsara + pj->force.balsara); + + /* Get the common factor out. */ + w = ri * + ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr)); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f = dx[k] * w; + pi->a[k] -= mj * f; + pj->a[k] += mi * f; + } + + /* Get the time derivative for u. */ + pi->force.u_dt += + mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr)); + pj->force.u_dt += + mi * dvdr * (POrho2j * wj_dr + 0.125f * Pi_ij * (wi_dr + wj_dr)); + + /* Get the time derivative for h. */ + pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; + pj->force.h_dt -= mi * dvdr / rhoi * wj_dr; + + /* Update the signal velocity. */ + pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig); + pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig); +} /** * @brief Force loop (Vectorized version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, r2, ri; - vector xi, xj; - vector hi, hj, hi_inv, hj_inv; - vector hi2_inv, hj2_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector w; - vector piPOrho2, pjPOrho2, pirho, pjrho; - vector mi, mj; - vector f; - vector dx[3]; - vector vi[3], vj[3]; - vector pia[3], pja[3]; - vector piu_dt, pju_dt; - vector pih_dt, pjh_dt; - vector ci, cj, v_sig, vi_sig, vj_sig; - vector omega_ij, Pi_ij, balsara; - int j, k; - - /* Load stuff. */ - #if VEC_SIZE==8 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass , pi[4]->mass , pi[5]->mass , pi[6]->mass , pi[7]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara ); - #elif VEC_SIZE==4 - mi.v = vec_set( pi[0]->mass , pi[1]->mass , pi[2]->mass , pi[3]->mass ); - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara ); - #else - #error - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - /* Get the kernel for hi. */ - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); - hi2_inv.v = hi_inv.v * hi_inv.v; - xi.v = r.v * hi_inv.v; - kernel_deval_vec( &xi , &wi , &wi_dx ); - wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; - - /* Get the kernel for hj. */ - hj.v = vec_load( Hj ); - hj_inv.v = vec_rcp( hj.v ); - hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); - hj2_inv.v = hj_inv.v * hj_inv.v; - xj.v = r.v * hj_inv.v; - kernel_deval_vec( &xj , &wj , &wj_dx ); - wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; - - /* Compute dv dot r. */ - dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Get the time derivative for h. */ - pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; - pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) ); - - /* Compute signal velocity */ - v_sig.v = ci.v + cj.v - vec_set1( 3.0f )*omega_ij.v; - - /* Compute viscosity tensor */ - Pi_ij.v = -balsara.v * vec_set1( const_viscosity_alpha ) * v_sig.v * omega_ij.v / (pirho.v + pjrho.v); - Pi_ij.v *= ( wi_dr.v + wj_dr.v ); - - /* Get the common factor out. */ - w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f.v = dx[k].v * w.v; - pia[k].v = mj.v * f.v; - pja[k].v = mi.v * f.v; - } - - /* Get the time derivative for u. */ - piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v ); - pju_dt.v = mi.v * dvdr.v * ( pjPOrho2.v * wj_dr.v + vec_set1( 0.125f ) * Pi_ij.v ); - - /* compute the signal velocity (this is always symmetrical). */ - vi_sig.v = vec_fmax( vi_sig.v , v_sig.v ); - vj_sig.v = vec_fmax( vj_sig.v , v_sig.v ); - - /* Store the forces back on the particles. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->force.u_dt += piu_dt.f[k]; - pj[k]->force.u_dt += pju_dt.f[k]; - pi[k]->force.h_dt -= pih_dt.f[k]; - pj[k]->force.h_dt -= pjh_dt.f[k]; - pi[k]->force.v_sig = vi_sig.f[k]; - pj[k]->force.v_sig = vj_sig.f[k]; - for ( j = 0 ; j < 3 ; j++ ) { - pi[k]->a[j] -= pia[j].f[k]; - pj[k]->a[j] += pja[j].f[k]; - } - } - + vector r, r2, ri; + vector xi, xj; + vector hi, hj, hi_inv, hj_inv; + vector hi2_inv, hj2_inv; + vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector w; + vector piPOrho2, pjPOrho2, pirho, pjrho; + vector mi, mj; + vector f; + vector dx[3]; + vector vi[3], vj[3]; + vector pia[3], pja[3]; + vector piu_dt, pju_dt; + vector pih_dt, pjh_dt; + vector ci, cj, v_sig, vi_sig, vj_sig; + vector omega_ij, Pi_ij, balsara; + int j, k; + +/* Load stuff. */ +#if VEC_SIZE == 8 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass, + pi[4]->mass, pi[5]->mass, pi[6]->mass, pi[7]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + piPOrho2.v = + vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2, + pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2, + pi[6]->force.POrho2, pi[7]->force.POrho2); + pjPOrho2.v = + vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2, + pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2, + pj[6]->force.POrho2, pj[7]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, + pi[5]->rho, pi[6]->rho, pi[7]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, + pj[5]->rho, pj[6]->rho, pj[7]->rho); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c, + pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c, + pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig, + pi[6]->force.v_sig, pi[7]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig, + pj[6]->force.v_sig, pj[7]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); + balsara.v = + vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, + pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, + pi[6]->force.balsara, pi[7]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, + pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, + pj[6]->force.balsara, pj[7]->force.balsara); +#elif VEC_SIZE == 4 + mi.v = vec_set(pi[0]->mass, pi[1]->mass, pi[2]->mass, pi[3]->mass); + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, + pi[2]->force.POrho2, pi[3]->force.POrho2); + pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, + pj[2]->force.POrho2, pj[3]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); + balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, + pi[2]->force.balsara, pi[3]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, + pj[2]->force.balsara, pj[3]->force.balsara); +#else +#error +#endif + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + /* Get the kernel for hi. */ + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f)); + hi2_inv.v = hi_inv.v * hi_inv.v; + xi.v = r.v * hi_inv.v; + kernel_deval_vec(&xi, &wi, &wi_dx); + wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; + + /* Get the kernel for hj. */ + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f)); + hj2_inv.v = hj_inv.v * hj_inv.v; + xj.v = r.v * hj_inv.v; + kernel_deval_vec(&xj, &wj, &wj_dx); + wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; + + /* Compute dv dot r. */ + dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + + ((vi[2].v - vj[2].v) * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Get the time derivative for h. */ + pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; + pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); + + /* Compute signal velocity */ + v_sig.v = ci.v + cj.v - vec_set1(3.0f) * omega_ij.v; + + /* Compute viscosity tensor */ + Pi_ij.v = -balsara.v * vec_set1(const_viscosity_alpha) * v_sig.v * + omega_ij.v / (pirho.v + pjrho.v); + Pi_ij.v *= (wi_dr.v + wj_dr.v); + + /* Get the common factor out. */ + w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) + + vec_set1(0.25f) * Pi_ij.v); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f.v = dx[k].v * w.v; + pia[k].v = mj.v * f.v; + pja[k].v = mi.v * f.v; + } + + /* Get the time derivative for u. */ + piu_dt.v = + mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v); + pju_dt.v = + mi.v * dvdr.v * (pjPOrho2.v * wj_dr.v + vec_set1(0.125f) * Pi_ij.v); + + /* compute the signal velocity (this is always symmetrical). */ + vi_sig.v = vec_fmax(vi_sig.v, v_sig.v); + vj_sig.v = vec_fmax(vj_sig.v, v_sig.v); + + /* Store the forces back on the particles. */ + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->force.u_dt += piu_dt.f[k]; + pj[k]->force.u_dt += pju_dt.f[k]; + pi[k]->force.h_dt -= pih_dt.f[k]; + pj[k]->force.h_dt -= pjh_dt.f[k]; + pi[k]->force.v_sig = vi_sig.f[k]; + pj[k]->force.v_sig = vj_sig.f[k]; + for (j = 0; j < 3; j++) { + pi[k]->a[j] -= pia[j].f[k]; + pj[k]->a[j] += pja[j].f[k]; + } + } + #else - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); #endif - - } - +} /** * @brief Force loop (non-symmetric version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_force ( float r2 , float *dx , float hi , float hj , struct part *pi , struct part *pj ) { - - float r = sqrtf( r2 ), ri = 1.0f / r; - float xi, xj; - float hi_inv, hi2_inv; - float hj_inv, hj2_inv; - float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; - float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj; - float v_sig, omega_ij, Pi_ij; - // float dt_max; - float f; - int k; - - /* Get some values in local variables. */ - // mi = pi->mass; - mj = pj->mass; - rhoi = pi->rho; rhoj = pj->rho; - POrho2i = pi->force.POrho2; - POrho2j = pj->force.POrho2; - - /* Get the kernel for hi. */ - hi_inv = 1.0f / hi; - hi2_inv = hi_inv * hi_inv; - xi = r * hi_inv; - kernel_deval( xi , &wi , &wi_dx ); - wi_dr = hi2_inv * hi2_inv * wi_dx; - - /* Get the kernel for hj. */ - hj_inv = 1.0f / hj; - hj2_inv = hj_inv * hj_inv; - xj = r * hj_inv; - kernel_deval( xj , &wj , &wj_dx ); - wj_dr = hj2_inv * hj2_inv * wj_dx; - - /* Compute dv dot r. */ - dvdr = ( pi->v[0] - pj->v[0] ) * dx[0] + ( pi->v[1] - pj->v[1] ) * dx[1] + ( pi->v[2] - pj->v[2] ) * dx[2]; - dvdr *= ri; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij = fminf( dvdr , 0.f ); - - /* Compute signal velocity */ - v_sig = pi->force.c + pj->force.c - 3.0f*omega_ij; - - /* Compute viscosity tensor */ - Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / ( rhoi + rhoj ); - - /* Apply balsara switch */ - Pi_ij *= ( pi->force.balsara + pj->force.balsara ); - - /* Get the common factor out. */ - w = ri * ( ( POrho2i * wi_dr + POrho2j * wj_dr ) + 0.25f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f = dx[k] * w; - pi->a[k] -= mj * f; - } - - /* Get the time derivative for u. */ - pi->force.u_dt += mj * dvdr * ( POrho2i * wi_dr + 0.125f * Pi_ij * ( wi_dr + wj_dr ) ); - - /* Get the time derivative for h. */ - pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; - - /* Update the signal velocity. */ - pi->force.v_sig = fmaxf( pi->force.v_sig , v_sig ); - pj->force.v_sig = fmaxf( pj->force.v_sig , v_sig ); - - } - +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( + float r2, float *dx, float hi, float hj, struct part *pi, struct part *pj) { + + float r = sqrtf(r2), ri = 1.0f / r; + float xi, xj; + float hi_inv, hi2_inv; + float hj_inv, hj2_inv; + float wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, w, dvdr; + float /*mi,*/ mj, POrho2i, POrho2j, rhoi, rhoj; + float v_sig, omega_ij, Pi_ij; + // float dt_max; + float f; + int k; + + /* Get some values in local variables. */ + // mi = pi->mass; + mj = pj->mass; + rhoi = pi->rho; + rhoj = pj->rho; + POrho2i = pi->force.POrho2; + POrho2j = pj->force.POrho2; + + /* Get the kernel for hi. */ + hi_inv = 1.0f / hi; + hi2_inv = hi_inv * hi_inv; + xi = r * hi_inv; + kernel_deval(xi, &wi, &wi_dx); + wi_dr = hi2_inv * hi2_inv * wi_dx; + + /* Get the kernel for hj. */ + hj_inv = 1.0f / hj; + hj2_inv = hj_inv * hj_inv; + xj = r * hj_inv; + kernel_deval(xj, &wj, &wj_dx); + wj_dr = hj2_inv * hj2_inv * wj_dx; + + /* Compute dv dot r. */ + dvdr = (pi->v[0] - pj->v[0]) * dx[0] + (pi->v[1] - pj->v[1]) * dx[1] + + (pi->v[2] - pj->v[2]) * dx[2]; + dvdr *= ri; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij = fminf(dvdr, 0.f); + + /* Compute signal velocity */ + v_sig = pi->force.c + pj->force.c - 3.0f * omega_ij; + + /* Compute viscosity tensor */ + Pi_ij = -const_viscosity_alpha * v_sig * omega_ij / (rhoi + rhoj); + + /* Apply balsara switch */ + Pi_ij *= (pi->force.balsara + pj->force.balsara); + + /* Get the common factor out. */ + w = ri * + ((POrho2i * wi_dr + POrho2j * wj_dr) + 0.25f * Pi_ij * (wi_dr + wj_dr)); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f = dx[k] * w; + pi->a[k] -= mj * f; + } + + /* Get the time derivative for u. */ + pi->force.u_dt += + mj * dvdr * (POrho2i * wi_dr + 0.125f * Pi_ij * (wi_dr + wj_dr)); + + /* Get the time derivative for h. */ + pi->force.h_dt -= mj * dvdr / rhoj * wi_dr; + + /* Update the signal velocity. */ + pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig); + pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig); +} /** * @brief Force loop (Vectorized non-symmetric version) */ -__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) { +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { #ifdef VECTORIZE - vector r, r2, ri; - vector xi, xj; - vector hi, hj, hi_inv, hj_inv; - vector hi2_inv, hj2_inv; - vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; - vector w; - vector piPOrho2, pjPOrho2, pirho, pjrho; - vector mj; - vector f; - vector dx[3]; - vector vi[3], vj[3]; - vector pia[3]; - vector piu_dt; - vector pih_dt; - vector ci, cj, v_sig, vi_sig, vj_sig; - vector omega_ij, Pi_ij, balsara; - int j, k; - - /* Load stuff. */ - #if VEC_SIZE==8 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass , pj[4]->mass , pj[5]->mass , pj[6]->mass , pj[7]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 , pi[4]->force.POrho2 , pi[5]->force.POrho2 , pi[6]->force.POrho2 , pi[7]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 , pj[4]->force.POrho2 , pj[5]->force.POrho2 , pj[6]->force.POrho2 , pj[7]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho , pi[4]->rho , pi[5]->rho , pi[6]->rho , pi[7]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho , pj[4]->rho , pj[5]->rho , pj[6]->rho , pj[7]->rho ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c , pi[4]->force.c , pi[5]->force.c , pi[6]->force.c , pi[7]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c , pj[4]->force.c , pj[5]->force.c , pj[6]->force.c , pj[7]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig , pi[4]->force.v_sig , pi[5]->force.v_sig , pi[6]->force.v_sig , pi[7]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig , pj[4]->force.v_sig , pj[5]->force.v_sig , pj[6]->force.v_sig , pj[7]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] , pi[4]->v[k] , pi[5]->v[k] , pi[6]->v[k] , pi[7]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] , pj[4]->v[k] , pj[5]->v[k] , pj[6]->v[k] , pj[7]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] , Dx[12+k] , Dx[15+k] , Dx[18+k] , Dx[21+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara , pi[4]->force.balsara , pi[5]->force.balsara , pi[6]->force.balsara , pi[7]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara , pj[4]->force.balsara , pj[5]->force.balsara , pj[6]->force.balsara , pj[7]->force.balsara ); - #elif VEC_SIZE==4 - mj.v = vec_set( pj[0]->mass , pj[1]->mass , pj[2]->mass , pj[3]->mass ); - piPOrho2.v = vec_set( pi[0]->force.POrho2 , pi[1]->force.POrho2 , pi[2]->force.POrho2 , pi[3]->force.POrho2 ); - pjPOrho2.v = vec_set( pj[0]->force.POrho2 , pj[1]->force.POrho2 , pj[2]->force.POrho2 , pj[3]->force.POrho2 ); - pirho.v = vec_set( pi[0]->rho , pi[1]->rho , pi[2]->rho , pi[3]->rho ); - pjrho.v = vec_set( pj[0]->rho , pj[1]->rho , pj[2]->rho , pj[3]->rho ); - ci.v = vec_set( pi[0]->force.c , pi[1]->force.c , pi[2]->force.c , pi[3]->force.c ); - cj.v = vec_set( pj[0]->force.c , pj[1]->force.c , pj[2]->force.c , pj[3]->force.c ); - vi_sig.v = vec_set( pi[0]->force.v_sig , pi[1]->force.v_sig , pi[2]->force.v_sig , pi[3]->force.v_sig ); - vj_sig.v = vec_set( pj[0]->force.v_sig , pj[1]->force.v_sig , pj[2]->force.v_sig , pj[3]->force.v_sig ); - for ( k = 0 ; k < 3 ; k++ ) { - vi[k].v = vec_set( pi[0]->v[k] , pi[1]->v[k] , pi[2]->v[k] , pi[3]->v[k] ); - vj[k].v = vec_set( pj[0]->v[k] , pj[1]->v[k] , pj[2]->v[k] , pj[3]->v[k] ); - } - for ( k = 0 ; k < 3 ; k++ ) - dx[k].v = vec_set( Dx[0+k] , Dx[3+k] , Dx[6+k] , Dx[9+k] ); - balsara.v = vec_set( pi[0]->force.balsara , pi[1]->force.balsara , pi[2]->force.balsara , pi[3]->force.balsara ) + - vec_set( pj[0]->force.balsara , pj[1]->force.balsara , pj[2]->force.balsara , pj[3]->force.balsara ); - #else - #error - #endif - - /* Get the radius and inverse radius. */ - r2.v = vec_load( R2 ); - ri.v = vec_rsqrt( r2.v ); - ri.v = ri.v - vec_set1( 0.5f ) * ri.v * ( r2.v * ri.v * ri.v - vec_set1( 1.0f ) ); - r.v = r2.v * ri.v; - - /* Get the kernel for hi. */ - hi.v = vec_load( Hi ); - hi_inv.v = vec_rcp( hi.v ); - hi_inv.v = hi_inv.v - hi_inv.v * ( hi.v * hi_inv.v - vec_set1( 1.0f ) ); - hi2_inv.v = hi_inv.v * hi_inv.v; - xi.v = r.v * hi_inv.v; - kernel_deval_vec( &xi , &wi , &wi_dx ); - wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; - - /* Get the kernel for hj. */ - hj.v = vec_load( Hj ); - hj_inv.v = vec_rcp( hj.v ); - hj_inv.v = hj_inv.v - hj_inv.v * ( hj.v * hj_inv.v - vec_set1( 1.0f ) ); - hj2_inv.v = hj_inv.v * hj_inv.v; - xj.v = r.v * hj_inv.v; - kernel_deval_vec( &xj , &wj , &wj_dx ); - wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; - - /* Compute dv dot r. */ - dvdr.v = ( (vi[0].v - vj[0].v) * dx[0].v ) + ( (vi[1].v - vj[1].v) * dx[1].v ) + ( (vi[2].v - vj[2].v) * dx[2].v ); - dvdr.v = dvdr.v * ri.v; - - /* Get the time derivative for h. */ - pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; - - /* Compute the relative velocity. (This is 0 if the particles move away from each other and negative otherwise) */ - omega_ij.v = vec_fmin( dvdr.v , vec_set1( 0.0f ) ); - - /* Compute signal velocity */ - v_sig.v = ci.v + cj.v - vec_set1( 3.0f )*omega_ij.v; - - /* Compute viscosity tensor */ - Pi_ij.v = -balsara.v * vec_set1( const_viscosity_alpha ) * v_sig.v * omega_ij.v / (pirho.v + pjrho.v); - Pi_ij.v *= ( wi_dr.v + wj_dr.v ); - - /* Get the common factor out. */ - w.v = ri.v * ( ( piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v ) + vec_set1( 0.25f ) * Pi_ij.v ); - - /* Use the force, Luke! */ - for ( k = 0 ; k < 3 ; k++ ) { - f.v = dx[k].v * w.v; - pia[k].v = mj.v * f.v; - } - - /* Get the time derivative for u. */ - piu_dt.v = mj.v * dvdr.v * ( piPOrho2.v * wi_dr.v + vec_set1( 0.125f ) * Pi_ij.v ); - - /* compute the signal velocity (this is always symmetrical). */ - vi_sig.v = vec_fmax( vi_sig.v , v_sig.v ); - vj_sig.v = vec_fmax( vj_sig.v , v_sig.v ); - - /* Store the forces back on the particles. */ - for ( k = 0 ; k < VEC_SIZE ; k++ ) { - pi[k]->force.u_dt += piu_dt.f[k]; - pi[k]->force.h_dt -= pih_dt.f[k]; - pi[k]->force.v_sig = vi_sig.f[k]; - pj[k]->force.v_sig = vj_sig.f[k]; - for ( j = 0 ; j < 3 ; j++ ) - pi[k]->a[j] -= pia[j].f[k]; - } - + vector r, r2, ri; + vector xi, xj; + vector hi, hj, hi_inv, hj_inv; + vector hi2_inv, hj2_inv; + vector wi, wj, wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector w; + vector piPOrho2, pjPOrho2, pirho, pjrho; + vector mj; + vector f; + vector dx[3]; + vector vi[3], vj[3]; + vector pia[3]; + vector piu_dt; + vector pih_dt; + vector ci, cj, v_sig, vi_sig, vj_sig; + vector omega_ij, Pi_ij, balsara; + int j, k; + +/* Load stuff. */ +#if VEC_SIZE == 8 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass, + pj[4]->mass, pj[5]->mass, pj[6]->mass, pj[7]->mass); + piPOrho2.v = + vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, pi[2]->force.POrho2, + pi[3]->force.POrho2, pi[4]->force.POrho2, pi[5]->force.POrho2, + pi[6]->force.POrho2, pi[7]->force.POrho2); + pjPOrho2.v = + vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, pj[2]->force.POrho2, + pj[3]->force.POrho2, pj[4]->force.POrho2, pj[5]->force.POrho2, + pj[6]->force.POrho2, pj[7]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho, pi[4]->rho, + pi[5]->rho, pi[6]->rho, pi[7]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho, pj[4]->rho, + pj[5]->rho, pj[6]->rho, pj[7]->rho); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c, + pi[4]->force.c, pi[5]->force.c, pi[6]->force.c, pi[7]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c, + pj[4]->force.c, pj[5]->force.c, pj[6]->force.c, pj[7]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig, pi[4]->force.v_sig, pi[5]->force.v_sig, + pi[6]->force.v_sig, pi[7]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig, pj[4]->force.v_sig, pj[5]->force.v_sig, + pj[6]->force.v_sig, pj[7]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k], + pi[4]->v[k], pi[5]->v[k], pi[6]->v[k], pi[7]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k], + pj[4]->v[k], pj[5]->v[k], pj[6]->v[k], pj[7]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k], Dx[12 + k], + Dx[15 + k], Dx[18 + k], Dx[21 + k]); + balsara.v = + vec_set(pi[0]->force.balsara, pi[1]->force.balsara, pi[2]->force.balsara, + pi[3]->force.balsara, pi[4]->force.balsara, pi[5]->force.balsara, + pi[6]->force.balsara, pi[7]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, pj[2]->force.balsara, + pj[3]->force.balsara, pj[4]->force.balsara, pj[5]->force.balsara, + pj[6]->force.balsara, pj[7]->force.balsara); +#elif VEC_SIZE == 4 + mj.v = vec_set(pj[0]->mass, pj[1]->mass, pj[2]->mass, pj[3]->mass); + piPOrho2.v = vec_set(pi[0]->force.POrho2, pi[1]->force.POrho2, + pi[2]->force.POrho2, pi[3]->force.POrho2); + pjPOrho2.v = vec_set(pj[0]->force.POrho2, pj[1]->force.POrho2, + pj[2]->force.POrho2, pj[3]->force.POrho2); + pirho.v = vec_set(pi[0]->rho, pi[1]->rho, pi[2]->rho, pi[3]->rho); + pjrho.v = vec_set(pj[0]->rho, pj[1]->rho, pj[2]->rho, pj[3]->rho); + ci.v = + vec_set(pi[0]->force.c, pi[1]->force.c, pi[2]->force.c, pi[3]->force.c); + cj.v = + vec_set(pj[0]->force.c, pj[1]->force.c, pj[2]->force.c, pj[3]->force.c); + vi_sig.v = vec_set(pi[0]->force.v_sig, pi[1]->force.v_sig, pi[2]->force.v_sig, + pi[3]->force.v_sig); + vj_sig.v = vec_set(pj[0]->force.v_sig, pj[1]->force.v_sig, pj[2]->force.v_sig, + pj[3]->force.v_sig); + for (k = 0; k < 3; k++) { + vi[k].v = vec_set(pi[0]->v[k], pi[1]->v[k], pi[2]->v[k], pi[3]->v[k]); + vj[k].v = vec_set(pj[0]->v[k], pj[1]->v[k], pj[2]->v[k], pj[3]->v[k]); + } + for (k = 0; k < 3; k++) + dx[k].v = vec_set(Dx[0 + k], Dx[3 + k], Dx[6 + k], Dx[9 + k]); + balsara.v = vec_set(pi[0]->force.balsara, pi[1]->force.balsara, + pi[2]->force.balsara, pi[3]->force.balsara) + + vec_set(pj[0]->force.balsara, pj[1]->force.balsara, + pj[2]->force.balsara, pj[3]->force.balsara); #else +#error +#endif - for ( int k = 0 ; k < VEC_SIZE ; k++ ) - runner_iact_nonsym_force( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] ); + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + ri.v = vec_rsqrt(r2.v); + ri.v = ri.v - vec_set1(0.5f) * ri.v * (r2.v * ri.v * ri.v - vec_set1(1.0f)); + r.v = r2.v * ri.v; + + /* Get the kernel for hi. */ + hi.v = vec_load(Hi); + hi_inv.v = vec_rcp(hi.v); + hi_inv.v = hi_inv.v - hi_inv.v * (hi.v * hi_inv.v - vec_set1(1.0f)); + hi2_inv.v = hi_inv.v * hi_inv.v; + xi.v = r.v * hi_inv.v; + kernel_deval_vec(&xi, &wi, &wi_dx); + wi_dr.v = hi2_inv.v * hi2_inv.v * wi_dx.v; + + /* Get the kernel for hj. */ + hj.v = vec_load(Hj); + hj_inv.v = vec_rcp(hj.v); + hj_inv.v = hj_inv.v - hj_inv.v * (hj.v * hj_inv.v - vec_set1(1.0f)); + hj2_inv.v = hj_inv.v * hj_inv.v; + xj.v = r.v * hj_inv.v; + kernel_deval_vec(&xj, &wj, &wj_dx); + wj_dr.v = hj2_inv.v * hj2_inv.v * wj_dx.v; + + /* Compute dv dot r. */ + dvdr.v = ((vi[0].v - vj[0].v) * dx[0].v) + ((vi[1].v - vj[1].v) * dx[1].v) + + ((vi[2].v - vj[2].v) * dx[2].v); + dvdr.v = dvdr.v * ri.v; + + /* Get the time derivative for h. */ + pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij.v = vec_fmin(dvdr.v, vec_set1(0.0f)); + + /* Compute signal velocity */ + v_sig.v = ci.v + cj.v - vec_set1(3.0f) * omega_ij.v; + + /* Compute viscosity tensor */ + Pi_ij.v = -balsara.v * vec_set1(const_viscosity_alpha) * v_sig.v * + omega_ij.v / (pirho.v + pjrho.v); + Pi_ij.v *= (wi_dr.v + wj_dr.v); + + /* Get the common factor out. */ + w.v = ri.v * ((piPOrho2.v * wi_dr.v + pjPOrho2.v * wj_dr.v) + + vec_set1(0.25f) * Pi_ij.v); + + /* Use the force, Luke! */ + for (k = 0; k < 3; k++) { + f.v = dx[k].v * w.v; + pia[k].v = mj.v * f.v; + } + + /* Get the time derivative for u. */ + piu_dt.v = + mj.v * dvdr.v * (piPOrho2.v * wi_dr.v + vec_set1(0.125f) * Pi_ij.v); + + /* compute the signal velocity (this is always symmetrical). */ + vi_sig.v = vec_fmax(vi_sig.v, v_sig.v); + vj_sig.v = vec_fmax(vj_sig.v, v_sig.v); + + /* Store the forces back on the particles. */ + for (k = 0; k < VEC_SIZE; k++) { + pi[k]->force.u_dt += piu_dt.f[k]; + pi[k]->force.h_dt -= pih_dt.f[k]; + pi[k]->force.v_sig = vi_sig.f[k]; + pj[k]->force.v_sig = vj_sig.f[k]; + for (j = 0; j < 3; j++) pi[k]->a[j] -= pia[j].f[k]; + } -#endif - - } - +#else + for (int k = 0; k < VEC_SIZE; k++) + runner_iact_nonsym_force(R2[k], &Dx[3 * k], Hi[k], Hj[k], pi[k], pj[k]); +#endif +} +#endif /* SWIFT_RUNNER_IACT_LEGACY_H */ diff --git a/src/scheduler.c b/src/scheduler.c index 4c45303f1fb60a4ae2daf80dfda70de1c21361bc..02defc31710e3ab0de067cf359d4281b94c17d90 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -1,56 +1,48 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ +#include <limits.h> +#include <math.h> +#include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <math.h> -#include <pthread.h> -#include <limits.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif +/* This object's header. */ +#include "scheduler.h" + /* Local headers. */ -#include "error.h" -#include "cycle.h" #include "atomic.h" -#include "timers.h" #include "const.h" -#include "vector.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "debug.h" -#include "space.h" -#include "multipole.h" -#include "cell.h" -#include "queue.h" +#include "cycle.h" +#include "error.h" #include "kernel.h" -#include "scheduler.h" - +#include "timers.h" /** * @brief Add an unlock_task to the given task. @@ -59,490 +51,588 @@ * @param ta The unlocking #task. * @param tb The #task that will be unlocked. */ - -void scheduler_addunlock ( struct scheduler *s , struct task *ta , struct task *tb ) { - - /* Main loop. */ - while ( 1 ) { - /* Follow the links. */ - while ( ta->nr_unlock_tasks == task_maxunlock+1 ) - ta = ta->unlock_tasks[ task_maxunlock ]; +void scheduler_addunlock(struct scheduler *s, struct task *ta, + struct task *tb) { - /* Get the index of the next free task. */ - int ind = atomic_inc( &ta->nr_unlock_tasks ); + /* Main loop. */ + while (1) { - /* Is there room in this task? */ - if ( ind < task_maxunlock ) { - ta->unlock_tasks[ ind ] = tb; - break; - } + /* Follow the links. */ + while (ta->nr_unlock_tasks == task_maxunlock + 1) + ta = ta->unlock_tasks[task_maxunlock]; - /* Otherwise, generate a link task. */ - else { - - /* Only one thread should have to do this. */ - if ( ind == task_maxunlock ) { - ta->unlock_tasks[ task_maxunlock ] = scheduler_addtask( s , task_type_link , task_subtype_none , ta->flags , 0 , ta->ci , ta->cj , 0 ); - ta->unlock_tasks[ task_maxunlock ]->implicit = 1; - } + /* Get the index of the next free task. */ + int ind = atomic_inc(&ta->nr_unlock_tasks); - /* Otherwise, reduce the count. */ - else - atomic_dec( &ta->nr_unlock_tasks ); + /* Is there room in this task? */ + if (ind < task_maxunlock) { + ta->unlock_tasks[ind] = tb; + break; + } - } - - } + /* Otherwise, generate a link task. */ + else { + /* Only one thread should have to do this. */ + if (ind == task_maxunlock) { + ta->unlock_tasks[task_maxunlock] = + scheduler_addtask(s, task_type_link, task_subtype_none, ta->flags, + 0, ta->ci, ta->cj, 0); + ta->unlock_tasks[task_maxunlock]->implicit = 1; + } + + /* Otherwise, reduce the count. */ + else + atomic_dec(&ta->nr_unlock_tasks); } - + } +} /** * @brief Split tasks that may be too large. * * @param s The #scheduler we are working in. */ - -void scheduler_splittasks ( struct scheduler *s ) { - - int j, k, ind, sid, tid = 0, redo; - struct cell *ci, *cj; - double hi, hj, shift[3]; - struct task *t, *t_old; - // float dt_step = s->dt_step; - int pts[7][8] = { { -1 , 12 , 10 , 9 , 4 , 3 , 1 , 0 } , - { -1 , -1 , 11 , 10 , 5 , 4 , 2 , 1 } , - { -1 , -1 , -1 , 12 , 7 , 6 , 4 , 3 } , - { -1 , -1 , -1 , -1 , 8 , 7 , 5 , 4 } , - { -1 , -1 , -1 , -1 , -1 , 12 , 10 , 9 } , - { -1 , -1 , -1 , -1 , -1 , -1 , 11 , 10 } , - { -1 , -1 , -1 , -1 , -1 , -1 , -1 , 12 } }; - float sid_scale[13] = { 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , - 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , - 0.5788 }; - - /* Loop through the tasks... */ - redo = 0; t_old = t = NULL; - while ( 1 ) { - - /* Get a pointer on the task. */ - if ( redo ) { - redo = 0; - t = t_old; - } + +void scheduler_splittasks(struct scheduler *s) { + + int j, k, ind, sid, tid = 0, redo; + struct cell *ci, *cj; + double hi, hj, shift[3]; + struct task *t, *t_old; + // float dt_step = s->dt_step; + int pts[7][8] = {{-1, 12, 10, 9, 4, 3, 1, 0}, + {-1, -1, 11, 10, 5, 4, 2, 1}, + {-1, -1, -1, 12, 7, 6, 4, 3}, + {-1, -1, -1, -1, 8, 7, 5, 4}, + {-1, -1, -1, -1, -1, 12, 10, 9}, + {-1, -1, -1, -1, -1, -1, 11, 10}, + {-1, -1, -1, -1, -1, -1, -1, 12}}; + float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897, + 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788}; + + /* Loop through the tasks... */ + redo = 0; + t_old = t = NULL; + while (1) { + + /* Get a pointer on the task. */ + if (redo) { + redo = 0; + t = t_old; + } else { + if ((ind = atomic_inc(&tid)) < s->nr_tasks) + t_old = t = &s->tasks[s->tasks_ind[ind]]; + else + break; + } + + /* Empty task? */ + if (t->ci == NULL || (t->type == task_type_pair && t->cj == NULL)) { + t->type = task_type_none; + t->skip = 1; + continue; + } + + /* Non-local kick task? */ + if ((t->type == task_type_kick1 || t->type == task_type_kick2) && + t->ci->nodeID != s->nodeID) { + t->type = task_type_none; + t->skip = 1; + continue; + } + + /* Self-interaction? */ + if (t->type == task_type_self) { + + /* Get a handle on the cell involved. */ + ci = t->ci; + + /* Foreign task? */ + if (ci->nodeID != s->nodeID) { + t->skip = 1; + continue; + } + + /* Is this cell even split? */ + if (ci->split) { + + /* Make a sub? */ + if (scheduler_dosub && ci->count < space_subsize / ci->count) { + + /* convert to a self-subtask. */ + t->type = task_type_sub; + + } + + /* Otherwise, make tasks explicitly. */ else { - if ( ( ind = atomic_inc( &tid ) ) < s->nr_tasks ) - t_old = t = &s->tasks[ s->tasks_ind[ ind ] ]; - else - break; - } - - /* Empty task? */ - if ( t->ci == NULL || ( t->type == task_type_pair && t->cj == NULL ) ) { - t->type = task_type_none; - t->skip = 1; - continue; - } - - /* Non-local kick task? */ - if ( (t->type == task_type_kick1 || t->type == task_type_kick2 ) && - t->ci->nodeID != s->nodeID ) { + + /* Take a step back (we're going to recycle the current task)... */ + redo = 1; + + /* Add the self taks. */ + for (k = 0; ci->progeny[k] == NULL; k++) + ; + t->ci = ci->progeny[k]; + for (k += 1; k < 8; k++) + if (ci->progeny[k] != NULL) + scheduler_addtask(s, task_type_self, task_subtype_density, 0, 0, + ci->progeny[k], NULL, 0); + + /* Make a task for each pair of progeny. */ + for (j = 0; j < 8; j++) + if (ci->progeny[j] != NULL) + for (k = j + 1; k < 8; k++) + if (ci->progeny[k] != NULL) + scheduler_addtask(s, task_type_pair, task_subtype_density, + pts[j][k], 0, ci->progeny[j], + ci->progeny[k], 0); + } + } + + } + + /* Pair interaction? */ + else if (t->type == task_type_pair) { + + /* Get a handle on the cells involved. */ + ci = t->ci; + cj = t->cj; + hi = ci->dmin; + hj = cj->dmin; + + /* Foreign task? */ + if (ci->nodeID != s->nodeID && cj->nodeID != s->nodeID) { + t->skip = 1; + continue; + } + + /* Get the sort ID, use space_getsid and not t->flags + to make sure we get ci and cj swapped if needed. */ + sid = space_getsid(s->space, &ci, &cj, shift); + + /* Should this task be split-up? */ + if (ci->split && cj->split && + ci->h_max * kernel_gamma * space_stretch < hi / 2 && + cj->h_max * kernel_gamma * space_stretch < hj / 2) { + + /* Replace by a single sub-task? */ + if (scheduler_dosub && + ci->count * sid_scale[sid] < space_subsize / cj->count && + sid != 0 && sid != 2 && sid != 6 && sid != 8) { + + /* Make this task a sub task. */ + t->type = task_type_sub; + + } + + /* Otherwise, split it. */ + else { + + /* Take a step back (we're going to recycle the current task)... */ + redo = 1; + + /* For each different sorting type... */ + switch (sid) { + + case 0: /* ( 1 , 1 , 1 ) */ + t->ci = ci->progeny[7]; + t->cj = cj->progeny[0]; + t->flags = 0; + break; + + case 1: /* ( 1 , 1 , 0 ) */ + t->ci = ci->progeny[6]; + t->cj = cj->progeny[0]; + t->flags = 1; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0, + ci->progeny[7], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[6], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0, + ci->progeny[7], cj->progeny[0], 1); + break; + + case 2: /* ( 1 , 1 , -1 ) */ + t->ci = ci->progeny[6]; + t->cj = cj->progeny[1]; + t->flags = 2; + t->tight = 1; + break; + + case 3: /* ( 1 , 0 , 1 ) */ + t->ci = ci->progeny[5]; + t->cj = cj->progeny[0]; + t->flags = 3; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0, + ci->progeny[7], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[5], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0, + ci->progeny[7], cj->progeny[0], 1); + break; + + case 4: /* ( 1 , 0 , 0 ) */ + t->ci = ci->progeny[4]; + t->cj = cj->progeny[0]; + t->flags = 4; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0, + ci->progeny[5], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0, + ci->progeny[6], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0, + ci->progeny[7], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0, + ci->progeny[4], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 4, 0, + ci->progeny[5], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0, + ci->progeny[6], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0, + ci->progeny[7], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0, + ci->progeny[4], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0, + ci->progeny[5], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 4, 0, + ci->progeny[6], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0, + ci->progeny[7], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[4], cj->progeny[3], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0, + ci->progeny[5], cj->progeny[3], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0, + ci->progeny[6], cj->progeny[3], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 4, 0, + ci->progeny[7], cj->progeny[3], 1); + break; + + case 5: /* ( 1 , 0 , -1 ) */ + t->ci = ci->progeny[4]; + t->cj = cj->progeny[1]; + t->flags = 5; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0, + ci->progeny[6], cj->progeny[3], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0, + ci->progeny[4], cj->progeny[3], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0, + ci->progeny[6], cj->progeny[1], 1); + break; + + case 6: /* ( 1 , -1 , 1 ) */ + t->ci = ci->progeny[5]; + t->cj = cj->progeny[2]; + t->flags = 6; + t->tight = 1; + break; + + case 7: /* ( 1 , -1 , 0 ) */ + t->ci = ci->progeny[4]; + t->cj = cj->progeny[3]; + t->flags = 6; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0, + ci->progeny[5], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0, + ci->progeny[4], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0, + ci->progeny[5], cj->progeny[3], 1); + break; + + case 8: /* ( 1 , -1 , -1 ) */ + t->ci = ci->progeny[4]; + t->cj = cj->progeny[3]; + t->flags = 8; + t->tight = 1; + break; + + case 9: /* ( 0 , 1 , 1 ) */ + t->ci = ci->progeny[3]; + t->cj = cj->progeny[0]; + t->flags = 9; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0, + ci->progeny[7], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[3], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0, + ci->progeny[7], cj->progeny[0], 1); + break; + + case 10: /* ( 0 , 1 , 0 ) */ + t->ci = ci->progeny[2]; + t->cj = cj->progeny[0]; + t->flags = 10; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0, + ci->progeny[3], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0, + ci->progeny[6], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0, + ci->progeny[7], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0, + ci->progeny[2], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 10, 0, + ci->progeny[3], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0, + ci->progeny[6], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 7, 0, + ci->progeny[7], cj->progeny[1], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0, + ci->progeny[2], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0, + ci->progeny[3], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 10, 0, + ci->progeny[6], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0, + ci->progeny[7], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[2], cj->progeny[5], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 1, 0, + ci->progeny[3], cj->progeny[5], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0, + ci->progeny[6], cj->progeny[5], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 10, 0, + ci->progeny[7], cj->progeny[5], 1); + break; + + case 11: /* ( 0 , 1 , -1 ) */ + t->ci = ci->progeny[2]; + t->cj = cj->progeny[1]; + t->flags = 11; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0, + ci->progeny[6], cj->progeny[5], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0, + ci->progeny[2], cj->progeny[5], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0, + ci->progeny[6], cj->progeny[1], 1); + break; + + case 12: /* ( 0 , 0 , 1 ) */ + t->ci = ci->progeny[1]; + t->cj = cj->progeny[0]; + t->flags = 12; + t->tight = 1; + t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0, + ci->progeny[3], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0, + ci->progeny[5], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 2, 0, + ci->progeny[7], cj->progeny[0], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0, + ci->progeny[1], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 12, 0, + ci->progeny[3], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 8, 0, + ci->progeny[5], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 5, 0, + ci->progeny[7], cj->progeny[2], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0, + ci->progeny[1], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 6, 0, + ci->progeny[3], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 12, 0, + ci->progeny[5], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 11, 0, + ci->progeny[7], cj->progeny[4], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[1], cj->progeny[6], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 3, 0, + ci->progeny[3], cj->progeny[6], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 9, 0, + ci->progeny[5], cj->progeny[6], 1); + t = scheduler_addtask(s, task_type_pair, t->subtype, 12, 0, + ci->progeny[7], cj->progeny[6], 1); + break; + } + } + + } /* split this task? */ + + /* Otherwise, break it up if it is too large? */ + else if (scheduler_doforcesplit && ci->split && cj->split && + (ci->count > space_maxsize / cj->count)) { + + // message( "force splitting pair with %i and %i parts." , ci->count , + // cj->count ); + + /* Replace the current task. */ + t->type = task_type_none; + + for (j = 0; j < 8; j++) + if (ci->progeny[j] != NULL) + for (k = 0; k < 8; k++) + if (cj->progeny[k] != NULL) { + t = scheduler_addtask(s, task_type_pair, t->subtype, 0, 0, + ci->progeny[j], cj->progeny[k], 0); + t->flags = space_getsid(s->space, &t->ci, &t->cj, shift); + } + + } + + /* Otherwise, if not spilt, stitch-up the sorting. */ + else { + + /* Create the sort for ci. */ + // lock_lock( &ci->lock ); + if (ci->sorts == NULL) + ci->sorts = + scheduler_addtask(s, task_type_sort, 0, 1 << sid, 0, ci, NULL, 0); + else + ci->sorts->flags |= (1 << sid); + // lock_unlock_blind( &ci->lock ); + scheduler_addunlock(s, ci->sorts, t); + + /* Create the sort for cj. */ + // lock_lock( &cj->lock ); + if (cj->sorts == NULL) + cj->sorts = + scheduler_addtask(s, task_type_sort, 0, 1 << sid, 0, cj, NULL, 0); + else + cj->sorts->flags |= (1 << sid); + // lock_unlock_blind( &cj->lock ); + scheduler_addunlock(s, cj->sorts, t); + } + + } /* pair interaction? */ + + /* Gravity interaction? */ + else if (t->type == task_type_grav_mm) { + + /* Get a handle on the cells involved. */ + ci = t->ci; + cj = t->cj; + + /* Self-interaction? */ + if (cj == NULL) { + + /* Ignore this task if the cell has no gparts. */ + if (ci->gcount == 0) t->type = task_type_none; + + /* If the cell is split, recurse. */ + else if (ci->split) { + + /* Make a single sub-task? */ + if (scheduler_dosub && ci->count < space_subsize / ci->count) { + + t->type = task_type_sub; + t->subtype = task_subtype_grav; + + } + + /* Otherwise, just split the task. */ + else { + + /* Split this task into tasks on its progeny. */ t->type = task_type_none; - t->skip = 1; - continue; - } - - /* Self-interaction? */ - if ( t->type == task_type_self ) { - - /* Get a handle on the cell involved. */ - ci = t->ci; - - /* Foreign task? */ - if ( ci->nodeID != s->nodeID ) { - t->skip = 1; - continue; - } - - /* Is this cell even split? */ - if ( ci->split ) { - - /* Make a sub? */ - if ( scheduler_dosub && ci->count < space_subsize/ci->count ) { + for (j = 0; j < 8; j++) + if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) { + if (t->type == task_type_none) { + t->type = task_type_grav_mm; + t->ci = ci->progeny[j]; + t->cj = NULL; + } else + t = scheduler_addtask(s, task_type_grav_mm, task_subtype_none, + 0, 0, ci->progeny[j], NULL, 0); + for (k = j + 1; k < 8; k++) + if (ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0) { + if (t->type == task_type_none) { + t->type = task_type_grav_mm; + t->ci = ci->progeny[j]; + t->cj = ci->progeny[k]; + } else + t = scheduler_addtask(s, task_type_grav_mm, + task_subtype_none, 0, 0, + ci->progeny[j], ci->progeny[k], 0); + } + } + redo = (t->type != task_type_none); + } - /* convert to a self-subtask. */ - t->type = task_type_sub; + } - } + /* Otherwise, just make a pp task out of it. */ + else + t->type = task_type_grav_pp; - /* Otherwise, make tasks explicitly. */ - else { - - /* Take a step back (we're going to recycle the current task)... */ - redo = 1; - - /* Add the self taks. */ - for ( k = 0 ; ci->progeny[k] == NULL ; k++ ); - t->ci = ci->progeny[k]; - for ( k += 1 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) - scheduler_addtask( s , task_type_self , task_subtype_density , 0 , 0 , ci->progeny[k] , NULL , 0 ); - - /* Make a task for each pair of progeny. */ - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL ) - for ( k = j + 1 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL ) - scheduler_addtask( s , task_type_pair , task_subtype_density , pts[j][k] , 0 , ci->progeny[j] , ci->progeny[k] , 0 ); - } + } - } - - } - - /* Pair interaction? */ - else if ( t->type == task_type_pair ) { - - /* Get a handle on the cells involved. */ - ci = t->ci; - cj = t->cj; - hi = ci->dmin; - hj = cj->dmin; - - /* Foreign task? */ - if ( ci->nodeID != s->nodeID && cj->nodeID != s->nodeID ) { - t->skip = 1; - continue; - } - - /* Get the sort ID, use space_getsid and not t->flags - to make sure we get ci and cj swapped if needed. */ - sid = space_getsid( s->space , &ci , &cj , shift ); - - /* Should this task be split-up? */ - if ( ci->split && cj->split && - ci->h_max*kernel_gamma*space_stretch < hi/2 && - cj->h_max*kernel_gamma*space_stretch < hj/2 ) { - - /* Replace by a single sub-task? */ - if ( scheduler_dosub && - ci->count * sid_scale[sid] < space_subsize/cj->count && - sid != 0 && sid != 2 && sid != 6 && sid != 8 ) { - - /* Make this task a sub task. */ - t->type = task_type_sub; + /* Nope, pair. */ + else { - } - - /* Otherwise, split it. */ - else { - - /* Take a step back (we're going to recycle the current task)... */ - redo = 1; - - /* For each different sorting type... */ - switch ( sid ) { - - case 0: /* ( 1 , 1 , 1 ) */ - t->ci = ci->progeny[7]; t->cj = cj->progeny[0]; t->flags = 0; - break; - - case 1: /* ( 1 , 1 , 0 ) */ - t->ci = ci->progeny[6]; t->cj = cj->progeny[0]; t->flags = 1; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[7] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[6] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[7] , cj->progeny[0] , 1 ); - break; - - case 2: /* ( 1 , 1 , -1 ) */ - t->ci = ci->progeny[6]; t->cj = cj->progeny[1]; t->flags = 2; t->tight = 1; - break; - - case 3: /* ( 1 , 0 , 1 ) */ - t->ci = ci->progeny[5]; t->cj = cj->progeny[0]; t->flags = 3; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[7] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[5] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[7] , cj->progeny[0] , 1 ); - break; - - case 4: /* ( 1 , 0 , 0 ) */ - t->ci = ci->progeny[4]; t->cj = cj->progeny[0]; t->flags = 4; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[5] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[6] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[7] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[4] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 4 , 0 , ci->progeny[5] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[6] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[7] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[4] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[5] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 4 , 0 , ci->progeny[6] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[7] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[4] , cj->progeny[3] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[5] , cj->progeny[3] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[6] , cj->progeny[3] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 4 , 0 , ci->progeny[7] , cj->progeny[3] , 1 ); - break; - - case 5: /* ( 1 , 0 , -1 ) */ - t->ci = ci->progeny[4]; t->cj = cj->progeny[1]; t->flags = 5; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[6] , cj->progeny[3] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[4] , cj->progeny[3] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[6] , cj->progeny[1] , 1 ); - break; - - case 6: /* ( 1 , -1 , 1 ) */ - t->ci = ci->progeny[5]; t->cj = cj->progeny[2]; t->flags = 6; t->tight = 1; - break; - - case 7: /* ( 1 , -1 , 0 ) */ - t->ci = ci->progeny[4]; t->cj = cj->progeny[3]; t->flags = 6; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[5] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[4] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[5] , cj->progeny[3] , 1 ); - break; - - case 8: /* ( 1 , -1 , -1 ) */ - t->ci = ci->progeny[4]; t->cj = cj->progeny[3]; t->flags = 8; t->tight = 1; - break; - - case 9: /* ( 0 , 1 , 1 ) */ - t->ci = ci->progeny[3]; t->cj = cj->progeny[0]; t->flags = 9; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[7] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[3] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[7] , cj->progeny[0] , 1 ); - break; - - case 10: /* ( 0 , 1 , 0 ) */ - t->ci = ci->progeny[2]; t->cj = cj->progeny[0]; t->flags = 10; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[3] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[6] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[7] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[2] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 10 , 0 , ci->progeny[3] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[6] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 7 , 0 , ci->progeny[7] , cj->progeny[1] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[2] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[3] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 10 , 0 , ci->progeny[6] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[7] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[2] , cj->progeny[5] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 1 , 0 , ci->progeny[3] , cj->progeny[5] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[6] , cj->progeny[5] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 10 , 0 , ci->progeny[7] , cj->progeny[5] , 1 ); - break; - - case 11: /* ( 0 , 1 , -1 ) */ - t->ci = ci->progeny[2]; t->cj = cj->progeny[1]; t->flags = 11; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[6] , cj->progeny[5] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[2] , cj->progeny[5] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[6] , cj->progeny[1] , 1 ); - break; - - case 12: /* ( 0 , 0 , 1 ) */ - t->ci = ci->progeny[1]; t->cj = cj->progeny[0]; t->flags = 12; t->tight = 1; - t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[3] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[5] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 2 , 0 , ci->progeny[7] , cj->progeny[0] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[1] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 12 , 0 , ci->progeny[3] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 8 , 0 , ci->progeny[5] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 5 , 0 , ci->progeny[7] , cj->progeny[2] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[1] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 6 , 0 , ci->progeny[3] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 12 , 0 , ci->progeny[5] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 11 , 0 , ci->progeny[7] , cj->progeny[4] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[1] , cj->progeny[6] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 3 , 0 , ci->progeny[3] , cj->progeny[6] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 9 , 0 , ci->progeny[5] , cj->progeny[6] , 1 ); - t = scheduler_addtask( s , task_type_pair , t->subtype , 12 , 0 , ci->progeny[7] , cj->progeny[6] , 1 ); - break; - - } - - } + /* Make a sub-task? */ + if (scheduler_dosub && ci->count < space_subsize / cj->count) { - } /* split this task? */ - - /* Otherwise, break it up if it is too large? */ - else if ( scheduler_doforcesplit && ci->split && cj->split && - ( ci->count > space_maxsize / cj->count ) ) { - - // message( "force splitting pair with %i and %i parts." , ci->count , cj->count ); - - /* Replace the current task. */ - t->type = task_type_none; - - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL ) - for ( k = 0 ; k < 8 ; k++ ) - if ( cj->progeny[k] != NULL ) { - t = scheduler_addtask( s , task_type_pair , t->subtype , 0 , 0 , ci->progeny[j] , cj->progeny[k] , 0 ); - t->flags = space_getsid( s->space , &t->ci , &t->cj , shift ); - } - - } - - /* Otherwise, if not spilt, stitch-up the sorting. */ - else { - - /* Create the sort for ci. */ - // lock_lock( &ci->lock ); - if ( ci->sorts == NULL ) - ci->sorts = scheduler_addtask( s , task_type_sort , 0 , 1 << sid , 0 , ci , NULL , 0 ); - else - ci->sorts->flags |= (1 << sid); - // lock_unlock_blind( &ci->lock ); - scheduler_addunlock( s , ci->sorts , t ); - - /* Create the sort for cj. */ - // lock_lock( &cj->lock ); - if ( cj->sorts == NULL ) - cj->sorts = scheduler_addtask( s , task_type_sort , 0 , 1 << sid , 0 , cj , NULL , 0 ); - else - cj->sorts->flags |= (1 << sid); - // lock_unlock_blind( &cj->lock ); - scheduler_addunlock( s , cj->sorts , t ); - - } - - } /* pair interaction? */ - - /* Gravity interaction? */ - else if ( t->type == task_type_grav_mm ) { - - /* Get a handle on the cells involved. */ - ci = t->ci; - cj = t->cj; - - /* Self-interaction? */ - if ( cj == NULL ) { - - /* Ignore this task if the cell has no gparts. */ - if ( ci->gcount == 0 ) - t->type = task_type_none; - - /* If the cell is split, recurse. */ - else if ( ci->split ) { - - /* Make a single sub-task? */ - if ( scheduler_dosub && ci->count < space_subsize/ci->count ) { - - t->type = task_type_sub; - t->subtype = task_subtype_grav; - - } - - /* Otherwise, just split the task. */ - else { - - /* Split this task into tasks on its progeny. */ - t->type = task_type_none; - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0 ) { - if ( t->type == task_type_none ) { - t->type = task_type_grav_mm; - t->ci = ci->progeny[j]; - t->cj = NULL; - } - else - t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , NULL , 0 ); - for ( k = j+1 ; k < 8 ; k++ ) - if ( ci->progeny[k] != NULL && ci->progeny[k]->gcount > 0 ) { - if ( t->type == task_type_none ) { - t->type = task_type_grav_mm; - t->ci = ci->progeny[j]; - t->cj = ci->progeny[k]; - } - else - t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , ci->progeny[k] , 0 ); - } - } - redo = ( t->type != task_type_none ); - - } - + t->type = task_type_sub; + t->subtype = task_subtype_grav; + + } + + /* Otherwise, split the task. */ + else { + + /* Get the opening angle theta. */ + float dx[3], theta; + for (k = 0; k < 3; k++) { + dx[k] = fabsf(ci->loc[k] - cj->loc[k]); + if (s->space->periodic && dx[k] > 0.5 * s->space->dim[k]) + dx[k] = -dx[k] + s->space->dim[k]; + if (dx[k] > 0.0f) dx[k] -= ci->h[k]; + } + theta = + (dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]) / + (ci->h[0] * ci->h[0] + ci->h[1] * ci->h[1] + ci->h[2] * ci->h[2]); + + /* Ignore this task if the cell has no gparts. */ + if (ci->gcount == 0 || cj->gcount == 0) t->type = task_type_none; + + /* Split the interacton? */ + else if (theta < const_theta_max * const_theta_max) { + + /* Are both ci and cj split? */ + if (ci->split && cj->split) { + + /* Split this task into tasks on its progeny. */ + t->type = task_type_none; + for (j = 0; j < 8; j++) + if (ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0) { + for (k = 0; k < 8; k++) + if (cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0) { + if (t->type == task_type_none) { + t->type = task_type_grav_mm; + t->ci = ci->progeny[j]; + t->cj = cj->progeny[k]; + } else + t = scheduler_addtask( + s, task_type_grav_mm, task_subtype_none, 0, 0, + ci->progeny[j], cj->progeny[k], 0); } - - /* Otherwise, just make a pp task out of it. */ - else - t->type = task_type_grav_pp; - } - - /* Nope, pair. */ - else { - - /* Make a sub-task? */ - if ( scheduler_dosub && ci->count < space_subsize/cj->count ) { - - t->type = task_type_sub; - t->subtype = task_subtype_grav; - - } - - /* Otherwise, split the task. */ - else { - - /* Get the opening angle theta. */ - float dx[3], theta; - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = fabsf( ci->loc[k] - cj->loc[k] ); - if ( s->space->periodic && dx[k] > 0.5*s->space->dim[k] ) - dx[k] = -dx[k] + s->space->dim[k]; - if ( dx[k] > 0.0f ) - dx[k] -= ci->h[k]; - } - theta = ( dx[0]*dx[0] + dx[1]*dx[1] + dx[2]*dx[2] ) / - ( ci->h[0]*ci->h[0] + ci->h[1]*ci->h[1] + ci->h[2]*ci->h[2] ); - - /* Ignore this task if the cell has no gparts. */ - if ( ci->gcount == 0 || cj->gcount == 0 ) - t->type = task_type_none; - - /* Split the interacton? */ - else if ( theta < const_theta_max*const_theta_max ) { - - /* Are both ci and cj split? */ - if ( ci->split && cj->split ) { - - /* Split this task into tasks on its progeny. */ - t->type = task_type_none; - for ( j = 0 ; j < 8 ; j++ ) - if ( ci->progeny[j] != NULL && ci->progeny[j]->gcount > 0 ) { - for ( k = 0 ; k < 8 ; k++ ) - if ( cj->progeny[k] != NULL && cj->progeny[k]->gcount > 0 ) { - if ( t->type == task_type_none ) { - t->type = task_type_grav_mm; - t->ci = ci->progeny[j]; - t->cj = cj->progeny[k]; - } - else - t = scheduler_addtask( s , task_type_grav_mm , task_subtype_none , 0 , 0 , ci->progeny[j] , cj->progeny[k] , 0 ); - } - } - redo = ( t->type != task_type_none ); - - } - - /* Otherwise, make a pp task out of it. */ - else - t->type = task_type_grav_pp; - - } - - } - - } /* gravity pair interaction? */ - - } /* gravity interaction? */ - - } /* loop over all tasks. */ - - } - - + redo = (t->type != task_type_none); + + } + + /* Otherwise, make a pp task out of it. */ + else + t->type = task_type_grav_pp; + } + } + + } /* gravity pair interaction? */ + + } /* gravity interaction? */ + + } /* loop over all tasks. */ +} + /** * @brief Add a #task to the #scheduler. * @@ -550,116 +640,111 @@ void scheduler_splittasks ( struct scheduler *s ) { * @param type The type of the task. * @param subtype The sub-type of the task. * @param flags The flags of the task. - * @param wait + * @param wait * @param ci The first cell to interact. * @param cj The second cell to interact. * @param tight */ - -struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype , int flags , int wait , struct cell *ci , struct cell *cj , int tight ) { - - int ind; - struct task *t; - - /* Get the next free task. */ - ind = atomic_inc( &s->tasks_next ); - - /* Overflow? */ - if ( ind >= s->size ) - error( "Task list overflow." ); - - /* Get a pointer to the new task. */ - t = &s->tasks[ ind ]; - - /* Copy the data. */ - t->type = type; - t->subtype = subtype; - t->flags = flags; - t->wait = wait; - t->ci = ci; - t->cj = cj; - t->skip = 0; - t->tight = tight; - t->implicit = 0; - t->weight = 0; - t->rank = 0; - t->tic = 0; - t->toc = 0; - t->nr_unlock_tasks = 0; - - /* Init the lock. */ - lock_init( &t->lock ); - - /* Add an index for it. */ - // lock_lock( &s->lock ); - s->tasks_ind[ atomic_inc( &s->nr_tasks ) ] = ind; - // lock_unlock_blind( &s->lock ); - - /* Return a pointer to the new task. */ - return t; - - } - +struct task *scheduler_addtask(struct scheduler *s, int type, int subtype, + int flags, int wait, struct cell *ci, + struct cell *cj, int tight) { + + int ind; + struct task *t; + + /* Get the next free task. */ + ind = atomic_inc(&s->tasks_next); + + /* Overflow? */ + if (ind >= s->size) error("Task list overflow."); + + /* Get a pointer to the new task. */ + t = &s->tasks[ind]; + + /* Copy the data. */ + t->type = type; + t->subtype = subtype; + t->flags = flags; + t->wait = wait; + t->ci = ci; + t->cj = cj; + t->skip = 0; + t->tight = tight; + t->implicit = 0; + t->weight = 0; + t->rank = 0; + t->tic = 0; + t->toc = 0; + t->nr_unlock_tasks = 0; + + /* Init the lock. */ + lock_init(&t->lock); + + /* Add an index for it. */ + // lock_lock( &s->lock ); + s->tasks_ind[atomic_inc(&s->nr_tasks)] = ind; + // lock_unlock_blind( &s->lock ); + + /* Return a pointer to the new task. */ + return t; +} -/** +/** * @brief Sort the tasks in topological order over all queues. * * @param s The #scheduler. */ - -void scheduler_ranktasks ( struct scheduler *s ) { - - int i, j = 0, k, temp, left = 0, rank; - struct task *t, *tasks = s->tasks; - int *tid = s->tasks_ind, nr_tasks = s->nr_tasks; - - /* Run throught the tasks and get all the waits right. */ - for ( i = 0 , k = 0 ; k < nr_tasks ; k++ ) { - tid[k] = k; - for ( j = 0 ; j < tasks[k].nr_unlock_tasks ; j++ ) - tasks[k].unlock_tasks[j]->wait += 1; - } - - /* Main loop. */ - for ( j = 0 , rank = 0 ; left < nr_tasks ; rank++ ) { - - /* Load the tids of tasks with no waits. */ - for ( k = left ; k < nr_tasks ; k++ ) - if ( tasks[ tid[k] ].wait == 0 ) { - temp = tid[j]; tid[j] = tid[k]; tid[k] = temp; - j += 1; - } - - /* Did we get anything? */ - if ( j == left ) - error( "Unsatisfiable task dependencies detected." ); - - /* Unlock the next layer of tasks. */ - for ( i = left ; i < j ; i++ ) { - t = &tasks[ tid[i] ]; - t->rank = rank; - tid[i] = t - tasks; - if ( tid[i] >= nr_tasks ) - error( "Task index overshoot." ); - /* message( "task %i of type %s has rank %i." , i , - (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? "pair" : "sort" , rank ); */ - for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) - t->unlock_tasks[k]->wait -= 1; - } - - /* The new left (no, not tony). */ - left = j; - - } - - /* Verify that the tasks were ranked correctly. */ - /* for ( k = 1 ; k < s->nr_tasks ; k++ ) - if ( tasks[ tid[k-1] ].rank > tasks[ tid[k-1] ].rank ) - error( "Task ranking failed." ); */ - + +void scheduler_ranktasks(struct scheduler *s) { + + int i, j = 0, k, temp, left = 0, rank; + struct task *t, *tasks = s->tasks; + int *tid = s->tasks_ind, nr_tasks = s->nr_tasks; + + /* Run throught the tasks and get all the waits right. */ + for (i = 0, k = 0; k < nr_tasks; k++) { + tid[k] = k; + for (j = 0; j < tasks[k].nr_unlock_tasks; j++) + tasks[k].unlock_tasks[j]->wait += 1; + } + + /* Main loop. */ + for (j = 0, rank = 0; left < nr_tasks; rank++) { + + /* Load the tids of tasks with no waits. */ + for (k = left; k < nr_tasks; k++) + if (tasks[tid[k]].wait == 0) { + temp = tid[j]; + tid[j] = tid[k]; + tid[k] = temp; + j += 1; + } + + /* Did we get anything? */ + if (j == left) error("Unsatisfiable task dependencies detected."); + + /* Unlock the next layer of tasks. */ + for (i = left; i < j; i++) { + t = &tasks[tid[i]]; + t->rank = rank; + tid[i] = t - tasks; + if (tid[i] >= nr_tasks) error("Task index overshoot."); + /* message( "task %i of type %s has rank %i." , i , + (t->type == task_type_self) ? "self" : (t->type == task_type_pair) ? + "pair" : "sort" , rank ); */ + for (k = 0; k < t->nr_unlock_tasks; k++) t->unlock_tasks[k]->wait -= 1; } + /* The new left (no, not tony). */ + left = j; + } + + /* Verify that the tasks were ranked correctly. */ + /* for ( k = 1 ; k < s->nr_tasks ; k++ ) + if ( tasks[ tid[k-1] ].rank > tasks[ tid[k-1] ].rank ) + error( "Task ranking failed." ); */ +} /** * @brief (Re)allocate the task arrays. @@ -667,131 +752,123 @@ void scheduler_ranktasks ( struct scheduler *s ) { * @param s The #scheduler. * @param size The maximum number of tasks in the #scheduler. */ - -void scheduler_reset ( struct scheduler *s , int size ) { - int k; +void scheduler_reset(struct scheduler *s, int size) { - /* Do we need to re-allocate? */ - if ( size > s->size ) { + int k; - /* Free exising task lists if necessary. */ - if ( s->tasks != NULL ) - free( s->tasks ); - if ( s->tasks_ind != NULL ) - free( s->tasks_ind ); + /* Do we need to re-allocate? */ + if (size > s->size) { - /* Allocate the new lists. */ - if ( ( s->tasks = (struct task *)malloc( sizeof(struct task) * size ) ) == NULL || - ( s->tasks_ind = (int *)malloc( sizeof(int) * size ) ) == NULL ) - error( "Failed to allocate task lists." ); - - } - - /* Reset the task data. */ - bzero( s->tasks , sizeof(struct task) * size ); - - /* Reset the counters. */ - s->size = size; - s->nr_tasks = 0; - s->tasks_next = 0; - s->waiting = 0; - - /* Set the task pointers in the queues. */ - for ( k = 0 ; k < s->nr_queues ; k++ ) - s->queues[k].tasks = s->tasks; + /* Free exising task lists if necessary. */ + if (s->tasks != NULL) free(s->tasks); + if (s->tasks_ind != NULL) free(s->tasks_ind); - } + /* Allocate the new lists. */ + if ((s->tasks = (struct task *)malloc(sizeof(struct task) *size)) == NULL || + (s->tasks_ind = (int *)malloc(sizeof(int) * size)) == NULL) + error("Failed to allocate task lists."); + } + /* Reset the task data. */ + bzero(s->tasks, sizeof(struct task) * size); + + /* Reset the counters. */ + s->size = size; + s->nr_tasks = 0; + s->tasks_next = 0; + s->waiting = 0; + + /* Set the task pointers in the queues. */ + for (k = 0; k < s->nr_queues; k++) s->queues[k].tasks = s->tasks; +} /** * @brief Compute the task weights * * @param s The #scheduler. */ - -void scheduler_reweight ( struct scheduler *s ) { - - int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind; - struct task *t, *tasks = s->tasks; - int nodeID = s->nodeID; - float sid_scale[13] = { 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , - 0.1897 , 0.4025 , 0.1897 , 0.4025 , 0.5788 , 0.4025 , - 0.5788 }; - float wscale = 0.001; - // ticks tic; - - /* Run throught the tasks backwards and set their waits and - weights. */ - // tic = getticks(); - for ( k = nr_tasks-1 ; k >= 0 ; k-- ) { - t = &tasks[ tid[k] ]; - t->weight = 0; - for ( j = 0 ; j < t->nr_unlock_tasks ; j++ ) - if ( t->unlock_tasks[j]->weight > t->weight ) - t->weight = t->unlock_tasks[j]->weight; - if ( !t->implicit && t->tic > 0 ) - t->weight += wscale * (t->toc - t->tic); - else - switch ( t->type ) { - case task_type_sort: - t->weight += wscale * __builtin_popcount( t->flags ) * t->ci->count * ( sizeof(int)*8 - __builtin_clz( t->ci->count ) ); - break; - case task_type_self: - t->weight += 1 * t->ci->count * t->ci->count; - break; - case task_type_pair: - if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID ) - t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; - else - t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; - break; - case task_type_sub: - if ( t->cj != NULL ) { - if ( t->ci->nodeID != nodeID || t->cj->nodeID != nodeID ) { - if ( t->flags < 0 ) - t->weight += 3 * wscale * t->ci->count * t->cj->count; - else - t->weight += 3 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; - } - else { - if ( t->flags < 0 ) - t->weight += 2 * wscale * t->ci->count * t->cj->count; - else - t->weight += 2 * wscale * t->ci->count * t->cj->count * sid_scale[ t->flags ]; - } - } - else - t->weight += 1 * wscale * t->ci->count * t->ci->count; - break; - case task_type_ghost: - if ( t->ci == t->ci->super ) - t->weight += wscale * t->ci->count; - break; - case task_type_kick1: - case task_type_kick2: - t->weight += wscale * t->ci->count; - break; - default: - break; - } - if ( t->type == task_type_send ) - t->weight = INT_MAX / 8; - if ( t->type == task_type_recv ) - t->weight *= 1.41; - } - // message( "weighting tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 ); - - /* int min = tasks[0].weight, max = tasks[0].weight; - for ( k = 1 ; k < nr_tasks ; k++ ) - if ( tasks[k].weight < min ) - min = tasks[k].weight; - else if ( tasks[k].weight > max ) - max = tasks[k].weight; - message( "task weights are in [ %i , %i ]." , min , max ); */ - - } +void scheduler_reweight(struct scheduler *s) { + + int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind; + struct task *t, *tasks = s->tasks; + int nodeID = s->nodeID; + float sid_scale[13] = {0.1897, 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.1897, + 0.4025, 0.1897, 0.4025, 0.5788, 0.4025, 0.5788}; + float wscale = 0.001; + // ticks tic; + + /* Run throught the tasks backwards and set their waits and + weights. */ + // tic = getticks(); + for (k = nr_tasks - 1; k >= 0; k--) { + t = &tasks[tid[k]]; + t->weight = 0; + for (j = 0; j < t->nr_unlock_tasks; j++) + if (t->unlock_tasks[j]->weight > t->weight) + t->weight = t->unlock_tasks[j]->weight; + if (!t->implicit && t->tic > 0) + t->weight += wscale * (t->toc - t->tic); + else + switch (t->type) { + case task_type_sort: + t->weight += wscale * __builtin_popcount(t->flags) * t->ci->count * + (sizeof(int) * 8 - __builtin_clz(t->ci->count)); + break; + case task_type_self: + t->weight += 1 * t->ci->count * t->ci->count; + break; + case task_type_pair: + if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) + t->weight += + 3 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; + else + t->weight += + 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; + break; + case task_type_sub: + if (t->cj != NULL) { + if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) { + if (t->flags < 0) + t->weight += 3 * wscale * t->ci->count * t->cj->count; + else + t->weight += 3 * wscale * t->ci->count * t->cj->count * + sid_scale[t->flags]; + } else { + if (t->flags < 0) + t->weight += 2 * wscale * t->ci->count * t->cj->count; + else + t->weight += 2 * wscale * t->ci->count * t->cj->count * + sid_scale[t->flags]; + } + } else + t->weight += 1 * wscale * t->ci->count * t->ci->count; + break; + case task_type_ghost: + if (t->ci == t->ci->super) t->weight += wscale * t->ci->count; + break; + case task_type_kick1: + case task_type_kick2: + t->weight += wscale * t->ci->count; + break; + default: + break; + } + if (t->type == task_type_send) t->weight = INT_MAX / 8; + if (t->type == task_type_recv) t->weight *= 1.41; + } + // message( "weighting tasks took %.3f ms." , (double)( getticks() - tic ) / + // CPU_TPS * 1000 ); + + /* int min = tasks[0].weight, max = tasks[0].weight; + for ( k = 1 ; k < nr_tasks ; k++ ) + if ( tasks[k].weight < min ) + min = tasks[k].weight; + else if ( tasks[k].weight > max ) + max = tasks[k].weight; + message( "task weights are in [ %i , %i ]." , min , max ); */ +} /** * @brief Start the scheduler, i.e. fill the queues with ready tasks. @@ -799,46 +876,44 @@ void scheduler_reweight ( struct scheduler *s ) { * @param s The #scheduler. * @param mask The task types to enqueue. */ - -void scheduler_start ( struct scheduler *s , unsigned int mask ) { - - int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind; - struct task *t, *tasks = s->tasks; - // ticks tic; - - /* Run throught the tasks and set their waits. */ - // tic = getticks(); - for ( k = nr_tasks - 1 ; k >= 0 ; k-- ) { - t = &tasks[ tid[k] ]; - t->wait = 0; - t->rid = -1; - if ( !( (1 << t->type) & mask ) || t->skip ) - continue; - for ( j = 0 ; j < t->nr_unlock_tasks ; j++ ) - atomic_inc( &t->unlock_tasks[j]->wait ); - } - // message( "waiting tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 ); - - /* Don't enqueue link tasks directly. */ - mask &= ~(1 << task_type_link); - - /* Loop over the tasks and enqueue whoever is ready. */ - // tic = getticks(); - for ( k = 0 ; k < nr_tasks ; k++) { - t = &tasks[ tid[k] ]; - if ( ( (1 << t->type) & mask ) && !t->skip ) { - if ( t->wait == 0 ) { - scheduler_enqueue( s , t ); - pthread_cond_broadcast( &s->sleep_cond ); - } - else - break; - } - } - // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) / CPU_TPS * 1000 ); - - } +void scheduler_start(struct scheduler *s, unsigned int mask) { + + int k, j, nr_tasks = s->nr_tasks, *tid = s->tasks_ind; + struct task *t, *tasks = s->tasks; + // ticks tic; + + /* Run throught the tasks and set their waits. */ + // tic = getticks(); + for (k = nr_tasks - 1; k >= 0; k--) { + t = &tasks[tid[k]]; + t->wait = 0; + t->rid = -1; + if (!((1 << t->type) & mask) || t->skip) continue; + for (j = 0; j < t->nr_unlock_tasks; j++) + atomic_inc(&t->unlock_tasks[j]->wait); + } + // message( "waiting tasks took %.3f ms." , (double)( getticks() - tic ) / + // CPU_TPS * 1000 ); + + /* Don't enqueue link tasks directly. */ + mask &= ~(1 << task_type_link); + + /* Loop over the tasks and enqueue whoever is ready. */ + // tic = getticks(); + for (k = 0; k < nr_tasks; k++) { + t = &tasks[tid[k]]; + if (((1 << t->type) & mask) && !t->skip) { + if (t->wait == 0) { + scheduler_enqueue(s, t); + pthread_cond_broadcast(&s->sleep_cond); + } else + break; + } + } + // message( "enqueueing tasks took %.3f ms." , (double)( getticks() - tic ) / + // CPU_TPS * 1000 ); +} /** * @brief Put a task on one of the queues. @@ -846,97 +921,97 @@ void scheduler_start ( struct scheduler *s , unsigned int mask ) { * @param s The #scheduler. * @param t The #task. */ - -void scheduler_enqueue ( struct scheduler *s , struct task *t ) { - - int qid = -1; - #ifdef WITH_MPI - int err; - #endif - - /* Ignore skipped tasks. */ - if ( t->skip || atomic_cas( &t->rid , -1 , 0 ) != -1 ) - return; - - /* If this is an implicit task, just pretend it's done. */ - if ( t->implicit ) { - for ( int j = 0 ; j < t->nr_unlock_tasks ; j++ ) { - struct task *t2 = t->unlock_tasks[j]; - if ( atomic_dec( &t2->wait ) == 1 && !t2->skip ) - scheduler_enqueue( s , t2 ); - } + +void scheduler_enqueue(struct scheduler *s, struct task *t) { + + int qid = -1; +#ifdef WITH_MPI + int err; +#endif + + /* Ignore skipped tasks. */ + if (t->skip || atomic_cas(&t->rid, -1, 0) != -1) return; + + /* If this is an implicit task, just pretend it's done. */ + if (t->implicit) { + for (int j = 0; j < t->nr_unlock_tasks; j++) { + struct task *t2 = t->unlock_tasks[j]; + if (atomic_dec(&t2->wait) == 1 && !t2->skip) scheduler_enqueue(s, t2); + } + } + + /* Otherwise, look for a suitable queue. */ + else { + + /* Find the previous owner for each task type, and do + any pre-processing needed. */ + switch (t->type) { + case task_type_self: + case task_type_sort: + case task_type_ghost: + case task_type_kick2: + qid = t->ci->super->owner; + break; + case task_type_pair: + case task_type_sub: + qid = t->ci->super->owner; + if (t->cj != NULL && + (qid < 0 || + s->queues[qid].count > s->queues[t->cj->super->owner].count)) + qid = t->cj->super->owner; + break; + case task_type_recv: +#ifdef WITH_MPI + if ((err = MPI_Irecv(t->ci->parts, sizeof(struct part) * t->ci->count, + MPI_BYTE, t->ci->nodeID, t->flags, MPI_COMM_WORLD, + &t->req)) != MPI_SUCCESS) { + char buff[MPI_MAX_ERROR_STRING]; + int len; + MPI_Error_string(err, buff, &len); + error("Failed to emit irecv for particle data (%s).", buff); } - - /* Otherwise, look for a suitable queue. */ - else { - - /* Find the previous owner for each task type, and do - any pre-processing needed. */ - switch ( t->type ) { - case task_type_self: - case task_type_sort: - case task_type_ghost: - case task_type_kick2: - qid = t->ci->super->owner; - break; - case task_type_pair: - case task_type_sub: - qid = t->ci->super->owner; - if ( t->cj != NULL && - ( qid < 0 || s->queues[qid].count > s->queues[t->cj->super->owner].count ) ) - qid = t->cj->super->owner; - break; - case task_type_recv: - #ifdef WITH_MPI - if ( ( err = MPI_Irecv( t->ci->parts , sizeof(struct part) * t->ci->count , MPI_BYTE , t->ci->nodeID , t->flags , MPI_COMM_WORLD , &t->req ) ) != MPI_SUCCESS ) { - char buff[ MPI_MAX_ERROR_STRING ]; - int len; - MPI_Error_string( err , buff , &len ); - error( "Failed to emit irecv for particle data (%s)." , buff ); - } - // message( "recieving %i parts with tag=%i from %i to %i." , - // t->ci->count , t->flags , t->ci->nodeID , s->nodeID ); fflush(stdout); - qid = 1 % s->nr_queues; - #else - error( "SWIFT was not compiled with MPI support." ); - #endif - break; - case task_type_send: - #ifdef WITH_MPI - if ( ( err = MPI_Isend( t->ci->parts , sizeof(struct part) * t->ci->count , MPI_BYTE , t->cj->nodeID , t->flags , MPI_COMM_WORLD , &t->req ) ) != MPI_SUCCESS ) { - char buff[ MPI_MAX_ERROR_STRING ]; - int len; - MPI_Error_string( err , buff , &len ); - error( "Failed to emit isend for particle data (%s)." , buff ); - } - // message( "sending %i parts with tag=%i from %i to %i." , - // t->ci->count , t->flags , s->nodeID , t->cj->nodeID ); fflush(stdout); - qid = 0; - #else - error( "SWIFT was not compiled with MPI support." ); - #endif - break; - default: - qid = -1; - } - - if ( qid >= s->nr_queues ) - error( "Bad computed qid." ); - - /* If no previous owner, find the shortest queue. */ - if ( qid < 0 ) - qid = rand() % s->nr_queues; - - /* Increase the waiting counter. */ - atomic_inc( &s->waiting ); - - /* Insert the task into that queue. */ - queue_insert( &s->queues[qid] , t ); - + // message( "recieving %i parts with tag=%i from %i to %i." , + // t->ci->count , t->flags , t->ci->nodeID , s->nodeID ); + // fflush(stdout); + qid = 1 % s->nr_queues; +#else + error("SWIFT was not compiled with MPI support."); +#endif + break; + case task_type_send: +#ifdef WITH_MPI + if ((err = MPI_Isend(t->ci->parts, sizeof(struct part) * t->ci->count, + MPI_BYTE, t->cj->nodeID, t->flags, MPI_COMM_WORLD, + &t->req)) != MPI_SUCCESS) { + char buff[MPI_MAX_ERROR_STRING]; + int len; + MPI_Error_string(err, buff, &len); + error("Failed to emit isend for particle data (%s).", buff); } - + // message( "sending %i parts with tag=%i from %i to %i." , + // t->ci->count , t->flags , s->nodeID , t->cj->nodeID ); + // fflush(stdout); + qid = 0; +#else + error("SWIFT was not compiled with MPI support."); +#endif + break; + default: + qid = -1; } + if (qid >= s->nr_queues) error("Bad computed qid."); + + /* If no previous owner, find the shortest queue. */ + if (qid < 0) qid = rand() % s->nr_queues; + + /* Increase the waiting counter. */ + atomic_inc(&s->waiting); + + /* Insert the task into that queue. */ + queue_insert(&s->queues[qid], t); + } +} /** * @brief Take care of a tasks dependencies. @@ -947,58 +1022,49 @@ void scheduler_enqueue ( struct scheduler *s , struct task *t ) { * @return A pointer to the next task, if a suitable one has * been identified. */ - -struct task *scheduler_done ( struct scheduler *s , struct task *t ) { - - int k, res; - struct task *t2, *next = NULL; - struct cell *super = t->ci->super; - - /* Release whatever locks this task held. */ - if ( !t->implicit ) - task_unlock( t ); - - /* Loop through the dependencies and add them to a queue if - they are ready. */ - for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) { - t2 = t->unlock_tasks[k]; - if ( ( res = atomic_dec( &t2->wait ) ) < 1 ) - error( "Negative wait!" ); - if ( res == 1 && !t2->skip ) { - if ( 0 && !t2->implicit && - t2->ci->super == super && - ( next == NULL || t2->weight > next->weight ) && - task_lock( t2 ) ) { - if ( next != NULL ) { - task_unlock( next ); - scheduler_enqueue( s , next ); - } - next = t2; - } - else - scheduler_enqueue( s , t2 ); - } - } - - /* Task definitely done. */ - if ( !t->implicit ) { - t->toc = getticks(); - pthread_mutex_lock( &s->sleep_mutex ); - if ( next == NULL ) - atomic_dec( &s->waiting ); - pthread_cond_broadcast( &s->sleep_cond ); - pthread_mutex_unlock( &s->sleep_mutex ); + +struct task *scheduler_done(struct scheduler *s, struct task *t) { + + int k, res; + struct task *t2, *next = NULL; + struct cell *super = t->ci->super; + + /* Release whatever locks this task held. */ + if (!t->implicit) task_unlock(t); + + /* Loop through the dependencies and add them to a queue if + they are ready. */ + for (k = 0; k < t->nr_unlock_tasks; k++) { + t2 = t->unlock_tasks[k]; + if ((res = atomic_dec(&t2->wait)) < 1) error("Negative wait!"); + if (res == 1 && !t2->skip) { + if (0 && !t2->implicit && t2->ci->super == super && + (next == NULL || t2->weight > next->weight) && task_lock(t2)) { + if (next != NULL) { + task_unlock(next); + scheduler_enqueue(s, next); } + next = t2; + } else + scheduler_enqueue(s, t2); + } + } - /* Start the clock on the follow-up task. */ - if ( next != NULL ) - next->tic = getticks(); - - /* Return the next best task. */ - return next; + /* Task definitely done. */ + if (!t->implicit) { + t->toc = getticks(); + pthread_mutex_lock(&s->sleep_mutex); + if (next == NULL) atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + } - } + /* Start the clock on the follow-up task. */ + if (next != NULL) next->tic = getticks(); + /* Return the next best task. */ + return next; +} /** * @brief Resolve a single dependency by hand. @@ -1009,41 +1075,35 @@ struct task *scheduler_done ( struct scheduler *s , struct task *t ) { * @return A pointer to the next task, if a suitable one has * been identified. */ - -struct task *scheduler_unlock ( struct scheduler *s , struct task *t ) { - - int k, res; - struct task *t2, *next = NULL; - - /* Loop through the dependencies and add them to a queue if - they are ready. */ - for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) { - t2 = t->unlock_tasks[k]; - if ( ( res = atomic_dec( &t2->wait ) ) < 1 ) - error( "Negative wait!" ); - if ( res == 1 && !t2->skip ) - scheduler_enqueue( s , t2 ); - } - - /* Task definitely done. */ - if ( !t->implicit ) { - t->toc = getticks(); - pthread_mutex_lock( &s->sleep_mutex ); - if ( next == NULL ) - atomic_dec( &s->waiting ); - pthread_cond_broadcast( &s->sleep_cond ); - pthread_mutex_unlock( &s->sleep_mutex ); - } - /* Start the clock on the follow-up task. */ - if ( next != NULL ) - next->tic = getticks(); - - /* Return the next best task. */ - return next; +struct task *scheduler_unlock(struct scheduler *s, struct task *t) { - } + int k, res; + struct task *t2, *next = NULL; + + /* Loop through the dependencies and add them to a queue if + they are ready. */ + for (k = 0; k < t->nr_unlock_tasks; k++) { + t2 = t->unlock_tasks[k]; + if ((res = atomic_dec(&t2->wait)) < 1) error("Negative wait!"); + if (res == 1 && !t2->skip) scheduler_enqueue(s, t2); + } + /* Task definitely done. */ + if (!t->implicit) { + t->toc = getticks(); + pthread_mutex_lock(&s->sleep_mutex); + if (next == NULL) atomic_dec(&s->waiting); + pthread_cond_broadcast(&s->sleep_cond); + pthread_mutex_unlock(&s->sleep_mutex); + } + + /* Start the clock on the follow-up task. */ + if (next != NULL) next->tic = getticks(); + + /* Return the next best task. */ + return next; +} /** * @brief Get a task, preferably from the given queue. @@ -1054,79 +1114,72 @@ struct task *scheduler_unlock ( struct scheduler *s , struct task *t ) { * * @return A pointer to a #task or @c NULL if there are no available tasks. */ - -struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *super ) { - - struct task *res = NULL; - int k, nr_queues = s->nr_queues; - unsigned int seed = qid; - - /* Check qid. */ - if ( qid >= nr_queues || qid < 0 ) - error( "Bad queue ID." ); - - /* Loop as long as there are tasks... */ - while ( s->waiting > 0 && res == NULL ) { - - /* Try more than once before sleeping. */ - for ( int tries = 0 ; res == NULL && s->waiting && tries < scheduler_maxtries ; tries++ ) { - - /* Try to get a task from the suggested queue. */ - if ( s->queues[qid].count > 0 ) { - TIMER_TIC - res = queue_gettask( &s->queues[qid] , super , 0 ); - TIMER_TOC( timer_qget ); - if ( res != NULL ) - break; - } - /* If unsucessful, try stealing from the other queues. */ - if ( s->flags & scheduler_flag_steal ) { - int count = 0, qids[ nr_queues ]; - for ( k = 0 ; k < nr_queues ; k++ ) - if ( s->queues[k].count > 0 ) - qids[ count++ ] = k; - for ( k = 0 ; k < scheduler_maxsteal && count > 0 ; k++ ) { - int ind = rand_r( &seed ) % count; - TIMER_TIC - res = queue_gettask( &s->queues[ qids[ ind ] ] , super , 0 ); - TIMER_TOC( timer_qsteal ); - if ( res != NULL ) - break; - else - qids[ ind ] = qids[ --count ]; - } - if ( res != NULL ) - break; - } - - } - - /* If we failed, take a short nap. */ - #ifdef WITH_MPI - if ( res == NULL && qid > 1 ) { - #else - if ( res == NULL ) { - #endif - pthread_mutex_lock( &s->sleep_mutex ); - if ( s->waiting > 0 ) - pthread_cond_wait( &s->sleep_cond , &s->sleep_mutex ); - pthread_mutex_unlock( &s->sleep_mutex ); - } - - } - - /* Start the timer on this task, if we got one. */ - if ( res != NULL ) { - res->tic = getticks(); - res->rid = qid; +struct task *scheduler_gettask(struct scheduler *s, int qid, + struct cell *super) { + + struct task *res = NULL; + int k, nr_queues = s->nr_queues; + unsigned int seed = qid; + + /* Check qid. */ + if (qid >= nr_queues || qid < 0) error("Bad queue ID."); + + /* Loop as long as there are tasks... */ + while (s->waiting > 0 && res == NULL) { + + /* Try more than once before sleeping. */ + for (int tries = 0; res == NULL && s->waiting && tries < scheduler_maxtries; + tries++) { + + /* Try to get a task from the suggested queue. */ + if (s->queues[qid].count > 0) { + TIMER_TIC + res = queue_gettask(&s->queues[qid], super, 0); + TIMER_TOC(timer_qget); + if (res != NULL) break; + } + + /* If unsucessful, try stealing from the other queues. */ + if (s->flags & scheduler_flag_steal) { + int count = 0, qids[nr_queues]; + for (k = 0; k < nr_queues; k++) + if (s->queues[k].count > 0) qids[count++] = k; + for (k = 0; k < scheduler_maxsteal && count > 0; k++) { + int ind = rand_r(&seed) % count; + TIMER_TIC + res = queue_gettask(&s->queues[qids[ind]], super, 0); + TIMER_TOC(timer_qsteal); + if (res != NULL) + break; + else + qids[ind] = qids[--count]; } - - /* No milk today. */ - return res; + if (res != NULL) break; + } + } +/* If we failed, take a short nap. */ +#ifdef WITH_MPI + if (res == NULL && qid > 1) { +#else + if (res == NULL) { +#endif + pthread_mutex_lock(&s->sleep_mutex); + if (s->waiting > 0) pthread_cond_wait(&s->sleep_cond, &s->sleep_mutex); + pthread_mutex_unlock(&s->sleep_mutex); } + } + + /* Start the timer on this task, if we got one. */ + if (res != NULL) { + res->tic = getticks(); + res->rid = qid; + } + /* No milk today. */ + return res; +} /** * @brief Initialize the #scheduler. @@ -1137,40 +1190,39 @@ struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *su * @param flags The #scheduler flags. * @param nodeID The MPI rank */ - -void scheduler_init ( struct scheduler *s , struct space *space , int nr_queues , unsigned int flags , int nodeID ) { - - int k; - - /* Init the lock. */ - lock_init( &s->lock ); - - /* Allocate the queues. */ - if ( ( s->queues = (struct queue *)malloc( sizeof(struct queue) * nr_queues ) ) == NULL ) - error( "Failed to allocate queues." ); - - /* Initialize each queue. */ - for ( k = 0 ; k < nr_queues ; k++ ) - queue_init( &s->queues[k] , NULL ); - - /* Init the sleep mutex and cond. */ - if ( pthread_cond_init( &s->sleep_cond , NULL ) != 0 || - pthread_mutex_init( &s->sleep_mutex , NULL ) != 0 ) - error( "Failed to initialize sleep barrier." ); - - /* Set the scheduler variables. */ - s->nr_queues = nr_queues; - s->flags = flags; - s->space = space; - s->nodeID = nodeID; - - /* Init other values. */ - s->tasks = NULL; - s->tasks_ind = NULL; - s->waiting = 0; - s->size = 0; - s->nr_tasks = 0; - s->tasks_next = 0; - - } +void scheduler_init(struct scheduler *s, struct space *space, int nr_queues, + unsigned int flags, int nodeID) { + + int k; + + /* Init the lock. */ + lock_init(&s->lock); + + /* Allocate the queues. */ + if ((s->queues = (struct queue *)malloc(sizeof(struct queue) * nr_queues)) == + NULL) + error("Failed to allocate queues."); + + /* Initialize each queue. */ + for (k = 0; k < nr_queues; k++) queue_init(&s->queues[k], NULL); + + /* Init the sleep mutex and cond. */ + if (pthread_cond_init(&s->sleep_cond, NULL) != 0 || + pthread_mutex_init(&s->sleep_mutex, NULL) != 0) + error("Failed to initialize sleep barrier."); + + /* Set the scheduler variables. */ + s->nr_queues = nr_queues; + s->flags = flags; + s->space = space; + s->nodeID = nodeID; + + /* Init other values. */ + s->tasks = NULL; + s->tasks_ind = NULL; + s->waiting = 0; + s->size = 0; + s->nr_tasks = 0; + s->tasks_next = 0; +} diff --git a/src/scheduler.h b/src/scheduler.h index a71683db102a65e7c053677a3f3789c979626198..620b712885a1653397b3e9fd0e632cc0e562cf19 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -1,88 +1,103 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2013 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_SCHEDULER_H +#define SWIFT_SCHEDULER_H +/* Some standard headers. */ +#include <pthread.h> + +/* Includes. */ +#include "cell.h" +#include "lock.h" +#include "queue.h" +#include "space.h" +#include "task.h" /* Some constants. */ -#define scheduler_maxwait 3 -#define scheduler_maxunlock 40 -#define scheduler_dosub 1 -#define scheduler_maxsteal 10 -#define scheduler_maxtries 2 -#define scheduler_doforcesplit 0 /* Beware: switching this on can/will - break engine_addlink as it assumes - a maximum number of tasks per cell. */ +#define scheduler_maxwait 3 +#define scheduler_maxunlock 40 +#define scheduler_dosub 1 +#define scheduler_maxsteal 10 +#define scheduler_maxtries 2 +#define scheduler_doforcesplit \ + 0 /* Beware: switching this on can/will \ + break engine_addlink as it assumes \ + a maximum number of tasks per cell. */ /* Flags . */ -#define scheduler_flag_none 0 -#define scheduler_flag_steal 1 - +#define scheduler_flag_none 0 +#define scheduler_flag_steal 1 /* Data of a scheduler. */ struct scheduler { - /* Scheduler flags. */ - unsigned int flags; - - /* Number of queues in this scheduler. */ - int nr_queues; - - /* Array of queues. */ - struct queue *queues; - - /* Total number of tasks. */ - int nr_tasks, size, tasks_next; - - /* Total number of waiting tasks. */ - int waiting; - - /* The task array. */ - struct task *tasks; - - /* The task indices. */ - int *tasks_ind; - - /* Lock for this scheduler. */ - lock_type lock; - - /* Waiting queue. */ - pthread_mutex_t sleep_mutex; - pthread_cond_t sleep_cond; - - /* The space associated with this scheduler. */ - struct space *space; - - /* The node we are working on. */ - int nodeID; - - }; + /* Scheduler flags. */ + unsigned int flags; + + /* Number of queues in this scheduler. */ + int nr_queues; + + /* Array of queues. */ + struct queue *queues; + + /* Total number of tasks. */ + int nr_tasks, size, tasks_next; + + /* Total number of waiting tasks. */ + int waiting; + /* The task array. */ + struct task *tasks; + + /* The task indices. */ + int *tasks_ind; + + /* Lock for this scheduler. */ + lock_type lock; + + /* Waiting queue. */ + pthread_mutex_t sleep_mutex; + pthread_cond_t sleep_cond; + + /* The space associated with this scheduler. */ + struct space *space; + + /* The node we are working on. */ + int nodeID; +}; /* Function prototypes. */ -void scheduler_init ( struct scheduler *s , struct space *space , int nr_queues , unsigned int flags , int nodeID ); -struct task *scheduler_gettask ( struct scheduler *s , int qid , struct cell *super ); -void scheduler_enqueue ( struct scheduler *s , struct task *t ); -void scheduler_start ( struct scheduler *s , unsigned int mask ); -void scheduler_reset ( struct scheduler *s , int nr_tasks ); -void scheduler_ranktasks ( struct scheduler *s ); -void scheduler_reweight ( struct scheduler *s ); -struct task *scheduler_addtask ( struct scheduler *s , int type , int subtype , int flags , int wait , struct cell *ci , struct cell *cj , int tight ); -void scheduler_splittasks ( struct scheduler *s ); -struct task *scheduler_done ( struct scheduler *s , struct task *t ); -struct task *scheduler_unlock ( struct scheduler *s , struct task *t ); -void scheduler_addunlock ( struct scheduler *s , struct task *ta , struct task *tb ); +void scheduler_init(struct scheduler *s, struct space *space, int nr_queues, + unsigned int flags, int nodeID); +struct task *scheduler_gettask(struct scheduler *s, int qid, + struct cell *super); +void scheduler_enqueue(struct scheduler *s, struct task *t); +void scheduler_start(struct scheduler *s, unsigned int mask); +void scheduler_reset(struct scheduler *s, int nr_tasks); +void scheduler_ranktasks(struct scheduler *s); +void scheduler_reweight(struct scheduler *s); +struct task *scheduler_addtask(struct scheduler *s, int type, int subtype, + int flags, int wait, struct cell *ci, + struct cell *cj, int tight); +void scheduler_splittasks(struct scheduler *s); +struct task *scheduler_done(struct scheduler *s, struct task *t); +struct task *scheduler_unlock(struct scheduler *s, struct task *t); +void scheduler_addunlock(struct scheduler *s, struct task *ta, struct task *tb); + +#endif /* SWIFT_SCHEDULER_H */ diff --git a/src/serial_io.c b/src/serial_io.c index fa54be0b30a3f9e9c33221ab4ba5f668d274b209..f771f3d4f1b0ed5b94ff27e9e54fea484e02f381 100644 --- a/src/serial_io.c +++ b/src/serial_io.c @@ -2,20 +2,20 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ @@ -23,30 +23,25 @@ #if defined(HAVE_HDF5) && defined(WITH_MPI) && !defined(HAVE_PARALLEL_HDF5) - /* Some standard headers. */ +#include <hdf5.h> +#include <math.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stddef.h> -#include <hdf5.h> -#include <math.h> -#include "mpi.h" - -#include "const.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "space.h" -#include "scheduler.h" -#include "engine.h" -#include "error.h" -#include "kernel.h" -#include "common_io.h" +/* MPI headers. */ +#ifdef WITH_MPI +#include <mpi.h> +#endif +/* This object's header. */ +#include "serial_io.h" +/* Local includes. */ +#include "common_io.h" +#include "error.h" /*----------------------------------------------------------------------------- * Routines reading an IC file @@ -60,21 +55,25 @@ * @param type The #DATA_TYPE of the attribute. * @param N The number of particles. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array - * @param importance If COMPULSORY, the data must be present in the IC file. If OPTIONAL, the array will be zeroed when the data is not present. + * @param part_c A (char*) pointer on the first occurence of the field of + *interest in the parts array + * @param importance If COMPULSORY, the data must be present in the IC file. If + *OPTIONAL, the array will be zeroed when the data is not present. * - * @todo A better version using HDF5 hyperslabs to read the file directly into the part array + * @todo A better version using HDF5 hyperslabs to read the file directly into + *the part array * will be written once the strucutres have been stabilized. - * + * * Calls #error() if an error occurs. */ -void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, long long offset, char* part_c, enum DATA_IMPORTANCE importance) -{ - hid_t h_data=0, h_err=0, h_type=0, h_memspace=0, h_filespace=0; +void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, + int dim, long long N_total, long long offset, + char* part_c, enum DATA_IMPORTANCE importance) { + hid_t h_data = 0, h_err = 0, h_type = 0, h_memspace = 0, h_filespace = 0; hsize_t shape[2], offsets[2]; - htri_t exist=0; + htri_t exist = 0; void* temp; - int i=0, rank=0; + int i = 0, rank = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); @@ -82,56 +81,48 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim /* Check whether the dataspace exists or not */ exist = H5Lexists(grp, name, 0); - if(exist < 0) - { - error( "Error while checking the existence of data set '%s'." , name ); - } - else if(exist == 0) - { - if(importance == COMPULSORY) - { - error( "Compulsory data set '%s' not present in the file." , name ); - } - else - { - for(i=0; i<N; ++i) - memset(part_c+i*partSize, 0, copySize); - return; - } + if (exist < 0) { + error("Error while checking the existence of data set '%s'.", name); + } else if (exist == 0) { + if (importance == COMPULSORY) { + error("Compulsory data set '%s' not present in the file.", name); + } else { + for (i = 0; i < N; ++i) memset(part_c + i * partSize, 0, copySize); + return; } + } - /* message( "Reading %s '%s' array...", importance == COMPULSORY ? "compulsory": "optional ", name); */ + /* message( "Reading %s '%s' array...", importance == COMPULSORY ? + * "compulsory": "optional ", name); */ /* Open data space */ h_data = H5Dopen1(grp, name); - if(h_data < 0) - error( "Error while opening data space '%s'." , name ); + if (h_data < 0) error("Error while opening data space '%s'.", name); /* Check data type */ h_type = H5Dget_type(h_data); - if(h_type < 0) - error("Unable to retrieve data type from the file"); - if(!H5Tequal(h_type, hdf5Type(type))) + if (h_type < 0) error("Unable to retrieve data type from the file"); + if (!H5Tequal(h_type, hdf5Type(type))) error("Non-matching types between the code and the file"); - + /* Allocate temporary buffer */ temp = malloc(N * dim * sizeOfType(type)); - if(temp == NULL) - error("Unable to allocate memory for temporary buffer"); + if (temp == NULL) error("Unable to allocate memory for temporary buffer"); /* Prepare information for hyperslab */ - if(dim > 1) - { - rank = 2; - shape[0] = N; shape[1] = dim; - offsets[0] = offset; offsets[1] = 0; - } - else - { - rank = 1; - shape[0] = N; shape[1] = 0; - offsets[0] = offset; offsets[1] = 0; - } + if (dim > 1) { + rank = 2; + shape[0] = N; + shape[1] = dim; + offsets[0] = offset; + offsets[1] = 0; + } else { + rank = 1; + shape[0] = N; + shape[1] = 0; + offsets[0] = offset; + offsets[1] = 0; + } /* Create data space in memory */ h_memspace = H5Screate_simple(rank, shape, NULL); @@ -140,21 +131,20 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim h_filespace = H5Dget_space(h_data); H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL); - /* Read HDF5 dataspace in temporary buffer */ /* Dirty version that happens to work for vectors but should be improved */ /* Using HDF5 dataspaces would be better */ - h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT, temp); - if(h_err < 0) - { - error( "Error while reading data array '%s'." , name ); - } + h_err = H5Dread(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT, + temp); + if (h_err < 0) { + error("Error while reading data array '%s'.", name); + } /* Copy temporary buffer to particle data */ temp_c = temp; - for(i=0; i<N; ++i) - memcpy(part_c+i*partSize, &temp_c[i*copySize], copySize); - + for (i = 0; i < N; ++i) + memcpy(part_c + i * partSize, &temp_c[i * copySize], copySize); + /* Free and close everything */ free(temp); H5Sclose(h_filespace); @@ -178,7 +168,10 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim * @param importance Is the data compulsory or not * */ -#define readArray(grp, name, type, N, dim, part, N_total, offset, field, importance) readArrayBackEnd(grp, name, type, N, dim, N_total, offset, (char*)(&(part[0]).field), importance) +#define readArray(grp, name, type, N, dim, part, N_total, offset, field, \ + importance) \ + readArrayBackEnd(grp, name, type, N, dim, N_total, offset, \ + (char*)(&(part[0]).field), importance) /** * @brief Reads an HDF5 initial condition file (GADGET-3 type) @@ -199,12 +192,15 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim * Calls #error() if an error occurs. * */ -void read_ic_serial ( char* fileName, double dim[3], struct part **parts, int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info) -{ - hid_t h_file=0, h_grp=0; - double boxSize[3]={0.0,-1.0,-1.0}; /* GADGET has only cubic boxes (in cosmological mode) */ - int numParticles[6]={0}; /* GADGET has 6 particle types. We only keep the type 0*/ - int numParticles_highWord[6]={0}; +void read_ic_serial(char* fileName, double dim[3], struct part** parts, int* N, + int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, + MPI_Info info) { + hid_t h_file = 0, h_grp = 0; + double boxSize[3] = { + 0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */ + int numParticles[6] = { + 0}; /* GADGET has 6 particle types. We only keep the type 0*/ + int numParticles_highWord[6] = {0}; long long offset = 0; long long N_total = 0; int rank; @@ -215,40 +211,39 @@ void read_ic_serial ( char* fileName, double dim[3], struct part **parts, int* /* Open file */ /* message("Opening file '%s' as IC.", fileName); */ h_file = H5Fopen(fileName, H5F_ACC_RDONLY, H5P_DEFAULT); - if(h_file < 0) - error( "Error while opening file '%s' for inital read." , fileName ); - + if (h_file < 0) + error("Error while opening file '%s' for inital read.", fileName); + /* Open header to read simulation properties */ /* message("Reading runtime parameters..."); */ h_grp = H5Gopen1(h_file, "/RuntimePars"); - if(h_grp < 0) - error("Error while opening runtime parameters\n"); - + if (h_grp < 0) error("Error while opening runtime parameters\n"); + /* Read the relevant information */ readAttribute(h_grp, "PeriodicBoundariesOn", INT, periodic); /* Close runtime parameters */ H5Gclose(h_grp); - + /* Open header to read simulation properties */ /* message("Reading file header..."); */ h_grp = H5Gopen1(h_file, "/Header"); - if(h_grp < 0) - error("Error while opening file header\n"); - + if (h_grp < 0) error("Error while opening file header\n"); + /* Read the relevant information and print status */ readAttribute(h_grp, "BoxSize", DOUBLE, boxSize); readAttribute(h_grp, "NumPart_Total", UINT, numParticles); readAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticles_highWord); - N_total = ((long long) numParticles[0]) + ((long long) numParticles_highWord[0] << 32); + N_total = ((long long)numParticles[0]) + + ((long long)numParticles_highWord[0] << 32); dim[0] = boxSize[0]; - dim[1] = ( boxSize[1] < 0 ) ? boxSize[0] : boxSize[1]; - dim[2] = ( boxSize[2] < 0 ) ? boxSize[0] : boxSize[2]; + dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1]; + dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2]; /* message("Found %d particles in a %speriodic box of size [%f %f %f].", */ /* *N, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */ - + /* Close header */ H5Gclose(h_grp); @@ -256,78 +251,82 @@ void read_ic_serial ( char* fileName, double dim[3], struct part **parts, int* H5Fclose(h_file); } - /* Now need to broadcast that information to all ranks. */ MPI_Bcast(periodic, 1, MPI_INT, 0, comm); MPI_Bcast(&N_total, 1, MPI_LONG_LONG, 0, comm); MPI_Bcast(dim, 3, MPI_DOUBLE, 0, comm); - /* Divide the particles among the tasks. */ offset = mpi_rank * N_total / mpi_size; *N = (mpi_rank + 1) * N_total / mpi_size - offset; - /* Allocate memory to store particles */ - if(posix_memalign( (void*)parts , part_align , (*N) * sizeof(struct part)) != 0) + if (posix_memalign((void*)parts, part_align, (*N) * sizeof(struct part)) != 0) error("Error while allocating memory for particles"); - bzero( *parts , *N * sizeof(struct part) ); - /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / (1024.*1024.)); */ + bzero(*parts, *N * sizeof(struct part)); + /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / + * (1024.*1024.)); */ /* Now loop over ranks and read the data */ - for ( rank = 0; rank < mpi_size ; ++ rank ) { + for (rank = 0; rank < mpi_size; ++rank) { /* Is it this rank's turn to read ? */ - if ( rank == mpi_rank ) { + if (rank == mpi_rank) { h_file = H5Fopen(fileName, H5F_ACC_RDONLY, H5P_DEFAULT); - if(h_file < 0) - error( "Error while opening file '%s' on rank %d." , fileName, mpi_rank ); - + if (h_file < 0) + error("Error while opening file '%s' on rank %d.", fileName, mpi_rank); + /* Open SPH particles group */ /* message("Reading particle arrays..."); */ h_grp = H5Gopen1(h_file, "/PartType0"); - if(h_grp < 0) - error( "Error while opening particle group on rank %d.\n", mpi_rank); - + if (h_grp < 0) + error("Error while opening particle group on rank %d.\n", mpi_rank); + /* Read arrays */ - readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x, COMPULSORY); - readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v, COMPULSORY); - readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass, COMPULSORY); - readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, h, COMPULSORY); - readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, u, COMPULSORY); - readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, id, COMPULSORY); - readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt, OPTIONAL); - readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a, OPTIONAL); - readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho, OPTIONAL ); - + readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, N_total, offset, x, + COMPULSORY); + readArray(h_grp, "Velocities", FLOAT, *N, 3, *parts, N_total, offset, v, + COMPULSORY); + readArray(h_grp, "Masses", FLOAT, *N, 1, *parts, N_total, offset, mass, + COMPULSORY); + readArray(h_grp, "SmoothingLength", FLOAT, *N, 1, *parts, N_total, offset, + h, COMPULSORY); + readArray(h_grp, "InternalEnergy", FLOAT, *N, 1, *parts, N_total, offset, + u, COMPULSORY); + readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, N_total, offset, + id, COMPULSORY); + readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, N_total, offset, dt, + OPTIONAL); + readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, N_total, offset, a, + OPTIONAL); + readArray(h_grp, "Density", FLOAT, *N, 1, *parts, N_total, offset, rho, + OPTIONAL); + /* Close particle group */ H5Gclose(h_grp); /* Close file */ H5Fclose(h_file); - } /* Wait for the read of the reading to complete */ MPI_Barrier(comm); - } /* message("Done Reading particles..."); */ - } - /*----------------------------------------------------------------------------- * Routines writing an output file *-----------------------------------------------------------------------------*/ -void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DATA_TYPE type, long long N_total, int dim, struct UnitSystem* us, enum UnitConversionFactor convFactor) -{ - hid_t h_data=0, h_err=0, h_space=0; +void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, + enum DATA_TYPE type, long long N_total, int dim, + struct UnitSystem* us, enum UnitConversionFactor convFactor) { + hid_t h_data = 0, h_err = 0, h_space = 0; void* temp = 0; - int i=0, rank=0; + int i = 0, rank = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); @@ -337,45 +336,42 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DAT /* Create data space */ h_space = H5Screate(H5S_SIMPLE); - if(h_space < 0) - { - error( "Error while creating data space for field '%s'." , name ); - } - - if(dim > 1) - { - rank = 2; - shape[0] = N_total; shape[1] = dim; - } - else - { - rank = 1; - shape[0] = N_total; shape[1] = 0; - } - + if (h_space < 0) { + error("Error while creating data space for field '%s'.", name); + } + + if (dim > 1) { + rank = 2; + shape[0] = N_total; + shape[1] = dim; + } else { + rank = 1; + shape[0] = N_total; + shape[1] = 0; + } + /* Change shape of data space */ h_err = H5Sset_extent_simple(h_space, rank, shape, NULL); - if(h_err < 0) - { - error( "Error while changing data space shape for field '%s'." , name ); - } - + if (h_err < 0) { + error("Error while changing data space shape for field '%s'.", name); + } + /* Create dataset */ h_data = H5Dcreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT); - if(h_data < 0) - { - error( "Error while creating dataspace '%s'." , name ); - } + if (h_data < 0) { + error("Error while creating dataspace '%s'.", name); + } /* Write XMF description for this data set */ writeXMFline(xmfFile, fileName, name, N_total, dim, type); /* Write unit conversion factors for this data set */ - conversionString( buffer, us, convFactor ); - writeAttribute_d( h_data, "CGS conversion factor", conversionFactor( us, convFactor ) ); - writeAttribute_f( h_data, "h-scale exponant", hFactor( us, convFactor ) ); - writeAttribute_f( h_data, "a-scale exponant", aFactor( us, convFactor ) ); - writeAttribute_s( h_data, "Conversion factor", buffer ); + conversionString(buffer, us, convFactor); + writeAttribute_d(h_data, "CGS conversion factor", + conversionFactor(us, convFactor)); + writeAttribute_f(h_data, "h-scale exponant", hFactor(us, convFactor)); + writeAttribute_f(h_data, "a-scale exponant", aFactor(us, convFactor)); + writeAttribute_s(h_data, "Conversion factor", buffer); H5Dclose(h_data); H5Sclose(h_space); @@ -391,19 +387,21 @@ void prepareArray(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DAT * @param type The #DATA_TYPE of the array. * @param N The number of particles to write. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array + * @param part_c A (char*) pointer on the first occurence of the field of + *interest in the parts array * @param us The UnitSystem currently in use * @param convFactor The UnitConversionFactor for this array * * * Calls #error() if an error occurs. */ -void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, long long N_total, long long offset, char* part_c) -{ - hid_t h_data=0, h_err=0, h_memspace=0, h_filespace=0; +void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, + int dim, long long N_total, long long offset, + char* part_c) { + hid_t h_data = 0, h_err = 0, h_memspace = 0, h_filespace = 0; hsize_t shape[2], shape_total[2], offsets[2]; void* temp = 0; - int i=0, rank=0; + int i = 0, rank = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); @@ -413,55 +411,56 @@ void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int di /* Allocate temporary buffer */ temp = malloc(N * dim * sizeOfType(type)); - if(temp == NULL) - error("Unable to allocate memory for temporary buffer"); + if (temp == NULL) error("Unable to allocate memory for temporary buffer"); /* Copy particle data to temporary buffer */ temp_c = temp; - for(i=0; i<N; ++i) - memcpy(&temp_c[i*copySize], part_c+i*partSize, copySize); + for (i = 0; i < N; ++i) + memcpy(&temp_c[i * copySize], part_c + i * partSize, copySize); /* Construct information for the hyperslab */ - if(dim > 1) - { - rank = 2; - shape[0] = N; shape[1] = dim; - shape_total[0] = N_total; shape_total[1] = dim; - offsets[0] = offset; offsets[1] = 0; - } - else - { - rank = 1; - shape[0] = N; shape[1] = 0; - shape_total[0] = N_total; shape_total[1] = 0; - offsets[0] = offset; offsets[1] = 0; - } + if (dim > 1) { + rank = 2; + shape[0] = N; + shape[1] = dim; + shape_total[0] = N_total; + shape_total[1] = dim; + offsets[0] = offset; + offsets[1] = 0; + } else { + rank = 1; + shape[0] = N; + shape[1] = 0; + shape_total[0] = N_total; + shape_total[1] = 0; + offsets[0] = offset; + offsets[1] = 0; + } - /* Create data space in memory */ h_memspace = H5Screate(H5S_SIMPLE); - if(h_memspace < 0) - error( "Error while creating data space (memory) for field '%s'." , name ); + if (h_memspace < 0) + error("Error while creating data space (memory) for field '%s'.", name); /* Change shape of memory data space */ h_err = H5Sset_extent_simple(h_memspace, rank, shape, NULL); - if(h_err < 0) - error( "Error while changing data space (memory) shape for field '%s'." , name ); - + if (h_err < 0) + error("Error while changing data space (memory) shape for field '%s'.", + name); + /* Open pre-existing data set */ h_data = H5Dopen(grp, name, H5P_DEFAULT); - if(h_data < 0) - error( "Error while opening dataset '%s'." , name ); + if (h_data < 0) error("Error while opening dataset '%s'.", name); /* Select data space in that data set */ h_filespace = H5Dget_space(h_data); H5Sselect_hyperslab(h_filespace, H5S_SELECT_SET, offsets, NULL, shape, NULL); /* Write temporary buffer to HDF5 dataspace */ - h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT, temp); - if(h_err < 0) - error( "Error while writing data array '%s'." , name ); - + h_err = H5Dwrite(h_data, hdf5Type(type), h_memspace, h_filespace, H5P_DEFAULT, + temp); + if (h_err < 0) error("Error while writing data array '%s'.", name); + /* Free and close everything */ free(temp); H5Dclose(h_data); @@ -479,14 +478,16 @@ void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int di * @param type The #DATA_TYPE of the array. * @param N The number of particles to write. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part A (char*) pointer on the first occurence of the field of interest in the parts array + * @param part A (char*) pointer on the first occurence of the field of interest + *in the parts array * @param field The name (code name) of the field to read from. * @param us The UnitSystem currently in use * @param convFactor The UnitConversionFactor for this array * */ -#define writeArray(grp, name, type, N, dim, N_total, offset, part, field) writeArrayBackEnd(grp, name, type, N, dim, N_total, offset, (char*)(&(part[0]).field)) - +#define writeArray(grp, name, type, N, dim, N_total, offset, part, field) \ + writeArrayBackEnd(grp, name, type, N, dim, N_total, offset, \ + (char*)(&(part[0]).field)) /** * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor @@ -496,20 +497,20 @@ void writeArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int di * * Creates an HDF5 output file and writes the particles contained * in the engine. If such a file already exists, it is erased and replaced - * by the new one. + * by the new one. * The companion XMF file is also updated accordingly. * * Calls #error() if an error occurs. * */ -void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info ) -{ - hid_t h_file=0, h_grp=0; +void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank, + int mpi_size, MPI_Comm comm, MPI_Info info) { + hid_t h_file = 0, h_grp = 0; int N = e->s->nr_parts; int periodic = e->s->periodic; - int numParticles[6]={N,0}; - int numParticlesHighWord[6]={0}; - unsigned int flagEntropy[6]={0}; + int numParticles[6] = {N, 0}; + int numParticlesHighWord[6] = {0}; + unsigned int flagEntropy[6] = {0}; long long N_total = 0, offset = 0; double offset_d = 0., N_d = 0., N_total_d = 0.; int numFiles = 1; @@ -517,7 +518,7 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank struct part* parts = e->s->parts; FILE* xmfFile = 0; static int outputCount = 0; - + /* File name */ char fileName[200]; sprintf(fileName, "output_%03i.hdf5", outputCount); @@ -527,62 +528,60 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank N_d = (double)N; MPI_Exscan(&N_d, &offset_d, 1, MPI_DOUBLE, MPI_SUM, comm); N_total_d = offset_d + N_d; - MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size-1, comm); - if(N_total_d > 1.e15) - error("Error while computing the offest for parallel output: Simulation has more than 10^15 particles.\n"); - N_total = (long long) N_total_d; - offset = (long long) offset_d; - + MPI_Bcast(&N_total_d, 1, MPI_DOUBLE, mpi_size - 1, comm); + if (N_total_d > 1.e15) + error( + "Error while computing the offest for parallel output: Simulation has " + "more than 10^15 particles.\n"); + N_total = (long long)N_total_d; + offset = (long long)offset_d; /* Do common stuff first */ - if ( mpi_rank == 0 ) { + if (mpi_rank == 0) { /* First time, we need to create the XMF file */ - if(outputCount == 0) - createXMFfile(); - + if (outputCount == 0) createXMFfile(); + /* Prepare the XMF file for the new entry */ xmfFile = prepareXMFfile(); - + /* Write the part corresponding to this specific output */ writeXMFheader(xmfFile, N_total, fileName, e->time); /* Open file */ /* message("Opening file '%s'.", fileName); */ - h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT,H5P_DEFAULT); - if(h_file < 0) - { - error( "Error while opening file '%s'." , fileName ); - } + h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + if (h_file < 0) { + error("Error while opening file '%s'.", fileName); + } /* Open header to write simulation properties */ /* message("Writing runtime parameters..."); */ h_grp = H5Gcreate1(h_file, "/RuntimePars", 0); - if(h_grp < 0) - error("Error while creating runtime parameters group\n"); + if (h_grp < 0) error("Error while creating runtime parameters group\n"); /* Write the relevant information */ writeAttribute(h_grp, "PeriodicBoundariesOn", INT, &periodic, 1); /* Close runtime parameters */ H5Gclose(h_grp); - + /* Open header to write simulation properties */ /* message("Writing file header..."); */ h_grp = H5Gcreate1(h_file, "/Header", 0); - if(h_grp < 0) - error("Error while creating file header\n"); - + if (h_grp < 0) error("Error while creating file header\n"); + /* Print the relevant information and print status */ writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3); writeAttribute(h_grp, "Time", DOUBLE, &e->time, 1); /* GADGET-2 legacy values */ - numParticles[0] = (unsigned int) N_total ; + numParticles[0] = (unsigned int)N_total; writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6); writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6); - numParticlesHighWord[0] = (unsigned int) (N_total >> 32); - writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, 6); + numParticlesHighWord[0] = (unsigned int)(N_total >> 32); + writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, + 6); double MassTable[6] = {0., 0., 0., 0., 0., 0.}; writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6); writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, flagEntropy, 6); @@ -596,24 +595,32 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank /* Print the system of Units */ writeUnitSystem(h_file, us); - + /* Create SPH particles group */ /* message("Writing particle arrays..."); */ h_grp = H5Gcreate1(h_file, "/PartType0", 0); - if(h_grp < 0) - error( "Error while creating particle group.\n"); + if (h_grp < 0) error("Error while creating particle group.\n"); /* Prepare the arrays in the file */ - prepareArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N_total, 3, us, UNIT_CONV_LENGTH); - prepareArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N_total, 3, us, UNIT_CONV_SPEED); - prepareArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N_total, 1, us, UNIT_CONV_MASS); - prepareArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N_total, 1, us, UNIT_CONV_LENGTH); - prepareArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N_total, 1, us, UNIT_CONV_ENERGY_PER_UNIT_MASS); - prepareArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N_total, 1, us, UNIT_CONV_NO_UNITS); - prepareArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N_total, 1, us, UNIT_CONV_TIME); - prepareArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N_total, 3, us, UNIT_CONV_ACCELERATION); - prepareArray(h_grp, fileName, xmfFile, "Density", FLOAT, N_total, 1, us, UNIT_CONV_DENSITY); - + prepareArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N_total, 3, + us, UNIT_CONV_LENGTH); + prepareArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N_total, 3, us, + UNIT_CONV_SPEED); + prepareArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N_total, 1, us, + UNIT_CONV_MASS); + prepareArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N_total, 1, + us, UNIT_CONV_LENGTH); + prepareArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N_total, 1, + us, UNIT_CONV_ENERGY_PER_UNIT_MASS); + prepareArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N_total, 1, + us, UNIT_CONV_NO_UNITS); + prepareArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N_total, 1, us, + UNIT_CONV_TIME); + prepareArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N_total, 3, + us, UNIT_CONV_ACCELERATION); + prepareArray(h_grp, fileName, xmfFile, "Density", FLOAT, N_total, 1, us, + UNIT_CONV_DENSITY); + /* Close particle group */ H5Gclose(h_grp); @@ -624,31 +631,32 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank writeXMFfooter(xmfFile); } - - /* Now loop over ranks and write the data */ - for ( rank = 0; rank < mpi_size ; ++ rank ) { + for (rank = 0; rank < mpi_size; ++rank) { /* Is it this rank's turn to write ? */ - if ( rank == mpi_rank ) { + if (rank == mpi_rank) { h_file = H5Fopen(fileName, H5F_ACC_RDWR, H5P_DEFAULT); - if(h_file < 0) - error( "Error while opening file '%s' on rank %d." , fileName, mpi_rank ); + if (h_file < 0) + error("Error while opening file '%s' on rank %d.", fileName, mpi_rank); /* Open SPH particles group */ /* message("Reading particle arrays..."); */ h_grp = H5Gopen1(h_file, "/PartType0"); - if(h_grp < 0) - error( "Error while opening particle group on rank %d.\n", mpi_rank); + if (h_grp < 0) + error("Error while opening particle group on rank %d.\n", mpi_rank); /* Write arrays */ writeArray(h_grp, "Coordinates", DOUBLE, N, 3, N_total, offset, parts, x); writeArray(h_grp, "Velocities", FLOAT, N, 3, N_total, offset, parts, v); writeArray(h_grp, "Masses", FLOAT, N, 1, N_total, offset, parts, mass); - writeArray(h_grp, "SmoothingLength", FLOAT, N, 1, N_total, offset, parts, h); - writeArray(h_grp, "InternalEnergy", FLOAT, N, 1, N_total, offset, parts, u); - writeArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, N_total, offset, parts, id); + writeArray(h_grp, "SmoothingLength", FLOAT, N, 1, N_total, offset, parts, + h); + writeArray(h_grp, "InternalEnergy", FLOAT, N, 1, N_total, offset, parts, + u); + writeArray(h_grp, "ParticleIDs", ULONGLONG, N, 1, N_total, offset, parts, + id); writeArray(h_grp, "TimeStep", FLOAT, N, 1, N_total, offset, parts, dt); writeArray(h_grp, "Acceleration", FLOAT, N, 3, N_total, offset, parts, a); writeArray(h_grp, "Density", FLOAT, N, 1, N_total, offset, parts, rho); @@ -658,19 +666,14 @@ void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank /* Close file */ H5Fclose(h_file); - } /* Wait for the read of the reading to complete */ MPI_Barrier(comm); - } /* message("Done writing particles..."); */ ++outputCount; } - -#endif /* HAVE_HDF5 */ - - +#endif /* HAVE_HDF5 */ diff --git a/src/serial_io.h b/src/serial_io.h index 3349f221531ce7c4a2a290b121500e5d4336ed6b..bb05fc61bdca1b0db36386e6773a034cc17ea6b9 100644 --- a/src/serial_io.h +++ b/src/serial_io.h @@ -1,28 +1,43 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_SERIAL_IO_H +#define SWIFT_SERIAL_IO_H +/* MPI headers. */ +#ifdef WITH_MPI +#include <mpi.h> +#endif + +/* Includes. */ +#include "engine.h" +#include "part.h" +#include "units.h" #if defined(HAVE_HDF5) && defined(WITH_MPI) && !defined(HAVE_PARALLEL_HDF5) -void read_ic_serial ( char* fileName, double dim[3], struct part **parts, int* N, int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info); +void read_ic_serial(char* fileName, double dim[3], struct part** parts, int* N, + int* periodic, int mpi_rank, int mpi_size, MPI_Comm comm, + MPI_Info info); -void write_output_serial ( struct engine* e, struct UnitSystem* us, int mpi_rank, int mpi_size, MPI_Comm comm, MPI_Info info ); +void write_output_serial(struct engine* e, struct UnitSystem* us, int mpi_rank, + int mpi_size, MPI_Comm comm, MPI_Info info); #endif +#endif /* SWIFT_SERIAL_IO_H */ diff --git a/src/single_io.c b/src/single_io.c index 485cb60aa51140682ef868d0323b31f00ce4ed9e..0874442982df747ed0eff38bf060e50e0d205034 100644 --- a/src/single_io.c +++ b/src/single_io.c @@ -2,20 +2,20 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ @@ -23,28 +23,20 @@ #if defined(HAVE_HDF5) && !defined(WITH_MPI) - /* Some standard headers. */ +#include <hdf5.h> +#include <math.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stddef.h> -#include <hdf5.h> -#include <math.h> - -#include "const.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "part.h" -#include "space.h" -#include "scheduler.h" -#include "engine.h" -#include "error.h" -#include "kernel.h" -#include "common_io.h" +/* This object's header. */ +#include "single_io.h" +/* Local includes. */ +#include "common_io.h" +#include "error.h" /*----------------------------------------------------------------------------- * Routines reading an IC file @@ -58,20 +50,23 @@ * @param type The #DATA_TYPE of the attribute. * @param N The number of particles. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array - * @param importance If COMPULSORY, the data must be present in the IC file. If OPTIONAL, the array will be zeroed when the data is not present. + * @param part_c A (char*) pointer on the first occurence of the field of + *interest in the parts array + * @param importance If COMPULSORY, the data must be present in the IC file. If + *OPTIONAL, the array will be zeroed when the data is not present. * - * @todo A better version using HDF5 hyperslabs to read the file directly into the part array + * @todo A better version using HDF5 hyperslabs to read the file directly into + *the part array * will be written once the strucutres have been stabilized. - * + * * Calls #error() if an error occurs. */ -void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim, char* part_c, enum DATA_IMPORTANCE importance) -{ - hid_t h_data=0, h_err=0, h_type=0; - htri_t exist=0; +void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, + int dim, char* part_c, enum DATA_IMPORTANCE importance) { + hid_t h_data = 0, h_err = 0, h_type = 0; + htri_t exist = 0; void* temp; - int i=0; + int i = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); @@ -79,62 +74,53 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim /* Check whether the dataspace exists or not */ exist = H5Lexists(grp, name, 0); - if(exist < 0) - { - error( "Error while checking the existence of data set '%s'." , name ); + if (exist < 0) { + error("Error while checking the existence of data set '%s'.", name); + } else if (exist == 0) { + if (importance == COMPULSORY) { + error("Compulsory data set '%s' not present in the file.", name); + } else { + /* message("Optional data set '%s' not present. Zeroing this particle + * field...", name); */ + + for (i = 0; i < N; ++i) memset(part_c + i * partSize, 0, copySize); + + return; } - else if(exist == 0) - { - if(importance == COMPULSORY) - { - error( "Compulsory data set '%s' not present in the file." , name ); - } - else - { - /* message("Optional data set '%s' not present. Zeroing this particle field...", name); */ - - for(i=0; i<N; ++i) - memset(part_c+i*partSize, 0, copySize); - - return; - } - } - - /* message( "Reading %s '%s' array...", importance == COMPULSORY ? "compulsory": "optional ", name); */ + } + + /* message( "Reading %s '%s' array...", importance == COMPULSORY ? + * "compulsory": "optional ", name); */ /* Open data space */ h_data = H5Dopen1(grp, name); - if(h_data < 0) - { - error( "Error while opening data space '%s'." , name ); - } + if (h_data < 0) { + error("Error while opening data space '%s'.", name); + } /* Check data type */ h_type = H5Dget_type(h_data); - if(h_type < 0) - error("Unable to retrieve data type from the file"); - if(!H5Tequal(h_type, hdf5Type(type))) + if (h_type < 0) error("Unable to retrieve data type from the file"); + if (!H5Tequal(h_type, hdf5Type(type))) error("Non-matching types between the code and the file"); - + /* Allocate temporary buffer */ temp = malloc(N * dim * sizeOfType(type)); - if(temp == NULL) - error("Unable to allocate memory for temporary buffer"); + if (temp == NULL) error("Unable to allocate memory for temporary buffer"); /* Read HDF5 dataspace in temporary buffer */ /* Dirty version that happens to work for vectors but should be improved */ /* Using HDF5 dataspaces would be better */ h_err = H5Dread(h_data, hdf5Type(type), H5S_ALL, H5S_ALL, H5P_DEFAULT, temp); - if(h_err < 0) - { - error( "Error while reading data array '%s'." , name ); - } + if (h_err < 0) { + error("Error while reading data array '%s'.", name); + } /* Copy temporary buffer to particle data */ temp_c = temp; - for(i=0; i<N; ++i) - memcpy(part_c+i*partSize, &temp_c[i*copySize], copySize); - + for (i = 0; i < N; ++i) + memcpy(part_c + i * partSize, &temp_c[i * copySize], copySize); + /* Free and close everything */ free(temp); H5Tclose(h_type); @@ -154,7 +140,9 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim * @param importance Is the data compulsory or not * */ -#define readArray(grp, name, type, N, dim, part, field, importance) readArrayBackEnd(grp, name, type, N, dim, (char*)(&(part[0]).field), importance) +#define readArray(grp, name, type, N, dim, part, field, importance) \ + readArrayBackEnd(grp, name, type, N, dim, (char*)(&(part[0]).field), \ + importance) /** * @brief Reads an HDF5 initial condition file (GADGET-3 type) @@ -175,46 +163,45 @@ void readArrayBackEnd(hid_t grp, char* name, enum DATA_TYPE type, int N, int dim * Calls #error() if an error occurs. * */ -void read_ic_single ( char* fileName, double dim[3], struct part **parts, int* N, int* periodic) -{ - hid_t h_file=0, h_grp=0; - double boxSize[3]={0.0,-1.0,-1.0}; /* GADGET has only cubic boxes (in cosmological mode) */ - int numParticles[6]={0}; /* GADGET has 6 particle types. We only keep the type 0*/ +void read_ic_single(char* fileName, double dim[3], struct part** parts, int* N, + int* periodic) { + hid_t h_file = 0, h_grp = 0; + double boxSize[3] = { + 0.0, -1.0, -1.0}; /* GADGET has only cubic boxes (in cosmological mode) */ + int numParticles[6] = { + 0}; /* GADGET has 6 particle types. We only keep the type 0*/ /* Open file */ /* message("Opening file '%s' as IC.", fileName); */ h_file = H5Fopen(fileName, H5F_ACC_RDONLY, H5P_DEFAULT); - if(h_file < 0) - { - error( "Error while opening file '%s'." , fileName ); - } + if (h_file < 0) { + error("Error while opening file '%s'.", fileName); + } /* Open header to read simulation properties */ /* message("Reading runtime parameters..."); */ h_grp = H5Gopen1(h_file, "/RuntimePars"); - if(h_grp < 0) - error("Error while opening runtime parameters\n"); + if (h_grp < 0) error("Error while opening runtime parameters\n"); /* Read the relevant information */ readAttribute(h_grp, "PeriodicBoundariesOn", INT, periodic); /* Close runtime parameters */ H5Gclose(h_grp); - + /* Open header to read simulation properties */ /* message("Reading file header..."); */ h_grp = H5Gopen1(h_file, "/Header"); - if(h_grp < 0) - error("Error while opening file header\n"); - + if (h_grp < 0) error("Error while opening file header\n"); + /* Read the relevant information and print status */ readAttribute(h_grp, "BoxSize", DOUBLE, boxSize); readAttribute(h_grp, "NumPart_Total", UINT, numParticles); *N = numParticles[0]; dim[0] = boxSize[0]; - dim[1] = ( boxSize[1] < 0 ) ? boxSize[0] : boxSize[1]; - dim[2] = ( boxSize[2] < 0 ) ? boxSize[0] : boxSize[2]; + dim[1] = (boxSize[1] < 0) ? boxSize[0] : boxSize[1]; + dim[2] = (boxSize[2] < 0) ? boxSize[0] : boxSize[2]; /* message("Found %d particles in a %speriodic box of size [%f %f %f].", */ /* *N, (periodic ? "": "non-"), dim[0], dim[1], dim[2]); */ @@ -223,17 +210,17 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts, int* H5Gclose(h_grp); /* Allocate memory to store particles */ - if(posix_memalign( (void*)parts , part_align , *N * sizeof(struct part)) != 0) + if (posix_memalign((void*)parts, part_align, *N * sizeof(struct part)) != 0) error("Error while allocating memory for particles"); - bzero( *parts , *N * sizeof(struct part) ); + bzero(*parts, *N * sizeof(struct part)); + + /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / + * (1024.*1024.)); */ - /* message("Allocated %8.2f MB for particles.", *N * sizeof(struct part) / (1024.*1024.)); */ - /* Open SPH particles group */ /* message("Reading particle arrays..."); */ h_grp = H5Gopen1(h_file, "/PartType0"); - if(h_grp < 0) - error( "Error while opening particle group.\n"); + if (h_grp < 0) error("Error while opening particle group.\n"); /* Read arrays */ readArray(h_grp, "Coordinates", DOUBLE, *N, 3, *parts, x, COMPULSORY); @@ -244,7 +231,7 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts, int* readArray(h_grp, "ParticleIDs", ULONGLONG, *N, 1, *parts, id, COMPULSORY); readArray(h_grp, "TimeStep", FLOAT, *N, 1, *parts, dt, OPTIONAL); readArray(h_grp, "Acceleration", FLOAT, *N, 3, *parts, a, OPTIONAL); - readArray(h_grp, "Density", FLOAT, *N, 1, *parts, rho, OPTIONAL ); + readArray(h_grp, "Density", FLOAT, *N, 1, *parts, rho, OPTIONAL); /* Close particle group */ H5Gclose(h_grp); @@ -255,7 +242,6 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts, int* H5Fclose(h_file); } - /*----------------------------------------------------------------------------- * Routines writing an output file *-----------------------------------------------------------------------------*/ @@ -270,20 +256,24 @@ void read_ic_single ( char* fileName, double dim[3], struct part **parts, int* * @param type The #DATA_TYPE of the array. * @param N The number of particles to write. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part_c A (char*) pointer on the first occurence of the field of interest in the parts array + * @param part_c A (char*) pointer on the first occurence of the field of + *interest in the parts array * @param us The UnitSystem currently in use * @param convFactor The UnitConversionFactor for this array * - * @todo A better version using HDF5 hyperslabs to write the file directly from the part array + * @todo A better version using HDF5 hyperslabs to write the file directly from + *the part array * will be written once the strucutres have been stabilized. * * Calls #error() if an error occurs. */ -void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enum DATA_TYPE type, int N, int dim, char* part_c, struct UnitSystem* us, enum UnitConversionFactor convFactor) -{ - hid_t h_data=0, h_err=0, h_space=0; +void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, + enum DATA_TYPE type, int N, int dim, char* part_c, + struct UnitSystem* us, + enum UnitConversionFactor convFactor) { + hid_t h_data = 0, h_err = 0, h_space = 0; void* temp = 0; - int i=0, rank=0; + int i = 0, rank = 0; const size_t typeSize = sizeOfType(type); const size_t copySize = typeSize * dim; const size_t partSize = sizeof(struct part); @@ -295,63 +285,58 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu /* Allocate temporary buffer */ temp = malloc(N * dim * sizeOfType(type)); - if(temp == NULL) - error("Unable to allocate memory for temporary buffer"); + if (temp == NULL) error("Unable to allocate memory for temporary buffer"); /* Copy particle data to temporary buffer */ temp_c = temp; - for(i=0; i<N; ++i) - memcpy(&temp_c[i*copySize], part_c+i*partSize, copySize); + for (i = 0; i < N; ++i) + memcpy(&temp_c[i * copySize], part_c + i * partSize, copySize); /* Create data space */ h_space = H5Screate(H5S_SIMPLE); - if(h_space < 0) - { - error( "Error while creating data space for field '%s'." , name ); - } - - if(dim > 1) - { - rank = 2; - shape[0] = N; shape[1] = dim; - } - else - { - rank = 1; - shape[0] = N; shape[1] = 0; - } - + if (h_space < 0) { + error("Error while creating data space for field '%s'.", name); + } + + if (dim > 1) { + rank = 2; + shape[0] = N; + shape[1] = dim; + } else { + rank = 1; + shape[0] = N; + shape[1] = 0; + } + /* Change shape of data space */ h_err = H5Sset_extent_simple(h_space, rank, shape, NULL); - if(h_err < 0) - { - error( "Error while changing data space shape for field '%s'." , name ); - } - + if (h_err < 0) { + error("Error while changing data space shape for field '%s'.", name); + } + /* Create dataset */ h_data = H5Dcreate1(grp, name, hdf5Type(type), h_space, H5P_DEFAULT); - if(h_data < 0) - { - error( "Error while creating dataspace '%s'." , name ); - } - + if (h_data < 0) { + error("Error while creating dataspace '%s'.", name); + } + /* Write temporary buffer to HDF5 dataspace */ h_err = H5Dwrite(h_data, hdf5Type(type), h_space, H5S_ALL, H5P_DEFAULT, temp); - if(h_err < 0) - { - error( "Error while writing data array '%s'." , name ); - } + if (h_err < 0) { + error("Error while writing data array '%s'.", name); + } /* Write XMF description for this data set */ writeXMFline(xmfFile, fileName, name, N, dim, type); /* Write unit conversion factors for this data set */ - conversionString( buffer, us, convFactor ); - writeAttribute_d( h_data, "CGS conversion factor", conversionFactor( us, convFactor ) ); - writeAttribute_f( h_data, "h-scale exponant", hFactor( us, convFactor ) ); - writeAttribute_f( h_data, "a-scale exponant", aFactor( us, convFactor ) ); - writeAttribute_s( h_data, "Conversion factor", buffer ); - + conversionString(buffer, us, convFactor); + writeAttribute_d(h_data, "CGS conversion factor", + conversionFactor(us, convFactor)); + writeAttribute_f(h_data, "h-scale exponant", hFactor(us, convFactor)); + writeAttribute_f(h_data, "a-scale exponant", aFactor(us, convFactor)); + writeAttribute_s(h_data, "Conversion factor", buffer); + /* Free and close everything */ free(temp); H5Dclose(h_data); @@ -368,13 +353,17 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu * @param type The #DATA_TYPE of the array. * @param N The number of particles to write. * @param dim The dimension of the data (1 for scalar, 3 for vector) - * @param part A (char*) pointer on the first occurence of the field of interest in the parts array + * @param part A (char*) pointer on the first occurence of the field of interest + *in the parts array * @param field The name (code name) of the field to read from. * @param us The UnitSystem currently in use * @param convFactor The UnitConversionFactor for this array * */ -#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, field, us, convFactor) writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, (char*)(&(part[0]).field), us, convFactor) +#define writeArray(grp, fileName, xmfFile, name, type, N, dim, part, field, \ + us, convFactor) \ + writeArrayBackEnd(grp, fileName, xmfFile, name, type, N, dim, \ + (char*)(&(part[0]).field), us, convFactor) /** * @brief Writes an HDF5 output file (GADGET-3 type) with its XMF descriptor @@ -384,66 +373,60 @@ void writeArrayBackEnd(hid_t grp, char* fileName, FILE* xmfFile, char* name, enu * * Creates an HDF5 output file and writes the particles contained * in the engine. If such a file already exists, it is erased and replaced - * by the new one. + * by the new one. * The companion XMF file is also updated accordingly. * * Calls #error() if an error occurs. * */ -void write_output_single (struct engine *e, struct UnitSystem* us) -{ - - hid_t h_file=0, h_grp=0; +void write_output_single(struct engine* e, struct UnitSystem* us) { + + hid_t h_file = 0, h_grp = 0; int N = e->s->nr_parts; int periodic = e->s->periodic; - int numParticles[6]={N,0}; - int numParticlesHighWord[6]={0}; + int numParticles[6] = {N, 0}; + int numParticlesHighWord[6] = {0}; int numFiles = 1; struct part* parts = e->s->parts; FILE* xmfFile = 0; static int outputCount = 0; - + /* File name */ char fileName[200]; sprintf(fileName, "output_%03i.hdf5", outputCount); /* First time, we need to create the XMF file */ - if(outputCount == 0) - createXMFfile(); - + if (outputCount == 0) createXMFfile(); + /* Prepare the XMF file for the new entry */ xmfFile = prepareXMFfile(); /* Write the part corresponding to this specific output */ writeXMFheader(xmfFile, N, fileName, e->time); - /* Open file */ /* message("Opening file '%s'.", fileName); */ - h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT,H5P_DEFAULT); - if(h_file < 0) - { - error( "Error while opening file '%s'." , fileName ); - } + h_file = H5Fcreate(fileName, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + if (h_file < 0) { + error("Error while opening file '%s'.", fileName); + } /* Open header to write simulation properties */ /* message("Writing runtime parameters..."); */ h_grp = H5Gcreate1(h_file, "/RuntimePars", 0); - if(h_grp < 0) - error("Error while creating runtime parameters group\n"); + if (h_grp < 0) error("Error while creating runtime parameters group\n"); /* Write the relevant information */ writeAttribute(h_grp, "PeriodicBoundariesOn", INT, &periodic, 1); /* Close runtime parameters */ H5Gclose(h_grp); - + /* Open header to write simulation properties */ /* message("Writing file header..."); */ h_grp = H5Gcreate1(h_file, "/Header", 0); - if(h_grp < 0) - error("Error while creating file header\n"); - + if (h_grp < 0) error("Error while creating file header\n"); + /* Print the relevant information and print status */ writeAttribute(h_grp, "BoxSize", DOUBLE, e->s->dim, 3); writeAttribute(h_grp, "NumPart_ThisFile", UINT, numParticles, 6); @@ -451,7 +434,8 @@ void write_output_single (struct engine *e, struct UnitSystem* us) /* GADGET-2 legacy values */ writeAttribute(h_grp, "NumPart_Total", UINT, numParticles, 6); - writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, 6); + writeAttribute(h_grp, "NumPart_Total_HighWord", UINT, numParticlesHighWord, + 6); double MassTable[6] = {0., 0., 0., 0., 0., 0.}; writeAttribute(h_grp, "MassTable", DOUBLE, MassTable, 6); writeAttribute(h_grp, "Flag_Entropy_ICs", UINT, numParticlesHighWord, 6); @@ -465,23 +449,31 @@ void write_output_single (struct engine *e, struct UnitSystem* us) /* Print the system of Units */ writeUnitSystem(h_file, us); - + /* Create SPH particles group */ /* message("Writing particle arrays..."); */ h_grp = H5Gcreate1(h_file, "/PartType0", 0); - if(h_grp < 0) - error( "Error while creating particle group.\n"); + if (h_grp < 0) error("Error while creating particle group.\n"); /* Write arrays */ - writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, x, us, UNIT_CONV_LENGTH); - writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, v, us, UNIT_CONV_SPEED); - writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, mass, us, UNIT_CONV_MASS); - writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, h, us, UNIT_CONV_LENGTH); - writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, u, us, UNIT_CONV_ENERGY_PER_UNIT_MASS); - writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts, id, us, UNIT_CONV_NO_UNITS); - writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, dt, us, UNIT_CONV_TIME); - writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, a, us, UNIT_CONV_ACCELERATION); - writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, rho, us, UNIT_CONV_DENSITY); + writeArray(h_grp, fileName, xmfFile, "Coordinates", DOUBLE, N, 3, parts, x, + us, UNIT_CONV_LENGTH); + writeArray(h_grp, fileName, xmfFile, "Velocities", FLOAT, N, 3, parts, v, us, + UNIT_CONV_SPEED); + writeArray(h_grp, fileName, xmfFile, "Masses", FLOAT, N, 1, parts, mass, us, + UNIT_CONV_MASS); + writeArray(h_grp, fileName, xmfFile, "SmoothingLength", FLOAT, N, 1, parts, h, + us, UNIT_CONV_LENGTH); + writeArray(h_grp, fileName, xmfFile, "InternalEnergy", FLOAT, N, 1, parts, u, + us, UNIT_CONV_ENERGY_PER_UNIT_MASS); + writeArray(h_grp, fileName, xmfFile, "ParticleIDs", ULONGLONG, N, 1, parts, + id, us, UNIT_CONV_NO_UNITS); + writeArray(h_grp, fileName, xmfFile, "TimeStep", FLOAT, N, 1, parts, dt, us, + UNIT_CONV_TIME); + writeArray(h_grp, fileName, xmfFile, "Acceleration", FLOAT, N, 3, parts, a, + us, UNIT_CONV_ACCELERATION); + writeArray(h_grp, fileName, xmfFile, "Density", FLOAT, N, 1, parts, rho, us, + UNIT_CONV_DENSITY); /* Close particle group */ H5Gclose(h_grp); @@ -497,7 +489,4 @@ void write_output_single (struct engine *e, struct UnitSystem* us) ++outputCount; } - -#endif /* HAVE_HDF5 */ - - +#endif /* HAVE_HDF5 */ diff --git a/src/single_io.h b/src/single_io.h index 3cc58a46cc5398affd63e5d7e22b317ae79db3f5..91d229178bbd45df3ba358172d1f52c70008adb7 100644 --- a/src/single_io.h +++ b/src/single_io.h @@ -1,28 +1,36 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_SINGLE_IO_H +#define SWIFT_SINGLE_IO_H +/* Includes. */ +#include "engine.h" +#include "part.h" +#include "units.h" #if defined(HAVE_HDF5) && !defined(WITH_MPI) -void read_ic_single ( char* fileName, double dim[3], struct part **parts, int* N, int* periodic); +void read_ic_single(char* fileName, double dim[3], struct part** parts, int* N, + int* periodic); -void write_output_single ( struct engine* e, struct UnitSystem* us ); +void write_output_single(struct engine* e, struct UnitSystem* us); #endif +#endif /* SWIFT_SINGLE_IO_H */ diff --git a/src/space.c b/src/space.c index f9aa0d142a55007a9aa1bdaa83147034d7111048..fcdbfa24906153252f3b8a8855ca63703c648da6 100644 --- a/src/space.c +++ b/src/space.c @@ -1,54 +1,46 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> #include <float.h> #include <limits.h> #include <math.h> - +#include <string.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif -/* Local headers. */ -#include "const.h" -#include "cycle.h" -#include "lock.h" -#include "task.h" -#include "kernel.h" -#include "part.h" +/* This object's header. */ #include "space.h" -#include "multipole.h" -#include "cell.h" -#include "scheduler.h" + +/* Local headers. */ +#include "atomic.h" #include "engine.h" -#include "runner.h" #include "error.h" +#include "kernel.h" +#include "lock.h" +#include "runner.h" /* Split size. */ int space_splitsize = space_splitsize_default; @@ -57,36 +49,34 @@ int space_maxsize = space_maxsize_default; /* Map shift vector to sortlist. */ const int sortlistID[27] = { - /* ( -1 , -1 , -1 ) */ 0 , - /* ( -1 , -1 , 0 ) */ 1 , - /* ( -1 , -1 , 1 ) */ 2 , - /* ( -1 , 0 , -1 ) */ 3 , - /* ( -1 , 0 , 0 ) */ 4 , - /* ( -1 , 0 , 1 ) */ 5 , - /* ( -1 , 1 , -1 ) */ 6 , - /* ( -1 , 1 , 0 ) */ 7 , - /* ( -1 , 1 , 1 ) */ 8 , - /* ( 0 , -1 , -1 ) */ 9 , - /* ( 0 , -1 , 0 ) */ 10 , - /* ( 0 , -1 , 1 ) */ 11 , - /* ( 0 , 0 , -1 ) */ 12 , - /* ( 0 , 0 , 0 ) */ 0 , - /* ( 0 , 0 , 1 ) */ 12 , - /* ( 0 , 1 , -1 ) */ 11 , - /* ( 0 , 1 , 0 ) */ 10 , - /* ( 0 , 1 , 1 ) */ 9 , - /* ( 1 , -1 , -1 ) */ 8 , - /* ( 1 , -1 , 0 ) */ 7 , - /* ( 1 , -1 , 1 ) */ 6 , - /* ( 1 , 0 , -1 ) */ 5 , - /* ( 1 , 0 , 0 ) */ 4 , - /* ( 1 , 0 , 1 ) */ 3 , - /* ( 1 , 1 , -1 ) */ 2 , - /* ( 1 , 1 , 0 ) */ 1 , - /* ( 1 , 1 , 1 ) */ 0 - }; - - + /* ( -1 , -1 , -1 ) */ 0, + /* ( -1 , -1 , 0 ) */ 1, + /* ( -1 , -1 , 1 ) */ 2, + /* ( -1 , 0 , -1 ) */ 3, + /* ( -1 , 0 , 0 ) */ 4, + /* ( -1 , 0 , 1 ) */ 5, + /* ( -1 , 1 , -1 ) */ 6, + /* ( -1 , 1 , 0 ) */ 7, + /* ( -1 , 1 , 1 ) */ 8, + /* ( 0 , -1 , -1 ) */ 9, + /* ( 0 , -1 , 0 ) */ 10, + /* ( 0 , -1 , 1 ) */ 11, + /* ( 0 , 0 , -1 ) */ 12, + /* ( 0 , 0 , 0 ) */ 0, + /* ( 0 , 0 , 1 ) */ 12, + /* ( 0 , 1 , -1 ) */ 11, + /* ( 0 , 1 , 0 ) */ 10, + /* ( 0 , 1 , 1 ) */ 9, + /* ( 1 , -1 , -1 ) */ 8, + /* ( 1 , -1 , 0 ) */ 7, + /* ( 1 , -1 , 1 ) */ 6, + /* ( 1 , 0 , -1 ) */ 5, + /* ( 1 , 0 , 0 ) */ 4, + /* ( 1 , 0 , 1 ) */ 3, + /* ( 1 , 1 , -1 ) */ 2, + /* ( 1 , 1 , 0 ) */ 1, + /* ( 1 , 1 , 1 ) */ 0}; + /** * @brief Get the shift-id of the given pair of cells, swapping them * if need be. @@ -98,199 +88,200 @@ const int sortlistID[27] = { * * @return The shift ID and set shift, may or may not swap ci and cj. */ - -int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift ) { - - int k, sid = 0, periodic = s->periodic; - struct cell *temp; - double dx[3]; - - /* Get the relative distance between the pairs, wrapping. */ - for ( k = 0 ; k < 3 ; k++ ) { - dx[k] = (*cj)->loc[k] - (*ci)->loc[k]; - if ( periodic && dx[k] < -s->dim[k]/2 ) - shift[k] = s->dim[k]; - else if ( periodic && dx[k] > s->dim[k]/2 ) - shift[k] = -s->dim[k]; - else - shift[k] = 0.0; - dx[k] += shift[k]; - } - - /* Get the sorting index. */ - for ( k = 0 ; k < 3 ; k++ ) - sid = 3*sid + ( (dx[k] < 0.0) ? 0 : ( (dx[k] > 0.0) ? 2 : 1 ) ); - - /* Switch the cells around? */ - if ( runner_flip[sid] ) { - temp = *ci; *ci = *cj; *cj = temp; - for ( k = 0 ; k < 3 ; k++ ) - shift[k] = -shift[k]; - } - sid = sortlistID[sid]; - - /* Return the sort ID. */ - return sid; - - } +int space_getsid(struct space *s, struct cell **ci, struct cell **cj, + double *shift) { + + int k, sid = 0, periodic = s->periodic; + struct cell *temp; + double dx[3]; + + /* Get the relative distance between the pairs, wrapping. */ + for (k = 0; k < 3; k++) { + dx[k] = (*cj)->loc[k] - (*ci)->loc[k]; + if (periodic && dx[k] < -s->dim[k] / 2) + shift[k] = s->dim[k]; + else if (periodic && dx[k] > s->dim[k] / 2) + shift[k] = -s->dim[k]; + else + shift[k] = 0.0; + dx[k] += shift[k]; + } + + /* Get the sorting index. */ + for (k = 0; k < 3; k++) + sid = 3 * sid + ((dx[k] < 0.0) ? 0 : ((dx[k] > 0.0) ? 2 : 1)); + + /* Switch the cells around? */ + if (runner_flip[sid]) { + temp = *ci; + *ci = *cj; + *cj = temp; + for (k = 0; k < 3; k++) shift[k] = -shift[k]; + } + sid = sortlistID[sid]; + + /* Return the sort ID. */ + return sid; +} /** * @brief Recursively dismantle a cell tree. * */ - -void space_rebuild_recycle ( struct space *s , struct cell *c ) { - - int k; - - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) { - space_rebuild_recycle( s , c->progeny[k] ); - space_recycle( s , c->progeny[k] ); - c->progeny[k] = NULL; - } - - } - - + +void space_rebuild_recycle(struct space *s, struct cell *c) { + + int k; + + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) { + space_rebuild_recycle(s, c->progeny[k]); + space_recycle(s, c->progeny[k]); + c->progeny[k] = NULL; + } +} + /** * @brief Re-build the cell grid. * * @param s The #space. * @param cell_max Maximum cell edge length. */ - -void space_regrid ( struct space *s , double cell_max ) { - - float h_max = s->cell_min / kernel_gamma / space_stretch, dmin; - int i, j, k, cdim[3], nr_parts = s->nr_parts; - struct cell *restrict c; - // ticks tic; - - /* Run through the parts and get the current h_max. */ - // tic = getticks(); - if ( s->cells != NULL ) { - for ( k = 0 ; k < s->nr_cells ; k++ ) { - if ( s->cells[k].h_max > h_max ) - h_max = s->cells[k].h_max; - } - } - else { - for ( k = 0 ; k < nr_parts ; k++ ) { - if ( s->parts[k].h > h_max ) - h_max = s->parts[k].h; - } - s->h_max = h_max; - } - - /* If we are running in parallel, make sure everybody agrees on - how large the largest cell should be. */ - #ifdef WITH_MPI - { - float buff; - if ( MPI_Allreduce( &h_max , &buff , 1 , MPI_FLOAT , MPI_MAX , MPI_COMM_WORLD ) != MPI_SUCCESS ) - error( "Failed to aggreggate the rebuild flag accross nodes." ); - h_max = buff; + +void space_regrid(struct space *s, double cell_max) { + + float h_max = s->cell_min / kernel_gamma / space_stretch, dmin; + int i, j, k, cdim[3], nr_parts = s->nr_parts; + struct cell *restrict c; + // ticks tic; + + /* Run through the parts and get the current h_max. */ + // tic = getticks(); + if (s->cells != NULL) { + for (k = 0; k < s->nr_cells; k++) { + if (s->cells[k].h_max > h_max) h_max = s->cells[k].h_max; + } + } else { + for (k = 0; k < nr_parts; k++) { + if (s->parts[k].h > h_max) h_max = s->parts[k].h; } - #endif - message( "h_max is %.3e (cell_max=%.3e)." , h_max , cell_max ); - - /* Get the new putative cell dimensions. */ - for ( k = 0 ; k < 3 ; k++ ) - cdim[k] = floor( s->dim[k] / fmax( h_max*kernel_gamma*space_stretch , cell_max ) ); - - /* Check if we have enough cells for periodicity. */ - if ( s->periodic && (cdim[0] < 3 || cdim[1] < 3 || cdim[2] < 3) ) - error( "Must have at least 3 cells in each spatial dimension when periodicity is switched on." ); - - /* In MPI-Land, we're not allowed to change the top-level cell size. */ - #ifdef WITH_MPI - if ( cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) - error( "Root-level change of cell size not allowed." ); - #endif - - /* Do we need to re-build the upper-level cells? */ - // tic = getticks(); - if ( s->cells == NULL || - cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2] ) { - - /* Free the old cells, if they were allocated. */ - if ( s->cells != NULL ) { - for ( k = 0 ; k < s->nr_cells ; k++ ) { - space_rebuild_recycle( s , &s->cells[k] ); - if ( s->cells[k].sort != NULL ) - free( s->cells[k].sort ); - } - free( s->cells ); - s->maxdepth = 0; - } - - /* Set the new cell dimensions only if smaller. */ - for ( k = 0 ; k < 3 ; k++ ) { - s->cdim[k] = cdim[k]; - s->h[k] = s->dim[k] / cdim[k]; - s->ih[k] = 1.0 / s->h[k]; - } - dmin = fminf( s->h[0] , fminf( s->h[1] , s->h[2] ) ); - - /* Allocate the highest level of cells. */ - s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2]; - if ( posix_memalign( (void *)&s->cells , 64 , s->nr_cells * sizeof(struct cell) ) != 0 ) - error( "Failed to allocate cells." ); - bzero( s->cells , s->nr_cells * sizeof(struct cell) ); - for ( k = 0 ; k < s->nr_cells ; k++ ) - if ( lock_init( &s->cells[k].lock ) != 0 ) - error( "Failed to init spinlock." ); - - /* Set the cell location and sizes. */ - for ( i = 0 ; i < cdim[0] ; i++ ) - for ( j = 0 ; j < cdim[1] ; j++ ) - for ( k = 0 ; k < cdim[2] ; k++ ) { - c = &s->cells[ cell_getid( cdim , i , j , k ) ]; - c->loc[0] = i*s->h[0]; c->loc[1] = j*s->h[1]; c->loc[2] = k*s->h[2]; - c->h[0] = s->h[0]; c->h[1] = s->h[1]; c->h[2] = s->h[2]; - c->dmin = dmin; - c->depth = 0; - c->count = 0; - c->gcount = 0; - c->super = c; - lock_init( &c->lock ); - } - - /* Be verbose about the change. */ - message( "set cell dimensions to [ %i %i %i ]." , cdim[0] , cdim[1] , cdim[2] ); fflush(stdout); - - } /* re-build upper-level cells? */ - // message( "rebuilding upper-level cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* Otherwise, just clean up the cells. */ - else { - - /* Free the old cells, if they were allocated. */ - for ( k = 0 ; k < s->nr_cells ; k++ ) { - space_rebuild_recycle( s , &s->cells[k] ); - s->cells[k].sorts = NULL; - s->cells[k].nr_tasks = 0; - s->cells[k].nr_density = 0; - s->cells[k].nr_force = 0; - s->cells[k].density = NULL; - s->cells[k].force = NULL; - s->cells[k].dx_max = 0.0f; - s->cells[k].sorted = 0; - s->cells[k].count = 0; - s->cells[k].gcount = 0; - s->cells[k].kick1 = NULL; - s->cells[k].kick2 = NULL; - s->cells[k].super = &s->cells[k]; - } - s->maxdepth = 0; - + s->h_max = h_max; + } + +/* If we are running in parallel, make sure everybody agrees on + how large the largest cell should be. */ +#ifdef WITH_MPI + { + float buff; + if (MPI_Allreduce(&h_max, &buff, 1, MPI_FLOAT, MPI_MAX, MPI_COMM_WORLD) != + MPI_SUCCESS) + error("Failed to aggreggate the rebuild flag accross nodes."); + h_max = buff; + } +#endif + message("h_max is %.3e (cell_max=%.3e).", h_max, cell_max); + + /* Get the new putative cell dimensions. */ + for (k = 0; k < 3; k++) + cdim[k] = + floor(s->dim[k] / fmax(h_max * kernel_gamma * space_stretch, cell_max)); + + /* Check if we have enough cells for periodicity. */ + if (s->periodic && (cdim[0] < 3 || cdim[1] < 3 || cdim[2] < 3)) + error( + "Must have at least 3 cells in each spatial dimension when periodicity " + "is switched on."); + +/* In MPI-Land, we're not allowed to change the top-level cell size. */ +#ifdef WITH_MPI + if (cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || cdim[2] < s->cdim[2]) + error("Root-level change of cell size not allowed."); +#endif + + /* Do we need to re-build the upper-level cells? */ + // tic = getticks(); + if (s->cells == NULL || cdim[0] < s->cdim[0] || cdim[1] < s->cdim[1] || + cdim[2] < s->cdim[2]) { + + /* Free the old cells, if they were allocated. */ + if (s->cells != NULL) { + for (k = 0; k < s->nr_cells; k++) { + space_rebuild_recycle(s, &s->cells[k]); + if (s->cells[k].sort != NULL) free(s->cells[k].sort); + } + free(s->cells); + s->maxdepth = 0; + } + + /* Set the new cell dimensions only if smaller. */ + for (k = 0; k < 3; k++) { + s->cdim[k] = cdim[k]; + s->h[k] = s->dim[k] / cdim[k]; + s->ih[k] = 1.0 / s->h[k]; + } + dmin = fminf(s->h[0], fminf(s->h[1], s->h[2])); + + /* Allocate the highest level of cells. */ + s->tot_cells = s->nr_cells = cdim[0] * cdim[1] * cdim[2]; + if (posix_memalign((void *)&s->cells, 64, + s->nr_cells * sizeof(struct cell)) != 0) + error("Failed to allocate cells."); + bzero(s->cells, s->nr_cells * sizeof(struct cell)); + for (k = 0; k < s->nr_cells; k++) + if (lock_init(&s->cells[k].lock) != 0) error("Failed to init spinlock."); + + /* Set the cell location and sizes. */ + for (i = 0; i < cdim[0]; i++) + for (j = 0; j < cdim[1]; j++) + for (k = 0; k < cdim[2]; k++) { + c = &s->cells[cell_getid(cdim, i, j, k)]; + c->loc[0] = i * s->h[0]; + c->loc[1] = j * s->h[1]; + c->loc[2] = k * s->h[2]; + c->h[0] = s->h[0]; + c->h[1] = s->h[1]; + c->h[2] = s->h[2]; + c->dmin = dmin; + c->depth = 0; + c->count = 0; + c->gcount = 0; + c->super = c; + lock_init(&c->lock); } - + + /* Be verbose about the change. */ + message("set cell dimensions to [ %i %i %i ].", cdim[0], cdim[1], cdim[2]); + fflush(stdout); + + } /* re-build upper-level cells? */ + // message( "rebuilding upper-level cells took %.3f ms." , (double)(getticks() + // - tic) / CPU_TPS * 1000 ); + + /* Otherwise, just clean up the cells. */ + else { + + /* Free the old cells, if they were allocated. */ + for (k = 0; k < s->nr_cells; k++) { + space_rebuild_recycle(s, &s->cells[k]); + s->cells[k].sorts = NULL; + s->cells[k].nr_tasks = 0; + s->cells[k].nr_density = 0; + s->cells[k].nr_force = 0; + s->cells[k].density = NULL; + s->cells[k].force = NULL; + s->cells[k].dx_max = 0.0f; + s->cells[k].sorted = 0; + s->cells[k].count = 0; + s->cells[k].gcount = 0; + s->cells[k].kick1 = NULL; + s->cells[k].kick2 = NULL; + s->cells[k].super = &s->cells[k]; } - + s->maxdepth = 0; + } +} /** * @brief Re-build the cells as well as the tasks. @@ -299,177 +290,187 @@ void space_regrid ( struct space *s , double cell_max ) { * @param cell_max Maximal cell size. * */ - -void space_rebuild ( struct space *s , double cell_max ) { - - int j, k, cdim[3], nr_parts = s->nr_parts, nr_gparts = s->nr_gparts; - struct cell *restrict c, *restrict cells; - struct part *restrict finger, *restrict p, *parts = s->parts; - struct xpart *xfinger, *xparts = s->xparts; - struct gpart *gp, *gparts = s->gparts, *gfinger; - int *ind; - double ih[3], dim[3]; - // ticks tic; - - /* Be verbose about this. */ - // message( "re)building space..." ); fflush(stdout); - - /* Re-grid if necessary, or just re-set the cell data. */ - space_regrid( s , cell_max ); - cells = s->cells; - - /* Run through the particles and get their cell index. */ - // tic = getticks(); - const int ind_size = s->size_parts; - if ( ( ind = (int *)malloc( sizeof(int) * ind_size ) ) == NULL ) - error( "Failed to allocate temporary particle indices." ); - ih[0] = s->ih[0]; ih[1] = s->ih[1]; ih[2] = s->ih[2]; - dim[0] = s->dim[0]; dim[1] = s->dim[1]; dim[2] = s->dim[2]; - cdim[0] = s->cdim[0]; cdim[1] = s->cdim[1]; cdim[2] = s->cdim[2]; - for ( k = 0 ; k < nr_parts ; k++ ) { - p = &parts[k]; - for ( j = 0 ; j < 3 ; j++ ) - if ( p->x[j] < 0.0 ) - p->x[j] += dim[j]; - else if ( p->x[j] >= dim[j] ) - p->x[j] -= dim[j]; - ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] ); - cells[ ind[k] ].count++; - } - // message( "getting particle indices took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - - #ifdef WITH_MPI - /* Move non-local parts to the end of the list. */ - int nodeID = s->e->nodeID; - for ( k = 0 ; k < nr_parts ; k++ ) - if ( cells[ ind[k] ].nodeID != nodeID ) { - cells[ ind[k] ].count -= 1; - nr_parts -= 1; - struct part tp = parts[k]; - parts[k] = parts[ nr_parts ]; - parts[ nr_parts ] = tp; - struct xpart txp = xparts[k]; - xparts[k] = xparts[ nr_parts ]; - xparts[ nr_parts ] = txp; - int t = ind[k]; - ind[k] = ind[ nr_parts ]; - ind[ nr_parts ] = t; - } - - /* Exchange the strays, note that this potentially re-allocates - the parts arrays. */ - s->nr_parts = nr_parts + engine_exchange_strays( s->e , nr_parts , &ind[nr_parts] , s->nr_parts - nr_parts ); - parts = s->parts; - xparts = s->xparts; - - /* Re-allocate the index array if needed.. */ - if (s->nr_parts > ind_size) { - int *ind_new; - if ( ( ind_new = (int *)malloc( sizeof(int) * s->nr_parts ) ) == NULL ) - error( "Failed to allocate temporary particle indices." ); - memcpy(ind_new, ind, sizeof(int) * nr_parts); - free(ind); ind = ind_new; - } - - /* Assign each particle to its cell. */ - for ( k = nr_parts ; k < s->nr_parts ; k++ ) { - p = &parts[k]; - ind[k] = cell_getid( cdim , p->x[0]*ih[0] , p->x[1]*ih[1] , p->x[2]*ih[2] ); - cells[ ind[k] ].count += 1; - /* if ( cells[ ind[k] ].nodeID != nodeID ) - error( "Received part that does not belong to me (nodeID=%i)." , cells[ ind[k] ].nodeID ); */ - } - nr_parts = s->nr_parts; - #endif - - - /* Sort the parts according to their cells. */ - // tic = getticks(); - parts_sort( parts , xparts , ind , nr_parts , 0 , s->nr_cells-1 ); - // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* Re-link the gparts. */ - for ( k = 0 ; k < nr_parts ; k++ ) - if ( parts[k].gpart != NULL ) - parts[k].gpart->part = &parts[k]; - - /* Verify sort. */ - /* for ( k = 1 ; k < nr_parts ; k++ ) { - if ( ind[k-1] > ind[k] ) { - error( "Sort failed!" ); - } - else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] , parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ) - error( "Incorrect indices!" ); - } */ - - /* We no longer need the indices as of here. */ - free( ind ); - - - - /* Run through the gravity particles and get their cell index. */ - // tic = getticks(); - if ( ( ind = (int *)malloc( sizeof(int) * s->size_gparts ) ) == NULL ) - error( "Failed to allocate temporary particle indices." ); - for ( k = 0 ; k < nr_gparts ; k++ ) { - gp = &gparts[k]; - for ( j = 0 ; j < 3 ; j++ ) - if ( gp->x[j] < 0.0 ) - gp->x[j] += dim[j]; - else if ( gp->x[j] >= dim[j] ) - gp->x[j] -= dim[j]; - ind[k] = cell_getid( cdim , gp->x[0]*ih[0] , gp->x[1]*ih[1] , gp->x[2]*ih[2] ); - cells[ ind[k] ].gcount++; - } - // message( "getting particle indices took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* TODO: Here we should exchange the gparts as well! */ - - /* Sort the parts according to their cells. */ - // tic = getticks(); - gparts_sort( gparts ,ind , nr_gparts , 0 , s->nr_cells-1 ); - // message( "gparts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* Re-link the parts. */ - for ( k = 0 ; k < nr_gparts ; k++ ) - if ( gparts[k].id > 0 ) - gparts[k].part->gpart = &gparts[k]; - - /* We no longer need the indices as of here. */ - free( ind ); - - - - /* Hook the cells up to the parts. */ - // tic = getticks(); - finger = parts; - xfinger = xparts; - gfinger = gparts; - for ( k = 0 ; k < s->nr_cells ; k++ ) { - c = &cells[ k ]; - c->parts = finger; - c->xparts = xfinger; - c->gparts = gfinger; - finger = &finger[ c->count ]; - xfinger = &xfinger[ c->count ]; - gfinger = &gfinger[ c->gcount ]; - } - // message( "hooking up cells took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - - /* At this point, we have the upper-level cells, old or new. Now make - sure that the parts in each cell are ok. */ - // tic = getticks(); - for ( k = 0; k < s->nr_cells; k++ ) - space_split( s , &cells[k] ); - - // message( "space_split took %.3f ms." , (double)(getticks() - tic) / CPU_TPS * 1000 ); - + +void space_rebuild(struct space *s, double cell_max) { + + int j, k, cdim[3], nr_parts = s->nr_parts, nr_gparts = s->nr_gparts; + struct cell *restrict c, *restrict cells; + struct part *restrict finger, *restrict p, *parts = s->parts; + struct xpart *xfinger, *xparts = s->xparts; + struct gpart *gp, *gparts = s->gparts, *gfinger; + int *ind; + double ih[3], dim[3]; + // ticks tic; + + /* Be verbose about this. */ + // message( "re)building space..." ); fflush(stdout); + + /* Re-grid if necessary, or just re-set the cell data. */ + space_regrid(s, cell_max); + cells = s->cells; + + /* Run through the particles and get their cell index. */ + // tic = getticks(); + const int ind_size = s->size_parts; + if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL) + error("Failed to allocate temporary particle indices."); + ih[0] = s->ih[0]; + ih[1] = s->ih[1]; + ih[2] = s->ih[2]; + dim[0] = s->dim[0]; + dim[1] = s->dim[1]; + dim[2] = s->dim[2]; + cdim[0] = s->cdim[0]; + cdim[1] = s->cdim[1]; + cdim[2] = s->cdim[2]; + for (k = 0; k < nr_parts; k++) { + p = &parts[k]; + for (j = 0; j < 3; j++) + if (p->x[j] < 0.0) + p->x[j] += dim[j]; + else if (p->x[j] >= dim[j]) + p->x[j] -= dim[j]; + ind[k] = + cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); + cells[ind[k]].count++; + } +// message( "getting particle indices took %.3f ms." , (double)(getticks() - +// tic) / CPU_TPS * 1000 ); + +#ifdef WITH_MPI + /* Move non-local parts to the end of the list. */ + int nodeID = s->e->nodeID; + for (k = 0; k < nr_parts; k++) + if (cells[ind[k]].nodeID != nodeID) { + cells[ind[k]].count -= 1; + nr_parts -= 1; + struct part tp = parts[k]; + parts[k] = parts[nr_parts]; + parts[nr_parts] = tp; + struct xpart txp = xparts[k]; + xparts[k] = xparts[nr_parts]; + xparts[nr_parts] = txp; + int t = ind[k]; + ind[k] = ind[nr_parts]; + ind[nr_parts] = t; } + /* Exchange the strays, note that this potentially re-allocates + the parts arrays. */ + s->nr_parts = + nr_parts + engine_exchange_strays(s->e, nr_parts, &ind[nr_parts], + s->nr_parts - nr_parts); + parts = s->parts; + xparts = s->xparts; + + /* Re-allocate the index array if needed.. */ + if (s->nr_parts > ind_size) { + int *ind_new; + if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL) + error("Failed to allocate temporary particle indices."); + memcpy(ind_new, ind, sizeof(int) * nr_parts); + free(ind); + ind = ind_new; + } + + /* Assign each particle to its cell. */ + for (k = nr_parts; k < s->nr_parts; k++) { + p = &parts[k]; + ind[k] = + cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); + cells[ind[k]].count += 1; + /* if ( cells[ ind[k] ].nodeID != nodeID ) + error( "Received part that does not belong to me (nodeID=%i)." , cells[ + ind[k] ].nodeID ); */ + } + nr_parts = s->nr_parts; +#endif + + /* Sort the parts according to their cells. */ + // tic = getticks(); + parts_sort(parts, xparts, ind, nr_parts, 0, s->nr_cells - 1); + // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS + // * 1000 ); + + /* Re-link the gparts. */ + for (k = 0; k < nr_parts; k++) + if (parts[k].gpart != NULL) parts[k].gpart->part = &parts[k]; + + /* Verify sort. */ + /* for ( k = 1 ; k < nr_parts ; k++ ) { + if ( ind[k-1] > ind[k] ) { + error( "Sort failed!" ); + } + else if ( ind[k] != cell_getid( cdim , parts[k].x[0]*ih[0] , + parts[k].x[1]*ih[1] , parts[k].x[2]*ih[2] ) ) + error( "Incorrect indices!" ); + } */ + + /* We no longer need the indices as of here. */ + free(ind); + + /* Run through the gravity particles and get their cell index. */ + // tic = getticks(); + if ((ind = (int *)malloc(sizeof(int) * s->size_gparts)) == NULL) + error("Failed to allocate temporary particle indices."); + for (k = 0; k < nr_gparts; k++) { + gp = &gparts[k]; + for (j = 0; j < 3; j++) + if (gp->x[j] < 0.0) + gp->x[j] += dim[j]; + else if (gp->x[j] >= dim[j]) + gp->x[j] -= dim[j]; + ind[k] = + cell_getid(cdim, gp->x[0] * ih[0], gp->x[1] * ih[1], gp->x[2] * ih[2]); + cells[ind[k]].gcount++; + } + // message( "getting particle indices took %.3f ms." , (double)(getticks() - + // tic) / CPU_TPS * 1000 ); + + /* TODO: Here we should exchange the gparts as well! */ + + /* Sort the parts according to their cells. */ + // tic = getticks(); + gparts_sort(gparts, ind, nr_gparts, 0, s->nr_cells - 1); + // message( "gparts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS + // * 1000 ); + + /* Re-link the parts. */ + for (k = 0; k < nr_gparts; k++) + if (gparts[k].id > 0) gparts[k].part->gpart = &gparts[k]; + + /* We no longer need the indices as of here. */ + free(ind); + + /* Hook the cells up to the parts. */ + // tic = getticks(); + finger = parts; + xfinger = xparts; + gfinger = gparts; + for (k = 0; k < s->nr_cells; k++) { + c = &cells[k]; + c->parts = finger; + c->xparts = xfinger; + c->gparts = gfinger; + finger = &finger[c->count]; + xfinger = &xfinger[c->count]; + gfinger = &gfinger[c->gcount]; + } + // message( "hooking up cells took %.3f ms." , (double)(getticks() - tic) / + // CPU_TPS * 1000 ); + + /* At this point, we have the upper-level cells, old or new. Now make + sure that the parts in each cell are ok. */ + // tic = getticks(); + for (k = 0; k < s->nr_cells; k++) space_split(s, &cells[k]); + + // message( "space_split took %.3f ms." , (double)(getticks() - tic) / CPU_TPS + // * 1000 ); +} /** - * @brief Sort the particles and condensed particles according to the given indices. + * @brief Sort the particles and condensed particles according to the given + *indices. * * @param parts The list of #part * @param xparts The list of reduced particles @@ -478,313 +479,303 @@ void space_rebuild ( struct space *s , double cell_max ) { * @param min Lowest index. * @param max highest index. */ - -void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max ) { - - struct qstack { - volatile int i, j, min, max; - volatile int ready; - }; - struct qstack *qstack; - int qstack_size = 2*(max-min) + 10; - volatile unsigned int first, last, waiting; - - int pivot; - int i, ii, j, jj, temp_i, qid; - struct part temp_p; - struct xpart temp_xp; - - /* for ( int k = 0 ; k < N ; k++ ) - if ( ind[k] > max || ind[k] < min ) - error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */ - - /* Allocate the stack. */ - if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL ) - error( "Failed to allocate qstack." ); - - /* Init the interval stack. */ - qstack[0].i = 0; - qstack[0].j = N-1; - qstack[0].min = min; - qstack[0].max = max; - qstack[0].ready = 1; - for ( i = 1 ; i < qstack_size ; i++ ) - qstack[i].ready = 0; - first = 0; last = 1; waiting = 1; - - /* Main loop. */ - while ( waiting > 0 ) { - - /* Grab an interval off the queue. */ - qid = ( first++ ) % qstack_size; - - - /* Get the stack entry. */ - i = qstack[qid].i; - j = qstack[qid].j; - min = qstack[qid].min; - max = qstack[qid].max; - qstack[qid].ready = 0; - - - /* Loop over sub-intervals. */ - while ( 1 ) { - - /* Bring beer. */ - pivot = (min + max) / 2; - - /* One pass of QuickSort's partitioning. */ - ii = i; jj = j; - while ( ii < jj ) { - while ( ii <= j && ind[ii] <= pivot ) - ii++; - while ( jj >= i && ind[jj] > pivot ) - jj--; - if ( ii < jj ) { - temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i; - temp_p = parts[ii]; parts[ii] = parts[jj]; parts[jj] = temp_p; - temp_xp = xparts[ii]; xparts[ii] = xparts[jj]; xparts[jj] = temp_xp; - } - } - - /* Verify sort. */ - /* for ( int k = i ; k <= jj ; k++ ) - if ( ind[k] > pivot ) { - message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N ); - error( "Partition failed (<=pivot)." ); - } - for ( int k = jj+1 ; k <= j ; k++ ) - if ( ind[k] <= pivot ) { - message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N ); - error( "Partition failed (>pivot)." ); - } */ - - /* Split-off largest interval. */ - if ( jj - i > j - jj+1 ) { - - /* Recurse on the left? */ - if ( jj > i && pivot > min ) { - qid = ( last++ ) % qstack_size; - qstack[qid].i = i; - qstack[qid].j = jj; - qstack[qid].min = min; - qstack[qid].max = pivot; - qstack[qid].ready = 1; - if ( waiting++ >= qstack_size ) - error( "Qstack overflow." ); - } - - /* Recurse on the right? */ - if ( jj+1 < j && pivot+1 < max ) { - i = jj+1; - min = pivot+1; - } - else - break; - - } - - else { - - /* Recurse on the right? */ - if ( jj+1 < j && pivot+1 < max ) { - qid = ( last++ ) % qstack_size; - qstack[qid].i = jj+1; - qstack[qid].j = j; - qstack[qid].min = pivot+1; - qstack[qid].max = max; - qstack[qid].ready = 1; - if ( ( waiting++ ) >= qstack_size ) - error( "Qstack overflow." ); - } - - /* Recurse on the left? */ - if ( jj > i && pivot > min ) { - j = jj; - max = pivot; - } - else - break; - - } - - } /* loop over sub-intervals. */ - - waiting--; - - } /* main loop. */ - - - /* Verify sort. */ - /* for ( i = 1 ; i < N ; i++ ) - if ( ind[i-1] > ind[i] ) - error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i , ind[i] ); */ - - /* Clean up. */ - free( qstack ); - } +void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, + int min, int max) { + + struct qstack { + volatile int i, j, min, max; + volatile int ready; + }; + struct qstack *qstack; + int qstack_size = 2 * (max - min) + 10; + volatile unsigned int first, last, waiting; + + int pivot; + int i, ii, j, jj, temp_i, qid; + struct part temp_p; + struct xpart temp_xp; + + /* for ( int k = 0 ; k < N ; k++ ) + if ( ind[k] > max || ind[k] < min ) + error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */ + + /* Allocate the stack. */ + if ((qstack = malloc(sizeof(struct qstack) * qstack_size)) == NULL) + error("Failed to allocate qstack."); + + /* Init the interval stack. */ + qstack[0].i = 0; + qstack[0].j = N - 1; + qstack[0].min = min; + qstack[0].max = max; + qstack[0].ready = 1; + for (i = 1; i < qstack_size; i++) qstack[i].ready = 0; + first = 0; + last = 1; + waiting = 1; + + /* Main loop. */ + while (waiting > 0) { + + /* Grab an interval off the queue. */ + qid = (first++) % qstack_size; + + /* Get the stack entry. */ + i = qstack[qid].i; + j = qstack[qid].j; + min = qstack[qid].min; + max = qstack[qid].max; + qstack[qid].ready = 0; + + /* Loop over sub-intervals. */ + while (1) { + + /* Bring beer. */ + pivot = (min + max) / 2; + + /* One pass of QuickSort's partitioning. */ + ii = i; + jj = j; + while (ii < jj) { + while (ii <= j && ind[ii] <= pivot) ii++; + while (jj >= i && ind[jj] > pivot) jj--; + if (ii < jj) { + temp_i = ind[ii]; + ind[ii] = ind[jj]; + ind[jj] = temp_i; + temp_p = parts[ii]; + parts[ii] = parts[jj]; + parts[jj] = temp_p; + temp_xp = xparts[ii]; + xparts[ii] = xparts[jj]; + xparts[jj] = temp_xp; + } + } + /* Verify sort. */ + /* for ( int k = i ; k <= jj ; k++ ) + if ( ind[k] > pivot ) { + message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, + N=%i." , k , ind[k] , pivot , i , j , N ); + error( "Partition failed (<=pivot)." ); + } + for ( int k = jj+1 ; k <= j ; k++ ) + if ( ind[k] <= pivot ) { + message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, + N=%i." , k , ind[k] , pivot , i , j , N ); + error( "Partition failed (>pivot)." ); + } */ + + /* Split-off largest interval. */ + if (jj - i > j - jj + 1) { + + /* Recurse on the left? */ + if (jj > i && pivot > min) { + qid = (last++) % qstack_size; + qstack[qid].i = i; + qstack[qid].j = jj; + qstack[qid].min = min; + qstack[qid].max = pivot; + qstack[qid].ready = 1; + if (waiting++ >= qstack_size) error("Qstack overflow."); + } -void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max ) { - - struct qstack { - volatile int i, j, min, max; - volatile int ready; - }; - struct qstack *qstack; - int qstack_size = 2*(max-min) + 10; - volatile unsigned int first, last, waiting; - - int pivot; - int i, ii, j, jj, temp_i, qid; - struct gpart temp_p; - - /* for ( int k = 0 ; k < N ; k++ ) - if ( ind[k] > max || ind[k] < min ) - error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */ - - /* Allocate the stack. */ - if ( ( qstack = malloc( sizeof(struct qstack) * qstack_size ) ) == NULL ) - error( "Failed to allocate qstack." ); - - /* Init the interval stack. */ - qstack[0].i = 0; - qstack[0].j = N-1; - qstack[0].min = min; - qstack[0].max = max; - qstack[0].ready = 1; - for ( i = 1 ; i < qstack_size ; i++ ) - qstack[i].ready = 0; - first = 0; last = 1; waiting = 1; - - /* Main loop. */ - while ( waiting > 0 ) { - - /* Grab an interval off the queue. */ - qid = ( first++ ) % qstack_size; - - - /* Get the stack entry. */ - i = qstack[qid].i; - j = qstack[qid].j; - min = qstack[qid].min; - max = qstack[qid].max; - qstack[qid].ready = 0; - - - /* Loop over sub-intervals. */ - while ( 1 ) { - - /* Bring beer. */ - pivot = (min + max) / 2; - - /* One pass of QuickSort's partitioning. */ - ii = i; jj = j; - while ( ii < jj ) { - while ( ii <= j && ind[ii] <= pivot ) - ii++; - while ( jj >= i && ind[jj] > pivot ) - jj--; - if ( ii < jj ) { - temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i; - temp_p = gparts[ii]; gparts[ii] = gparts[jj]; gparts[jj] = temp_p; - } - } - - /* Verify sort. */ - /* for ( int k = i ; k <= jj ; k++ ) - if ( ind[k] > pivot ) { - message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N ); - error( "Partition failed (<=pivot)." ); - } - for ( int k = jj+1 ; k <= j ; k++ ) - if ( ind[k] <= pivot ) { - message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i." , k , ind[k] , pivot , i , j , N ); - error( "Partition failed (>pivot)." ); - } */ - - /* Split-off largest interval. */ - if ( jj - i > j - jj+1 ) { - - /* Recurse on the left? */ - if ( jj > i && pivot > min ) { - qid = ( last++ ) % qstack_size; - qstack[qid].i = i; - qstack[qid].j = jj; - qstack[qid].min = min; - qstack[qid].max = pivot; - qstack[qid].ready = 1; - if ( ( waiting++ ) >= qstack_size ) - error( "Qstack overflow." ); - } - - /* Recurse on the right? */ - if ( jj+1 < j && pivot+1 < max ) { - i = jj+1; - min = pivot+1; - } - else - break; - - } - - else { - - /* Recurse on the right? */ - if ( jj+1 < j && pivot+1 < max ) { - qid = ( last++ ) % qstack_size; - qstack[qid].i = jj+1; - qstack[qid].j = j; - qstack[qid].min = pivot+1; - qstack[qid].max = max; - qstack[qid].ready = 1; - if ( ( waiting++ ) >= qstack_size ) - error( "Qstack overflow." ); - } - - /* Recurse on the left? */ - if ( jj > i && pivot > min ) { - j = jj; - max = pivot; - } - else - break; - - } - - } /* loop over sub-intervals. */ - - waiting--; - - } /* main loop. */ - - - - /* Verify sort. */ - /* for ( i = 1 ; i < N ; i++ ) - if ( ind[i-1] > ind[i] ) - error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i , ind[i] ); */ - - /* Clean up. */ - free( qstack ); + /* Recurse on the right? */ + if (jj + 1 < j && pivot + 1 < max) { + i = jj + 1; + min = pivot + 1; + } else + break; + + } else { + + /* Recurse on the right? */ + if (jj + 1 < j && pivot + 1 < max) { + qid = (last++) % qstack_size; + qstack[qid].i = jj + 1; + qstack[qid].j = j; + qstack[qid].min = pivot + 1; + qstack[qid].max = max; + qstack[qid].ready = 1; + if ((waiting++) >= qstack_size) error("Qstack overflow."); + } - } + /* Recurse on the left? */ + if (jj > i && pivot > min) { + j = jj; + max = pivot; + } else + break; + } + } /* loop over sub-intervals. */ + + waiting--; + + } /* main loop. */ + + /* Verify sort. */ + /* for ( i = 1 ; i < N ; i++ ) + if ( ind[i-1] > ind[i] ) + error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i + , ind[i] ); */ + + /* Clean up. */ + free(qstack); +} + +void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) { + + struct qstack { + volatile int i, j, min, max; + volatile int ready; + }; + struct qstack *qstack; + int qstack_size = 2 * (max - min) + 10; + volatile unsigned int first, last, waiting; + + int pivot; + int i, ii, j, jj, temp_i, qid; + struct gpart temp_p; + + /* for ( int k = 0 ; k < N ; k++ ) + if ( ind[k] > max || ind[k] < min ) + error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */ + + /* Allocate the stack. */ + if ((qstack = malloc(sizeof(struct qstack) * qstack_size)) == NULL) + error("Failed to allocate qstack."); + + /* Init the interval stack. */ + qstack[0].i = 0; + qstack[0].j = N - 1; + qstack[0].min = min; + qstack[0].max = max; + qstack[0].ready = 1; + for (i = 1; i < qstack_size; i++) qstack[i].ready = 0; + first = 0; + last = 1; + waiting = 1; + + /* Main loop. */ + while (waiting > 0) { + + /* Grab an interval off the queue. */ + qid = (first++) % qstack_size; + + /* Get the stack entry. */ + i = qstack[qid].i; + j = qstack[qid].j; + min = qstack[qid].min; + max = qstack[qid].max; + qstack[qid].ready = 0; + + /* Loop over sub-intervals. */ + while (1) { + + /* Bring beer. */ + pivot = (min + max) / 2; + + /* One pass of QuickSort's partitioning. */ + ii = i; + jj = j; + while (ii < jj) { + while (ii <= j && ind[ii] <= pivot) ii++; + while (jj >= i && ind[jj] > pivot) jj--; + if (ii < jj) { + temp_i = ind[ii]; + ind[ii] = ind[jj]; + ind[jj] = temp_i; + temp_p = gparts[ii]; + gparts[ii] = gparts[jj]; + gparts[jj] = temp_p; + } + } + + /* Verify sort. */ + /* for ( int k = i ; k <= jj ; k++ ) + if ( ind[k] > pivot ) { + message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, + N=%i." , k , ind[k] , pivot , i , j , N ); + error( "Partition failed (<=pivot)." ); + } + for ( int k = jj+1 ; k <= j ; k++ ) + if ( ind[k] <= pivot ) { + message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, + N=%i." , k , ind[k] , pivot , i , j , N ); + error( "Partition failed (>pivot)." ); + } */ + + /* Split-off largest interval. */ + if (jj - i > j - jj + 1) { + + /* Recurse on the left? */ + if (jj > i && pivot > min) { + qid = (last++) % qstack_size; + qstack[qid].i = i; + qstack[qid].j = jj; + qstack[qid].min = min; + qstack[qid].max = pivot; + qstack[qid].ready = 1; + if ((waiting++) >= qstack_size) error("Qstack overflow."); + } + + /* Recurse on the right? */ + if (jj + 1 < j && pivot + 1 < max) { + i = jj + 1; + min = pivot + 1; + } else + break; + + } else { + + /* Recurse on the right? */ + if (jj + 1 < j && pivot + 1 < max) { + qid = (last++) % qstack_size; + qstack[qid].i = jj + 1; + qstack[qid].j = j; + qstack[qid].min = pivot + 1; + qstack[qid].max = max; + qstack[qid].ready = 1; + if ((waiting++) >= qstack_size) error("Qstack overflow."); + } + + /* Recurse on the left? */ + if (jj > i && pivot > min) { + j = jj; + max = pivot; + } else + break; + } + + } /* loop over sub-intervals. */ + + waiting--; + + } /* main loop. */ + + /* Verify sort. */ + /* for ( i = 1 ; i < N ; i++ ) + if ( ind[i-1] > ind[i] ) + error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i + , ind[i] ); */ + + /* Clean up. */ + free(qstack); +} /** * @brief Mapping function to free the sorted indices buffers. */ -void space_map_clearsort ( struct cell *c , void *data ) { - - if ( c->sort != NULL ) { - free( c->sort ); - c->sort = NULL; - } - - } +void space_map_clearsort(struct cell *c, void *data) { + if (c->sort != NULL) { + free(c->sort); + c->sort = NULL; + } +} /** * @brief Map a function to all particles in a aspace. @@ -793,34 +784,30 @@ void space_map_clearsort ( struct cell *c , void *data ) { * @param fun Function pointer to apply on the cells. * @param data Data passed to the function fun. */ - -void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data ) { - - int cid = 0; - - void rec_map ( struct cell *c ) { - - int k; - - /* No progeny? */ - if ( !c->split ) - for ( k = 0 ; k < c->count ; k++ ) - fun( &c->parts[k] , c , data ); - - /* Otherwise, recurse. */ - else - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - rec_map( c->progeny[k] ); - - } - - /* Call the recursive function on all higher-level cells. */ - for( cid = 0; cid < s->nr_cells; cid++ ) - rec_map( &s->cells[cid] ); - - } +void space_map_parts(struct space *s, + void (*fun)(struct part *p, struct cell *c, void *data), + void *data) { + + int cid = 0; + + void rec_map(struct cell * c) { + + int k; + + /* No progeny? */ + if (!c->split) + for (k = 0; k < c->count; k++) fun(&c->parts[k], c, data); + + /* Otherwise, recurse. */ + else + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) rec_map(c->progeny[k]); + } + + /* Call the recursive function on all higher-level cells. */ + for (cid = 0; cid < s->nr_cells; cid++) rec_map(&s->cells[cid]); +} /** * @brief Map a function to all particles in a aspace. @@ -830,61 +817,50 @@ void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct ce * @param fun Function pointer to apply on the cells. * @param data Data passed to the function fun. */ - -void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data ) { - - int cid = 0; - - void rec_map ( struct cell *c ) { - - int k; - - /* Recurse. */ - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - rec_map( c->progeny[k] ); - - /* No progeny? */ - if ( full || !c->split ) - fun( c , data ); - - } - - /* Call the recursive function on all higher-level cells. */ - for ( cid = 0; cid < s->nr_cells; cid++ ) - rec_map( &s->cells[cid] ); - } +void space_map_cells_post(struct space *s, int full, + void (*fun)(struct cell *c, void *data), void *data) { + int cid = 0; -void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data ) { - - int cid = 0; - - void rec_map ( struct cell *c ) { - - int k; - - /* No progeny? */ - if ( full || !c->split ) - fun( c , data ); - - /* Recurse. */ - if ( c->split ) - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k] != NULL ) - rec_map( c->progeny[k] ); - - } - - /* Call the recursive function on all higher-level cells. */ - for (cid = 0; cid < s->nr_cells; cid++ ) - rec_map( &s->cells[cid] ); - + void rec_map(struct cell * c) { - } + int k; + + /* Recurse. */ + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) rec_map(c->progeny[k]); + + /* No progeny? */ + if (full || !c->split) fun(c, data); + } + + /* Call the recursive function on all higher-level cells. */ + for (cid = 0; cid < s->nr_cells; cid++) rec_map(&s->cells[cid]); +} +void space_map_cells_pre(struct space *s, int full, + void (*fun)(struct cell *c, void *data), void *data) { + + int cid = 0; + + void rec_map(struct cell * c) { + + int k; + + /* No progeny? */ + if (full || !c->split) fun(c, data); + + /* Recurse. */ + if (c->split) + for (k = 0; k < 8; k++) + if (c->progeny[k] != NULL) rec_map(c->progeny[k]); + } + + /* Call the recursive function on all higher-level cells. */ + for (cid = 0; cid < s->nr_cells; cid++) rec_map(&s->cells[cid]); +} /** * @brief Split cells that contain too many particles. @@ -892,114 +868,103 @@ void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell * @param s The #space we are working in. * @param c The #cell under consideration. */ - -void space_split ( struct space *s , struct cell *c ) { - - int k, count = c->count, gcount = c->gcount, maxdepth = 0; - float h, h_max = 0.0f, dt, dt_min = c->parts[0].dt, dt_max = dt_min; - struct cell *temp; - struct part *p, *parts = c->parts; - struct xpart *xp, *xparts = c->xparts; - - /* Check the depth. */ - if ( c->depth > s->maxdepth ) - s->maxdepth = c->depth; - - /* Split or let it be? */ - if ( count > space_splitsize || gcount > space_splitsize ) { - - /* No longer just a leaf. */ - c->split = 1; - - /* Create the cell's progeny. */ - for ( k = 0 ; k < 8 ; k++ ) { - temp = space_getcell( s ); - temp->count = 0; - temp->gcount = 0; - temp->loc[0] = c->loc[0]; - temp->loc[1] = c->loc[1]; - temp->loc[2] = c->loc[2]; - temp->h[0] = c->h[0]/2; - temp->h[1] = c->h[1]/2; - temp->h[2] = c->h[2]/2; - temp->dmin = c->dmin/2; - if ( k & 4 ) - temp->loc[0] += temp->h[0]; - if ( k & 2 ) - temp->loc[1] += temp->h[1]; - if ( k & 1 ) - temp->loc[2] += temp->h[2]; - temp->depth = c->depth + 1; - temp->split = 0; - temp->h_max = 0.0; - temp->dx_max = 0.0; - temp->nodeID = c->nodeID; - temp->parent = c; - c->progeny[k] = temp; - } - - /* Split the cell data. */ - cell_split( c ); - - /* Remove any progeny with zero parts. */ - for ( k = 0 ; k < 8 ; k++ ) - if ( c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0 ) { - space_recycle( s , c->progeny[k] ); - c->progeny[k] = NULL; - } - else { - space_split( s , c->progeny[k] ); - h_max = fmaxf( h_max , c->progeny[k]->h_max ); - dt_min = fminf( dt_min , c->progeny[k]->dt_min ); - dt_max = fmaxf( dt_max , c->progeny[k]->dt_max ); - if ( c->progeny[k]->maxdepth > maxdepth ) - maxdepth = c->progeny[k]->maxdepth; - } - - /* Set the values for this cell. */ - c->h_max = h_max; - c->dt_min = dt_min; - c->dt_max = dt_max; - c->maxdepth = maxdepth; - - } - - /* Otherwise, collect the data for this cell. */ - else { - - /* Clear the progeny. */ - bzero( c->progeny , sizeof(struct cell *) * 8 ); - c->split = 0; - c->maxdepth = c->depth; - - /* Get dt_min/dt_max. */ - - for ( k = 0 ; k < count ; k++ ) { - p = &parts[k]; - xp = &xparts[k]; - xp->x_old[0] = p->x[0]; - xp->x_old[1] = p->x[1]; - xp->x_old[2] = p->x[2]; - dt = p->dt; - h = p->h; - if ( h > h_max ) - h_max = h; - if ( dt < dt_min ) - dt_min = dt; - if ( dt > dt_max ) - dt_max = dt; - } - c->h_max = h_max; - c->dt_min = dt_min; - c->dt_max = dt_max; - - } - - /* Set ownership accorind to the start of the parts array. */ - c->owner = ( ( c->parts - s->parts ) % s->nr_parts ) * s->nr_queues / s->nr_parts; +void space_split(struct space *s, struct cell *c) { + + int k, count = c->count, gcount = c->gcount, maxdepth = 0; + float h, h_max = 0.0f, dt, dt_min = c->parts[0].dt, dt_max = dt_min; + struct cell *temp; + struct part *p, *parts = c->parts; + struct xpart *xp, *xparts = c->xparts; + + /* Check the depth. */ + if (c->depth > s->maxdepth) s->maxdepth = c->depth; + + /* Split or let it be? */ + if (count > space_splitsize || gcount > space_splitsize) { + + /* No longer just a leaf. */ + c->split = 1; + + /* Create the cell's progeny. */ + for (k = 0; k < 8; k++) { + temp = space_getcell(s); + temp->count = 0; + temp->gcount = 0; + temp->loc[0] = c->loc[0]; + temp->loc[1] = c->loc[1]; + temp->loc[2] = c->loc[2]; + temp->h[0] = c->h[0] / 2; + temp->h[1] = c->h[1] / 2; + temp->h[2] = c->h[2] / 2; + temp->dmin = c->dmin / 2; + if (k & 4) temp->loc[0] += temp->h[0]; + if (k & 2) temp->loc[1] += temp->h[1]; + if (k & 1) temp->loc[2] += temp->h[2]; + temp->depth = c->depth + 1; + temp->split = 0; + temp->h_max = 0.0; + temp->dx_max = 0.0; + temp->nodeID = c->nodeID; + temp->parent = c; + c->progeny[k] = temp; } + /* Split the cell data. */ + cell_split(c); + + /* Remove any progeny with zero parts. */ + for (k = 0; k < 8; k++) + if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { + space_recycle(s, c->progeny[k]); + c->progeny[k] = NULL; + } else { + space_split(s, c->progeny[k]); + h_max = fmaxf(h_max, c->progeny[k]->h_max); + dt_min = fminf(dt_min, c->progeny[k]->dt_min); + dt_max = fmaxf(dt_max, c->progeny[k]->dt_max); + if (c->progeny[k]->maxdepth > maxdepth) + maxdepth = c->progeny[k]->maxdepth; + } + + /* Set the values for this cell. */ + c->h_max = h_max; + c->dt_min = dt_min; + c->dt_max = dt_max; + c->maxdepth = maxdepth; + + } + + /* Otherwise, collect the data for this cell. */ + else { + + /* Clear the progeny. */ + bzero(c->progeny, sizeof(struct cell *) * 8); + c->split = 0; + c->maxdepth = c->depth; + + /* Get dt_min/dt_max. */ + + for (k = 0; k < count; k++) { + p = &parts[k]; + xp = &xparts[k]; + xp->x_old[0] = p->x[0]; + xp->x_old[1] = p->x[1]; + xp->x_old[2] = p->x[2]; + dt = p->dt; + h = p->h; + if (h > h_max) h_max = h; + if (dt < dt_min) dt_min = dt; + if (dt > dt_max) dt_max = dt; + } + c->h_max = h_max; + c->dt_min = dt_min; + c->dt_max = dt_max; + } + + /* Set ownership accorind to the start of the parts array. */ + c->owner = ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts; +} /** * @brief Return a used cell to the cell buffer. @@ -1007,77 +972,71 @@ void space_split ( struct space *s , struct cell *c ) { * @param s The #space. * @param c The #cell. */ - -void space_recycle ( struct space *s , struct cell *c ) { - - /* Lock the space. */ - lock_lock( &s->lock ); - - /* Clear the cell. */ - if ( lock_destroy( &c->lock ) != 0 ) - error( "Failed to destroy spinlock." ); - - /* Clear this cell's sort arrays. */ - if ( c->sort != NULL ) - free( c->sort ); - - /* Clear the cell data. */ - bzero( c , sizeof(struct cell) ); - - /* Hook this cell into the buffer. */ - c->next = s->cells_new; - s->cells_new = c; - s->tot_cells -= 1; - - /* Unlock the space. */ - lock_unlock_blind( &s->lock ); - - } +void space_recycle(struct space *s, struct cell *c) { + + /* Lock the space. */ + lock_lock(&s->lock); + + /* Clear the cell. */ + if (lock_destroy(&c->lock) != 0) error("Failed to destroy spinlock."); + + /* Clear this cell's sort arrays. */ + if (c->sort != NULL) free(c->sort); + + /* Clear the cell data. */ + bzero(c, sizeof(struct cell)); + + /* Hook this cell into the buffer. */ + c->next = s->cells_new; + s->cells_new = c; + s->tot_cells -= 1; + + /* Unlock the space. */ + lock_unlock_blind(&s->lock); +} /** * @brief Get a new empty cell. * * @param s The #space. */ - -struct cell *space_getcell ( struct space *s ) { - struct cell *c; - int k; - - /* Lock the space. */ - lock_lock( &s->lock ); - - /* Is the buffer empty? */ - if ( s->cells_new == NULL ) { - if ( posix_memalign( (void *)&s->cells_new , 64 , space_cellallocchunk * sizeof(struct cell) ) != 0 ) - error( "Failed to allocate more cells." ); - bzero( s->cells_new , space_cellallocchunk * sizeof(struct cell) ); - for ( k = 0 ; k < space_cellallocchunk-1 ; k++ ) - s->cells_new[k].next = &s->cells_new[k+1]; - s->cells_new[ space_cellallocchunk-1 ].next = NULL; - } +struct cell *space_getcell(struct space *s) { - /* Pick off the next cell. */ - c = s->cells_new; - s->cells_new = c->next; - s->tot_cells += 1; - - /* Unlock the space. */ - lock_unlock_blind( &s->lock ); - - /* Init some things in the cell. */ - bzero( c , sizeof(struct cell) ); - c->nodeID = -1; - if ( lock_init( &c->lock ) != 0 || - lock_init( &c->glock ) != 0 ) - error( "Failed to initialize cell spinlocks." ); - - return c; + struct cell *c; + int k; - } + /* Lock the space. */ + lock_lock(&s->lock); + + /* Is the buffer empty? */ + if (s->cells_new == NULL) { + if (posix_memalign((void *)&s->cells_new, 64, + space_cellallocchunk * sizeof(struct cell)) != 0) + error("Failed to allocate more cells."); + bzero(s->cells_new, space_cellallocchunk * sizeof(struct cell)); + for (k = 0; k < space_cellallocchunk - 1; k++) + s->cells_new[k].next = &s->cells_new[k + 1]; + s->cells_new[space_cellallocchunk - 1].next = NULL; + } + + /* Pick off the next cell. */ + c = s->cells_new; + s->cells_new = c->next; + s->tot_cells += 1; + /* Unlock the space. */ + lock_unlock_blind(&s->lock); + + /* Init some things in the cell. */ + bzero(c, sizeof(struct cell)); + c->nodeID = -1; + if (lock_init(&c->lock) != 0 || lock_init(&c->glock) != 0) + error("Failed to initialize cell spinlocks."); + + return c; +} /** * @brief Split the space into cells given the array of particles. @@ -1095,78 +1054,76 @@ struct cell *space_getcell ( struct space *s ) { * recursively. */ - -void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ) { - - /* Store eveything in the space. */ - s->dim[0] = dim[0]; s->dim[1] = dim[1]; s->dim[2] = dim[2]; - s->periodic = periodic; - s->nr_parts = N; - s->size_parts = N; - s->parts = parts; - s->cell_min = h_max; - s->nr_queues = 1; - s->size_parts_foreign = 0; - - /* Check that all the particle positions are reasonable, wrap if periodic. */ - if ( periodic ) { - for ( int k = 0 ; k < N ; k++ ) - for ( int j = 0 ; j < 3 ; j++ ) { - while ( parts[k].x[j] < 0 ) parts[k].x[j] += dim[j]; - while ( parts[k].x[j] >= dim[j] ) parts[k].x[j] -= dim[j]; - } +void space_init(struct space *s, double dim[3], struct part *parts, int N, + int periodic, double h_max) { + + /* Store eveything in the space. */ + s->dim[0] = dim[0]; + s->dim[1] = dim[1]; + s->dim[2] = dim[2]; + s->periodic = periodic; + s->nr_parts = N; + s->size_parts = N; + s->parts = parts; + s->cell_min = h_max; + s->nr_queues = 1; + s->size_parts_foreign = 0; + + /* Check that all the particle positions are reasonable, wrap if periodic. */ + if (periodic) { + for (int k = 0; k < N; k++) + for (int j = 0; j < 3; j++) { + while (parts[k].x[j] < 0) parts[k].x[j] += dim[j]; + while (parts[k].x[j] >= dim[j]) parts[k].x[j] -= dim[j]; } - else { - for ( int k = 0 ; k < N ; k++ ) - for ( int j = 0 ; j < 3 ; j++ ) - if ( parts[k].x[j] < 0 || parts[k].x[j] >= dim[j] ) - error( "Not all particles are within the specified domain." ); + } else { + for (int k = 0; k < N; k++) + for (int j = 0; j < 3; j++) + if (parts[k].x[j] < 0 || parts[k].x[j] >= dim[j]) + error("Not all particles are within the specified domain."); + } + + /* Allocate the xtra parts array. */ + if (posix_memalign((void *)&s->xparts, part_align, + N * sizeof(struct xpart)) != 0) + error("Failed to allocate xparts."); + bzero(s->xparts, N * sizeof(struct xpart)); + + /* Initialize the velocities and internal energies. */ + for (int k = 0; k < N; k++) { + struct part *p = &parts[k]; + struct xpart *xp = &s->xparts[k]; + xp->v_hdt[0] = p->v[0]; + xp->v_hdt[1] = p->v[1]; + xp->v_hdt[2] = p->v[2]; + xp->u_hdt = p->u; + } + + /* For now, clone the parts to make gparts. */ + if (posix_memalign((void *)&s->gparts, part_align, + N * sizeof(struct gpart)) != 0) + error("Failed to allocate gparts."); + bzero(s->gparts, N * sizeof(struct gpart)); + /* for ( int k = 0 ; k < N ; k++ ) { + s->gparts[k].x[0] = s->parts[k].x[0]; + s->gparts[k].x[1] = s->parts[k].x[1]; + s->gparts[k].x[2] = s->parts[k].x[2]; + s->gparts[k].v[0] = s->parts[k].v[0]; + s->gparts[k].v[1] = s->parts[k].v[1]; + s->gparts[k].v[2] = s->parts[k].v[2]; + s->gparts[k].mass = s->parts[k].mass; + s->gparts[k].dt = s->parts[k].dt; + s->gparts[k].id = s->parts[k].id; + s->gparts[k].part = &s->parts[k]; + s->parts[k].gpart = &s->gparts[k]; } - - /* Allocate the xtra parts array. */ - if ( posix_memalign( (void *)&s->xparts , part_align , N * sizeof(struct xpart) ) != 0 ) - error( "Failed to allocate xparts." ); - bzero( s->xparts , N * sizeof(struct xpart) ); - - /* Initialize the velocities and internal energies. */ - for ( int k = 0 ; k < N ; k++ ) { - struct part *p = &parts[k]; - struct xpart *xp = &s->xparts[k]; - xp->v_hdt[0] = p->v[0]; - xp->v_hdt[1] = p->v[1]; - xp->v_hdt[2] = p->v[2]; - xp->u_hdt = p->u; - } - - - /* For now, clone the parts to make gparts. */ - if ( posix_memalign( (void *)&s->gparts , part_align , N * sizeof(struct gpart) ) != 0 ) - error( "Failed to allocate gparts." ); - bzero( s->gparts , N * sizeof(struct gpart) ); - /* for ( int k = 0 ; k < N ; k++ ) { - s->gparts[k].x[0] = s->parts[k].x[0]; - s->gparts[k].x[1] = s->parts[k].x[1]; - s->gparts[k].x[2] = s->parts[k].x[2]; - s->gparts[k].v[0] = s->parts[k].v[0]; - s->gparts[k].v[1] = s->parts[k].v[1]; - s->gparts[k].v[2] = s->parts[k].v[2]; - s->gparts[k].mass = s->parts[k].mass; - s->gparts[k].dt = s->parts[k].dt; - s->gparts[k].id = s->parts[k].id; - s->gparts[k].part = &s->parts[k]; - s->parts[k].gpart = &s->gparts[k]; - } - s->nr_gparts = s->nr_parts; */ - s->nr_gparts = 0; - s->size_gparts = s->size_parts; - - - /* Init the space lock. */ - if ( lock_init( &s->lock ) != 0 ) - error( "Failed to create space spin-lock." ); - - /* Build the cells and the tasks. */ - space_regrid( s , h_max ); - - } + s->nr_gparts = s->nr_parts; */ + s->nr_gparts = 0; + s->size_gparts = s->size_parts; + + /* Init the space lock. */ + if (lock_init(&s->lock) != 0) error("Failed to create space spin-lock."); + /* Build the cells and the tasks. */ + space_regrid(s, h_max); +} diff --git a/src/space.h b/src/space.h index 9d1f849d3b29b26d80b12a9767d6505040a1c74c..e0bad6773547f813d70943c2ca2703529a0306a8 100644 --- a/src/space.h +++ b/src/space.h @@ -1,39 +1,45 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_SPACE_H +#define SWIFT_SPACE_H +/* Includes. */ +#include "cell.h" +#include "part.h" - +/* Forward-declare the engine to avoid cyclic includes. */ +struct engine; /* Some constants. */ -#define space_maxdepth 10 -#define space_cellallocchunk 1000 -#define space_splitratio 0.875f -#define space_splitsize_default 400 -#define space_maxsize_default 8000000 -#define space_subsize_default 8000000 -#define space_stretch 1.10f -#define space_maxreldx 0.25f -#define space_qstack 2048 - +#define space_maxdepth 10 +#define space_cellallocchunk 1000 +#define space_splitratio 0.875f +#define space_splitsize_default 400 +#define space_maxsize_default 8000000 +#define space_subsize_default 8000000 +#define space_stretch 1.10f +#define space_maxreldx 0.25f +#define space_qstack 2048 /* Convert cell location to ID. */ -#define cell_getid( cdim , i , j , k ) ( (int)(k) + (cdim)[2]*( (int)(j) + (cdim)[1]*(int)(i) ) ) +#define cell_getid(cdim, i, j, k) \ + ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* Split size. */ extern int space_splitsize; @@ -42,83 +48,87 @@ extern int space_subsize; /* Map shift vector to sortlist. */ extern const int sortlistID[27]; - - + /* Entry in a list of sorted indices. */ struct entry { - float d; - int i; - }; - - + float d; + int i; +}; + /* The space in which the cells reside. */ struct space { - /* Spatial extent. */ - double dim[3]; - - /* Cell widths. */ - double h[3], ih[3]; - - /* The minimum and maximum cutoff radii. */ - double h_max, cell_min; - - /* Current time step for particles. */ - float dt_step; - - /* Current maximum displacement for particles. */ - float dx_max; - - /* Number of cells. */ - int nr_cells, tot_cells; - - /* Space dimensions in number of cells. */ - int maxdepth, cdim[3]; - - /* The (level 0) cells themselves. */ - struct cell *cells; - - /* Buffer of unused cells. */ - struct cell *cells_new; - - /* The particle data (cells have pointers to this). */ - struct part *parts; - struct xpart *xparts; - struct gpart *gparts; - - /* The total number of parts in the space. */ - int nr_parts, size_parts; - int nr_gparts, size_gparts; - - /* Is the space periodic? */ - int periodic; - - /* General-purpose lock for this space. */ - lock_type lock; - - /* Number of queues in the system. */ - int nr_queues; - - /* The associated engine. */ - struct engine *e; - - /* Buffers for parts that we will receive from foreign cells. */ - struct part *parts_foreign; - int nr_parts_foreign, size_parts_foreign; - - }; + /* Spatial extent. */ + double dim[3]; + + /* Cell widths. */ + double h[3], ih[3]; + + /* The minimum and maximum cutoff radii. */ + double h_max, cell_min; + + /* Current time step for particles. */ + float dt_step; + + /* Current maximum displacement for particles. */ + float dx_max; + + /* Number of cells. */ + int nr_cells, tot_cells; + + /* Space dimensions in number of cells. */ + int maxdepth, cdim[3]; + + /* The (level 0) cells themselves. */ + struct cell *cells; + + /* Buffer of unused cells. */ + struct cell *cells_new; + + /* The particle data (cells have pointers to this). */ + struct part *parts; + struct xpart *xparts; + struct gpart *gparts; + + /* The total number of parts in the space. */ + int nr_parts, size_parts; + int nr_gparts, size_gparts; + + /* Is the space periodic? */ + int periodic; + + /* General-purpose lock for this space. */ + lock_type lock; + + /* Number of queues in the system. */ + int nr_queues; + + /* The associated engine. */ + struct engine *e; + /* Buffers for parts that we will receive from foreign cells. */ + struct part *parts_foreign; + int nr_parts_foreign, size_parts_foreign; +}; /* function prototypes. */ -void parts_sort ( struct part *parts , struct xpart *xparts , int *ind , int N , int min , int max ); -void gparts_sort ( struct gpart *gparts , int *ind , int N , int min , int max ); -struct cell *space_getcell ( struct space *s ); -int space_getsid ( struct space *s , struct cell **ci , struct cell **cj , double *shift ); -void space_init ( struct space *s , double dim[3] , struct part *parts , int N , int periodic , double h_max ); -void space_map_cells_pre ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data ); -void space_map_parts ( struct space *s , void (*fun)( struct part *p , struct cell *c , void *data ) , void *data ); -void space_map_cells_post ( struct space *s , int full , void (*fun)( struct cell *c , void *data ) , void *data ); -void space_rebuild ( struct space *s , double h_max ); -void space_recycle ( struct space *s , struct cell *c ); -void space_split ( struct space *s , struct cell *c ); +void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, + int min, int max); +void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max); +struct cell *space_getcell(struct space *s); +int space_getsid(struct space *s, struct cell **ci, struct cell **cj, + double *shift); +void space_init(struct space *s, double dim[3], struct part *parts, int N, + int periodic, double h_max); +void space_map_cells_pre(struct space *s, int full, + void (*fun)(struct cell *c, void *data), void *data); +void space_map_parts(struct space *s, + void (*fun)(struct part *p, struct cell *c, void *data), + void *data); +void space_map_cells_post(struct space *s, int full, + void (*fun)(struct cell *c, void *data), void *data); +void space_rebuild(struct space *s, double h_max); +void space_recycle(struct space *s, struct cell *c); +void space_split(struct space *s, struct cell *c); +#endif /* SWIFT_SPACE_H */ diff --git a/src/swift.h b/src/swift.h index 7652191b2e9cfb864cc64e157eeda98627cdccdc..b302bca9b007cec47c96e1ab07770a3a3dc84966 100644 --- a/src/swift.h +++ b/src/swift.h @@ -1,47 +1,49 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_SWIFT_H +#define SWIFT_SWIFT_H /* Config parameters. */ #include "../config.h" /* Local headers. */ +#include "atomic.h" +#include "cell.h" #include "const.h" -#include "error.h" -#include "cycle.h" -#include "timers.h" #include "const.h" -#include "atomic.h" +#include "cycle.h" +#include "debug.h" +#include "engine.h" +#include "error.h" #include "lock.h" -#include "task.h" -#include "scheduler.h" -#include "part.h" #include "multipole.h" -#include "cell.h" -#include "space.h" +#include "parallel_io.h" +#include "part.h" #include "queue.h" #include "runner.h" -#include "engine.h" -#include "units.h" -#include "single_io.h" +#include "scheduler.h" #include "serial_io.h" -#include "parallel_io.h" -#include "debug.h" +#include "single_io.h" +#include "space.h" +#include "task.h" +#include "timers.h" +#include "units.h" #include "version.h" #ifdef LEGACY_GADGET2_SPH @@ -50,3 +52,5 @@ #include "runner_iact.h" #endif #include "runner_iact_grav.h" + +#endif /* SWIFT_SWIFT_H */ diff --git a/src/task.c b/src/task.c index 94bacd3766d33865da8a6cbf64a0eb2f3aa2bad2..949caab56c4c4d8a0e3c73d05014ebc5ad68657a 100644 --- a/src/task.c +++ b/src/task.c @@ -1,165 +1,144 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" /* Some standard headers. */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <pthread.h> -#include <math.h> #include <float.h> #include <limits.h> #include <sched.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> /* MPI headers. */ #ifdef WITH_MPI - #include <mpi.h> +#include <mpi.h> #endif +/* This object's header. */ +#include "task.h" + /* Local headers. */ -#include "const.h" -#include "cycle.h" #include "atomic.h" -#include "lock.h" -#include "space.h" -#include "part.h" -#include "multipole.h" -#include "cell.h" -#include "task.h" #include "error.h" +#include "lock.h" /* Task type names. */ -const char *taskID_names[task_type_count] = { - "none" , "sort" , "self" , "pair" , "sub" , "ghost" , - "kick1" , "kick2" , "send" , "recv" , "link" , "grav_pp" , - "grav_mm" , "grav_up" , "grav_down" }; - +const char *taskID_names[task_type_count] = { + "none", "sort", "self", "pair", "sub", + "ghost", "kick1", "kick2", "send", "recv", + "link", "grav_pp", "grav_mm", "grav_up", "grav_down"}; /** * @brief Unlock the cell held by this task. - * + * * @param t The #task. */ - -void task_unlock ( struct task *t ) { - - /* Act based on task type. */ - switch ( t->type ) { - case task_type_self: - case task_type_sort: - cell_unlocktree( t->ci ); - break; - case task_type_pair: - case task_type_sub: - cell_unlocktree( t->ci ); - if ( t->cj != NULL ) - cell_unlocktree( t->cj ); - break; - case task_type_grav_pp: - case task_type_grav_mm: - case task_type_grav_down: - cell_gunlocktree( t->ci ); - if ( t->cj != NULL ) - cell_gunlocktree( t->cj ); - break; - default: - break; - } - - } +void task_unlock(struct task *t) { + + /* Act based on task type. */ + switch (t->type) { + case task_type_self: + case task_type_sort: + cell_unlocktree(t->ci); + break; + case task_type_pair: + case task_type_sub: + cell_unlocktree(t->ci); + if (t->cj != NULL) cell_unlocktree(t->cj); + break; + case task_type_grav_pp: + case task_type_grav_mm: + case task_type_grav_down: + cell_gunlocktree(t->ci); + if (t->cj != NULL) cell_gunlocktree(t->cj); + break; + default: + break; + } +} /** * @brief Try to lock the cells associated with this task. * * @param t the #task. */ - -int task_lock ( struct task *t ) { - - int type = t->type; - struct cell *ci = t->ci, *cj = t->cj; - - /* Communication task? */ - if ( type == task_type_recv || - type == task_type_send ) { - - #ifdef WITH_MPI - /* Check the status of the MPI request. */ - int res, err; - MPI_Status stat; - if ( ( err = MPI_Test( &t->req , &res , &stat ) ) != MPI_SUCCESS ) { - char buff[ MPI_MAX_ERROR_STRING ]; - int len; - MPI_Error_string( err , buff , &len ); - error( "Failed to test request on send/recv task (tag=%i, %s)." , t->flags , buff ); - } - return res; - #else - error( "SWIFT was not compiled with MPI support." ); - #endif - - } - - /* Unary lock? */ - else if ( type == task_type_self || - type == task_type_sort || - (type == task_type_sub && cj == NULL) ) { - if ( cell_locktree( ci ) != 0 ) - return 0; - } - - /* Otherwise, binary lock. */ - else if ( type == task_type_pair || - ( type == task_type_sub && cj != NULL ) ) { - if ( ci->hold || cj->hold ) - return 0; - if ( cell_locktree( ci ) != 0 ) - return 0; - if ( cell_locktree( cj ) != 0 ) { - cell_unlocktree( ci ); - return 0; - } - } - - /* Gravity tasks? */ - else if ( type == task_type_grav_mm || - type == task_type_grav_pp || - type == task_type_grav_down ) { - if ( ci->ghold || ( cj != NULL && cj->ghold ) ) - return 0; - if ( cell_glocktree( ci ) != 0 ) - return 0; - if ( cj != NULL && cell_glocktree( cj ) != 0 ) { - cell_gunlocktree( ci ); - return 0; - } - } - - /* If we made it this far, we've got a lock. */ - return 1; - + +int task_lock(struct task *t) { + + int type = t->type; + struct cell *ci = t->ci, *cj = t->cj; + + /* Communication task? */ + if (type == task_type_recv || type == task_type_send) { + +#ifdef WITH_MPI + /* Check the status of the MPI request. */ + int res, err; + MPI_Status stat; + if ((err = MPI_Test(&t->req, &res, &stat)) != MPI_SUCCESS) { + char buff[MPI_MAX_ERROR_STRING]; + int len; + MPI_Error_string(err, buff, &len); + error("Failed to test request on send/recv task (tag=%i, %s).", t->flags, + buff); } + return res; +#else + error("SWIFT was not compiled with MPI support."); +#endif + + } + + /* Unary lock? */ + else if (type == task_type_self || type == task_type_sort || + (type == task_type_sub && cj == NULL)) { + if (cell_locktree(ci) != 0) return 0; + } + + /* Otherwise, binary lock. */ + else if (type == task_type_pair || (type == task_type_sub && cj != NULL)) { + if (ci->hold || cj->hold) return 0; + if (cell_locktree(ci) != 0) return 0; + if (cell_locktree(cj) != 0) { + cell_unlocktree(ci); + return 0; + } + } + + /* Gravity tasks? */ + else if (type == task_type_grav_mm || type == task_type_grav_pp || + type == task_type_grav_down) { + if (ci->ghold || (cj != NULL && cj->ghold)) return 0; + if (cell_glocktree(ci) != 0) return 0; + if (cj != NULL && cell_glocktree(cj) != 0) { + cell_gunlocktree(ci); + return 0; + } + } + /* If we made it this far, we've got a lock. */ + return 1; +} /** * @brief Remove all unlocks to tasks that are of the given type. @@ -167,23 +146,21 @@ int task_lock ( struct task *t ) { * @param t The #task. * @param type The task type ID to remove. */ - -void task_cleanunlock ( struct task *t , int type ) { - - int k; - - lock_lock( &t->lock ); - - for ( k = 0 ; k < t->nr_unlock_tasks ; k++ ) - if ( t->unlock_tasks[k]->type == type ) { - t->nr_unlock_tasks -= 1; - t->unlock_tasks[k] = t->unlock_tasks[ t->nr_unlock_tasks ]; - } - - lock_unlock_blind( &t->lock ); - + +void task_cleanunlock(struct task *t, int type) { + + int k; + + lock_lock(&t->lock); + + for (k = 0; k < t->nr_unlock_tasks; k++) + if (t->unlock_tasks[k]->type == type) { + t->nr_unlock_tasks -= 1; + t->unlock_tasks[k] = t->unlock_tasks[t->nr_unlock_tasks]; } + lock_unlock_blind(&t->lock); +} /** * @brief Remove an unlock_task from the given task. @@ -191,24 +168,22 @@ void task_cleanunlock ( struct task *t , int type ) { * @param ta The unlocking #task. * @param tb The #task that will be unlocked. */ - -void task_rmunlock ( struct task *ta , struct task *tb ) { - - int k; - - lock_lock( &ta->lock ); - - for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ ) - if ( ta->unlock_tasks[k] == tb ) { - ta->nr_unlock_tasks -= 1; - ta->unlock_tasks[k] = ta->unlock_tasks[ ta->nr_unlock_tasks ]; - lock_unlock_blind( &ta->lock ); - return; - } - error( "Task not found." ); +void task_rmunlock(struct task *ta, struct task *tb) { + + int k; + + lock_lock(&ta->lock); + + for (k = 0; k < ta->nr_unlock_tasks; k++) + if (ta->unlock_tasks[k] == tb) { + ta->nr_unlock_tasks -= 1; + ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks]; + lock_unlock_blind(&ta->lock); + return; } - + error("Task not found."); +} /** * @brief Remove an unlock_task from the given task. @@ -219,24 +194,22 @@ void task_rmunlock ( struct task *ta , struct task *tb ) { * Differs from #task_rmunlock in that it will not fail if * the task @c tb is not in the unlocks of @c ta. */ - -void task_rmunlock_blind ( struct task *ta , struct task *tb ) { - - int k; - - lock_lock( &ta->lock ); - - for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ ) - if ( ta->unlock_tasks[k] == tb ) { - ta->nr_unlock_tasks -= 1; - ta->unlock_tasks[k] = ta->unlock_tasks[ ta->nr_unlock_tasks ]; - break; - } - - lock_unlock_blind( &ta->lock ); +void task_rmunlock_blind(struct task *ta, struct task *tb) { + + int k; + + lock_lock(&ta->lock); + + for (k = 0; k < ta->nr_unlock_tasks; k++) + if (ta->unlock_tasks[k] == tb) { + ta->nr_unlock_tasks -= 1; + ta->unlock_tasks[k] = ta->unlock_tasks[ta->nr_unlock_tasks]; + break; } - + + lock_unlock_blind(&ta->lock); +} /** * @brief Add an unlock_task to the given task. @@ -244,43 +217,38 @@ void task_rmunlock_blind ( struct task *ta , struct task *tb ) { * @param ta The unlocking #task. * @param tb The #task that will be unlocked. */ - -void task_addunlock ( struct task *ta , struct task *tb ) { - error( "Use sched_addunlock instead." ); +void task_addunlock(struct task *ta, struct task *tb) { - /* Add the lock atomically. */ - ta->unlock_tasks[ atomic_inc( &ta->nr_unlock_tasks ) ] = tb; + error("Use sched_addunlock instead."); - /* Check a posteriori if we did not overshoot. */ - if ( ta->nr_unlock_tasks > task_maxunlock ) - error( "Too many unlock_tasks in task." ); - - } - - -void task_addunlock_old ( struct task *ta , struct task *tb ) { - - int k; - - lock_lock( &ta->lock ); - - /* Check if ta already unlocks tb. */ - for ( k = 0 ; k < ta->nr_unlock_tasks ; k++ ) - if ( ta->unlock_tasks[k] == tb ) { - error( "Duplicate unlock." ); - lock_unlock_blind( &ta->lock ); - return; - } - - if ( ta->nr_unlock_tasks == task_maxunlock ) - error( "Too many unlock_tasks in task." ); - - ta->unlock_tasks[ ta->nr_unlock_tasks] = tb; - ta->nr_unlock_tasks += 1; - - lock_unlock_blind( &ta->lock ); - + /* Add the lock atomically. */ + ta->unlock_tasks[atomic_inc(&ta->nr_unlock_tasks)] = tb; + + /* Check a posteriori if we did not overshoot. */ + if (ta->nr_unlock_tasks > task_maxunlock) + error("Too many unlock_tasks in task."); +} + +void task_addunlock_old(struct task *ta, struct task *tb) { + + int k; + + lock_lock(&ta->lock); + + /* Check if ta already unlocks tb. */ + for (k = 0; k < ta->nr_unlock_tasks; k++) + if (ta->unlock_tasks[k] == tb) { + error("Duplicate unlock."); + lock_unlock_blind(&ta->lock); + return; } - + if (ta->nr_unlock_tasks == task_maxunlock) + error("Too many unlock_tasks in task."); + + ta->unlock_tasks[ta->nr_unlock_tasks] = tb; + ta->nr_unlock_tasks += 1; + + lock_unlock_blind(&ta->lock); +} diff --git a/src/task.h b/src/task.h index 0505815ff2d5dcc186b30011a906458947589bd8..0d3a68e1e8a892d554f8fb83f8f16d7030d5a54c 100644 --- a/src/task.h +++ b/src/task.h @@ -1,92 +1,94 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_TASK_H +#define SWIFT_TASK_H +/* Includes. */ +#include "cell.h" +#include "cycle.h" /* Some constants. */ -#define task_maxwait 3 -#define task_maxunlock 15 - +#define task_maxwait 3 +#define task_maxunlock 15 /* The different task types. */ enum task_types { - task_type_none = 0, - task_type_sort, - task_type_self, - task_type_pair, - task_type_sub, - task_type_ghost, - task_type_kick1, - task_type_kick2, - task_type_send, - task_type_recv, - task_type_link, - task_type_grav_pp, - task_type_grav_mm, - task_type_grav_up, - task_type_grav_down, - task_type_count - }; - + task_type_none = 0, + task_type_sort, + task_type_self, + task_type_pair, + task_type_sub, + task_type_ghost, + task_type_kick1, + task_type_kick2, + task_type_send, + task_type_recv, + task_type_link, + task_type_grav_pp, + task_type_grav_mm, + task_type_grav_up, + task_type_grav_down, + task_type_count +}; + extern const char *taskID_names[]; - - + /* The different task sub-types. */ enum task_subtypes { - task_subtype_none = 0, - task_subtype_density, - task_subtype_force, - task_subtype_grav, - task_subtype_count - }; - + task_subtype_none = 0, + task_subtype_density, + task_subtype_force, + task_subtype_grav, + task_subtype_count +}; + extern const char *taskID_names[]; - - + /* Data of a task. */ struct task { - enum task_types type; - enum task_subtypes subtype; - char skip, tight, implicit; - int flags, wait, rank, weight; - - lock_type lock; - - struct cell *ci, *cj; - - #ifdef WITH_MPI - MPI_Request req; - #endif - - int rid; - ticks tic, toc; - - int nr_unlock_tasks; - struct task *unlock_tasks[ task_maxunlock + 1 ]; + enum task_types type; + enum task_subtypes subtype; + char skip, tight, implicit; + int flags, wait, rank, weight; - }; + lock_type lock; + struct cell *ci, *cj; + +#ifdef WITH_MPI + MPI_Request req; +#endif + + int rid; + ticks tic, toc; + + int nr_unlock_tasks; + struct task *unlock_tasks[task_maxunlock + 1]; +}; /* Function prototypes. */ -void task_rmunlock( struct task *ta , struct task *tb ); -void task_rmunlock_blind( struct task *ta , struct task *tb ); -void task_cleanunlock ( struct task *t , int type ); -void task_addunlock( struct task *ta , struct task *tb ); -void task_unlock ( struct task *t ); -int task_lock ( struct task *t ); +void task_rmunlock(struct task *ta, struct task *tb); +void task_rmunlock_blind(struct task *ta, struct task *tb); +void task_cleanunlock(struct task *t, int type); +void task_addunlock(struct task *ta, struct task *tb); +void task_unlock(struct task *t); +int task_lock(struct task *t); + +#endif /* SWIFT_TASK_H */ diff --git a/src/timers.c b/src/timers.c index 0bc1a85d92cacaa79837f59ad3fc17c7d66f3259..01a77d7804241f108b092f7d6857c90be3861cd0 100644 --- a/src/timers.c +++ b/src/timers.c @@ -1,33 +1,30 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" -/* Local headers. */ -#include "cycle.h" +/* This object's header. */ #include "timers.h" - /* The timers. */ -ticks timers[ timer_count ]; - +ticks timers[timer_count]; /** * @brief Re-set the timers. @@ -36,14 +33,12 @@ ticks timers[ timer_count ]; * * To reset all timers, use the mask #timers_mask_all. */ - -void timers_reset ( unsigned int mask ) { - int k; - - /* Loop over the timers and set the masked ones to zero. */ - for ( k = 0 ; k < timer_count ; k++ ) - if ( mask & ( 1 << k ) ) - timers[ k ] = 0; +void timers_reset(unsigned int mask) { + + int k; - } + /* Loop over the timers and set the masked ones to zero. */ + for (k = 0; k < timer_count; k++) + if (mask & (1 << k)) timers[k] = 0; +} diff --git a/src/timers.h b/src/timers.h index 58c48ac2444e2cd615f711213474d729b4bbbe70..38ca81222ffb33b1558dcc4d7ee3a0cc1a71cd20 100644 --- a/src/timers.h +++ b/src/timers.h @@ -1,82 +1,86 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_TIMERS_H +#define SWIFT_TIMERS_H +/* Includes. */ +#include "cycle.h" #include "inline.h" /* The timers themselves. */ enum { - timer_none = 0, - timer_prepare, - timer_kick1, - timer_kick2, - timer_dosort, - timer_doself_density, - timer_doself_force, - timer_doself_grav, - timer_dopair_density, - timer_dopair_force, - timer_dopair_grav, - timer_dosub_density, - timer_dosub_force, - timer_dosub_grav, - timer_dopair_subset, - timer_doghost, - timer_gettask, - timer_qget, - timer_qsteal, - timer_runners, - timer_step, - timer_count, - }; - + timer_none = 0, + timer_prepare, + timer_kick1, + timer_kick2, + timer_dosort, + timer_doself_density, + timer_doself_force, + timer_doself_grav, + timer_dopair_density, + timer_dopair_force, + timer_dopair_grav, + timer_dosub_density, + timer_dosub_force, + timer_dosub_grav, + timer_dopair_subset, + timer_doghost, + timer_gettask, + timer_qget, + timer_qsteal, + timer_runners, + timer_step, + timer_count, +}; + /* The timers. */ -extern ticks timers[ timer_count ]; +extern ticks timers[timer_count]; /* Mask for all timers. */ -#define timers_mask_all ( (1 << timer_count) - 1 ) - +#define timers_mask_all ((1 << timer_count) - 1) /* Define the timer macros. */ #ifdef TIMER_VERBOSE - #ifndef TIMER - #define TIMER - #endif +#ifndef TIMER +#define TIMER +#endif #endif #ifdef TIMER - #define TIMER_TIC_ND tic = getticks(); - #define TIMER_TIC2_ND ticks tic2 = getticks(); - #define TIMER_TIC ticks tic = getticks(); - #define TIMER_TOC(t) timers_toc( t , tic ) - #define TIMER_TIC2 ticks tic2 = getticks(); - #define TIMER_TOC2(t) timers_toc( t , tic2 ) - INLINE static ticks timers_toc ( int t , ticks tic ) { - ticks d = (getticks() - tic); - __sync_add_and_fetch( &timers[t] , d ); - return d; - } +#define TIMER_TIC_ND tic = getticks(); +#define TIMER_TIC2_ND ticks tic2 = getticks(); +#define TIMER_TIC ticks tic = getticks(); +#define TIMER_TOC(t) timers_toc(t, tic) +#define TIMER_TIC2 ticks tic2 = getticks(); +#define TIMER_TOC2(t) timers_toc(t, tic2) +INLINE static ticks timers_toc(int t, ticks tic) { + ticks d = (getticks() - tic); + __sync_add_and_fetch(&timers[t], d); + return d; +} #else - #define TIMER_TIC - #define TIMER_TOC(t) - #define TIMER_TIC2 - #define TIMER_TOC2(t) +#define TIMER_TIC +#define TIMER_TOC(t) +#define TIMER_TIC2 +#define TIMER_TOC2(t) #endif - /* Function prototypes. */ -void timers_reset ( unsigned int mask ); +void timers_reset(unsigned int mask); + +#endif /* SWIFT_TIMERS_H */ diff --git a/src/units.c b/src/units.c index ffca1974205936fe50dc770ba7eaa73895273737..af705323bdd8089c5ae22f11c49975bfe01c5f83 100644 --- a/src/units.c +++ b/src/units.c @@ -2,52 +2,56 @@ * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk), * Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ /* Config parameters. */ #include "../config.h" - +/* Some standard headers. */ +#include <math.h> +#include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> -#include <stddef.h> -#include <math.h> + +/* MPI headers. */ #ifdef WITH_MPI #include <mpi.h> #endif +/* This object's header. */ +#include "units.h" + +/* Includes. */ #include "const.h" -#include "cycle.h" -#include "part.h" #include "error.h" #include "units.h" - /** - * @brief Initialises the UnitSystem structure with the constants given in const.h + * @brief Initialises the UnitSystem structure with the constants given in + * const.h * @param us The UnitSystem to initialize */ -void initUnitSystem(struct UnitSystem* us) -{ +void initUnitSystem(struct UnitSystem* us) { us->UnitMass_in_cgs = const_unit_mass_in_cgs; us->UnitLength_in_cgs = const_unit_length_in_cgs; - us->UnitTime_in_cgs = 1. / ((double) const_unit_velocity_in_cgs / ( (double)const_unit_length_in_cgs )); + us->UnitTime_in_cgs = 1. / ((double)const_unit_velocity_in_cgs / + ((double)const_unit_length_in_cgs)); us->UnitCurrent_in_cgs = 1.; us->UnitTemperature_in_cgs = 1.; } @@ -55,147 +59,214 @@ void initUnitSystem(struct UnitSystem* us) /** * @brief Returns the base unit conversion factor for a given unit system * @param us The UnitSystem used - * @param baseUnit The base unit + * @param baseUnit The base unit */ -double getBaseUnit(struct UnitSystem* us, enum BaseUnits baseUnit) -{ - switch(baseUnit) - { - case UNIT_MASS: return us->UnitMass_in_cgs; - case UNIT_LENGTH: return us->UnitLength_in_cgs; - case UNIT_TIME: return us->UnitTime_in_cgs; - case UNIT_CURRENT: return us->UnitCurrent_in_cgs; - case UNIT_TEMPERATURE: return us->UnitTemperature_in_cgs; - default: error( "Invalid base Unit" ); - } +double getBaseUnit(struct UnitSystem* us, enum BaseUnits baseUnit) { + switch (baseUnit) { + case UNIT_MASS: + return us->UnitMass_in_cgs; + case UNIT_LENGTH: + return us->UnitLength_in_cgs; + case UNIT_TIME: + return us->UnitTime_in_cgs; + case UNIT_CURRENT: + return us->UnitCurrent_in_cgs; + case UNIT_TEMPERATURE: + return us->UnitTemperature_in_cgs; + default: + error("Invalid base Unit"); + } } /** * @brief Returns the base unit symbol - * @param baseUnit The base unit + * @param baseUnit The base unit */ -const char* getBaseUnitSymbol(enum BaseUnits baseUnit) -{ - switch(baseUnit) - { - case UNIT_MASS: return "U_M"; - case UNIT_LENGTH: return "U_L"; - case UNIT_TIME: return "U_t"; - case UNIT_CURRENT: return "U_I"; - case UNIT_TEMPERATURE: return "U_T"; - default: error( "Invalid base Unit" ); - } +const char* getBaseUnitSymbol(enum BaseUnits baseUnit) { + switch (baseUnit) { + case UNIT_MASS: + return "U_M"; + case UNIT_LENGTH: + return "U_L"; + case UNIT_TIME: + return "U_t"; + case UNIT_CURRENT: + return "U_I"; + case UNIT_TEMPERATURE: + return "U_T"; + default: + error("Invalid base Unit"); + } } - /** * @brief Returns the base unit symbol in the cgs system - * @param baseUnit The base unit + * @param baseUnit The base unit */ -const char* getBaseUnitCGSSymbol(enum BaseUnits baseUnit) -{ - switch(baseUnit) - { - case UNIT_MASS: return "g"; - case UNIT_LENGTH: return "cm"; - case UNIT_TIME: return "s"; - case UNIT_CURRENT: return "A"; - case UNIT_TEMPERATURE: return "K"; - default: error( "Invalid base Unit" ); - } +const char* getBaseUnitCGSSymbol(enum BaseUnits baseUnit) { + switch (baseUnit) { + case UNIT_MASS: + return "g"; + case UNIT_LENGTH: + return "cm"; + case UNIT_TIME: + return "s"; + case UNIT_CURRENT: + return "A"; + case UNIT_TEMPERATURE: + return "K"; + default: + error("Invalid base Unit"); + } } - -void getBaseUnitExponantsArray(float baseUnitsExp[5], enum UnitConversionFactor unit) -{ - switch( unit ) - { +void getBaseUnitExponantsArray(float baseUnitsExp[5], + enum UnitConversionFactor unit) { + switch (unit) { case UNIT_CONV_NO_UNITS: break; - case UNIT_CONV_MASS: - baseUnitsExp[UNIT_MASS] = 1.f; break; + case UNIT_CONV_MASS: + baseUnitsExp[UNIT_MASS] = 1.f; + break; - case UNIT_CONV_LENGTH: - baseUnitsExp[UNIT_LENGTH] = 1.f; break; + case UNIT_CONV_LENGTH: + baseUnitsExp[UNIT_LENGTH] = 1.f; + break; - case UNIT_CONV_TIME: - baseUnitsExp[UNIT_TIME] = 1.f; break; + case UNIT_CONV_TIME: + baseUnitsExp[UNIT_TIME] = 1.f; + break; - case UNIT_CONV_FREQUENCY: - baseUnitsExp[UNIT_TIME] = -1.f; break; + case UNIT_CONV_FREQUENCY: + baseUnitsExp[UNIT_TIME] = -1.f; + break; - case UNIT_CONV_DENSITY: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = -3.f; break; + case UNIT_CONV_DENSITY: + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = -3.f; + break; - case UNIT_CONV_SPEED: - baseUnitsExp[UNIT_LENGTH] = 1.f; baseUnitsExp[UNIT_TIME] = -1.f; break; + case UNIT_CONV_SPEED: + baseUnitsExp[UNIT_LENGTH] = 1.f; + baseUnitsExp[UNIT_TIME] = -1.f; + break; - case UNIT_CONV_ACCELERATION: - baseUnitsExp[UNIT_LENGTH] = 1.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_ACCELERATION: + baseUnitsExp[UNIT_LENGTH] = 1.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; - case UNIT_CONV_FORCE: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 1.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_FORCE: + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 1.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; - case UNIT_CONV_ENERGY: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_ENERGY: + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; - case UNIT_CONV_ENERGY_PER_UNIT_MASS: - baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_ENERGY_PER_UNIT_MASS: + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; - case UNIT_CONV_ENTROPY: - baseUnitsExp[UNIT_MASS] = 1.f - const_hydro_gamma; baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_ENTROPY: + baseUnitsExp[UNIT_MASS] = 1.f - const_hydro_gamma; + baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; - case UNIT_CONV_ENTROPY_PER_UNIT_MASS: - baseUnitsExp[UNIT_MASS] = -const_hydro_gamma; baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_ENTROPY_PER_UNIT_MASS: + baseUnitsExp[UNIT_MASS] = -const_hydro_gamma; + baseUnitsExp[UNIT_LENGTH] = 3.f * const_hydro_gamma - 1.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; - case UNIT_CONV_POWER: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -3.f; break; + case UNIT_CONV_POWER: + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -3.f; + break; - case UNIT_CONV_PRESSURE: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = -1.f; baseUnitsExp[UNIT_TIME] = -2.f; break; + case UNIT_CONV_PRESSURE: + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = -1.f; + baseUnitsExp[UNIT_TIME] = -2.f; + break; case UNIT_CONV_ELECTRIC_CHARGE: - baseUnitsExp[UNIT_TIME] = 1.f; baseUnitsExp[UNIT_CURRENT] = 1.f; break; + baseUnitsExp[UNIT_TIME] = 1.f; + baseUnitsExp[UNIT_CURRENT] = 1.f; + break; case UNIT_CONV_ELECTRIC_VOLTAGE: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -3.f; baseUnitsExp[UNIT_CURRENT] = -1.f; break; - + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -3.f; + baseUnitsExp[UNIT_CURRENT] = -1.f; + break; + case UNIT_CONV_ELECTRIC_CAPACITANCE: - baseUnitsExp[UNIT_MASS] = -1.f; baseUnitsExp[UNIT_LENGTH] = -2.f; baseUnitsExp[UNIT_TIME] = 4; baseUnitsExp[UNIT_CURRENT] = 2.f; break; + baseUnitsExp[UNIT_MASS] = -1.f; + baseUnitsExp[UNIT_LENGTH] = -2.f; + baseUnitsExp[UNIT_TIME] = 4; + baseUnitsExp[UNIT_CURRENT] = 2.f; + break; case UNIT_CONV_ELECTRIC_RESISTANCE: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -3.f; baseUnitsExp[UNIT_CURRENT] = -2.f; break; + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -3.f; + baseUnitsExp[UNIT_CURRENT] = -2.f; + break; case UNIT_CONV_ELECTRIC_CONDUCTANCE: - baseUnitsExp[UNIT_MASS] = -1.f; baseUnitsExp[UNIT_LENGTH] = -2.f; baseUnitsExp[UNIT_TIME] = 3.f; baseUnitsExp[UNIT_CURRENT] = 2.f; break; - + baseUnitsExp[UNIT_MASS] = -1.f; + baseUnitsExp[UNIT_LENGTH] = -2.f; + baseUnitsExp[UNIT_TIME] = 3.f; + baseUnitsExp[UNIT_CURRENT] = 2.f; + break; + case UNIT_CONV_MAGNETIC_FLUX: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f; baseUnitsExp[UNIT_CURRENT] = -1.f; break; - + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -2.f; + baseUnitsExp[UNIT_CURRENT] = -1.f; + break; + case UNIT_CONV_MAGNETIC_FIELD: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_TIME] = -2.f; baseUnitsExp[UNIT_CURRENT] = -1.f; break; + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_TIME] = -2.f; + baseUnitsExp[UNIT_CURRENT] = -1.f; + break; case UNIT_CONV_MAGNETIC_INDUCTANCE: - baseUnitsExp[UNIT_MASS] = 1.f; baseUnitsExp[UNIT_LENGTH] = 2.f; baseUnitsExp[UNIT_TIME] = -2.f; baseUnitsExp[UNIT_CURRENT] = -2.f; break; + baseUnitsExp[UNIT_MASS] = 1.f; + baseUnitsExp[UNIT_LENGTH] = 2.f; + baseUnitsExp[UNIT_TIME] = -2.f; + baseUnitsExp[UNIT_CURRENT] = -2.f; + break; case UNIT_CONV_TEMPERATURE: baseUnitsExp[UNIT_TEMPERATURE] = 1.f; - } + } } - /** - * @brief Returns the conversion factor for a given unit in the chosen unit system + * @brief Returns the conversion factor for a given unit in the chosen unit + * system * @param us The system of units in use * @param unit The unit to convert */ -double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit) -{ - float baseUnitsExp[5] = { 0.f }; +double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit) { + float baseUnitsExp[5] = {0.f}; getBaseUnitExponantsArray(baseUnitsExp, unit); - + return generalConversionFactor(us, baseUnitsExp); } @@ -204,102 +275,101 @@ double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit) * @param us The system of units in use * @param unit The unit to convert */ -float hFactor(struct UnitSystem* us, enum UnitConversionFactor unit) -{ - float baseUnitsExp[5] = { 0.f }; +float hFactor(struct UnitSystem* us, enum UnitConversionFactor unit) { + float baseUnitsExp[5] = {0.f}; getBaseUnitExponantsArray(baseUnitsExp, unit); - - return generalhFactor(us, baseUnitsExp); + return generalhFactor(us, baseUnitsExp); } - /** * @brief Returns the scaling factor exponentiation for a given unit * @param us The system of units in use * @param unit The unit to convert */ -float aFactor(struct UnitSystem* us, enum UnitConversionFactor unit) -{ - float baseUnitsExp[5] = { 0.f }; +float aFactor(struct UnitSystem* us, enum UnitConversionFactor unit) { + float baseUnitsExp[5] = {0.f}; getBaseUnitExponantsArray(baseUnitsExp, unit); - - return generalaFactor(us, baseUnitsExp); + return generalaFactor(us, baseUnitsExp); } - /** - * @brief Returns a string containg the exponants of the base units making up the conversion factors + * @brief Returns a string containg the exponants of the base units making up + * the conversion factors */ -void conversionString(char * buffer, struct UnitSystem* us, enum UnitConversionFactor unit) -{ - float baseUnitsExp[5] = { 0.f }; +void conversionString(char* buffer, struct UnitSystem* us, + enum UnitConversionFactor unit) { + float baseUnitsExp[5] = {0.f}; getBaseUnitExponantsArray(baseUnitsExp, unit); - + generalConversionString(buffer, us, baseUnitsExp); } - - /** - * @brief Returns the conversion factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system + * @brief Returns the conversion factor for a given unit (expressed in terms of + * the 5 fundamental units) in the chosen unit system * @param us The unit system used - * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example + * @param baseUnitsExponants The exponant of each base units required to form + * the desired quantity. See conversionFactor() for a working example */ -double generalConversionFactor(struct UnitSystem* us, float baseUnitsExponants[5]) -{ +double generalConversionFactor(struct UnitSystem* us, + float baseUnitsExponants[5]) { double factor = 1.; int i; - for(i = 0 ; i < 5 ; ++i ) - if(baseUnitsExponants[i] != 0) - factor *= pow( getBaseUnit( us, i ) , baseUnitsExponants[i] ); - return factor; + for (i = 0; i < 5; ++i) + if (baseUnitsExponants[i] != 0) + factor *= pow(getBaseUnit(us, i), baseUnitsExponants[i]); + return factor; } - /** - * @brief Returns the h factor exponentiation for a given unit (expressed in terms of the 5 fundamental units) + * @brief Returns the h factor exponentiation for a given unit (expressed in + * terms of the 5 fundamental units) * @param us The unit system used - * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example + * @param baseUnitsExponants The exponant of each base units required to form + * the desired quantity. See conversionFactor() for a working example */ -float generalhFactor(struct UnitSystem* us, float baseUnitsExponants[5]) -{ +float generalhFactor(struct UnitSystem* us, float baseUnitsExponants[5]) { float factor_exp = 0.f; - + factor_exp += -baseUnitsExponants[UNIT_MASS]; factor_exp += -baseUnitsExponants[UNIT_LENGTH]; factor_exp += -baseUnitsExponants[UNIT_TIME]; - + return factor_exp; } /** - * @brief Returns the scaling factor exponentiation for a given unit (expressed in terms of the 5 fundamental units) + * @brief Returns the scaling factor exponentiation for a given unit (expressed + * in terms of the 5 fundamental units) * @param us The unit system used - * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example + * @param baseUnitsExponants The exponant of each base units required to form + * the desired quantity. See conversionFactor() for a working example */ -float generalaFactor(struct UnitSystem* us, float baseUnitsExponants[5]) -{ +float generalaFactor(struct UnitSystem* us, float baseUnitsExponants[5]) { float factor_exp = 0.f; - + factor_exp += baseUnitsExponants[UNIT_LENGTH]; - - return factor_exp; + + return factor_exp; } /** - * @brief Returns a string containg the exponants of the base units making up the conversion factors (expressed in terms of the 5 fundamental units) - * @param buffer The buffer in which to write (The buffer must be long enough, 140 chars at most) + * @brief Returns a string containg the exponants of the base units making up + * the conversion factors (expressed in terms of the 5 fundamental units) + * @param buffer The buffer in which to write (The buffer must be long enough, + * 140 chars at most) * @param us The UnistSystem in use. - * @param baseUnitsExponants The exponant of each base units required to form the desired quantity. See conversionFactor() for a working example + * @param baseUnitsExponants The exponant of each base units required to form + * the desired quantity. See conversionFactor() for a working example */ -void generalConversionString(char * buffer, struct UnitSystem* us, float baseUnitsExponants[5]) -{ +void generalConversionString(char* buffer, struct UnitSystem* us, + float baseUnitsExponants[5]) { char temp[14]; double a_exp = generalaFactor(us, baseUnitsExponants); double h_exp = generalhFactor(us, baseUnitsExponants); @@ -307,72 +377,68 @@ void generalConversionString(char * buffer, struct UnitSystem* us, float baseUni /* Check whether we are unitless or not */ char isAllNonZero = 1; - for(i = 0 ; i < 5 ; ++i ) - if( baseUnitsExponants[i] != 0.) - isAllNonZero = 0; - - if( isAllNonZero ) - { - sprintf(buffer, "[ - ] "); - return; - } + for (i = 0; i < 5; ++i) + if (baseUnitsExponants[i] != 0.) isAllNonZero = 0; + if (isAllNonZero) { + sprintf(buffer, "[ - ] "); + return; + } /* Add a-factor */ - if(a_exp == 0) + if (a_exp == 0) sprintf(buffer, " "); - else if(a_exp == 1) + else if (a_exp == 1) sprintf(buffer, "a "); - else if(remainder(a_exp, 1.) == 0) - sprintf(buffer, "a^%d ", (int) a_exp); + else if (remainder(a_exp, 1.) == 0) + sprintf(buffer, "a^%d ", (int)a_exp); else sprintf(buffer, "a^%7.4f ", a_exp); /* Add h-factor */ - if(h_exp == 0) + if (h_exp == 0) sprintf(temp, " "); - else if(h_exp == 1) + else if (h_exp == 1) sprintf(temp, "h "); - else if(remainder(h_exp, 1.) == 0) - sprintf(temp, "h^%d ", (int) h_exp); + else if (remainder(h_exp, 1.) == 0) + sprintf(temp, "h^%d ", (int)h_exp); else sprintf(temp, "h^%7.4f ", h_exp); strncat(buffer, temp, 12); /* Add conversion units */ - for(i = 0 ; i < 5 ; ++i ) - if(baseUnitsExponants[i] != 0) - { - if(baseUnitsExponants[i] == 0.) - sprintf(temp, " "); - else if(baseUnitsExponants[i] == 1.) - sprintf(temp, "%s ", getBaseUnitSymbol(i)); - else if(remainder(baseUnitsExponants[i], 1.) == 0) - sprintf(temp, "%s^%d ", getBaseUnitSymbol(i), (int) baseUnitsExponants[i]); - else - sprintf(temp, "%s^%7.4f ", getBaseUnitSymbol(i), baseUnitsExponants[i]); - strncat(buffer, temp, 12); - } - + for (i = 0; i < 5; ++i) + if (baseUnitsExponants[i] != 0) { + if (baseUnitsExponants[i] == 0.) + sprintf(temp, " "); + else if (baseUnitsExponants[i] == 1.) + sprintf(temp, "%s ", getBaseUnitSymbol(i)); + else if (remainder(baseUnitsExponants[i], 1.) == 0) + sprintf(temp, "%s^%d ", getBaseUnitSymbol(i), + (int)baseUnitsExponants[i]); + else + sprintf(temp, "%s^%7.4f ", getBaseUnitSymbol(i), baseUnitsExponants[i]); + strncat(buffer, temp, 12); + } /* Add CGS units */ strncat(buffer, " [ ", 3); - - for(i = 0 ; i < 5 ; ++i ) - { - if(baseUnitsExponants[i] != 0) - { - if(baseUnitsExponants[i] == 0.) - continue; - else if(baseUnitsExponants[i] == 1.) - sprintf(temp, "%s ", getBaseUnitCGSSymbol(i)); - else if(remainder(baseUnitsExponants[i], 1.) == 0) - sprintf(temp, "%s^%d ", getBaseUnitCGSSymbol(i), (int) baseUnitsExponants[i]); - else - sprintf(temp, "%s^%7.4f ", getBaseUnitCGSSymbol(i), baseUnitsExponants[i]); - strncat(buffer, temp, 12); - } + + for (i = 0; i < 5; ++i) { + if (baseUnitsExponants[i] != 0) { + if (baseUnitsExponants[i] == 0.) + continue; + else if (baseUnitsExponants[i] == 1.) + sprintf(temp, "%s ", getBaseUnitCGSSymbol(i)); + else if (remainder(baseUnitsExponants[i], 1.) == 0) + sprintf(temp, "%s^%d ", getBaseUnitCGSSymbol(i), + (int)baseUnitsExponants[i]); + else + sprintf(temp, "%s^%7.4f ", getBaseUnitCGSSymbol(i), + baseUnitsExponants[i]); + strncat(buffer, temp, 12); } - + } + strncat(buffer, "]", 2); } diff --git a/src/units.h b/src/units.h index 40eb88f4fa0255849a5bdab3d6bebd59c5d9dad7..ba69443ed883446e34894e4f6a47dfc28694ea31 100644 --- a/src/units.h +++ b/src/units.h @@ -1,90 +1,95 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ - +#ifndef SWIFT_UNITS_H +#define SWIFT_UNITS_H /** * @brief The unit system used internally. * - * This structure contains the conversion factors to the 7 cgs base units to the internal units. + * This structure contains the conversion factors to the 7 cgs base units to the + *internal units. * It is used everytime a conversion is performed or an i/o function is called. * **/ -struct UnitSystem -{ - double UnitMass_in_cgs; /*< Conversion factor from grams to internal mass units */ +struct UnitSystem { + double UnitMass_in_cgs; /*< Conversion factor from grams to internal mass + units */ - double UnitLength_in_cgs; /*< Conversion factor from centimeters to internal length units. */ + double UnitLength_in_cgs; /*< Conversion factor from centimeters to internal + length units. */ - double UnitTime_in_cgs; /*< Conversion factor from seconds to internal time units. */ + double UnitTime_in_cgs; /*< Conversion factor from seconds to internal time + units. */ - double UnitCurrent_in_cgs; /*< Conversion factor from Ampere to internal current units. */ + double UnitCurrent_in_cgs; /*< Conversion factor from Ampere to internal + current units. */ - double UnitTemperature_in_cgs; /*< Conversion factor from Kelvins to internal temperature units. */ + double + UnitTemperature_in_cgs; /*< Conversion factor from Kelvins to internal + temperature units. */ }; /** - * @brief The base units used in the cgs (and internal) system. All units are derived from those. + * @brief The base units used in the cgs (and internal) system. All units are + * derived from those. */ -enum BaseUnits - { - UNIT_MASS = 0, - UNIT_LENGTH = 1, - UNIT_TIME = 2, - UNIT_CURRENT = 3, - UNIT_TEMPERATURE = 4 - }; - +enum BaseUnits { + UNIT_MASS = 0, + UNIT_LENGTH = 1, + UNIT_TIME = 2, + UNIT_CURRENT = 3, + UNIT_TEMPERATURE = 4 +}; /** * @brief The different conversion factors supported by default */ -enum UnitConversionFactor - { - UNIT_CONV_NO_UNITS, - UNIT_CONV_MASS, - UNIT_CONV_LENGTH, - UNIT_CONV_TIME, - UNIT_CONV_DENSITY, - UNIT_CONV_SPEED, - UNIT_CONV_ACCELERATION, - UNIT_CONV_FORCE, - UNIT_CONV_ENERGY, - UNIT_CONV_ENERGY_PER_UNIT_MASS, - UNIT_CONV_ENTROPY, - UNIT_CONV_ENTROPY_PER_UNIT_MASS, - UNIT_CONV_POWER, - UNIT_CONV_PRESSURE, - UNIT_CONV_FREQUENCY, - UNIT_CONV_ELECTRIC_CHARGE, - UNIT_CONV_ELECTRIC_VOLTAGE, - UNIT_CONV_ELECTRIC_CAPACITANCE, - UNIT_CONV_ELECTRIC_RESISTANCE, - UNIT_CONV_ELECTRIC_CONDUCTANCE, - UNIT_CONV_MAGNETIC_FLUX, - UNIT_CONV_MAGNETIC_FIELD, - UNIT_CONV_MAGNETIC_INDUCTANCE, - UNIT_CONV_TEMPERATURE - }; - +enum UnitConversionFactor { + UNIT_CONV_NO_UNITS, + UNIT_CONV_MASS, + UNIT_CONV_LENGTH, + UNIT_CONV_TIME, + UNIT_CONV_DENSITY, + UNIT_CONV_SPEED, + UNIT_CONV_ACCELERATION, + UNIT_CONV_FORCE, + UNIT_CONV_ENERGY, + UNIT_CONV_ENERGY_PER_UNIT_MASS, + UNIT_CONV_ENTROPY, + UNIT_CONV_ENTROPY_PER_UNIT_MASS, + UNIT_CONV_POWER, + UNIT_CONV_PRESSURE, + UNIT_CONV_FREQUENCY, + UNIT_CONV_ELECTRIC_CHARGE, + UNIT_CONV_ELECTRIC_VOLTAGE, + UNIT_CONV_ELECTRIC_CAPACITANCE, + UNIT_CONV_ELECTRIC_RESISTANCE, + UNIT_CONV_ELECTRIC_CONDUCTANCE, + UNIT_CONV_MAGNETIC_FLUX, + UNIT_CONV_MAGNETIC_FIELD, + UNIT_CONV_MAGNETIC_INDUCTANCE, + UNIT_CONV_TEMPERATURE +}; /** - * @brief Initialises the UnitSystem structure with the constants given in const.h + * @brief Initialises the UnitSystem structure with the constants given in + * const.h */ void initUnitSystem(struct UnitSystem*); @@ -103,50 +108,53 @@ const char* getBaseUnitSymbol(enum BaseUnits); */ const char* getBaseUnitCGSSymbol(enum BaseUnits); - /** - * @brief Returns the conversion factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system + * @brief Returns the conversion factor for a given unit (expressed in terms of + * the 5 fundamental units) in the chosen unit system */ -double generalConversionFactor(struct UnitSystem* us, float baseUnitsExponants[5]); - +double generalConversionFactor(struct UnitSystem* us, + float baseUnitsExponants[5]); /** - * @brief Returns the conversion factor for a given unit in the chosen unit system + * @brief Returns the conversion factor for a given unit in the chosen unit + * system */ double conversionFactor(struct UnitSystem* us, enum UnitConversionFactor unit); - /** - * @brief Returns the h factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system + * @brief Returns the h factor for a given unit (expressed in terms of the 5 + * fundamental units) in the chosen unit system */ float generalhFactor(struct UnitSystem* us, float baseUnitsExponants[5]); - /** * @brief Returns the h factor for a given unit in the chosen unit system */ float hFactor(struct UnitSystem* us, enum UnitConversionFactor unit); - /** - * @brief Returns the scaling factor for a given unit (expressed in terms of the 5 fundamental units) in the chosen unit system + * @brief Returns the scaling factor for a given unit (expressed in terms of the + * 5 fundamental units) in the chosen unit system */ float generalaFactor(struct UnitSystem* us, float baseUnitsExponants[5]); - /** * @brief Returns the scaling factor for a given unit in the chosen unit system */ float aFactor(struct UnitSystem* us, enum UnitConversionFactor unit); - /** - * @brief Returns a string containg the exponants of the base units making up the conversion factors (expressed in terms of the 5 fundamental units) + * @brief Returns a string containg the exponants of the base units making up + * the conversion factors (expressed in terms of the 5 fundamental units) */ -void generalConversionString(char * buffer, struct UnitSystem* us, float baseUnitsExponants[5]); - +void generalConversionString(char* buffer, struct UnitSystem* us, + float baseUnitsExponants[5]); /** - * @brief Returns a string containg the exponants of the base units making up the conversion factors + * @brief Returns a string containg the exponants of the base units making up + * the conversion factors */ -void conversionString(char * buffer, struct UnitSystem* us, enum UnitConversionFactor unit); +void conversionString(char* buffer, struct UnitSystem* us, + enum UnitConversionFactor unit); + +#endif /* SWIFT_UNITS_H */ diff --git a/src/vector.h b/src/vector.h index 81efefe9f6218a17149869f3ec5535a4361f6b5c..34eb41eea31f821e03ddf24e871faec98095c920 100644 --- a/src/vector.h +++ b/src/vector.h @@ -1,137 +1,152 @@ /******************************************************************************* * This file is part of SWIFT. * Coypright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +#ifndef SWIFT_VECTOR_H +#define SWIFT_VECTOR_H /* Have I already read this file? */ #ifndef VEC_MACRO - /* Include the header file with the intrinsics. */ - #include <immintrin.h> - - /* Define the vector macro. */ - #define VEC_MACRO(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type +/* Include the header file with the intrinsics. */ +#include <immintrin.h> - /* So what will the vector size be? */ - #ifdef __MIC__ - #define VECTORIZE - #define VEC_HAVE_GATHER - #define VEC_SIZE 16 - #define VEC_FLOAT __m512 - #define VEC_DBL __m512d - #define VEC_INT __m512i - #define vec_load(a) _mm512_load_ps(a) - #define vec_set1(a) _mm512_set1_ps(a) - #define vec_set(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) _mm512_set_ps(p,o,n,m,l,k,j,i,h,g,f,e,d,c,b,a) - #define vec_dbl_set(a,b,c,d,e,f,g,h) _mm512_set_pd(h,g,f,e,d,c,b,a) - #define vec_sqrt(a) _mm512_sqrt_ps(a) - #define vec_rcp(a) _mm512_rcp_ps(a) - #define vec_rsqrt(a) _mm512_rsqrt_ps(a) - #define vec_ftoi(a) _mm512_cvttps_epi32(a) - #define vec_fmin(a,b) _mm512_min_ps(a,b) - #define vec_fmax(a,b) _mm512_max_ps(a,b) - #define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a) - #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a,0)) - #define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a,1)) - #define vec_dbl_tofloat(a,b) _mm512_insertf128( _mm512_castps128_ps512(a) , b , 1 ) - #define vec_dbl_load(a) _mm512_load_pd(a) - #define vec_dbl_set1(a) _mm512_set1_pd(a) - #define vec_dbl_sqrt(a) _mm512_sqrt_pd(a) - #define vec_dbl_rcp(a) _mm512_rcp_pd(a) - #define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a) - #define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a) - #define vec_dbl_fmin(a,b) _mm512_min_pd(a,b) - #define vec_dbl_fmax(a,b) _mm512_max_pd(a,b) - #define vec_getoffsets(ptrs) _mm512_insertf64x4( _mm512_insertf64x4( _mm512_setzero_pd() , _mm512_cvtepi64_epi32( _mm512_load_epi64(ptrs) - _mm512_set1_epi64(ptrs[0]) ) , 0 ) , _mm512_cvtepi64_epi32( _mm512_load_epi64(&ptrs[4]) - _mm512_set1_epi64(ptrs[0]) ) , 1 ) - #define vec_gather(base,offsets) _mm512_i32gather_ps( offsets.m , base , 1 ) - #elif defined( NO__AVX__ ) - #define VECTORIZE - #define VEC_SIZE 8 - #define VEC_FLOAT __m256 - #define VEC_DBL __m256d - #define VEC_INT __m256i - #define vec_load(a) _mm256_load_ps(a) - #define vec_set1(a) _mm256_set1_ps(a) - #define vec_set(a,b,c,d,e,f,g,h) _mm256_set_ps(h,g,f,e,d,c,b,a) - #define vec_dbl_set(a,b,c,d) _mm256_set_pd(d,c,b,a) - #define vec_sqrt(a) _mm256_sqrt_ps(a) - #define vec_rcp(a) _mm256_rcp_ps(a) - #define vec_rsqrt(a) _mm256_rsqrt_ps(a) - #define vec_ftoi(a) _mm256_cvttps_epi32(a) - #define vec_fmin(a,b) _mm256_min_ps(a,b) - #define vec_fmax(a,b) _mm256_max_ps(a,b) - #define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a) - #define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,0)) - #define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a,1)) - #define vec_dbl_tofloat(a,b) _mm256_insertf128( _mm256_castps128_ps256(a) , b , 1 ) - #define vec_dbl_load(a) _mm256_load_pd(a) - #define vec_dbl_set1(a) _mm256_set1_pd(a) - #define vec_dbl_sqrt(a) _mm256_sqrt_pd(a) - #define vec_dbl_rcp(a) _mm256_rcp_pd(a) - #define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a) - #define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a) - #define vec_dbl_fmin(a,b) _mm256_min_pd(a,b) - #define vec_dbl_fmax(a,b) _mm256_max_pd(a,b) - #ifdef __AVX2__ - #define VEC_HAVE_GATHER - #define vec_gather(base,offsets) _mm256_i32gather_ps( base , offsets.m , 1 ) - #endif - #elif defined( NO__SSE2__ ) - #define VECTORIZE - #define VEC_SIZE 4 - #define VEC_FLOAT __m128 - #define VEC_DBL __m128d - #define VEC_INT __m128i - #define vec_load(a) _mm_load_ps(a) - #define vec_set1(a) _mm_set1_ps(a) - #define vec_set(a,b,c,d) _mm_set_ps(d,c,b,a) - #define vec_dbl_set(a,b) _mm_set_pd(b,a) - #define vec_sqrt(a) _mm_sqrt_ps(a) - #define vec_rcp(a) _mm_rcp_ps(a) - #define vec_rsqrt(a) _mm_rsqrt_ps(a) - #define vec_ftoi(a) _mm_cvttps_epi32(a) - #define vec_fmin(a,b) _mm_min_ps(a,b) - #define vec_fmax(a,b) _mm_max_ps(a,b) - #define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a) - #define vec_todbl_lo(a) _mm_cvtps_pd(a) - #define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a,a)) - #define vec_dbl_tofloat(a,b) _mm_movelh_ps( _mm_cvtpd_ps(a) , _mm_cvtpd_ps(b) ) - #define vec_dbl_load(a) _mm_load_pd(a) - #define vec_dbl_set1(a) _mm_set1_pd(a) - #define vec_dbl_sqrt(a) _mm_sqrt_pd(a) - #define vec_dbl_rcp(a) _mm_rcp_pd(a) - #define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a) - #define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a) - #define vec_dbl_fmin(a,b) _mm_min_pd(a,b) - #define vec_dbl_fmax(a,b) _mm_max_pd(a,b) - #else - #define VEC_SIZE 4 - #endif +/* Define the vector macro. */ +#define VEC_MACRO(elcount, type) \ + __attribute__((vector_size((elcount) * sizeof(type)))) type - /* Define the composite types for element access. */ - #ifdef VECTORIZE - typedef union { - VEC_FLOAT v; - VEC_DBL vd; - VEC_INT m; - float f[VEC_SIZE]; - double d[VEC_SIZE/2]; - int i[VEC_SIZE]; - } vector; - #endif +/* So what will the vector size be? */ +#ifdef __MIC__ +#define VECTORIZE +#define VEC_HAVE_GATHER +#define VEC_SIZE 16 +#define VEC_FLOAT __m512 +#define VEC_DBL __m512d +#define VEC_INT __m512i +#define vec_load(a) _mm512_load_ps(a) +#define vec_set1(a) _mm512_set1_ps(a) +#define vec_set(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) \ + _mm512_set_ps(p, o, n, m, l, k, j, i, h, g, f, e, d, c, b, a) +#define vec_dbl_set(a, b, c, d, e, f, g, h) \ + _mm512_set_pd(h, g, f, e, d, c, b, a) +#define vec_sqrt(a) _mm512_sqrt_ps(a) +#define vec_rcp(a) _mm512_rcp_ps(a) +#define vec_rsqrt(a) _mm512_rsqrt_ps(a) +#define vec_ftoi(a) _mm512_cvttps_epi32(a) +#define vec_fmin(a, b) _mm512_min_ps(a, b) +#define vec_fmax(a, b) _mm512_max_ps(a, b) +#define vec_fabs(a) _mm512_andnot_ps(_mm512_set1_ps(-0.f), a) +#define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) +#define vec_todbl_hi(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 1)) +#define vec_dbl_tofloat(a, b) _mm512_insertf128(_mm512_castps128_ps512(a), b, 1) +#define vec_dbl_load(a) _mm512_load_pd(a) +#define vec_dbl_set1(a) _mm512_set1_pd(a) +#define vec_dbl_sqrt(a) _mm512_sqrt_pd(a) +#define vec_dbl_rcp(a) _mm512_rcp_pd(a) +#define vec_dbl_rsqrt(a) _mm512_rsqrt_pd(a) +#define vec_dbl_ftoi(a) _mm512_cvttpd_epi32(a) +#define vec_dbl_fmin(a, b) _mm512_min_pd(a, b) +#define vec_dbl_fmax(a, b) _mm512_max_pd(a, b) +#define vec_getoffsets(ptrs) \ + _mm512_insertf64x4( \ + _mm512_insertf64x4(_mm512_setzero_pd(), \ + _mm512_cvtepi64_epi32(_mm512_load_epi64(ptrs) - \ + _mm512_set1_epi64(ptrs[0])), \ + 0), \ + _mm512_cvtepi64_epi32(_mm512_load_epi64(&ptrs[4]) - \ + _mm512_set1_epi64(ptrs[0])), \ + 1) +#define vec_gather(base, offsets) _mm512_i32gather_ps(offsets.m, base, 1) +#elif defined(NO__AVX__) +#define VECTORIZE +#define VEC_SIZE 8 +#define VEC_FLOAT __m256 +#define VEC_DBL __m256d +#define VEC_INT __m256i +#define vec_load(a) _mm256_load_ps(a) +#define vec_set1(a) _mm256_set1_ps(a) +#define vec_set(a, b, c, d, e, f, g, h) _mm256_set_ps(h, g, f, e, d, c, b, a) +#define vec_dbl_set(a, b, c, d) _mm256_set_pd(d, c, b, a) +#define vec_sqrt(a) _mm256_sqrt_ps(a) +#define vec_rcp(a) _mm256_rcp_ps(a) +#define vec_rsqrt(a) _mm256_rsqrt_ps(a) +#define vec_ftoi(a) _mm256_cvttps_epi32(a) +#define vec_fmin(a, b) _mm256_min_ps(a, b) +#define vec_fmax(a, b) _mm256_max_ps(a, b) +#define vec_fabs(a) _mm256_andnot_ps(_mm256_set1_ps(-0.f), a) +#define vec_todbl_lo(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 0)) +#define vec_todbl_hi(a) _mm256_cvtps_pd(_mm256_extract128_ps(a, 1)) +#define vec_dbl_tofloat(a, b) _mm256_insertf128(_mm256_castps128_ps256(a), b, 1) +#define vec_dbl_load(a) _mm256_load_pd(a) +#define vec_dbl_set1(a) _mm256_set1_pd(a) +#define vec_dbl_sqrt(a) _mm256_sqrt_pd(a) +#define vec_dbl_rcp(a) _mm256_rcp_pd(a) +#define vec_dbl_rsqrt(a) _mm256_rsqrt_pd(a) +#define vec_dbl_ftoi(a) _mm256_cvttpd_epi32(a) +#define vec_dbl_fmin(a, b) _mm256_min_pd(a, b) +#define vec_dbl_fmax(a, b) _mm256_max_pd(a, b) +#ifdef __AVX2__ +#define VEC_HAVE_GATHER +#define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) +#endif +#elif defined(NO__SSE2__) +#define VECTORIZE +#define VEC_SIZE 4 +#define VEC_FLOAT __m128 +#define VEC_DBL __m128d +#define VEC_INT __m128i +#define vec_load(a) _mm_load_ps(a) +#define vec_set1(a) _mm_set1_ps(a) +#define vec_set(a, b, c, d) _mm_set_ps(d, c, b, a) +#define vec_dbl_set(a, b) _mm_set_pd(b, a) +#define vec_sqrt(a) _mm_sqrt_ps(a) +#define vec_rcp(a) _mm_rcp_ps(a) +#define vec_rsqrt(a) _mm_rsqrt_ps(a) +#define vec_ftoi(a) _mm_cvttps_epi32(a) +#define vec_fmin(a, b) _mm_min_ps(a, b) +#define vec_fmax(a, b) _mm_max_ps(a, b) +#define vec_fabs(a) _mm_andnot_ps(_mm_set1_ps(-0.f), a) +#define vec_todbl_lo(a) _mm_cvtps_pd(a) +#define vec_todbl_hi(a) _mm_cvtps_pd(_mm_movehl_ps(a, a)) +#define vec_dbl_tofloat(a, b) _mm_movelh_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b)) +#define vec_dbl_load(a) _mm_load_pd(a) +#define vec_dbl_set1(a) _mm_set1_pd(a) +#define vec_dbl_sqrt(a) _mm_sqrt_pd(a) +#define vec_dbl_rcp(a) _mm_rcp_pd(a) +#define vec_dbl_rsqrt(a) _mm_rsqrt_pd(a) +#define vec_dbl_ftoi(a) _mm_cvttpd_epi32(a) +#define vec_dbl_fmin(a, b) _mm_min_pd(a, b) +#define vec_dbl_fmax(a, b) _mm_max_pd(a, b) +#else +#define VEC_SIZE 4 +#endif +/* Define the composite types for element access. */ +#ifdef VECTORIZE +typedef union { + VEC_FLOAT v; + VEC_DBL vd; + VEC_INT m; + float f[VEC_SIZE]; + double d[VEC_SIZE / 2]; + int i[VEC_SIZE]; +} vector; #endif + +#endif + +#endif /* SWIFT_VECTOR_H */ diff --git a/src/version.c b/src/version.c index 705018b8726605e214cb02468f3baee628d8cf54..eb622b85571786c6827dcdff0a869e9dc833b4e5 100644 --- a/src/version.c +++ b/src/version.c @@ -2,57 +2,57 @@ * This file is part of SWIFT. * Copyright (C) 2012 Matthieu Schaller (matthieu.schaller@durham.ac.uk). * Copyright (C) 2015 Peter W. Draper (p.w.draper@durham.ac.uk). - * + * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. - * + * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. - * + * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. - * + * ******************************************************************************/ +/* Some standard headers. */ #include <stdio.h> + +/* This object's header. */ #include "version.h" /** * @brief Return the source code git revision * - * @details The SHA of the code checked out when the library was last built. + * @details The SHA of the code checked out when the library was last built. * Will include -dirty if they are local modifications. */ -const char *git_revision( void ) -{ - static const char *revision = GIT_REVISION; - return revision; +const char *git_revision(void) { + static const char *revision = GIT_REVISION; + return revision; } /** * @brief The version of SWIFT */ -const char *package_version( void ) -{ - static const char *version = PACKAGE_VERSION; - return version; +const char *package_version(void) { + static const char *version = PACKAGE_VERSION; + return version; } /** * @brief A description of the package version and code status. */ -const char *package_description( void ) -{ - static char buf[256]; - static int initialised = 0; - if ( ! initialised ) { - sprintf( buf, "SWIFT version: %s, at revision: %s", - PACKAGE_VERSION, GIT_REVISION ); - initialised = 1; - } - return buf; +const char *package_description(void) { + static char buf[256]; + static int initialised = 0; + if (!initialised) { + sprintf(buf, "SWIFT version: %s, at revision: %s", PACKAGE_VERSION, + GIT_REVISION); + initialised = 1; + } + return buf; }