Commit f8c01941 authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

added cparts, which contain only x, h, and dt of each part for faster, more cache-efficient access.


Former-commit-id: dbe48ffac91fdca4909a9de4484b9c5385e8f2cf
parent acf43028
......@@ -258,8 +258,18 @@ void cell_split ( struct cell *c ) {
for ( k = 0 ; k < 8 ; k++ ) {
c->progeny[k]->count = right[k] - left[k] + 1;
c->progeny[k]->parts = &c->parts[ left[k] ];
c->progeny[k]->cparts = &c->cparts[ left[k] ];
}
/* Update the condensed particle data. */
for ( k = 0 ; k < c->count ; k++ ) {
c->cparts[k].x[0] = c->parts[k].x[0];
c->cparts[k].x[1] = c->parts[k].x[1];
c->cparts[k].x[2] = c->parts[k].x[2];
c->cparts[k].h = c->parts[k].h;
c->cparts[k].dt = c->parts[k].dt;
}
/* Verify a few sub-cells. */
/* for ( k = 0 ; k < c->progeny[0]->count ; k++ )
if ( c->progeny[0]->parts[k].x[0] > pivot[0] ||
......
......@@ -49,6 +49,7 @@ struct cell {
/* Pointers to the particle data. */
struct part *parts;
struct cpart *cparts;
/* Pointers for the sorted indices. */
struct entry *sort;
......
......@@ -24,9 +24,27 @@
#define part_dtmax 10
/* Condensed data of a single particle. */
struct cpart {
/* Particle position. */
double x[3];
/* Particle cutoff radius. */
float h;
/* Particle time-step. */
float dt;
} __attribute__((aligned (32)));
/* Data of a single particle. */
struct part {
/* Particle position. */
double x[3];
/* Particle cutoff radius. */
float h;
......@@ -39,9 +57,6 @@ struct part {
/* Particle ID. */
unsigned long long id;
/* Particle position. */
double x[3];
/* Particle velocity. */
float v[3];
......@@ -70,7 +85,7 @@ struct part {
float rho_dh;
/* Particle number density. */
int icount;
// int icount;
float wcount;
float wcount_dh;
......
......@@ -172,7 +172,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
struct entry *finger;
struct entry *fingers[8];
struct part *parts = c->parts;
struct cpart *cparts = c->cparts;
int j, k, count = c->count;
int i, ind, off[8], inds[8], temp_i;
// float shift[3];
......@@ -262,9 +262,9 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
/* Fill the sort array. */
for ( k = 0 ; k < count ; k++ ) {
px[0] = parts[k].x[0];
px[1] = parts[k].x[1];
px[2] = parts[k].x[2];
px[0] = cparts[k].x[0];
px[1] = cparts[k].x[1];
px[2] = cparts[k].x[2];
for ( j = 0 ; j < 13 ; j++ )
if ( flags & (1 << j) ) {
c->sort[ j*(count + 1) + k].i = k;
......@@ -272,7 +272,7 @@ void runner_dosort ( struct runner *r , struct cell *c , int flags ) {
}
if ( flags & (1 << 14) ) {
c->sort[ 14*(count + 1) + k ].i = k;
c->sort[ 14*(count + 1) + k ].d = parts[k].dt;
c->sort[ 14*(count + 1) + k ].d = cparts[k].dt;
}
}
......
......@@ -82,8 +82,9 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
struct engine *e = r->e;
int pid, pjd, k, count_i = ci->count, count_j = cj->count;
double shift[3] = { 0.0 , 0.0 , 0.0 };
struct part *restrict pi, *restrict pj;
struct part *restrict parts_i = ci->parts, *restrict parts_j = cj->parts;
struct cpart *restrict cpj, *restrict cparts_j = cj->cparts;
struct cpart *restrict cpi, *restrict cparts_i = ci->cparts;
double pix[3];
float dx[3], hi, hi2, r2;
TIMER_TIC
......@@ -105,29 +106,29 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
for ( pid = 0 ; pid < count_i ; pid++ ) {
/* Get a hold of the ith part in ci. */
pi = &parts_i[ pid ];
cpi = &cparts_i[ pid ];
for ( k = 0 ; k < 3 ; k++ )
pix[k] = pi->x[k] - shift[k];
hi = pi->h;
pix[k] = cpi->x[k] - shift[k];
hi = cpi->h;
hi2 = hi * hi;
/* Loop over the parts in cj. */
for ( pjd = 0 ; pjd < count_j ; pjd++ ) {
/* Get a pointer to the jth particle. */
pj = &parts_j[ pjd ];
cpj = &cparts_j[ pjd ];
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pix[k] - pj->x[k];
dx[k] = pix[k] - cpj->x[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hi2 || r2 < pj->h*pj->h ) {
if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
IACT( r2 , dx , hi , pj->h , pi , pj );
IACT( r2 , dx , hi , cpj->h , &parts_i[ pid ] , &parts_j[pjd] );
}
......@@ -162,7 +163,8 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
struct engine *e = r->e;
int pid, pjd, sid, k, count_j = cj->count, flipped;
double shift[3] = { 0.0 , 0.0 , 0.0 };
struct part *restrict pi, *restrict pj, *restrict parts_j = cj->parts;
struct part *restrict pi, *restrict parts_j = cj->parts;
struct cpart *restrict cpj, *restrict cparts_j = cj->cparts;
double pix[3];
float dx[3], hi, hi2, r2, di;
struct entry *sort_j;
......@@ -210,19 +212,19 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
for ( pjd = 0 ; pjd < count_j && sort_j[ pjd ].d < di ; pjd++ ) {
/* Get a pointer to the jth particle. */
pj = &parts_j[ sort_j[ pjd ].i ];
cpj = &cparts_j[ sort_j[ pjd ].i ];
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pix[k] - pj->x[k];
dx[k] = pix[k] - cpj->x[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hi2 ) {
IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[ pjd ].i ] );
}
......@@ -250,19 +252,19 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
for ( pjd = count_j-1 ; pjd >= 0 && di < sort_j[ pjd ].d ; pjd-- ) {
/* Get a pointer to the jth particle. */
pj = &parts_j[ sort_j[ pjd ].i ];
cpj = &cparts_j[ sort_j[ pjd ].i ];
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pix[k] - pj->x[k];
dx[k] = pix[k] - cpj->x[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hi2 ) {
IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[ pjd ].i ] );
}
......@@ -297,7 +299,9 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *restrict parts , int *restrict ind , int count ) {
int pid, pjd, k, count_i = ci->count;
struct part *restrict pi, *restrict pj, *restrict parts_i = ci->parts;
struct part *restrict parts_i = ci->parts;
struct part *restrict pi;
struct cpart *restrict cpj, *restrict cparts = ci->cparts;
double pix[3];
float dx[3], hi, hi2, r2;
TIMER_TIC
......@@ -321,23 +325,19 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
for ( pjd = 0 ; pjd < count_i ; pjd++ ) {
/* Get a pointer to the jth particle. */
pj = &parts_i[ pjd ];
cpj = &cparts[ pjd ];
/* Skip the particle itself. */
if ( pj == pi )
continue;
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pix[k] - pj->x[k];
dx[k] = pix[k] - cpj->x[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hi2 ) {
if ( r2 > 0.0f && r2 < hi2 ) {
IACT_NONSYM( r2 , dx , hi , pj->h , pi , pj );
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_i[ pjd ] );
}
......@@ -370,7 +370,9 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
struct cell *temp;
struct entry *restrict sort_i, *restrict sort_j;
struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
struct part *restrict parts_i, *restrict parts_j;
struct cpart *restrict cpi, *restrict cparts_i;
struct cpart *restrict cpj, *restrict cparts_j;
double pix[3], pjx[3], di, dj;
float dx[3], hi, hi2, hj, hj2, r2;
double hi_max, hj_max;
......@@ -427,6 +429,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
hi_max = ci->h_max - rshift; hj_max = cj->h_max;
count_i = ci->count; count_j = cj->count;
parts_i = ci->parts; parts_j = cj->parts;
cparts_i = ci->cparts; cparts_j = cj->cparts;
di_max = sort_i[count_i-1].d - rshift;
dj_min = sort_j[0].d;
......@@ -437,33 +440,33 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max > dj_min ; pid-- ) {
/* Get a hold of the ith part in ci. */
pi = &parts_i[ sort_i[ pid ].i ];
hi = pi->h;
cpi = &cparts_i[ sort_i[ pid ].i ];
hi = cpi->h;
di = sort_i[pid].d + hi - rshift;
if ( di < dj_min )
continue;
hi2 = pi->h * pi->h;
hi2 = hi * hi;
for ( k = 0 ; k < 3 ; k++ )
pix[k] = pi->x[k] - shift[k];
pix[k] = cpi->x[k] - shift[k];
/* Loop over the parts in cj. */
for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d < di ; pjd++ ) {
/* Get a pointer to the jth particle. */
pj = &parts_j[ sort_j[pjd].i ];
cpj = &cparts_j[ sort_j[pjd].i ];
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pix[k] - pj->x[k];
dx[k] = pix[k] - cpj->x[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hi2 ) {
IACT( r2 , dx , hi , pj->h , pi , pj );
IACT( r2 , dx , hi , cpj->h , &parts_i[ sort_i[ pid ].i ] , &parts_j[ sort_j[pjd].i ] );
}
......@@ -478,33 +481,33 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max < di_max ; pjd++ ) {
/* Get a hold of the jth part in cj. */
pj = &parts_j[ sort_j[ pjd ].i ];
hj = pj->h;
cpj = &cparts_j[ sort_j[ pjd ].i ];
hj = cpj->h;
dj = sort_j[pjd].d - hj - rshift;
if ( dj > di_max )
continue;
for ( k = 0 ; k < 3 ; k++ )
pjx[k] = pj->x[k] + shift[k];
hj2 = pj->h * pj->h;
pjx[k] = cpj->x[k] + shift[k];
hj2 = hj * hj;
/* Loop over the parts in ci. */
for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d > dj ; pid-- ) {
/* Get a pointer to the jth particle. */
pi = &parts_i[ sort_i[pid].i ];
cpi = &cparts_i[ sort_i[pid].i ];
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pi->x[k] - pjx[k];
dx[k] = cpi->x[k] - pjx[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hj2 && r2 > pi->h*pi->h ) {
if ( r2 < hj2 && r2 > cpi->h*cpi->h ) {
IACT( r2 , dx , pi->h , hj , pi , pj );
IACT( r2 , dx , cpi->h , hj , &parts_i[ sort_i[pid].i ] , &parts_j[ sort_j[ pjd ].i ] );
}
......@@ -533,7 +536,8 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) {
int k, pid, pjd, count = c->count;
double pix[3];
float dx[3], hi, hi2, r2;
struct part *restrict pi, *restrict pj, *restrict parts = c->parts;
struct part *restrict parts = c->parts;
struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts;
TIMER_TIC
if ( c->split )
......@@ -543,31 +547,31 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) {
for ( pid = 0 ; pid < count ; pid++ ) {
/* Get a pointer to the ith particle. */
pi = &parts[pid];
cpi = &cparts[pid];
/* Get the particle position and radius. */
for ( k = 0 ; k < 3 ; k++ )
pix[k] = pi->x[k];
hi = pi->h;
pix[k] = cpi->x[k];
hi = cpi->h;
hi2 = hi * hi;
/* Loop over the other particles .*/
for ( pjd = pid+1 ; pjd < count ; pjd++ ) {
/* Get a pointer to the jth particle. */
pj = &parts[pjd];
cpj = &cparts[pjd];
/* Compute the pairwise distance. */
r2 = 0.0f;
for ( k = 0 ; k < 3 ; k++ ) {
dx[k] = pix[k] - pj->x[k];
dx[k] = pix[k] - cpj->x[k];
r2 += dx[k]*dx[k];
}
/* Hit or miss? */
if ( r2 < hi2 || r2 < pj->h*pj->h ) {
if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
IACT( r2 , dx , hi , pj->h , pi , pj );
IACT( r2 , dx , hi , cpj->h , &parts[pid] , &parts[pjd] );
}
......
......@@ -83,7 +83,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
pi->rho_dh += -pj->mass * ( 3.0*wi + xi*wi_dx );
pi->wcount += wi * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pi->wcount_dh -= xi * h_inv * wi_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pi->icount += 1;
// pi->icount += 1;
}
......@@ -98,7 +98,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
pj->rho_dh += -pi->mass * ( 3.0*wj + xj*wj_dx );
pj->wcount += wj * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pj->wcount_dh -= xj * h_inv * wj_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pj->icount += 1;
// pj->icount += 1;
}
......@@ -135,7 +135,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density (
pi->rho_dh += -pj->mass * ( 3.0*wi + xi*wi_dx );
pi->wcount += wi * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pi->wcount_dh -= xi * h_inv * wi_dx * ( 4.0f * M_PI / 3.0f * kernel_igamma3 );
pi->icount += 1;
// pi->icount += 1;
}
......
......@@ -118,7 +118,7 @@ int space_rebuild_recurse ( struct space *s , struct cell *c ) {
/* Count the particles below that. */
for ( count = 0 , k = 0 ; k < c->count ; k++ ) {
h = c->parts[k].h;
h = c->cparts[k].h;
if ( h <= h_limit )
count += 1;
if ( h > h_max )
......@@ -212,14 +212,16 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
int i, j, k, cdim[3];
struct cell *c;
struct part *finger;
struct cpart *cfinger;
int *ind, changes = 0;
/* Run through the parts and get the current h_max. */
for ( k = 0 ; k < s->nr_parts ; k++ )
for ( k = 0 ; k < s->nr_parts ; k++ ) {
if ( s->parts[k].h > h_max )
h_max = s->parts[k].h;
else if ( s->parts[k].h < h_min )
h_min = s->parts[k].h;
}
s->h_min = h_min;
s->h_max = h_max;
......@@ -264,28 +266,43 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
c->depth = 0;
}
/* Run through the particles and get their cell index. */
ind = (int *)alloca( sizeof(int) * s->nr_parts );
for ( k = 0 ; k < s->nr_parts ; k++ ) {
ind[k] = cell_getid( cdim , s->parts[k].x[0]*s->ih[0] , s->parts[k].x[1]*s->ih[1] , s->parts[k].x[2]*s->ih[2] );
s->cells[ ind[k] ].count += 1;
}
/* Sort the parts according to their cells. */
parts_sort( s->parts , ind , s->nr_parts , 0 , s->nr_cells );
/* Hook the cells up to the parts. */
for ( finger = s->parts , k = 0 ; k < s->nr_cells ; k++ ) {
c = &s->cells[ k ];
c->parts = finger;
finger = &finger[ c->count ];
}
/* There were massive changes. */
changes = 1;
} /* re-build upper-level cells? */
/* Run through the particles and get their cell index. */
ind = (int *)alloca( sizeof(int) * s->nr_parts );
for ( k = 0 ; k < s->nr_cells ; k++ )
s->cells[ k ].count = 0;
for ( k = 0 ; k < s->nr_parts ; k++ ) {
ind[k] = cell_getid( s->cdim , s->parts[k].x[0]*s->ih[0] , s->parts[k].x[1]*s->ih[1] , s->parts[k].x[2]*s->ih[2] );
s->cells[ ind[k] ].count += 1;
}
/* Sort the parts according to their cells. */
parts_sort( s->parts , ind , s->nr_parts , 0 , s->nr_cells );
/* Update the condensed particle data. */
for ( k = 0 ; k < s->nr_parts ; k++ ) {
s->cparts[k].x[0] = s->parts[k].x[0];
s->cparts[k].x[1] = s->parts[k].x[1];
s->cparts[k].x[2] = s->parts[k].x[2];
s->cparts[k].h = s->parts[k].h;
s->cparts[k].dt = s->parts[k].dt;
}
/* Hook the cells up to the parts. */
for ( finger = s->parts , cfinger = s->cparts , k = 0 ; k < s->nr_cells ; k++ ) {
c = &s->cells[ k ];
c->parts = finger;
c->cparts = cfinger;
finger = &finger[ c->count ];
cfinger = &cfinger[ c->count ];
}
/* At this point, we have the upper-level cells, old or new. Now make
sure that the parts in each cell are ok. */
#pragma omp parallel for shared(s) reduction(+:changes)
......@@ -303,7 +320,7 @@ int space_rebuild ( struct space *s , int force , double cell_max ) {
/**
* @brief Sort the particles according to the given indices.
* @brief Sort the particles and condensed particles according to the given indices.
*
* @param parts The list of #part
* @param ind The indices with respect to which the parts are sorted.
......@@ -321,57 +338,79 @@ void parts_sort ( struct part *parts , int *ind , int N , int min , int max ) {
int temp_i;
struct part temp_p;
/* One pass of quicksort. */
while ( i < j ) {
while ( i < N && ind[i] <= pivot )
i++;
while ( j >= 0 && ind[j] > pivot )
j--;
if ( i < j ) {
temp_i = ind[i]; ind[i] = ind[j]; ind[j] = temp_i;
temp_p = parts[i]; parts[i] = parts[j]; parts[j] = temp_p;
}
/* If N is small enough, just do insert sort. */
if ( N < 16 ) {
for ( i = 1 ; i < N ; i++ )
if ( ind[i] < ind[i-1] ) {
temp_i = ind[i];
temp_p = parts[j];
for ( j = i ; j > 0 && ind[j-1] > temp_i ; j-- ) {
ind[j] = ind[j-1];
parts[j] = parts[j-1];
}
ind[j] = temp_i;
parts[j] = temp_p;
}
}
/* Verify sort. */
for ( int k = 0 ; k <= j ; k++ )
if ( ind[k] > pivot ) {
printf( "parts_sort: sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i.\n" , k , ind[k] , pivot , i , j , N );
error( "Sorting failed (<=pivot)." );
/* Otherwise, recurse with Quicksort. */
else {
/* One pass of quicksort. */
while ( i < j ) {
while ( i < N && ind[i] <= pivot )
i++;
while ( j >= 0 && ind[j] > pivot )
j--;
if ( i < j ) {
temp_i = ind[i]; ind[i] = ind[j]; ind[j] = temp_i;
temp_p = parts[i]; parts[i] = parts[j]; parts[j] = temp_p;
}
}
for ( int k = j+1 ; k < N ; k++ )
if ( ind[k] <= pivot ) {
printf( "parts_sort: sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i.\n" , k , ind[k] , pivot , i , j , N );
error( "Sorting failed (>pivot)." );
/* Verify sort. */
for ( int k = 0 ; k <= j ; k++ )
if ( ind[k] > pivot ) {
printf( "parts_sort: sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i.\n" , k , ind[k] , pivot , i , j , N );
error( "Sorting failed (<=pivot)." );
}
for ( int k = j+1 ; k < N ; k++ )
if ( ind[k] <= pivot ) {
printf( "parts_sort: sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, N=%i.\n" , k , ind[k] , pivot , i , j , N );
error( "Sorting failed (>pivot)." );
}
/* Try to recurse in parallel. */
if ( N < 100 ) {
/* Recurse on the left? */
if ( j > 0 && pivot > min )
parts_sort( parts , ind , j+1 , min , pivot );
/* Recurse on the right? */
if ( i < N && pivot+1 < max )
parts_sort( &parts[i], &ind[i], N-i , pivot+1 , max );
}
/* Try to recurse in parallel. */