Commit 3d84436f authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

add vectorization for kernels.


Former-commit-id: 152df3a9b0832933bb9cdc98973ec7fbc4a0a583
parent 1952cdc2
......@@ -68,6 +68,13 @@
#define TIMER_DOPAIR_SUBSET2(f) PASTE(runner_timer_dopair_subset,f)
#define TIMER_DOPAIR_SUBSET TIMER_DOPAIR_SUBSET2(FUNCTION)
#define IACT_NONSYM_VEC2(f) PASTE(runner_iact_nonsym_vec,f)
#define IACT_NONSYM_VEC IACT_NONSYM_VEC2(FUNCTION)
#define IACT_VEC2(f) PASTE(runner_iact_vec,f)
#define IACT_VEC IACT_VEC2(FUNCTION)
/**
* @brief Compute the interactions between a cell pair.
......@@ -87,6 +94,14 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
struct cpart *restrict cpi, *restrict cparts_i = ci->cparts;
double pix[3];
float dx[3], hi, hi2, r2;
#ifdef VECTORIZE
int icount = 0;
float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
TIMER_TIC
/* Get the relative distance between the pairs, wrapping. */
......@@ -128,14 +143,44 @@ void DOPAIR_NAIVE ( struct runner *r , struct cell *restrict ci , struct cell *r
/* Hit or miss? */
if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
IACT( r2 , dx , hi , cpj->h , &parts_i[ pid ] , &parts_j[pjd] );
#ifndef VECTORIZE
IACT( r2 , dx , hi , cpj->h , &parts_i[ pid ] , &parts_j[pjd] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hi;
hjq[icount] = cpj->h;
piq[icount] = &parts_i[ pid ];
pjq[icount] = &parts_j[ pjd ];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
} /* loop over the parts in cj. */
} /* loop over the parts in ci. */
#ifdef VECTORIZE
/* Pick up any leftovers. */
if ( icount > 0 )
for ( k = 0 ; k < icount ; k++ )
IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
#endif
#ifdef TIMER_VERBOSE
printf( "runner_dopair_naive[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
#else
......@@ -168,6 +213,14 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
double pix[3];
float dx[3], hi, hi2, r2, di;
struct entry *sort_j;
#ifdef VECTORIZE
int icount = 0;
float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
TIMER_TIC
/* Get the relative distance between the pairs, wrapping. */
......@@ -224,7 +277,30 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
/* Hit or miss? */
if ( r2 < hi2 ) {
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[ pjd ].i ] );
#ifndef VECTORIZE
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[ pjd ].i ] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hi;
hjq[icount] = cpj->h;
piq[icount] = pi;
pjq[icount] = &parts_j[ sort_j[ pjd ].i ];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
......@@ -264,7 +340,30 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
/* Hit or miss? */
if ( r2 < hi2 ) {
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[ pjd ].i ] );
#ifndef VECTORIZE
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_j[ sort_j[ pjd ].i ] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hi;
hjq[icount] = cpj->h;
piq[icount] = pi;
pjq[icount] = &parts_j[ sort_j[ pjd ].i ];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
......@@ -274,6 +373,13 @@ void DOPAIR_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
}
#ifdef VECTORIZE
/* Pick up any leftovers. */
if ( icount > 0 )
for ( k = 0 ; k < icount ; k++ )
IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
#endif
#ifdef TIMER_VERBOSE
printf( "runner_dopair_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
#else
......@@ -304,6 +410,14 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
struct cpart *restrict cpj, *restrict cparts = ci->cparts;
double pix[3];
float dx[3], hi, hi2, r2;
#ifdef VECTORIZE
int icount = 0;
float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
TIMER_TIC
/* printf( "runner_dopair_naive: doing pair [ %g %g %g ]/[ %g %g %g ] with %i/%i parts and shift = [ %g %g %g ].\n" ,
......@@ -337,7 +451,30 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
/* Hit or miss? */
if ( r2 > 0.0f && r2 < hi2 ) {
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_i[ pjd ] );
#ifndef VECTORIZE
IACT_NONSYM( r2 , dx , hi , cpj->h , pi , &parts_i[ pjd ] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hi;
hjq[icount] = cpj->h;
piq[icount] = pi;
pjq[icount] = &parts_i[ pjd ];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_NONSYM_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
......@@ -345,6 +482,13 @@ void DOSELF_SUBSET ( struct runner *r , struct cell *restrict ci , struct part *
} /* loop over the parts in ci. */
#ifdef VECTORIZE
/* Pick up any leftovers. */
if ( icount > 0 )
for ( k = 0 ; k < icount ; k++ )
IACT_NONSYM( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
#endif
#ifdef TIMER_VERBOSE
printf( "runner_doself_subset[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , ((double)TIMER_TOC(TIMER_DOPAIR)) / CPU_TPS * 1000 );
#else
......@@ -370,7 +514,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
double rshift, shift[3] = { 0.0 , 0.0 , 0.0 };
struct cell *temp;
struct entry *restrict sort_i, *restrict sort_j;
struct part *restrict parts_i, *restrict parts_j;
struct part *restrict pi, *restrict pj, *restrict parts_i, *restrict parts_j;
struct cpart *restrict cpi, *restrict cparts_i;
struct cpart *restrict cpj, *restrict cparts_j;
double pix[3], pjx[3], di, dj;
......@@ -378,6 +522,14 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
double hi_max, hj_max;
double di_max, dj_min;
int count_i, count_j;
#ifdef VECTORIZE
int icount = 0;
float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
TIMER_TIC
/* Get the relative distance between the pairs, wrapping. */
......@@ -440,6 +592,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
for ( pid = count_i-1 ; pid >= 0 && sort_i[pid].d + hi_max > dj_min ; pid-- ) {
/* Get a hold of the ith part in ci. */
pi = &parts_i[ sort_i[ pid ].i ];
cpi = &cparts_i[ sort_i[ pid ].i ];
hi = cpi->h;
di = sort_i[pid].d + hi - rshift;
......@@ -466,7 +619,30 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
/* Hit or miss? */
if ( r2 < hi2 ) {
IACT( r2 , dx , hi , cpj->h , &parts_i[ sort_i[ pid ].i ] , &parts_j[ sort_j[pjd].i ] );
#ifndef VECTORIZE
IACT( r2 , dx , hi , cpj->h , &parts_i[ sort_i[ pid ].i ] , &parts_j[ sort_j[pjd].i ] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hi;
hjq[icount] = cpj->h;
piq[icount] = pi;
pjq[icount] = &parts_j[ sort_j[ pjd ].i ];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
......@@ -481,6 +657,7 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
for ( pjd = 0 ; pjd < count_j && sort_j[pjd].d - hj_max < di_max ; pjd++ ) {
/* Get a hold of the jth part in cj. */
pj = &parts_j[ sort_j[ pjd ].i ];
cpj = &cparts_j[ sort_j[ pjd ].i ];
hj = cpj->h;
dj = sort_j[pjd].d - hj - rshift;
......@@ -507,7 +684,30 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
/* Hit or miss? */
if ( r2 < hj2 && r2 > cpi->h*cpi->h ) {
IACT( r2 , dx , cpi->h , hj , &parts_i[ sort_i[pid].i ] , &parts_j[ sort_j[ pjd ].i ] );
#ifndef VECTORIZE
IACT( r2 , dx , hj , cpi->h , &parts_j[ sort_j[ pjd ].i ] , &parts_i[ sort_i[pid].i ] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hj;
hjq[icount] = cpi->h;
piq[icount] = pj;
pjq[icount] = &parts_i[ sort_i[ pid ].i ];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
......@@ -515,6 +715,13 @@ void DOPAIR ( struct runner *r , struct cell *restrict ci , struct cell *restric
} /* loop over the parts in ci. */
#ifdef VECTORIZE
/* Pick up any leftovers. */
if ( icount > 0 )
for ( k = 0 ; k < icount ; k++ )
IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
#endif
#ifdef TIMER_VERBOSE
printf( "runner_dopair[%02i]: %i/%i parts at depth %i (r_max=%.3f/%.3f, h=%.3f) took %.3f ms.\n" , r->id , count_i , count_j , ci->depth , ci->h_max , cj->h_max , fmax(ci->h[0],fmax(ci->h[1],ci->h[2])) , ((double)(TIMER_TOC(TIMER_DOPAIR))) / CPU_TPS * 1000 );
#else
......@@ -538,6 +745,14 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) {
float dx[3], hi, hi2, r2;
struct part *restrict parts = c->parts;
struct cpart *restrict cpi, *restrict cpj, *restrict cparts = c->cparts;
#ifdef VECTORIZE
int icount = 0;
float r2q[VEC_SIZE] __attribute__ ((aligned (16)));
float hiq[VEC_SIZE] __attribute__ ((aligned (16)));
float hjq[VEC_SIZE] __attribute__ ((aligned (16)));
float dxq[3*VEC_SIZE] __attribute__ ((aligned (16)));
struct part *piq[VEC_SIZE], *pjq[VEC_SIZE];
#endif
TIMER_TIC
/* Loop over the particles in the cell. */
......@@ -568,7 +783,30 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) {
/* Hit or miss? */
if ( r2 < hi2 || r2 < cpj->h*cpj->h ) {
IACT( r2 , dx , hi , cpj->h , &parts[pid] , &parts[pjd] );
#ifndef VECTORIZE
IACT( r2 , dx , hi , cpj->h , &parts[pid] , &parts[pjd] );
#else
/* Add this interaction to the queue. */
r2q[icount] = r2;
dxq[3*icount+0] = dx[0];
dxq[3*icount+1] = dx[1];
dxq[3*icount+2] = dx[2];
hiq[icount] = hi;
hjq[icount] = cpj->h;
piq[icount] = &parts[pid];
pjq[icount] = &parts[pjd];
icount += 1;
/* Flush? */
if ( icount == VEC_SIZE ) {
IACT_VEC( r2q , dxq , hiq , hjq , piq , pjq );
icount = 0;
}
#endif
}
......@@ -576,6 +814,13 @@ void DOSELF ( struct runner *r , struct cell *restrict c ) {
} /* loop over all particles. */
#ifdef VECTORIZE
/* Pick up any leftovers. */
if ( icount > 0 )
for ( k = 0 ; k < icount ; k++ )
IACT( r2q[k] , &dxq[3*k] , hiq[k] , hjq[k] , piq[k] , pjq[k] );
#endif
#ifdef TIMER_VERBOSE
printf( "runner_doself[%02i]: %i parts at depth %i took %.3f ms.\n" , r->id , count , c->depth , ((double)TIMER_TOC(TIMER_DOSELF)) / CPU_TPS * 1000 );
#else
......
......@@ -18,15 +18,53 @@
******************************************************************************/
/* Coefficients for the kernel. */
/* Load the vector stuff. */
#ifdef __SSE2__
#define VECTORIZE
#include <immintrin.h>
#ifdef __AVX__
typedef union {
__m256 v;
__m256i m;
float f[8];
int i[8];
} vector;
#define VEC_SIZE 8
#define vec_load(a) _mm256_load_ps(a)
#define vec_set1(a) _mm256_set1_ps(a)
#define vec_sqrt(a) _mm256_sqrt_ps(a)
#define vec_rcp(a) _mm256_rcp_ps(a)
#define vec_rsqrt(a) _mm256_rsqrt_ps(a)
#define vec_ftoi(a) _mm256_cvttps_epi32(a)
#define vec_fmin(a,b) _mm256_min_ps(a,b)
#else
typedef union {
__m128 v;
__m128i m;
float f[4];
int i[4];
} vector;
#define VEC_SIZE 4
#define vec_load(a) _mm_load_ps(a)
#define vec_set1(a) _mm_set1_ps(a)
#define vec_sqrt(a) _mm_sqrt_ps(a)
#define vec_rcp(a) _mm_rcp_ps(a)
#define vec_rsqrt(a) _mm_rsqrt_ps(a)
#define vec_ftoi(a) _mm_cvttps_epi32(a)
#define vec_fmin(a,b) _mm_min_ps(a,b)
#endif
#else
#define VEC_SIZE 4
#endif
/* Coefficients for the kernel. */
#define kernel_degree 3
#define kernel_ivals 2
#define kernel_gamma 0.5f
#define kernel_igamma 2.0f
#define kernel_igamma3 8.0f
#define kernel_igamma4 16.0f
static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] =
static float kernel_coeffs[ (kernel_degree + 1) * (kernel_ivals + 1) ] __attribute__ ((aligned (16))) =
{ 3.0/4.0*M_1_PI , -3.0/2.0*M_1_PI , 0.0 , M_1_PI ,
-0.25*M_1_PI , 3.0/2.0*M_1_PI , -3.0*M_1_PI , M_2_PI ,
0.0 , 0.0 , 0.0 , 0.0 };
......@@ -51,6 +89,41 @@ __attribute__ ((always_inline)) INLINE static void kernel_deval ( float x , floa
}
__attribute__ ((always_inline)) INLINE static void kernel_deval_vec ( vector *x , vector *w , vector *dw_dx ) {
#ifdef VECTORIZE
vector ind, c[kernel_degree+1];
int j, k;
/* Load x and get the interval id. */
ind.m = vec_ftoi( vec_fmin( x->v , vec_set1( (float)kernel_ivals ) ) );
/* load the coefficients. */
for ( k = 0 ; k < VEC_SIZE ; k++ )
for ( j = 0 ; j < kernel_degree+1 ; j++ )
c[j].f[k] = kernel_coeffs[ ind.i[k]*(kernel_degree + 1) + j ];
/* Init the iteration for Horner's scheme. */
w->v = ( c[0].v * x->v ) + c[1].v;
dw_dx->v = c[0].v;
/* And we're off! */
for ( int k = 2 ; k <= kernel_degree ; k++ ) {
dw_dx->v = ( dw_dx->v * x->v ) + w->v;
w->v = ( x->v * w->v ) + c[k].v;
}
#else
/* Fake vectorization, i.e. hope the compiler gets it. */
for ( int k = 0 ; k < VEC_SIZE ; k++ )
kernel_deval( x.f[k] , &w.f[k] , &dw_dx.f[k] );
#endif
}
__attribute__ ((always_inline)) INLINE static void kernel_eval ( float x , float *W ) {
int ind = fmin( x , kernel_ivals );
float *coeffs = &kernel_coeffs[ ind*(kernel_degree + 1) ];
......@@ -112,6 +185,71 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_density ( float r
}
__attribute__ ((always_inline)) INLINE static void runner_iact_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
#ifdef VECTORIZE
vector r, xi, xj, hi, hj, hi_inv, hj_inv, hig_inv, hjg_inv, wi, wj, wi_dx, wj_dx;
vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh;
vector mi, mj, wscale;
int k;
r.v = vec_sqrt( vec_load( R2 ) );
#if VEC_SIZE==8
mi.v = _mm256_set_ps( pi[7]->mass , pi[6]->mass , pi[5]->mass , pi[4]->mass , pi[3]->mass , pi[2]->mass , pi[1]->mass , pi[0]->mass );
mj.v = _mm256_set_ps( pj[7]->mass , pj[6]->mass , pj[5]->mass , pj[4]->mass , pj[3]->mass , pj[2]->mass , pj[1]->mass , pj[0]->mass );
#elif VEC_SIZE==4
mi.v = _mm_set_ps( pi[3]->mass , pi[2]->mass , pi[1]->mass , pi[0]->mass );
mj.v = _mm_set_ps( pj[3]->mass , pj[2]->mass , pj[1]->mass , pj[0]->mass );
#endif
wscale.v = vec_set1( 4.0f * M_PI / 3.0f * kernel_igamma3 );
hi.v = vec_load( Hi );
hi_inv.v = vec_rcp( hi.v );
hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) );
hig_inv.v = hi_inv.v * vec_set1( kernel_igamma );
xi.v = r.v * hig_inv.v;
hj.v = vec_load( Hj );
hj_inv.v = vec_rcp( hj.v );
hj_inv.v = hj_inv.v - hj_inv.v * ( hj_inv.v * hj.v - vec_set1( 1.0f ) );
hjg_inv.v = hj_inv.v * vec_set1( kernel_igamma );
xj.v = r.v * hjg_inv.v;
kernel_deval_vec( &xi , &wi , &wi_dx );
kernel_deval_vec( &xj , &wj , &wj_dx );
rhoi.v = mj.v * wi.v;
rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
wcounti.v = wi.v * wscale.v;
wcounti_dh.v = xi.v * hi_inv.v * wi_dx.v * wscale.v;
rhoj.v = mi.v * wj.v;
rhoj_dh.v = mi.v * ( vec_set1( 3.0f ) * wj.v + xj.v * wj_dx.v );
wcountj.v = wj.v * wscale.v;
wcountj_dh.v = xj.v * hj_inv.v * wj_dx.v * wscale.v;
for ( k = 0 ; k < VEC_SIZE ; k++ ) {
pi[k]->rho += rhoi.f[k];
pi[k]->rho_dh -= rhoi_dh.f[k];
pi[k]->wcount += wcounti.f[k];
pi[k]->wcount_dh -= wcounti_dh.f[k];
pj[k]->rho += rhoj.f[k];
pj[k]->rho_dh -= rhoj_dh.f[k];
pj[k]->wcount += wcountj.f[k];
pj[k]->wcount_dh -= wcountj_dh.f[k];
}
#else
for ( int k = 0 ; k < VEC_SIZE ; k++ )
runner_iact_density( R2[k] , &Dx[3*k] , Hi[k] , Hj[k] , pi[k] , pj[k] );
#endif
}
/**
* @brief Density kernel
......@@ -142,6 +280,53 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_density (
}
__attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_density ( float *R2 , float *Dx , float *Hi , float *Hj , struct part **pi , struct part **pj ) {
#ifdef VECTORIZE
vector r, xi, hi, hi_inv, hig_inv, wi, wi_dx;
vector rhoi, rhoi_dh, wcounti, wcounti_dh;
vector mj, wscale;
int k;
r.v = vec_sqrt( vec_load( R2 ) );
#if VEC_SIZE==8
mj.v = _mm256_set_ps( pj[7]->mass , pj[6]->mass , pj[5]->mass , pj[4]->mass , pj[3]->mass , pj[2]->mass , pj[1]->mass , pj[0]->mass );
#elif VEC_SIZE==4
mj.v = _mm_set_ps( pj[3]->mass , pj[2]->mass , pj[1]->mass , pj[0]->mass );
#endif
wscale.v = vec_set1( 4.0f * M_PI / 3.0f * kernel_igamma3 );
hi.v = vec_load( Hi );
hi_inv.v = vec_rcp( hi.v );
hi_inv.v = hi_inv.v - hi_inv.v * ( hi_inv.v * hi.v - vec_set1( 1.0f ) );
hig_inv.v = hi_inv.v * vec_set1( kernel_igamma );
xi.v = r.v * hig_inv.v;
kernel_deval_vec( &xi , &wi , &wi_dx );
rhoi.v = mj.v * wi.v;
rhoi_dh.v = mj.v * ( vec_set1( 3.0f ) * wi.v + xi.v * wi_dx.v );
wcounti.v = wi.v * wscale.v;
wcounti_dh.v = xi.v * hi_inv.v * wi_dx.v * wscale.v;
for ( k = 0 ; k < VEC_SIZE ; k++ ) {
pi[k]->rho += rhoi.f[k];