diff --git a/src/runner_iact.h b/src/runner_iact.h index 70d07a6b658554e1583b5ac89dc5d30b19863f3e..a13ccaaae5421ec5855c3cf145daedf70f0603fa 100644 --- a/src/runner_iact.h +++ b/src/runner_iact.h @@ -404,6 +404,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float vector pia[3], pja[3]; vector piu_dt, pju_dt; vector pih_dt, pjh_dt; + vector ci, cj, vi_sig, vj_sig; int j, k; /* Load stuff. */ @@ -414,6 +415,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float pjPOrho2.v = _mm256_set_ps( pj[7]->POrho2 , pj[6]->POrho2 , pj[5]->POrho2 , pj[4]->POrho2 , pj[3]->POrho2 , pj[2]->POrho2 , pj[1]->POrho2 , pj[0]->POrho2 ); pirho.v = _mm256_set_ps( pi[7]->rho , pi[6]->rho , pi[5]->rho , pi[4]->rho , pi[3]->rho , pi[2]->rho , pi[1]->rho , pi[0]->rho ); pjrho.v = _mm256_set_ps( pj[7]->rho , pj[6]->rho , pj[5]->rho , pj[4]->rho , pj[3]->rho , pj[2]->rho , pj[1]->rho , pj[0]->rho ); + ci.v = _mm256_set_ps( pi[7]->c , pi[6]->c , pi[5]->c , pi[4]->c , pi[3]->c , pi[2]->c , pi[1]->c , pi[0]->c ); + cj.v = _mm256_set_ps( pj[7]->c , pj[6]->c , pj[5]->c , pj[4]->c , pj[3]->c , pj[2]->c , pj[1]->c , pj[0]->c ); + vi_sig.v = _mm256_set_ps( pi[7]->v_sig , pi[6]->v_sig , pi[5]->v_sig , pi[4]->v_sig , pi[3]->v_sig , pi[2]->v_sig , pi[1]->v_sig , pi[0]->v_sig ); + vj_sig.v = _mm256_set_ps( pj[7]->v_sig , pj[6]->v_sig , pj[5]->v_sig , pj[4]->v_sig , pj[3]->v_sig , pj[2]->v_sig , pj[1]->v_sig , pj[0]->v_sig ); for ( k = 0 ; k < 3 ; k++ ) { vi[k].v = _mm256_set_ps( pi[7]->v[k] , pi[6]->v[k] , pi[5]->v[k] , pi[4]->v[k] , pi[3]->v[k] , pi[2]->v[k] , pi[1]->v[k] , pi[0]->v[k] ); vj[k].v = _mm256_set_ps( pj[7]->v[k] , pj[6]->v[k] , pj[5]->v[k] , pj[4]->v[k] , pj[3]->v[k] , pj[2]->v[k] , pj[1]->v[k] , pj[0]->v[k] ); @@ -427,6 +432,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float pjPOrho2.v = _mm_set_ps( pj[3]->POrho2 , pj[2]->POrho2 , pj[1]->POrho2 , pj[0]->POrho2 ); pirho.v = _mm_set_ps( pi[3]->rho , pi[2]->rho , pi[1]->rho , pi[0]->rho ); pjrho.v = _mm_set_ps( pj[3]->rho , pj[2]->rho , pj[1]->rho , pj[0]->rho ); + ci.v = _mm_set_ps( pi[3]->c , pi[2]->c , pi[1]->c , pi[0]->c ); + cj.v = _mm_set_ps( pj[3]->c , pj[2]->c , pj[1]->c , pj[0]->c ); + vi_sig.v = _mm_set_ps( pi[3]->v_sig , pi[2]->v_sig , pi[1]->v_sig , pi[0]->v_sig ); + vj_sig.v = _mm_set_ps( pj[3]->v_sig , pj[2]->v_sig , pj[1]->v_sig , pj[0]->v_sig ); for ( k = 0 ; k < 3 ; k++ ) { vi[k].v = _mm_set_ps( pi[3]->v[k] , pi[2]->v[k] , pi[1]->v[k] , pi[0]->v[k] ); vj[k].v = _mm_set_ps( pj[3]->v[k] , pj[2]->v[k] , pj[1]->v[k] , pj[0]->v[k] ); @@ -485,12 +494,18 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_vec_force ( float pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; pjh_dt.v = mi.v / pirho.v * dvdr.v * wj_dr.v; + /* compute the signal velocity (this is always symmetrical). */ + vi_sig.v = vec_fmax( vi_sig.v , cj.v - 3.0f*dvdr.v ); + vj_sig.v = vec_fmax( vj_sig.v , ci.v - 3.0f*dvdr.v ); + /* Store the forces back on the particles. */ for ( k = 0 ; k < VEC_SIZE ; k++ ) { pi[k]->u_dt += piu_dt.f[k]; pj[k]->u_dt += pju_dt.f[k]; pi[k]->h_dt -= pih_dt.f[k]; pj[k]->h_dt -= pjh_dt.f[k]; + pi[k]->v_sig = vi_sig.f[k]; + pj[k]->v_sig = vj_sig.f[k]; for ( j = 0 ; j < 3 ; j++ ) { pi[k]->a[j] -= pia[j].f[k]; pj[k]->a[j] += pja[j].f[k]; @@ -577,6 +592,7 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force vector pia[3]; vector piu_dt; vector pih_dt; + vector ci, cj, vi_sig, vj_sig; int j, k; /* Load stuff. */ @@ -585,6 +601,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force piPOrho2.v = _mm256_set_ps( pi[7]->POrho2 , pi[6]->POrho2 , pi[5]->POrho2 , pi[4]->POrho2 , pi[3]->POrho2 , pi[2]->POrho2 , pi[1]->POrho2 , pi[0]->POrho2 ); pjPOrho2.v = _mm256_set_ps( pj[7]->POrho2 , pj[6]->POrho2 , pj[5]->POrho2 , pj[4]->POrho2 , pj[3]->POrho2 , pj[2]->POrho2 , pj[1]->POrho2 , pj[0]->POrho2 ); pjrho.v = _mm256_set_ps( pj[7]->rho , pj[6]->rho , pj[5]->rho , pj[4]->rho , pj[3]->rho , pj[2]->rho , pj[1]->rho , pj[0]->rho ); + ci.v = _mm256_set_ps( pi[7]->c , pi[6]->c , pi[5]->c , pi[4]->c , pi[3]->c , pi[2]->c , pi[1]->c , pi[0]->c ); + cj.v = _mm256_set_ps( pj[7]->c , pj[6]->c , pj[5]->c , pj[4]->c , pj[3]->c , pj[2]->c , pj[1]->c , pj[0]->c ); + vi_sig.v = _mm256_set_ps( pi[7]->v_sig , pi[6]->v_sig , pi[5]->v_sig , pi[4]->v_sig , pi[3]->v_sig , pi[2]->v_sig , pi[1]->v_sig , pi[0]->v_sig ); + vj_sig.v = _mm256_set_ps( pj[7]->v_sig , pj[6]->v_sig , pj[5]->v_sig , pj[4]->v_sig , pj[3]->v_sig , pj[2]->v_sig , pj[1]->v_sig , pj[0]->v_sig ); for ( k = 0 ; k < 3 ; k++ ) { vi[k].v = _mm256_set_ps( pi[7]->v[k] , pi[6]->v[k] , pi[5]->v[k] , pi[4]->v[k] , pi[3]->v[k] , pi[2]->v[k] , pi[1]->v[k] , pi[0]->v[k] ); vj[k].v = _mm256_set_ps( pj[7]->v[k] , pj[6]->v[k] , pj[5]->v[k] , pj[4]->v[k] , pj[3]->v[k] , pj[2]->v[k] , pj[1]->v[k] , pj[0]->v[k] ); @@ -596,6 +616,10 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force piPOrho2.v = _mm_set_ps( pi[3]->POrho2 , pi[2]->POrho2 , pi[1]->POrho2 , pi[0]->POrho2 ); pjPOrho2.v = _mm_set_ps( pj[3]->POrho2 , pj[2]->POrho2 , pj[1]->POrho2 , pj[0]->POrho2 ); pjrho.v = _mm_set_ps( pj[3]->rho , pj[2]->rho , pj[1]->rho , pj[0]->rho ); + ci.v = _mm_set_ps( pi[3]->c , pi[2]->c , pi[1]->c , pi[0]->c ); + cj.v = _mm_set_ps( pj[3]->c , pj[2]->c , pj[1]->c , pj[0]->c ); + vi_sig.v = _mm_set_ps( pi[3]->v_sig , pi[2]->v_sig , pi[1]->v_sig , pi[0]->v_sig ); + vj_sig.v = _mm_set_ps( pj[3]->v_sig , pj[2]->v_sig , pj[1]->v_sig , pj[0]->v_sig ); for ( k = 0 ; k < 3 ; k++ ) { vi[k].v = _mm_set_ps( pi[3]->v[k] , pi[2]->v[k] , pi[1]->v[k] , pi[0]->v[k] ); vj[k].v = _mm_set_ps( pj[3]->v[k] , pj[2]->v[k] , pj[1]->v[k] , pj[0]->v[k] ); @@ -651,10 +675,16 @@ __attribute__ ((always_inline)) INLINE static void runner_iact_nonsym_vec_force /* Get the time derivative for h. */ pih_dt.v = mj.v / pjrho.v * dvdr.v * wi_dr.v; + /* compute the signal velocity (this is always symmetrical). */ + vi_sig.v = vec_fmax( vi_sig.v , cj.v - 3.0f*dvdr.v ); + vj_sig.v = vec_fmax( vj_sig.v , ci.v - 3.0f*dvdr.v ); + /* Store the forces back on the particles. */ for ( k = 0 ; k < VEC_SIZE ; k++ ) { pi[k]->u_dt += piu_dt.f[k]; pi[k]->h_dt -= pih_dt.f[k]; + pi[k]->v_sig = vi_sig.f[k]; + pj[k]->v_sig = vj_sig.f[k]; for ( j = 0 ; j < 3 ; j++ ) pi[k]->a[j] -= pia[j].f[k]; } diff --git a/src/vector.h b/src/vector.h index 96534c6866d9a413967cebd917d2437486845e97..144752fd1bd2ace4c78a33b17ba7383955b66b33 100644 --- a/src/vector.h +++ b/src/vector.h @@ -27,7 +27,7 @@ #define VEC_MACRO(elcount, type) __attribute__((vector_size((elcount)*sizeof(type)))) type /* So what will the vector size be? */ - #ifdef NO__AVX__ + #ifdef __AVX__ #define VECTORIZE #define VEC_SIZE 8 #define VEC_FLOAT __m256 @@ -39,7 +39,8 @@ #define vec_rsqrt(a) _mm256_rsqrt_ps(a) #define vec_ftoi(a) _mm256_cvttps_epi32(a) #define vec_fmin(a,b) _mm256_min_ps(a,b) - #elif defined( NO__SSE2__ ) + #define vec_fmax(a,b) _mm256_max_ps(a,b) + #elif defined( __SSE2__ ) #define VECTORIZE #define VEC_SIZE 4 #define VEC_FLOAT __m128 @@ -51,6 +52,7 @@ #define vec_rsqrt(a) _mm_rsqrt_ps(a) #define vec_ftoi(a) _mm_cvttps_epi32(a) #define vec_fmin(a,b) _mm_min_ps(a,b) + #define vec_fmax(a,b) _mm_max_ps(a,b) #else #define VEC_SIZE 4 #endif