diff --git a/src/cache.h b/src/cache.h index 11d64206d61423895eeab22a0dbca3f07eb87ada..6bb49109ab516010b0989ecb916de325064166bb 100644 --- a/src/cache.h +++ b/src/cache.h @@ -64,7 +64,7 @@ struct cache { /* Maximum index into neighbouring cell for particles that are in range. */ int *restrict max_index SWIFT_CACHE_ALIGN; - + /* Particle density. */ float *restrict rho SWIFT_CACHE_ALIGN; @@ -178,10 +178,14 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, sizeIntBytes); error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes); - error += posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes); - error += posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes); - error += posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes); - error += posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -235,7 +239,8 @@ __attribute__((always_inline)) INLINE void cache_read_particles( } /** - * @brief Populate cache for force interactions by reading in the particles in unsorted order. + * @brief Populate cache for force interactions by reading in the particles in + * unsorted order. * * @param ci The #cell. * @param ci_cache The cache. @@ -257,10 +262,14 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT); - swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, SWIFT_CACHE_ALIGNMENT); - swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, SWIFT_CACHE_ALIGNMENT); - swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, SWIFT_CACHE_ALIGNMENT); - swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, + SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; double loc[3]; @@ -280,7 +289,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( vx[i] = parts[i].v[0]; vy[i] = parts[i].v[1]; vz[i] = parts[i].v[2]; - + rho[i] = parts[i].rho; grad_h[i] = parts[i].force.f; pOrho2[i] = parts[i].force.P_over_rho2; diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index e36c2e73404b7b0d0a8af23e63e8f70dd7078117..c7d51ec695b959de4378ef3904abb75bebe469e0 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -1168,7 +1168,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( } #ifdef WITH_VECTORIZATION - static const vector const_viscosity_alpha_fac = FILL_VEC(-0.25f * const_viscosity_alpha); +static const vector const_viscosity_alpha_fac = + FILL_VEC(-0.25f * const_viscosity_alpha); /** * @brief Force interaction computed using 1 vector @@ -1177,12 +1178,12 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( __attribute__((always_inline)) INLINE static void runner_iact_nonsym_1_vec_force( vector *r2, vector *dx, vector *dy, vector *dz, vector vix, vector viy, - vector viz, vector pirho, vector grad_hi, vector piPOrho2, - vector balsara_i, vector ci, float *Vjx, float *Vjy, float *Vjz, - float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, - float *Mj, vector hi_inv, float *Hj_inv, vector *a_hydro_xSum, - vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, - vector *v_sigSum, vector *entropy_dtSum, mask_t mask) { + vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i, + vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, + float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv, + float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum, + vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, + vector *entropy_dtSum, mask_t mask) { #ifdef WITH_VECTORIZATION @@ -1246,21 +1247,28 @@ runner_iact_nonsym_1_vec_force( /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); - mu_ij.v = vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ + mu_ij.v = + vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v)); /* Now construct the full viscosity term */ rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v)); - visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), rho_ij.v); + visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, + vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), + rho_ij.v); /* Now, convolve with the kernel */ - visc_term.v = vec_mul(vec_set1(0.5f), vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); + visc_term.v = + vec_mul(vec_set1(0.5f), + vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); sph_term.v = - vec_mul(vec_fma(vec_mul(grad_hi.v, piPOrho2.v), wi_dr.v, vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), ri.v); - + vec_mul(vec_fma(vec_mul(grad_hi.v, piPOrho2.v), wi_dr.v, + vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), + ri.v); + /* Eventually get the acceleration */ acc.v = vec_add(visc_term.v, sph_term.v); @@ -1270,7 +1278,8 @@ runner_iact_nonsym_1_vec_force( piaz.v = vec_mul(mj.v, vec_mul(dz->v, acc.v)); /* Get the time derivative for h. */ - pih_dt.v = vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v); + pih_dt.v = + vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v); /* Change in entropy */ entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v)); @@ -1299,13 +1308,12 @@ runner_iact_nonsym_1_vec_force( __attribute__((always_inline)) INLINE static void runner_iact_nonsym_2_vec_force( float *R2, float *Dx, float *Dy, float *Dz, vector vix, vector viy, - vector viz, vector pirho, vector grad_hi, vector piPOrho2, - vector balsara_i, vector ci, float *Vjx, float *Vjy, float *Vjz, - float *Pjrho, float *Grad_hj, float *PjPOrho2, float *Balsara_j, float *Cj, - float *Mj, vector hi_inv, float *Hj_inv, vector *a_hydro_xSum, - vector *a_hydro_ySum, vector *a_hydro_zSum, vector *h_dtSum, - vector *v_sigSum, vector *entropy_dtSum, mask_t mask, mask_t mask_2, - short mask_cond) { + vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i, + vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, + float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv, + float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum, + vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, + vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) { #ifdef WITH_VECTORIZATION @@ -1408,18 +1416,20 @@ runner_iact_nonsym_2_vec_force( dvy_2.v = vec_sub(viy.v, vjy_2.v); dvz.v = vec_sub(viz.v, vjz.v); dvz_2.v = vec_sub(viz.v, vjz_2.v); - + /* Compute dv dot r. */ dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v))); - dvdr_2.v = - vec_fma(dvx_2.v, dx_2.v, vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v))); + dvdr_2.v = vec_fma(dvx_2.v, dx_2.v, + vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v))); /* Compute the relative velocity. (This is 0 if the particles move away from * each other and negative otherwise) */ omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero()); - mu_ij.v = vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ - mu_ij_2.v = vec_mul(fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */ + mu_ij.v = + vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ + mu_ij_2.v = vec_mul( + fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */ /* Compute signal velocity */ v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v)); @@ -1429,19 +1439,33 @@ runner_iact_nonsym_2_vec_force( rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v)); rho_ij_2.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho_2.v)); - visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), rho_ij.v); - visc_2.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))), rho_ij_2.v); + visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, + vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), + rho_ij.v); + visc_2.v = + vec_div(vec_mul(const_viscosity_alpha_fac.v, + vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))), + rho_ij_2.v); /* Now, convolve with the kernel */ - visc_term.v = vec_mul(vec_set1(0.5f), vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); - visc_term_2.v = vec_mul(vec_set1(0.5f), vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v))); - + visc_term.v = + vec_mul(vec_set1(0.5f), + vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); + visc_term_2.v = vec_mul( + vec_set1(0.5f), + vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v))); + vector grad_hi_mul_piPOrho2; grad_hi_mul_piPOrho2.v = vec_mul(grad_hi.v, piPOrho2.v); sph_term.v = - vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v, vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), ri.v); - sph_term_2.v = vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v, vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))), ri_2.v); + vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v, + vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), + ri.v); + sph_term_2.v = + vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v, + vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))), + ri_2.v); /* Eventually get the acceleration */ acc.v = vec_add(visc_term.v, sph_term.v); @@ -1456,8 +1480,11 @@ runner_iact_nonsym_2_vec_force( piaz_2.v = vec_mul(mj_2.v, vec_mul(dz_2.v, acc_2.v)); /* Get the time derivative for h. */ - pih_dt.v = vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v); - pih_dt_2.v = vec_div(vec_mul(mj_2.v, vec_mul(dvdr_2.v, vec_mul(ri_2.v, wi_dr_2.v))), pjrho_2.v); + pih_dt.v = + vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v); + pih_dt_2.v = + vec_div(vec_mul(mj_2.v, vec_mul(dvdr_2.v, vec_mul(ri_2.v, wi_dr_2.v))), + pjrho_2.v); /* Change in entropy */ entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v)); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index c07d70f3e48bb6f1c9e7e343a50cdbba71da0785..14eecaa2c0c126b721a400e63298e1d55a9c98da 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1039,7 +1039,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, const float hj = pj->h; const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; if (dj - rshift > di_max) continue; - + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; @@ -1451,7 +1451,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { const float hj = pj->h; const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; if (dj - rshift > di_max) continue; - + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 1bf14f38370ecb80c66d1fd20a255f7c00c2c6f7..a8b7ecb18a3b4615f1ff9b9618bec3284cb5b9ad 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -226,20 +226,27 @@ __attribute__((always_inline)) INLINE static void storeInteractions( } /** - * @brief Compute the vector remainder force interactions from the secondary cache. + * @brief Compute the vector remainder force interactions from the secondary + * cache. * * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x acceleration + * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x + * acceleration * update on pi. - * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y acceleration + * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y + * acceleration * update on pi. - * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z acceleration + * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z + * acceleration * update on pi. - * @param h_dtSum (return) #vector holding the cumulative sum of the time derivative of the smoothing length update on pi. - * @param v_sigSum (return) #vector holding the maximum of the signal velocity update on pi. - * @param entropy_dtSum (return) #vector holding the cumulative sum of the time derivative of the entropy + * @param h_dtSum (return) #vector holding the cumulative sum of the time + * derivative of the smoothing length update on pi. + * @param v_sigSum (return) #vector holding the maximum of the signal velocity + * update on pi. + * @param entropy_dtSum (return) #vector holding the cumulative sum of the time + * derivative of the entropy * update on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -251,7 +258,8 @@ __attribute__((always_inline)) INLINE static void storeInteractions( * @param v_balsara_i #vector of balsara switch of pi. * @param v_ci #vector of sound speed of pi. * @param icount_align (return) Interaction count after the remainder - * @param num_vec_proc #int of the number of vectors to use to perform interaction. + * @param num_vec_proc #int of the number of vectors to use to perform + * interaction. * interactions have been performed, should be a multiple of the vector length. */ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( @@ -333,15 +341,21 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x acceleration + * @param a_hydro_xSum (return) #vector holding the cumulative sum of the x + * acceleration + * update on pi. + * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y + * acceleration * update on pi. - * @param a_hydro_ySum (return) #vector holding the cumulative sum of the y acceleration + * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z + * acceleration * update on pi. - * @param a_hydro_zSum (return) #vector holding the cumulative sum of the z acceleration + * @param h_dtSum (return) #vector holding the cumulative sum of the time + * derivative of the smoothing length update on pi. + * @param v_sigSum (return) #vector holding the maximum of the signal velocity * update on pi. - * @param h_dtSum (return) #vector holding the cumulative sum of the time derivative of the smoothing length update on pi. - * @param v_sigSum (return) #vector holding the maximum of the signal velocity update on pi. - * @param entropy_dtSum (return) #vector holding the cumulative sum of the time derivative of the entropy + * @param entropy_dtSum (return) #vector holding the cumulative sum of the time + * derivative of the entropy * update on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -352,7 +366,8 @@ __attribute__((always_inline)) INLINE static void calcRemForceInteractions( * @param v_pOrhoi2 #vector of pressure over density squared of pi. * @param v_balsara_i #vector of balsara switch of pi. * @param v_ci #vector of sound speed of pi. - * @param num_vec_proc #int of the number of vectors to use to perform interaction. + * @param num_vec_proc #int of the number of vectors to use to perform + * interaction. * interactions have been performed, should be a multiple of the vector length. */ __attribute__((always_inline)) INLINE static void storeForceInteractions( @@ -993,17 +1008,17 @@ for (int pid = 0; pid < count; pid++) { storeForceInteractions( doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, cell_cache, &int_cache, &icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, - v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); + &h_dtSum, &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, + v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, 2); } } /* Loop over all other particles. */ /* Perform padded vector remainder interactions if any are present. */ - calcRemForceInteractions( - &int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, - &v_sigSum, &entropy_dtSum, v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, - v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2); + calcRemForceInteractions(&int_cache, icount, &a_hydro_xSum, &a_hydro_ySum, + &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, + v_hi_inv, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, + v_pOrhoi2, v_balsara_i, v_ci, &icount_align, 2); /* Initialise masks to true in case remainder interactions have been * performed. */ @@ -1015,14 +1030,13 @@ for (int pid = 0; pid < count; pid++) { for (int pjd = 0; pjd < icount_align; pjd += (2 * VEC_SIZE)) { runner_iact_nonsym_2_vec_force( &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], - &int_cache.dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, - v_pOrhoi2, v_balsara_i, v_ci, &int_cache.vxq[pjd], - &int_cache.vyq[pjd], &int_cache.vzq[pjd], &int_cache.rhoq[pjd], - &int_cache.grad_hq[pjd], &int_cache.pOrho2q[pjd], - &int_cache.balsaraq[pjd], &int_cache.soundspeedq[pjd], - &int_cache.mq[pjd], v_hi_inv, &int_cache.h_invq[pjd], &a_hydro_xSum, - &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, - int_mask, int_mask2, 0); + &int_cache.dzq[pjd], v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, v_pOrhoi2, + v_balsara_i, v_ci, &int_cache.vxq[pjd], &int_cache.vyq[pjd], + &int_cache.vzq[pjd], &int_cache.rhoq[pjd], &int_cache.grad_hq[pjd], + &int_cache.pOrho2q[pjd], &int_cache.balsaraq[pjd], + &int_cache.soundspeedq[pjd], &int_cache.mq[pjd], v_hi_inv, + &int_cache.h_invq[pjd], &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, + &h_dtSum, &v_sigSum, &entropy_dtSum, int_mask, int_mask2, 0); } VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); @@ -1258,8 +1272,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, vector v_dx, v_dy, v_dz, v_r2; #ifdef SWIFT_DEBUG_CHECKS - if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || cj_cache_idx + (VEC_SIZE - 1) > (last_pj_align + 1 + VEC_SIZE)) { - error("Unaligned read!!! cj_cache_idx=%d, last_pj_align=%d", cj_cache_idx, last_pj_align); + if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || + cj_cache_idx + (VEC_SIZE - 1) > (last_pj_align + 1 + VEC_SIZE)) { + error("Unaligned read!!! cj_cache_idx=%d, last_pj_align=%d", + cj_cache_idx, last_pj_align); } #endif @@ -1384,8 +1400,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { #ifdef SWIFT_DEBUG_CHECKS - if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 || ci_cache_idx + (VEC_SIZE - 1) > (count_i - first_pi_align + VEC_SIZE)) { - error("Unaligned read!!! ci_cache_idx=%d, first_pi_align=%d, count_i=%d", ci_cache_idx, first_pi_align, count_i); + if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 || + ci_cache_idx + (VEC_SIZE - 1) > + (count_i - first_pi_align + VEC_SIZE)) { + error( + "Unaligned read!!! ci_cache_idx=%d, first_pi_align=%d, " + "count_i=%d", + ci_cache_idx, first_pi_align, count_i); } #endif diff --git a/tests/testInteractions.c b/tests/testInteractions.c index 8e8b2c26ec599f90ba8f2cc9f524a54f3a9ba2d2..2a0027574f65a9709a2f3ab6734414cd9227e642 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -126,37 +126,29 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { FILE *file = fopen(fileName, "a"); fprintf(file, - "%6llu %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f " - "%8.5f " - "%8.5f %8.5f %13e %13e %13e %13e %13e %8.5f %8.5f\n", - p->id, p->x[0], - p->x[1], p->x[2], - p->v[0], p->v[1], - p->v[2], p->h, - hydro_get_density(p), + "%6llu %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f " + "%8.5f " + "%8.5f %8.5f %13e %13e %13e %13e %13e %8.5f %8.5f\n", + p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], p->h, + hydro_get_density(p), #if defined(MINIMAL_SPH) || defined(SHADOWFAX_SPH) - 0.f, + 0.f, #else - p->density.div_v, + p->density.div_v, #endif - hydro_get_entropy(p), - hydro_get_internal_energy(p), - hydro_get_pressure(p), - hydro_get_soundspeed(p), - p->a_hydro[0], p->a_hydro[1], - p->a_hydro[2], p->force.h_dt, + hydro_get_entropy(p), hydro_get_internal_energy(p), + hydro_get_pressure(p), hydro_get_soundspeed(p), p->a_hydro[0], + p->a_hydro[1], p->a_hydro[2], p->force.h_dt, #if defined(GADGET2_SPH) - p->force.v_sig, p->entropy_dt, - 0.f + p->force.v_sig, p->entropy_dt, 0.f #elif defined(DEFAULT_SPH) - p->force.v_sig, 0.f, - p->force.u_dt + p->force.v_sig, 0.f, p->force.u_dt #elif defined(MINIMAL_SPH) - p->force.v_sig, 0.f, p->u_dt + p->force.v_sig, 0.f, p->u_dt #else - 0.f, 0.f, 0.f + 0.f, 0.f, 0.f #endif - ); + ); fclose(file); } @@ -174,7 +166,7 @@ void write_header(char *fileName) { "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "h", "rho", "div_v", "S", "u", "P", "c", "a_x", "a_y", "a_z", "h_dt", "v_sig", "dS/dt", "du/dt"); - + fclose(file); } @@ -415,8 +407,9 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, * @param runs No. of times to call interactions * */ -void test_force_interactions(struct part test_part, struct part *parts, size_t count, - char *filePrefix, int runs, int num_vec_proc) { +void test_force_interactions(struct part test_part, struct part *parts, + size_t count, char *filePrefix, int runs, + int num_vec_proc) { ticks serial_time = 0; ticks vec_time = 0; @@ -449,7 +442,7 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c float dxq[count] __attribute__((aligned(array_align))); float dyq[count] __attribute__((aligned(array_align))); float dzq[count] __attribute__((aligned(array_align))); - + float hiq[count] __attribute__((aligned(array_align))); float vixq[count] __attribute__((aligned(array_align))); float viyq[count] __attribute__((aligned(array_align))); @@ -459,7 +452,7 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c float pOrhoi2q[count] __attribute__((aligned(array_align))); float balsaraiq[count] __attribute__((aligned(array_align))); float ciq[count] __attribute__((aligned(array_align))); - + float hj_invq[count] __attribute__((aligned(array_align))); float mjq[count] __attribute__((aligned(array_align))); float vjxq[count] __attribute__((aligned(array_align))); @@ -502,8 +495,8 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c #pragma novector #endif for (size_t i = 0; i < count; i++) { - runner_iact_nonsym_force(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial, - &pj_serial[i]); + runner_iact_nonsym_force(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, + &pi_serial, &pj_serial[i]); } serial_time += getticks() - tic; } @@ -540,7 +533,7 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c dxq[i] = dx[0]; dyq[i] = dx[1]; dzq[i] = dx[2]; - + hiq[i] = pi_vec.h; vixq[i] = pi_vec.v[0]; viyq[i] = pi_vec.v[1]; @@ -550,7 +543,7 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c pOrhoi2q[i] = pi_vec.force.P_over_rho2; balsaraiq[i] = pi_vec.force.balsara; ciq[i] = pi_vec.force.soundspeed; - + hj_invq[i] = 1.f / pj_vec[i].h; mjq[i] = pj_vec[i].mass; vjxq[i] = pj_vec[i].v[0]; @@ -561,7 +554,6 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c pOrhoj2q[i] = pj_vec[i].force.P_over_rho2; balsarajq[i] = pj_vec[i].force.balsara; cjq[i] = pj_vec[i].force.soundspeed; - } /* Only dump data on first run. */ @@ -572,9 +564,11 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c dump_indv_particle_fields(vec_filename, pjq[i]); } -/* Perform vector interaction. */ - vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, rhoi_vec, grad_hi_vec, pOrhoi2_vec, balsara_i_vec, ci_vec; - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, entropy_dtSum; + /* Perform vector interaction. */ + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, rhoi_vec, grad_hi_vec, + pOrhoi2_vec, balsara_i_vec, ci_vec; + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; a_hydro_xSum.v = vec_setzero(); a_hydro_ySum.v = vec_setzero(); @@ -594,20 +588,23 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c ci_vec.v = vec_load(&ciq[0]); hi_inv_vec = vec_reciprocal(hi_vec); - + mask_t mask, mask2; vec_init_mask_true(mask); vec_init_mask_true(mask2); - + const ticks vec_tic = getticks(); for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) { if (num_vec_proc == 2) { - runner_iact_nonsym_2_vec_force(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), - (vix_vec), (viy_vec), (viz_vec), rhoi_vec, grad_hi_vec, pOrhoi2_vec, balsara_i_vec, ci_vec, &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), &(grad_hjq[i]), &(pOrhoj2q[i]), &(balsarajq[i]), &(cjq[i]), &(mjq[i]), hi_inv_vec, &(hj_invq[i]), &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, - mask, mask2, 0); + runner_iact_nonsym_2_vec_force( + &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (vix_vec), (viy_vec), + (viz_vec), rhoi_vec, grad_hi_vec, pOrhoi2_vec, balsara_i_vec, + ci_vec, &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), + &(grad_hjq[i]), &(pOrhoj2q[i]), &(balsarajq[i]), &(cjq[i]), + &(mjq[i]), hi_inv_vec, &(hj_invq[i]), &a_hydro_xSum, &a_hydro_ySum, + &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, mask, mask2, 0); } else { /* Only use one vector for interaction. */ vector r2, dx, dy, dz; @@ -617,11 +614,13 @@ void test_force_interactions(struct part test_part, struct part *parts, size_t c dz.v = vec_load(&(dzq[i])); runner_iact_nonsym_1_vec_force( - &r2, &dx, &dy, &dz, vix_vec, viy_vec, viz_vec, rhoi_vec, grad_hi_vec, pOrhoi2_vec, balsara_i_vec, ci_vec, &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), &(grad_hjq[i]), &(pOrhoj2q[i]), &(balsarajq[i]), &(cjq[i]), &(mjq[i]), hi_inv_vec, &(hj_invq[i]), &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, - &h_dtSum, &v_sigSum, &entropy_dtSum, - mask); + &r2, &dx, &dy, &dz, vix_vec, viy_vec, viz_vec, rhoi_vec, + grad_hi_vec, pOrhoi2_vec, balsara_i_vec, ci_vec, &(vjxq[i]), + &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), &(grad_hjq[i]), &(pOrhoj2q[i]), + &(balsarajq[i]), &(cjq[i]), &(mjq[i]), hi_inv_vec, &(hj_invq[i]), + &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, + &entropy_dtSum, mask); } - } VEC_HADD(a_hydro_xSum, piq[0]->a_hydro[0]); @@ -716,9 +715,11 @@ int main(int argc, char *argv[]) { prepare_force(particles, count); - test_force_interactions(test_particle, &particles[1], count - 1, "test_nonsym_force", runs, 1); - test_force_interactions(test_particle, &particles[1], count - 1, "test_nonsym_force", runs, 2); - + test_force_interactions(test_particle, &particles[1], count - 1, + "test_nonsym_force", runs, 1); + test_force_interactions(test_particle, &particles[1], count - 1, + "test_nonsym_force", runs, 2); + return 0; }