diff --git a/.gitignore b/.gitignore index e3e17bb82a01d9af0ace6ed72d196cf2dba242f6..d3faf084a3e0a78b06f0bfa57742bfe8d9311a20 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,11 @@ tests/brute_force_periodic_BC_perturbed.dat tests/swift_dopair_active.dat tests/test_nonsym_density_serial.dat tests/test_nonsym_density_vec.dat +tests/test_nonsym_force_serial.dat +tests/test_nonsym_density_1_vec.dat +tests/test_nonsym_density_2_vec.dat +tests/test_nonsym_force_1_vec.dat +tests/test_nonsym_force_2_vec.dat tests/testGreetings tests/testReading tests/input.hdf5 diff --git a/examples/main.c b/examples/main.c index ee1253062409ec2e787e064a5fb50da2c830d35d..a86cdc60a7473e84e8fba8c32417e7e19ebaeb60 100644 --- a/examples/main.c +++ b/examples/main.c @@ -677,6 +677,7 @@ int main(int argc, char *argv[]) { /* Write the state of the system before starting time integration. */ engine_dump_snapshot(&e); + engine_print_stats(&e); /* Legend */ if (myrank == 0) @@ -815,6 +816,7 @@ int main(int argc, char *argv[]) { /* Write final output. */ engine_drift_all(&e); + engine_print_stats(&e); engine_dump_snapshot(&e); #ifdef WITH_MPI diff --git a/src/active.h b/src/active.h index 58e88835b6f51ae15f9fd7270c0e1f89bbd6d61a..f9ed4a0d40127f1544c744a61b05515a19857b05 100644 --- a/src/active.h +++ b/src/active.h @@ -144,6 +144,14 @@ __attribute__((always_inline)) INLINE static int part_is_active( return (part_bin <= max_active_bin); } +__attribute__((always_inline)) INLINE static int part_is_active_no_debug( + const struct part *p, const timebin_t max_active_bin) { + + const timebin_t part_bin = p->time_bin; + + return (part_bin <= max_active_bin); +} + /** * @brief Is this g-particle finishing its time-step now ? * diff --git a/src/cache.h b/src/cache.h index 170eb8f5cb0939e85647bbb4622d66cc78ef3dbe..bb72149607f496dd378a848a4270fe74d3f75ad9 100644 --- a/src/cache.h +++ b/src/cache.h @@ -65,6 +65,21 @@ struct cache { /* Maximum index into neighbouring cell for particles that are in range. */ int *restrict max_index SWIFT_CACHE_ALIGN; + /* Particle density. */ + float *restrict rho SWIFT_CACHE_ALIGN; + + /* Particle smoothing length gradient. */ + float *restrict grad_h SWIFT_CACHE_ALIGN; + + /* Pressure over density squared. */ + float *restrict pOrho2 SWIFT_CACHE_ALIGN; + + /* Balsara switch. */ + float *restrict balsara SWIFT_CACHE_ALIGN; + + /* Particle sound speed. */ + float *restrict soundspeed SWIFT_CACHE_ALIGN; + /* Cache size. */ int count; }; @@ -127,6 +142,11 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, free(c->vz); free(c->h); free(c->max_index); + free(c->rho); + free(c->grad_h); + free(c->pOrho2); + free(c->balsara); + free(c->soundspeed); } error += posix_memalign((void **)&c->x, SWIFT_CACHE_ALIGNMENT, sizeBytes); @@ -139,6 +159,15 @@ __attribute__((always_inline)) INLINE void cache_init(struct cache *c, error += posix_memalign((void **)&c->h, SWIFT_CACHE_ALIGNMENT, sizeBytes); error += posix_memalign((void **)&c->max_index, SWIFT_CACHE_ALIGNMENT, sizeIntBytes); + error += posix_memalign((void **)&c->rho, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->grad_h, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->pOrho2, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->balsara, SWIFT_CACHE_ALIGNMENT, sizeBytes); + error += + posix_memalign((void **)&c->soundspeed, SWIFT_CACHE_ALIGNMENT, sizeBytes); if (error != 0) error("Couldn't allocate cache, no. of particles: %d", (int)count); @@ -191,6 +220,68 @@ __attribute__((always_inline)) INLINE void cache_read_particles( #endif } +/** + * @brief Populate cache for force interactions by reading in the particles in + * unsorted order. + * + * @param ci The #cell. + * @param ci_cache The cache. + */ +__attribute__((always_inline)) INLINE void cache_read_force_particles( + const struct cell *restrict const ci, + struct cache *restrict const ci_cache) { + +#if defined(GADGET2_SPH) + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, + SWIFT_CACHE_ALIGNMENT); + + const struct part *restrict parts = ci->parts; + double loc[3]; + loc[0] = ci->loc[0]; + loc[1] = ci->loc[1]; + loc[2] = ci->loc[2]; + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ + for (int i = 0; i < ci->count; i++) { + x[i] = (float)(parts[i].x[0] - loc[0]); + y[i] = (float)(parts[i].x[1] - loc[1]); + z[i] = (float)(parts[i].x[2] - loc[2]); + h[i] = parts[i].h; + + m[i] = parts[i].mass; + vx[i] = parts[i].v[0]; + vy[i] = parts[i].v[1]; + vz[i] = parts[i].v[2]; + + rho[i] = parts[i].rho; + grad_h[i] = parts[i].force.f; + pOrho2[i] = parts[i].force.P_over_rho2; + balsara[i] = parts[i].force.balsara; + soundspeed[i] = parts[i].force.soundspeed; + } + +#endif +} + /** * @brief Populate caches by only reading particles that are within range of * each other within the adjoining cell.Also read the particles into the cache diff --git a/src/engine.c b/src/engine.c index 1230abf848b3b63465bd5e7d4065dec36d46e46f..93c430d611cf573c643b4cf94325fb97333381a7 100644 --- a/src/engine.c +++ b/src/engine.c @@ -3378,6 +3378,9 @@ void engine_print_stats(struct engine *e) { struct statistics global_stats = stats; #endif + /* Finalize operations */ + stats_finalize(&stats); + /* Print info */ if (e->nodeID == 0) stats_print_to_file(e->file_stats, &global_stats, e->time); @@ -4372,7 +4375,7 @@ void engine_init(struct engine *e, struct space *s, e->file_timesteps = NULL; e->deltaTimeStatistics = parser_get_param_double(params, "Statistics:delta_time"); - e->timeLastStatistics = e->timeBegin - e->deltaTimeStatistics; + e->timeLastStatistics = 0; e->verbose = verbose; e->count_step = 0; e->wallclock_time = 0.f; @@ -4540,10 +4543,10 @@ void engine_init(struct engine *e, struct space *s, e->file_stats = fopen(energyfileName, "w"); fprintf(e->file_stats, "#%14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s %14s " - "%14s %14s %14s\n", + "%14s %14s %14s %14s %14s %14s\n", "Time", "Mass", "E_tot", "E_kin", "E_int", "E_pot", "E_pot_self", "E_pot_ext", "E_radcool", "Entropy", "p_x", "p_y", "p_z", "ang_x", - "ang_y", "ang_z"); + "ang_y", "ang_z", "com_x", "com_y", "com_z"); fflush(e->file_stats); char timestepsfileName[200] = ""; diff --git a/src/engine.h b/src/engine.h index 90645b308529e49a370bf91f2b3d05195b2b08d8..2b5a4eee7cde974fc555460c6483d02b21bd6c04 100644 --- a/src/engine.h +++ b/src/engine.h @@ -261,6 +261,7 @@ void engine_unskip(struct engine *e); void engine_drift_all(struct engine *e); void engine_drift_top_multipoles(struct engine *e); void engine_reconstruct_multipoles(struct engine *e); +void engine_print_stats(struct engine *e); void engine_dump_snapshot(struct engine *e); void engine_init(struct engine *e, struct space *s, const struct swift_params *params, int nr_nodes, int nodeID, diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 9e06fec92f9667d21ce7dc196ba26f9870cb24f4..cce4fb03ee405fbdf2232184778a0f426e63014a 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -1167,4 +1167,365 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( #endif } +#ifdef WITH_VECTORIZATION +static const vector const_viscosity_alpha_fac = + FILL_VEC(-0.25f * const_viscosity_alpha); + +/** + * @brief Force interaction computed using 1 vector + * (non-symmetric vectorized version). + */ +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_1_vec_force( + vector *r2, vector *dx, vector *dy, vector *dz, vector vix, vector viy, + vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i, + vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, + float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv, + vector hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum, + vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, + vector *entropy_dtSum, mask_t mask) { + +#ifdef WITH_VECTORIZATION + + vector r, ri; + vector vjx, vjy, vjz, dvx, dvy, dvz; + vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj; + vector xi, xj; + vector hid_inv, hjd_inv; + vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector piax, piay, piaz; + vector pih_dt; + vector v_sig; + vector omega_ij, mu_ij, fac_mu, balsara; + vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; + + /* Fill vectors. */ + vjx.v = vec_load(Vjx); + vjy.v = vec_load(Vjy); + vjz.v = vec_load(Vjz); + mj.v = vec_load(Mj); + + pjrho.v = vec_load(Pjrho); + grad_hj.v = vec_load(Grad_hj); + pjPOrho2.v = vec_load(PjPOrho2); + balsara_j.v = vec_load(Balsara_j); + cj.v = vec_load(Cj); + + fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ + + /* Load stuff. */ + balsara.v = vec_add(balsara_i.v, balsara_j.v); + + /* Get the radius and inverse radius. */ + ri = vec_reciprocal_sqrt(*r2); + r.v = vec_mul(r2->v, ri.v); + + /* Get the kernel for hi. */ + hid_inv = pow_dimension_plus_one_vec(hi_inv); + xi.v = vec_mul(r.v, hi_inv.v); + kernel_eval_dWdx_force_vec(&xi, &wi_dx); + wi_dr.v = vec_mul(hid_inv.v, wi_dx.v); + + /* Get the kernel for hj. */ + hjd_inv = pow_dimension_plus_one_vec(hj_inv); + xj.v = vec_mul(r.v, hj_inv.v); + + /* Calculate the kernel for two particles. */ + kernel_eval_dWdx_force_vec(&xj, &wj_dx); + + wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v); + + /* Compute dv. */ + dvx.v = vec_sub(vix.v, vjx.v); + dvy.v = vec_sub(viy.v, vjy.v); + dvz.v = vec_sub(viz.v, vjz.v); + + /* Compute dv dot r. */ + dvdr.v = vec_fma(dvx.v, dx->v, vec_fma(dvy.v, dy->v, vec_mul(dvz.v, dz->v))); + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); + mu_ij.v = + vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ + + /* Compute signal velocity */ + v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v)); + + /* Now construct the full viscosity term */ + rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v)); + visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, + vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), + rho_ij.v); + + /* Now, convolve with the kernel */ + visc_term.v = + vec_mul(vec_set1(0.5f), + vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); + + sph_term.v = + vec_mul(vec_fma(vec_mul(grad_hi.v, piPOrho2.v), wi_dr.v, + vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), + ri.v); + + /* Eventually get the acceleration */ + acc.v = vec_add(visc_term.v, sph_term.v); + + /* Use the force, Luke! */ + piax.v = vec_mul(mj.v, vec_mul(dx->v, acc.v)); + piay.v = vec_mul(mj.v, vec_mul(dy->v, acc.v)); + piaz.v = vec_mul(mj.v, vec_mul(dz->v, acc.v)); + + /* Get the time derivative for h. */ + pih_dt.v = + vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v); + + /* Change in entropy */ + entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v)); + + /* Store the forces back on the particles. */ + a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask); + a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask); + a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask); + h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask)); + entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask); + +#else + + error( + "The Gadget2 serial version of runner_iact_nonsym_force was called when " + "the vectorised version should have been used."); + +#endif +} + +/** + * @brief Force interaction computed using 2 interleaved vectors + * (non-symmetric vectorized version). + */ +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_2_vec_force( + float *R2, float *Dx, float *Dy, float *Dz, vector vix, vector viy, + vector viz, vector pirho, vector grad_hi, vector piPOrho2, vector balsara_i, + vector ci, float *Vjx, float *Vjy, float *Vjz, float *Pjrho, float *Grad_hj, + float *PjPOrho2, float *Balsara_j, float *Cj, float *Mj, vector hi_inv, + float *Hj_inv, vector *a_hydro_xSum, vector *a_hydro_ySum, + vector *a_hydro_zSum, vector *h_dtSum, vector *v_sigSum, + vector *entropy_dtSum, mask_t mask, mask_t mask_2, short mask_cond) { + +#ifdef WITH_VECTORIZATION + + vector r, r2, ri; + vector dx, dy, dz, dvx, dvy, dvz; + vector vjx, vjy, vjz; + vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; + vector ui, uj; + vector hid_inv, hjd_inv; + vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; + vector piax, piay, piaz; + vector pih_dt; + vector v_sig; + vector omega_ij, mu_ij, fac_mu, balsara; + vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; + + vector r_2, r2_2, ri_2; + vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2; + vector vjx_2, vjy_2, vjz_2; + vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2; + vector ui_2, uj_2; + vector hjd_inv_2; + vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; + vector piax_2, piay_2, piaz_2; + vector pih_dt_2; + vector v_sig_2; + vector omega_ij_2, mu_ij_2, balsara_2; + vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; + + /* Fill vectors. */ + mj.v = vec_load(Mj); + mj_2.v = vec_load(&Mj[VEC_SIZE]); + vjx.v = vec_load(Vjx); + vjx_2.v = vec_load(&Vjx[VEC_SIZE]); + vjy.v = vec_load(Vjy); + vjy_2.v = vec_load(&Vjy[VEC_SIZE]); + vjz.v = vec_load(Vjz); + vjz_2.v = vec_load(&Vjz[VEC_SIZE]); + dx.v = vec_load(Dx); + dx_2.v = vec_load(&Dx[VEC_SIZE]); + dy.v = vec_load(Dy); + dy_2.v = vec_load(&Dy[VEC_SIZE]); + dz.v = vec_load(Dz); + dz_2.v = vec_load(&Dz[VEC_SIZE]); + + /* Get the radius and inverse radius. */ + r2.v = vec_load(R2); + r2_2.v = vec_load(&R2[VEC_SIZE]); + ri = vec_reciprocal_sqrt(r2); + ri_2 = vec_reciprocal_sqrt(r2_2); + r.v = vec_mul(r2.v, ri.v); + r_2.v = vec_mul(r2_2.v, ri_2.v); + + /* Get remaining properties. */ + pjrho.v = vec_load(Pjrho); + pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); + grad_hj.v = vec_load(Grad_hj); + grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); + pjPOrho2.v = vec_load(PjPOrho2); + pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); + balsara_j.v = vec_load(Balsara_j); + balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); + cj.v = vec_load(Cj); + cj_2.v = vec_load(&Cj[VEC_SIZE]); + hj_inv.v = vec_load(Hj_inv); + hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); + + fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ + + /* Find the balsara switch. */ + balsara.v = vec_add(balsara_i.v, balsara_j.v); + balsara_2.v = vec_add(balsara_i.v, balsara_j_2.v); + + /* Get the kernel for hi. */ + hid_inv = pow_dimension_plus_one_vec(hi_inv); + ui.v = vec_mul(r.v, hi_inv.v); + ui_2.v = vec_mul(r_2.v, hi_inv.v); + kernel_eval_dWdx_force_vec(&ui, &wi_dx); + kernel_eval_dWdx_force_vec(&ui_2, &wi_dx_2); + wi_dr.v = vec_mul(hid_inv.v, wi_dx.v); + wi_dr_2.v = vec_mul(hid_inv.v, wi_dx_2.v); + + /* Get the kernel for hj. */ + hjd_inv = pow_dimension_plus_one_vec(hj_inv); + hjd_inv_2 = pow_dimension_plus_one_vec(hj_inv_2); + uj.v = vec_mul(r.v, hj_inv.v); + uj_2.v = vec_mul(r_2.v, hj_inv_2.v); + + /* Calculate the kernel for two particles. */ + kernel_eval_dWdx_force_vec(&uj, &wj_dx); + kernel_eval_dWdx_force_vec(&uj_2, &wj_dx_2); + + wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v); + wj_dr_2.v = vec_mul(hjd_inv_2.v, wj_dx_2.v); + + /* Compute dv. */ + dvx.v = vec_sub(vix.v, vjx.v); + dvx_2.v = vec_sub(vix.v, vjx_2.v); + dvy.v = vec_sub(viy.v, vjy.v); + dvy_2.v = vec_sub(viy.v, vjy_2.v); + dvz.v = vec_sub(viz.v, vjz.v); + dvz_2.v = vec_sub(viz.v, vjz_2.v); + + /* Compute dv dot r. */ + dvdr.v = vec_fma(dvx.v, dx.v, vec_fma(dvy.v, dy.v, vec_mul(dvz.v, dz.v))); + dvdr_2.v = vec_fma(dvx_2.v, dx_2.v, + vec_fma(dvy_2.v, dy_2.v, vec_mul(dvz_2.v, dz_2.v))); + + /* Compute the relative velocity. (This is 0 if the particles move away from + * each other and negative otherwise) */ + omega_ij.v = vec_fmin(dvdr.v, vec_setzero()); + omega_ij_2.v = vec_fmin(dvdr_2.v, vec_setzero()); + mu_ij.v = + vec_mul(fac_mu.v, vec_mul(ri.v, omega_ij.v)); /* This is 0 or negative */ + mu_ij_2.v = vec_mul( + fac_mu.v, vec_mul(ri_2.v, omega_ij_2.v)); /* This is 0 or negative */ + + /* Compute signal velocity */ + v_sig.v = vec_fnma(vec_set1(3.f), mu_ij.v, vec_add(ci.v, cj.v)); + v_sig_2.v = vec_fnma(vec_set1(3.f), mu_ij_2.v, vec_add(ci.v, cj_2.v)); + + /* Now construct the full viscosity term */ + rho_ij.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho.v)); + rho_ij_2.v = vec_mul(vec_set1(0.5f), vec_add(pirho.v, pjrho_2.v)); + + visc.v = vec_div(vec_mul(const_viscosity_alpha_fac.v, + vec_mul(v_sig.v, vec_mul(mu_ij.v, balsara.v))), + rho_ij.v); + visc_2.v = + vec_div(vec_mul(const_viscosity_alpha_fac.v, + vec_mul(v_sig_2.v, vec_mul(mu_ij_2.v, balsara_2.v))), + rho_ij_2.v); + + /* Now, convolve with the kernel */ + visc_term.v = + vec_mul(vec_set1(0.5f), + vec_mul(visc.v, vec_mul(vec_add(wi_dr.v, wj_dr.v), ri.v))); + visc_term_2.v = vec_mul( + vec_set1(0.5f), + vec_mul(visc_2.v, vec_mul(vec_add(wi_dr_2.v, wj_dr_2.v), ri_2.v))); + + vector grad_hi_mul_piPOrho2; + grad_hi_mul_piPOrho2.v = vec_mul(grad_hi.v, piPOrho2.v); + + sph_term.v = + vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr.v, + vec_mul(grad_hj.v, vec_mul(pjPOrho2.v, wj_dr.v))), + ri.v); + sph_term_2.v = + vec_mul(vec_fma(grad_hi_mul_piPOrho2.v, wi_dr_2.v, + vec_mul(grad_hj_2.v, vec_mul(pjPOrho2_2.v, wj_dr_2.v))), + ri_2.v); + + /* Eventually get the acceleration */ + acc.v = vec_add(visc_term.v, sph_term.v); + acc_2.v = vec_add(visc_term_2.v, sph_term_2.v); + + /* Use the force, Luke! */ + piax.v = vec_mul(mj.v, vec_mul(dx.v, acc.v)); + piax_2.v = vec_mul(mj_2.v, vec_mul(dx_2.v, acc_2.v)); + piay.v = vec_mul(mj.v, vec_mul(dy.v, acc.v)); + piay_2.v = vec_mul(mj_2.v, vec_mul(dy_2.v, acc_2.v)); + piaz.v = vec_mul(mj.v, vec_mul(dz.v, acc.v)); + piaz_2.v = vec_mul(mj_2.v, vec_mul(dz_2.v, acc_2.v)); + + /* Get the time derivative for h. */ + pih_dt.v = + vec_div(vec_mul(mj.v, vec_mul(dvdr.v, vec_mul(ri.v, wi_dr.v))), pjrho.v); + pih_dt_2.v = + vec_div(vec_mul(mj_2.v, vec_mul(dvdr_2.v, vec_mul(ri_2.v, wi_dr_2.v))), + pjrho_2.v); + + /* Change in entropy */ + entropy_dt.v = vec_mul(mj.v, vec_mul(visc_term.v, dvdr.v)); + entropy_dt_2.v = vec_mul(mj_2.v, vec_mul(visc_term_2.v, dvdr_2.v)); + + /* Store the forces back on the particles. */ + if (mask_cond) { + a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax.v, mask); + a_hydro_xSum->v = vec_mask_sub(a_hydro_xSum->v, piax_2.v, mask_2); + a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay.v, mask); + a_hydro_ySum->v = vec_mask_sub(a_hydro_ySum->v, piay_2.v, mask_2); + a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz.v, mask); + a_hydro_zSum->v = vec_mask_sub(a_hydro_zSum->v, piaz_2.v, mask_2); + h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt.v, mask); + h_dtSum->v = vec_mask_sub(h_dtSum->v, pih_dt_2.v, mask_2); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig.v, mask)); + v_sigSum->v = vec_fmax(v_sigSum->v, vec_and_mask(v_sig_2.v, mask_2)); + entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt.v, mask); + entropy_dtSum->v = vec_mask_add(entropy_dtSum->v, entropy_dt_2.v, mask_2); + } else { + a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax.v); + a_hydro_xSum->v = vec_sub(a_hydro_xSum->v, piax_2.v); + a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay.v); + a_hydro_ySum->v = vec_sub(a_hydro_ySum->v, piay_2.v); + a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz.v); + a_hydro_zSum->v = vec_sub(a_hydro_zSum->v, piaz_2.v); + h_dtSum->v = vec_sub(h_dtSum->v, pih_dt.v); + h_dtSum->v = vec_sub(h_dtSum->v, pih_dt_2.v); + v_sigSum->v = vec_fmax(v_sigSum->v, v_sig.v); + v_sigSum->v = vec_fmax(v_sigSum->v, v_sig_2.v); + entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt.v); + entropy_dtSum->v = vec_add(entropy_dtSum->v, entropy_dt_2.v); + } +#else + + error( + "The Gadget2 serial version of runner_iact_nonsym_force was called when " + "the vectorised version should have been used."); + +#endif +} + +#endif + #endif /* SWIFT_GADGET2_HYDRO_IACT_H */ diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 5355c15f8f1a0d0c3d811c7039da04caf0522cc9..d3c905e50ea023ea63629505fd3566cbed3ed9b0 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -438,9 +438,7 @@ static const vector cond = FILL_VEC(0.5f); /** * @brief Computes the kernel function and its derivative for two particles - * using vectors. - * - * Return 0 if $u > \\gamma = H/h$ + * using vectors. The return value is undefined if $u > \\gamma = H/h$. * * @param u The ratio of the distance to the smoothing length $u = x/h$. * @param w (return) The value of the kernel function $W(x,h)$. @@ -518,9 +516,8 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( /** * @brief Computes the kernel function and its derivative for two particles - * using interleaved vectors. - * - * Return 0 if $u > \\gamma = H/h$ + * using interleaved vectors. The return value is undefined if $u > \\gamma = + * H/h$. * * @param u The ratio of the distance to the smoothing length $u = x/h$. * @param w (return) The value of the kernel function $W(x,h)$. @@ -644,9 +641,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_2_vec( /** * @brief Computes the kernel function for two particles - * using vectors. - * - * Return 0 if $u > \\gamma = H/h$ + * using vectors. The return value is undefined if $u > \\gamma = H/h$. * * @param u The ratio of the distance to the smoothing length $u = x/h$. * @param w (return) The value of the kernel function $W(x,h)$. @@ -704,6 +699,65 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, vec_mul(w->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_vec.v)); } +/** + * @brief Computes the kernel function derivative for two particles + * using vectors. The return value is undefined if $u > \\gamma = H/h$. + * + * @param u The ratio of the distance to the smoothing length $u = x/h$. + * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. + */ +__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( + vector *u, vector *dw_dx) { + + /* Go to the range [0,1[ from [0,H[ */ + vector x; + x.v = vec_mul(u->v, kernel_gamma_inv_vec.v); + +#ifdef WENDLAND_C2_KERNEL + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(wendland_dwdx_const_c0.v, x.v, wendland_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations */ + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c2.v); + + dw_dx->v = vec_fma(dw_dx->v, x.v, wendland_dwdx_const_c3.v); + + dw_dx->v = vec_mul(dw_dx->v, x.v); + +#elif defined(CUBIC_SPLINE_KERNEL) + vector dw_dx2; + mask_t mask_reg1, mask_reg2; + + /* Form a mask for each part of the kernel. */ + vec_create_mask(mask_reg1, vec_cmp_lt(x.v, cond.v)); /* 0 < x < 0.5 */ + vec_create_mask(mask_reg2, vec_cmp_gte(x.v, cond.v)); /* 0.5 < x < 1 */ + + /* Work out w for both regions of the kernel and combine the results together + * using masks. */ + + /* Init the iteration for Horner's scheme. */ + dw_dx->v = vec_fma(cubic_1_dwdx_const_c0.v, x.v, cubic_1_dwdx_const_c1.v); + dw_dx2.v = vec_fma(cubic_2_dwdx_const_c0.v, x.v, cubic_2_dwdx_const_c1.v); + + /* Calculate the polynomial interleaving vector operations. */ + dw_dx->v = vec_mul(dw_dx->v, x.v); /* cubic_1_dwdx_const_c2 is zero. */ + dw_dx2.v = vec_fma(dw_dx2.v, x.v, cubic_2_dwdx_const_c2.v); + + /* Mask out unneeded values. */ + dw_dx->v = vec_and_mask(dw_dx->v, mask_reg1); + dw_dx2.v = vec_and_mask(dw_dx2.v, mask_reg2); + + /* Added both dwdx and dwdx2 together to form complete result. */ + dw_dx->v = vec_add(dw_dx->v, dw_dx2.v); +#else +#error "Vectorisation not supported for this kernel!!!" +#endif + + /* Return everything */ + dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, + kernel_gamma_inv_dim_plus_one_vec.v)); +} + /** * @brief Computes the kernel function derivative for two particles * using vectors. @@ -713,7 +767,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval_W_vec(vector *u, * @param u The ratio of the distance to the smoothing length $u = x/h$. * @param dw_dx (return) The norm of the gradient of $|\\nabla W(x,h)|$. */ -__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_vec( +__attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_vec( vector *u, vector *dw_dx) { /* Go to the range [0,1[ from [0,H[ */ @@ -855,6 +909,9 @@ __attribute__((always_inline)) INLINE static void kernel_eval_dWdx_force_2_vec( /* Return everything */ dw_dx->v = vec_mul(dw_dx->v, vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); + dw_dx_2->v = vec_mul( + dw_dx_2->v, + vec_mul(kernel_constant_vec.v, kernel_gamma_inv_dim_plus_one_vec.v)); } #endif /* WITH_VECTORIZATION */ diff --git a/src/runner.c b/src/runner.c index bb5f2685555504c24a40a32e5cbf635a7d4a7f1f..69c1512479e07da0aacad0a9e28bcaa6aafce104 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1864,9 +1864,13 @@ void *runner_main(void *data) { else if (t->subtype == task_subtype_gradient) runner_doself1_gradient(r, ci); #endif - else if (t->subtype == task_subtype_force) + else if (t->subtype == task_subtype_force) { +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + runner_doself2_force_vec(r, ci); +#else runner_doself2_force(r, ci); - else if (t->subtype == task_subtype_grav) +#endif + } else if (t->subtype == task_subtype_grav) runner_doself_grav(r, ci, 1); else if (t->subtype == task_subtype_external_grav) runner_do_grav_external(r, ci, 1); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 0dd6b2cdfd430e471fc13087afb0b2fcde947b97..14eecaa2c0c126b721a400e63298e1d55a9c98da 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -2664,9 +2664,14 @@ void DOSUB_SELF2(struct runner *r, struct cell *ci, int gettimer) { } /* Otherwise, compute self-interaction. */ - else + else { +#if (DOSELF2 == runner_doself2_force) && defined(WITH_VECTORIZATION) && \ + defined(GADGET2_SPH) + runner_doself2_force_vec(r, ci); +#else DOSELF2(r, ci); - +#endif + } if (gettimer) TIMER_TOC(TIMER_DOSUB_SELF); } diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 1557c1ec84863c3518f8671cdf0e9ca44732e94e..2f0b311218c86a60c5c0853c555cb48479249bce 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -27,6 +27,8 @@ #include "active.h" #ifdef WITH_VECTORIZATION +static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); + /** * @brief Compute the vector remainder interactions from the secondary cache. * @@ -241,14 +243,12 @@ __attribute__((always_inline)) INLINE static void storeInteractions( * @param di_max Maximal position on the axis that can interact in cell ci * @param dj_min Minimal position on the axis that can interact in cell ci * @param max_index_i array to hold the maximum distances of pi particles into - * cell - * cj + * #cell cj * @param max_index_j array to hold the maximum distances of pj particles into - * cell - * cj + * #cell cj * @param init_pi first pi to interact with a pj particle * @param init_pj last pj to interact with a pi particle - * @param e The #engine. + * @param max_active_bin The largest time-bin active during this step. */ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const struct cell *ci, const struct cell *cj, @@ -256,7 +256,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const float dx_max, const float rshift, const double hi_max, const double hj_max, const double di_max, const double dj_min, int *max_index_i, int *max_index_j, int *init_pi, int *init_pj, - const struct engine *e) { + const timebin_t max_active_bin) { const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; @@ -271,7 +271,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( while (first_pi > 0 && sort_i[first_pi - 1].d + dx_max + hi_max > dj_min) { first_pi--; /* Store the index of the particle if it is active. */ - if (part_is_active(&parts_i[sort_i[first_pi].i], e)) active_id = first_pi; + if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin)) + active_id = first_pi; } /* Set the first active pi in range of any particle in cell j. */ @@ -318,7 +319,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( sort_j[last_pj + 1].d - hj_max - dx_max < di_max) { last_pj++; /* Store the index of the particle if it is active. */ - if (part_is_active(&parts_j[sort_j[last_pj].i], e)) active_id = last_pj; + if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin)) + active_id = last_pj; } /* Set the last active pj in range of any particle in cell i. */ @@ -381,6 +383,8 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( struct part *restrict parts = c->parts; const int count = c->count; + const timebin_t max_active_bin = e->max_active_bin; + vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; TIMER_TIC @@ -411,7 +415,7 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( pi = &parts[pid]; /* Is the ith particle active? */ - if (!part_is_active(pi, e)) continue; + if (!part_is_active_no_debug(pi, max_active_bin)) continue; vector pix, piy, piz; @@ -481,22 +485,22 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ - vector v_dx_tmp, v_dy_tmp, v_dz_tmp; - vector v_dx_tmp2, v_dy_tmp2, v_dz_tmp2, v_r2_2; - - v_dx_tmp.v = vec_sub(pix.v, pjx.v); - v_dx_tmp2.v = vec_sub(pix.v, pjx2.v); - v_dy_tmp.v = vec_sub(piy.v, pjy.v); - v_dy_tmp2.v = vec_sub(piy.v, pjy2.v); - v_dz_tmp.v = vec_sub(piz.v, pjz.v); - v_dz_tmp2.v = vec_sub(piz.v, pjz2.v); - - v_r2.v = vec_mul(v_dx_tmp.v, v_dx_tmp.v); - v_r2_2.v = vec_mul(v_dx_tmp2.v, v_dx_tmp2.v); - v_r2.v = vec_fma(v_dy_tmp.v, v_dy_tmp.v, v_r2.v); - v_r2_2.v = vec_fma(v_dy_tmp2.v, v_dy_tmp2.v, v_r2_2.v); - v_r2.v = vec_fma(v_dz_tmp.v, v_dz_tmp.v, v_r2.v); - v_r2_2.v = vec_fma(v_dz_tmp2.v, v_dz_tmp2.v, v_r2_2.v); + vector v_dx, v_dy, v_dz; + vector v_dx_2, v_dy_2, v_dz_2, v_r2_2; + + v_dx.v = vec_sub(pix.v, pjx.v); + v_dx_2.v = vec_sub(pix.v, pjx2.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dy_2.v = vec_sub(piy.v, pjy2.v); + v_dz.v = vec_sub(piz.v, pjz.v); + v_dz_2.v = vec_sub(piz.v, pjz2.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2_2.v = vec_fma(v_dy_2.v, v_dy_2.v, v_r2_2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + v_r2_2.v = vec_fma(v_dz_2.v, v_dz_2.v, v_r2_2.v); /* Form a mask from r2 < hig2 and r2 > 0.*/ mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, @@ -526,19 +530,18 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* If there are any interactions left pack interaction values into c2 * cache. */ if (doi_mask) { - storeInteractions(doi_mask, pjd, &v_r2, &v_dx_tmp, &v_dy_tmp, &v_dz_tmp, - cell_cache, &int_cache, &icount, &rhoSum, &rho_dhSum, - &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, - v_viz); - } - if (doi_mask2) { - storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_tmp2, - &v_dy_tmp2, &v_dz_tmp2, cell_cache, &int_cache, - &icount, &rhoSum, &rho_dhSum, &wcountSum, + storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache, + &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); } + if (doi_mask2) { + storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2, + &v_dz_2, cell_cache, &int_cache, &icount, &rhoSum, + &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, + &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, + v_viy, v_viz); + } } /* Perform padded vector remainder interactions if any are present. */ @@ -583,6 +586,202 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( #endif /* WITH_VECTORIZATION */ } +/** + * @brief Compute the force cell self-interaction (non-symmetric) using vector + * intrinsics with one particle pi at a time. + * + * @param r The #runner. + * @param c The #cell. + */ +__attribute__((always_inline)) INLINE void runner_doself2_force_vec( + struct runner *r, struct cell *restrict c) { + +#ifdef WITH_VECTORIZATION + const struct engine *e = r->e; + struct part *restrict pi; + int count_align; + const int num_vec_proc = 1; + + const timebin_t max_active_bin = e->max_active_bin; + + struct part *restrict parts = c->parts; + const int count = c->count; + + vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; + vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci; + + TIMER_TIC; + + if (!cell_is_active(c, e)) return; + + if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell."); + + /* Get the particle cache from the runner and re-allocate + * the cache if it is not big enough for the cell. */ + struct cache *restrict cell_cache = &r->ci_cache; + + if (cell_cache->count < count) { + cache_init(cell_cache, count); + } + + /* Read the particles from the cell and store them locally in the cache. */ + cache_read_force_particles(c, cell_cache); + +#ifdef SWIFT_DEBUG_CHECKS + for (int i = 0; i < count; i++) { + pi = &c->parts[i]; + /* Check that particles have been drifted to the current time */ + if (pi->ti_drift != e->ti_current) + error("Particle pi not drifted to current time"); + } +#endif + + /* Loop over the particles in the cell. */ + for (int pid = 0; pid < count; pid++) { + + /* Get a pointer to the ith particle. */ + pi = &parts[pid]; + + /* Is the ith particle active? */ + if (!part_is_active_no_debug(pi, max_active_bin)) continue; + + vector pix, piy, piz; + + const float hi = cell_cache->h[pid]; + + /* Fill particle pi vectors. */ + pix.v = vec_set1(cell_cache->x[pid]); + piy.v = vec_set1(cell_cache->y[pid]); + piz.v = vec_set1(cell_cache->z[pid]); + v_hi.v = vec_set1(hi); + v_vix.v = vec_set1(cell_cache->vx[pid]); + v_viy.v = vec_set1(cell_cache->vy[pid]); + v_viz.v = vec_set1(cell_cache->vz[pid]); + + v_rhoi.v = vec_set1(cell_cache->rho[pid]); + v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]); + v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]); + v_balsara_i.v = vec_set1(cell_cache->balsara[pid]); + v_ci.v = vec_set1(cell_cache->soundspeed[pid]); + + const float hig2 = hi * hi * kernel_gamma2; + v_hig2.v = vec_set1(hig2); + + /* Reset cumulative sums of update vectors. */ + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; + + /* Get the inverse of hi. */ + vector v_hi_inv; + + v_hi_inv = vec_reciprocal(v_hi); + + a_hydro_xSum.v = vec_setzero(); + a_hydro_ySum.v = vec_setzero(); + a_hydro_zSum.v = vec_setzero(); + h_dtSum.v = vec_setzero(); + v_sigSum.v = vec_set1(pi->force.v_sig); + entropy_dtSum.v = vec_setzero(); + + /* Pad cache if there is a serial remainder. */ + count_align = count; + int rem = count % (num_vec_proc * VEC_SIZE); + if (rem != 0) { + int pad = (num_vec_proc * VEC_SIZE) - rem; + + count_align += pad; + + /* Set positions to the same as particle pi so when the r2 > 0 mask is + * applied these extra contributions are masked out.*/ + for (int i = count; i < count_align; i++) { + cell_cache->x[i] = pix.f[0]; + cell_cache->y[i] = piy.f[0]; + cell_cache->z[i] = piz.f[0]; + cell_cache->h[i] = 1.f; + cell_cache->rho[i] = 1.f; + cell_cache->grad_h[i] = 1.f; + cell_cache->pOrho2[i] = 1.f; + cell_cache->balsara[i] = 1.f; + cell_cache->soundspeed[i] = 1.f; + } + } + + vector pjx, pjy, pjz, hj, hjg2; + + /* Find all of particle pi's interacions and store needed values in the + * secondary cache.*/ + for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { + + /* Load 1 set of vectors from the particle cache. */ + pjx.v = vec_load(&cell_cache->x[pjd]); + pjy.v = vec_load(&cell_cache->y[pjd]); + pjz.v = vec_load(&cell_cache->z[pjd]); + hj.v = vec_load(&cell_cache->h[pjd]); + hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); + + /* Compute the pairwise distance. */ + vector v_dx, v_dy, v_dz; + + v_dx.v = vec_sub(pix.v, pjx.v); + v_dy.v = vec_sub(piy.v, pjy.v); + v_dz.v = vec_sub(piz.v, pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ + mask_t v_doi_mask, v_doi_mask_self_check; + int doi_mask; + + /* Form r2 > 0 mask.*/ + vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); + + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hig2.v, hjg2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + + /* Combine all 3 masks and form integer mask. */ + vec_combine_masks(v_doi_mask, v_doi_mask_self_check); + doi_mask = vec_form_int_mask(v_doi_mask); + + /* If there are any interactions perform them. */ + if (doi_mask) { + vector v_hj_inv; + v_hj_inv = vec_reciprocal(hj); + + /* To stop floating point exceptions for when particle separations are + * 0. + */ + v_r2.v = vec_add(v_r2.v, vec_set1(FLT_MIN)); + + runner_iact_nonsym_1_vec_force( + &v_r2, &v_dx, &v_dy, &v_dz, v_vix, v_viy, v_viz, v_rhoi, v_grad_hi, + v_pOrhoi2, v_balsara_i, v_ci, &cell_cache->vx[pjd], + &cell_cache->vy[pjd], &cell_cache->vz[pjd], &cell_cache->rho[pjd], + &cell_cache->grad_h[pjd], &cell_cache->pOrho2[pjd], + &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd], + &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &a_hydro_xSum, + &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, + v_doi_mask); + } + + } /* Loop over all other particles. */ + + VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HMAX(v_sigSum, pi->force.v_sig); + VEC_HADD(entropy_dtSum, pi->entropy_dt); + + } /* loop over all particles. */ + + TIMER_TOC(timer_doself_force); +#endif /* WITH_VECTORIZATION */ +} + /** * @brief Compute the density interactions between a cell pair (non-symmetric) * using vector intrinsics. @@ -599,6 +798,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #ifdef WITH_VECTORIZATION const struct engine *restrict e = r->e; + const timebin_t max_active_bin = e->max_active_bin; vector v_hi, v_vix, v_viy, v_viz, v_hig2; @@ -661,7 +861,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, for (int pid = count_i - 1; pid >= 0 && sort_i[pid].d + hi_max + dx_max > dj_min; pid--) { struct part *restrict pi = &parts_i[sort_i[pid].i]; - if (part_is_active(pi, e)) { + if (part_is_active_no_debug(pi, max_active_bin)) { numActive++; break; } @@ -671,7 +871,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, for (int pjd = 0; pjd < count_j && sort_j[pjd].d - hj_max - dx_max < di_max; pjd++) { struct part *restrict pj = &parts_j[sort_j[pjd].i]; - if (part_is_active(pj, e)) { + if (part_is_active_no_debug(pj, max_active_bin)) { numActive++; break; } @@ -700,12 +900,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, max_index_j = r->cj_cache.max_index; /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[]. - */ + */ /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, hj_max, di_max, dj_min, max_index_i, max_index_j, - &first_pi, &last_pj, e); + &first_pi, &last_pj, max_active_bin); /* Limits of the outer loops. */ int first_pi_loop = first_pi; @@ -733,7 +933,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Get a hold of the ith part in ci. */ struct part *restrict pi = &parts_i[sort_i[pid].i]; - if (!part_is_active(pi, e)) continue; + if (!part_is_active_no_debug(pi, max_active_bin)) continue; /* Set the cache index. */ int ci_cache_idx = pid - first_pi_align; @@ -801,8 +1001,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, vector v_dx, v_dy, v_dz, v_r2; #ifdef SWIFT_DEBUG_CHECKS - if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0) { - error("Unaligned read!!! cj_cache_idx=%d", cj_cache_idx); + if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || + cj_cache_idx + (VEC_SIZE - 1) > (last_pj_align + 1 + VEC_SIZE)) { + error("Unaligned read!!! cj_cache_idx=%d, last_pj_align=%d", + cj_cache_idx, last_pj_align); } #endif @@ -861,7 +1063,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Get a hold of the jth part in cj. */ struct part *restrict pj = &parts_j[sort_j[pjd].i]; - if (!part_is_active(pj, e)) continue; + if (!part_is_active_no_debug(pj, max_active_bin)) continue; /* Set the cache index. */ int cj_cache_idx = pjd; @@ -927,8 +1129,13 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { #ifdef SWIFT_DEBUG_CHECKS - if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) { - error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx); + if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 || + ci_cache_idx + (VEC_SIZE - 1) > + (count_i - first_pi_align + VEC_SIZE)) { + error( + "Unaligned read!!! ci_cache_idx=%d, first_pi_align=%d, " + "count_i=%d", + ci_cache_idx, first_pi_align, count_i); } #endif @@ -969,7 +1176,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } /* loop over the parts in ci. */ /* Perform horizontal adds on vector sums and store result in particle pj. - */ + */ VEC_HADD(rhoSum, pj->rho); VEC_HADD(rho_dhSum, pj->density.rho_dh); VEC_HADD(wcountSum, pj->density.wcount); diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 13c88cebaa47c5051b4119033a055e49a9a1079c..09dc76ef04df5d29ea32f4af24efdc09e433aa73 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -35,6 +35,7 @@ /* Function prototypes. */ void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); +void runner_doself2_force_vec(struct runner *r, struct cell *restrict c); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj, const int sid, const double *shift); diff --git a/src/statistics.c b/src/statistics.c index 5a3f1ff4f9a2232a14817e7e55fd2cff5bdcd80e..b06f087c70877fa5dda451300fcb4bf665b011be 100644 --- a/src/statistics.c +++ b/src/statistics.c @@ -74,6 +74,9 @@ void stats_add(struct statistics *a, const struct statistics *b) { a->ang_mom[0] += b->ang_mom[0]; a->ang_mom[1] += b->ang_mom[1]; a->ang_mom[2] += b->ang_mom[2]; + a->centre_of_mass[0] += b->centre_of_mass[0]; + a->centre_of_mass[1] += b->centre_of_mass[1]; + a->centre_of_mass[2] += b->centre_of_mass[2]; } /** @@ -129,8 +132,8 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) { /* Get useful time variables */ const integertime_t ti_begin = get_integer_time_begin(ti_current, p->time_bin); - const integertime_t ti_step = get_integer_timestep(p->time_bin); - const float dt = (ti_current - (ti_begin + ti_step / 2)) * timeBase; + const integertime_t ti_end = get_integer_time_end(ti_current, p->time_bin); + const float dt = (ti_current - ((ti_begin + ti_end) / 2)) * timeBase; /* Get the total acceleration */ float a_tot[3] = {p->a_hydro[0], p->a_hydro[1], p->a_hydro[2]}; @@ -147,10 +150,17 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) { const float m = hydro_get_mass(p); const double x[3] = {p->x[0], p->x[1], p->x[2]}; + const float rho = hydro_get_density(p); + const float u_int = hydro_get_internal_energy(p); /* Collect mass */ stats.mass += m; + /* Collect centre of mass */ + stats.centre_of_mass[0] += m * x[0]; + stats.centre_of_mass[1] += m * x[1]; + stats.centre_of_mass[2] += m * x[2]; + /* Collect momentum */ stats.mom[0] += m * v[0]; stats.mom[1] += m * v[1]; @@ -163,7 +173,7 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) { /* Collect energies. */ stats.E_kin += 0.5f * m * (v[0] * v[0] + v[1] * v[1] + v[2] * v[2]); - stats.E_int += m * hydro_get_internal_energy(p); + stats.E_int += m * u_int; stats.E_rad += cooling_get_radiated_energy(xp); if (gp != NULL) { stats.E_pot_self += m * gravity_get_potential(gp); @@ -172,7 +182,7 @@ void stats_collect_part_mapper(void *map_data, int nr_parts, void *extra_data) { } /* Collect entropy */ - stats.entropy += m * hydro_get_entropy(p); + stats.entropy += m * gas_entropy_from_internal_energy(rho, u_int); } /* Now write back to memory */ @@ -219,8 +229,8 @@ void stats_collect_gpart_mapper(void *map_data, int nr_gparts, /* Get useful variables */ const integertime_t ti_begin = get_integer_time_begin(ti_current, gp->time_bin); - const integertime_t ti_step = get_integer_timestep(gp->time_bin); - const float dt = (ti_current - (ti_begin + ti_step / 2)) * timeBase; + const integertime_t ti_end = get_integer_time_end(ti_current, gp->time_bin); + const float dt = (ti_current - ((ti_begin + ti_end) / 2)) * timeBase; /* Extrapolate velocities */ const float v[3] = {gp->v_full[0] + gp->a_grav[0] * dt, @@ -233,6 +243,11 @@ void stats_collect_gpart_mapper(void *map_data, int nr_gparts, /* Collect mass */ stats.mass += m; + /* Collect centre of mass */ + stats.centre_of_mass[0] += m * x[0]; + stats.centre_of_mass[1] += m * x[1]; + stats.centre_of_mass[2] += m * x[2]; + /* Collect momentum */ stats.mom[0] += m * v[0]; stats.mom[1] += m * v[1]; @@ -279,6 +294,18 @@ void stats_collect(const struct space *s, struct statistics *stats) { s->nr_gparts, sizeof(struct gpart), 0, &extra_data); } +/** + * @brief Apply final opetations on the #stats. + * + * @param stats The #stats to work on. + */ +void stats_finalize(struct statistics *stats) { + + stats->centre_of_mass[0] /= stats->mass; + stats->centre_of_mass[1] /= stats->mass; + stats->centre_of_mass[2] /= stats->mass; +} + /** * @brief Prints the content of a #statistics aggregator to a file * @@ -294,11 +321,12 @@ void stats_print_to_file(FILE *file, const struct statistics *stats, fprintf(file, " %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e %14e " - "%14e %14e %14e\n", + "%14e %14e %14e %14e %14e %14e\n", time, stats->mass, E_tot, stats->E_kin, stats->E_int, E_pot, stats->E_pot_self, stats->E_pot_ext, stats->E_rad, stats->entropy, stats->mom[0], stats->mom[1], stats->mom[2], stats->ang_mom[0], - stats->ang_mom[1], stats->ang_mom[2]); + stats->ang_mom[1], stats->ang_mom[2], stats->centre_of_mass[0], + stats->centre_of_mass[1], stats->centre_of_mass[2]); fflush(file); } diff --git a/src/statistics.h b/src/statistics.h index b007d1314c458254986862f99b7cb1669d995545..e8cddda1e855d156070b8a000cb15b515f127740 100644 --- a/src/statistics.h +++ b/src/statistics.h @@ -58,6 +58,9 @@ struct statistics { /*! Angular momentum */ double ang_mom[3]; + /*! Centre of mass */ + double centre_of_mass[3]; + /*! Lock for threaded access */ swift_lock_type lock; }; @@ -67,6 +70,7 @@ void stats_add(struct statistics* a, const struct statistics* b); void stats_print_to_file(FILE* file, const struct statistics* stats, double time); void stats_init(struct statistics* s); +void stats_finalize(struct statistics* s); #ifdef WITH_MPI extern MPI_Datatype statistics_mpi_type; diff --git a/src/vector.h b/src/vector.h index 6a7c6837989025785c1f9134004f2ebcc226a205..e463bbd2e62f0c6b51446d22a9805670b134e684 100644 --- a/src/vector.h +++ b/src/vector.h @@ -91,6 +91,8 @@ #define vec_init_mask_true(mask) ({ mask = 0xFFFF; }) #define vec_zero_mask(mask) ({ mask = 0; }) #define vec_create_mask(mask, cond) ({ mask = cond; }) +#define vec_combine_masks(mask1, mask2) \ + ({ mask1 = vec_mask_and(mask1, mask2); }) #define vec_pad_mask(mask, pad) ({ mask = mask >> (pad); }) #define vec_blend(mask, a, b) _mm512_mask_blend_ps(mask, a, b) #define vec_todbl_lo(a) _mm512_cvtps_pd(_mm512_extract128_ps(a, 0)) @@ -186,6 +188,8 @@ #define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v) #define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF) #define vec_create_mask(mask, cond) mask.v = cond +#define vec_combine_masks(mask1, mask2) \ + ({ mask1.v = vec_mask_and(mask1, mask2); }) #define vec_zero_mask(mask) mask.v = vec_setzero() #define vec_pad_mask(mask, pad) \ for (int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 diff --git a/src/xmf.c b/src/xmf.c index 67682b4794ade773c39a748eddf765e392c74865..3c8bb257cca224a06e5b516be46e92dd0006183a 100644 --- a/src/xmf.c +++ b/src/xmf.c @@ -205,6 +205,7 @@ void xmf_write_groupfooter(FILE* xmfFile, enum part_type ptype) { */ int xmf_precision(enum IO_DATA_TYPE type) { switch (type) { + case INT: case FLOAT: return 4; break; @@ -233,6 +234,7 @@ const char* xmf_type(enum IO_DATA_TYPE type) { case DOUBLE: return "Float"; break; + case INT: case ULONGLONG: case LONGLONG: return "Int"; diff --git a/tests/Makefile.am b/tests/Makefile.am index bf4c205b5ab5000e246693c4d45772d16f67a5e5..553980a93e907e83b65bb4539ca49c8bc1b7207b 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -32,7 +32,7 @@ TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetr check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \ testSPHStep testActivePair test27cells test125cells testParser \ testKernel testFFT testInteractions testMaths \ - testSymmetry testThreadpool benchmarkInteractions \ + testSymmetry testThreadpool \ testAdiabaticIndex testRiemannExact testRiemannTRRS \ testRiemannHLLC testMatrixInversion testDump testLogger \ testVoronoi1D testVoronoi2D testVoronoi3D testPeriodicBC \ @@ -72,8 +72,6 @@ testFFT_SOURCES = testFFT.c testInteractions_SOURCES = testInteractions.c -benchmarkInteractions_SOURCES = benchmarkInteractions.c - testAdiabaticIndex_SOURCES = testAdiabaticIndex.c testRiemannExact_SOURCES = testRiemannExact.c diff --git a/tests/benchmarkInteractions.c b/tests/benchmarkInteractions.c deleted file mode 100644 index 5dc7a4ae85e62313a84d0c0be88b65a1c0bb59df..0000000000000000000000000000000000000000 --- a/tests/benchmarkInteractions.c +++ /dev/null @@ -1,525 +0,0 @@ -/******************************************************************************* - * This file is part of SWIFT. - * Copyright (C) 2015 Matthieu Schaller (matthieu.schaller@durham.ac.uk). - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - ******************************************************************************/ - -#include <fenv.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include "swift.h" - -#define array_align sizeof(float) * VEC_SIZE -#define ACC_THRESHOLD 1e-5 - -#ifdef NONSYM_DENSITY -#define IACT runner_iact_nonsym_density -#define IACT_VEC runner_iact_nonsym_2_vec_density -#define IACT_NAME "test_nonsym_density" -#define NUM_VEC_PROC_INT 2 -#endif - -#ifdef SYM_DENSITY -#define IACT runner_iact_density -#define IACT_VEC runner_iact_vec_density -#define IACT_NAME "test_sym_density" -#endif - -#ifdef NONSYM_FORCE -#define IACT runner_iact_nonsym_force -#define IACT_VEC runner_iact_nonsym_vec_force -#define IACT_NAME "test_nonsym_force" -#endif - -#ifdef SYM_FORCE -#define IACT runner_iact_force -#define IACT_VEC runner_iact_vec_force -#define IACT_NAME "test_sym_force" -#endif - -#ifndef IACT -#define IACT runner_iact_nonsym_density -#define IACT_VEC runner_iact_nonsym_1_vec_density -#define IACT_NAME "test_nonsym_density" -#define NUM_VEC_PROC_INT 1 -#endif - -/** - * @brief Constructs an array of particles in a valid state prior to - * a IACT_NONSYM and IACT_NONSYM_VEC call. - * - * @param count No. of particles to create - * @param offset The position of the particle offset from (0,0,0). - * @param spacing Particle spacing. - * @param h The smoothing length of the particles in units of the inter-particle - *separation. - * @param partId The running counter of IDs. - */ -struct part *make_particles(size_t count, double *offset, double spacing, - double h, long long *partId) { - - struct part *particles; - if (posix_memalign((void **)&particles, part_align, - count * sizeof(struct part)) != 0) { - error("couldn't allocate particles, no. of particles: %d", (int)count); - } - bzero(particles, count * sizeof(struct part)); - - /* Construct the particles */ - struct part *p; - - /* Set test particle at centre of unit sphere. */ - p = &particles[0]; - - /* Place the test particle at the centre of a unit sphere. */ - p->x[0] = 0.0f; - p->x[1] = 0.0f; - p->x[2] = 0.0f; - - p->h = h; - p->id = ++(*partId); - -#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) - p->mass = 1.0f; -#endif - - /* Place rest of particles around the test particle - * with random position within a unit sphere. */ - for (size_t i = 1; i < count; ++i) { - p = &particles[i]; - - /* Randomise positions within a unit sphere. */ - p->x[0] = random_uniform(-1.0, 1.0); - p->x[1] = random_uniform(-1.0, 1.0); - p->x[2] = random_uniform(-1.0, 1.0); - - /* Randomise velocities. */ - p->v[0] = random_uniform(-0.05, 0.05); - p->v[1] = random_uniform(-0.05, 0.05); - p->v[2] = random_uniform(-0.05, 0.05); - - p->h = h; - p->id = ++(*partId); -#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) - p->mass = 1.0f; -#endif - } - return particles; -} - -/** - * @brief Populates particle properties needed for the force calculation. - */ -void prepare_force(struct part *parts, size_t count) { - -#if !defined(GIZMO_SPH) && !defined(SHADOWFAX_SPH) && !defined(MINIMAL_SPH) - struct part *p; - for (size_t i = 0; i < count; ++i) { - p = &parts[i]; - p->rho = i + 1; - p->force.balsara = random_uniform(0.0, 1.0); - p->force.P_over_rho2 = i + 1; - p->force.soundspeed = random_uniform(2.0, 3.0); - p->force.v_sig = 0.0f; - p->force.h_dt = 0.0f; - } -#endif -} - -/** - * @brief Dumps all particle information to a file - */ -void dump_indv_particle_fields(char *fileName, struct part *p) { - - FILE *file = fopen(fileName, "a"); - - fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %10e %10e %10e %13e %13e %13e " - "%13e %13e %13e %13e " - "%13e %13e %13e\n", - p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], - p->a_hydro[0], p->a_hydro[1], p->a_hydro[2], -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0., 0., -#else - p->rho, p->density.rho_dh, -#endif - p->density.wcount, p->density.wcount_dh, p->force.h_dt, -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0., -#else - p->force.v_sig, -#endif -#if defined(MINIMAL_SPH) || defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) - 0., 0., 0., 0. -#else - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2] -#endif - ); - fclose(file); -} - -/** - * @brief Creates a header for the output file - */ -void write_header(char *fileName) { - - FILE *file = fopen(fileName, "w"); - /* Write header */ - fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %10s %10s %10s %13s %13s %13s " - "%13s %13s %13s %13s" - "%13s %13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "a_x", "a_y", - "a_z", "rho", "rho_dh", "wcount", "wcount_dh", "dh/dt", "v_sig", - "div_v", "curl_vx", "curl_vy", "curl_vz", "dS/dt"); - fprintf(file, "\n# PARTICLES BEFORE INTERACTION:\n"); - fclose(file); -} - -/** - * @brief Compares the vectorised result against - * the serial result of the interaction. - * - * @param serial_test_part Particle that has been updated serially - * @param serial_parts Particle array that has been interacted serially - * @param vec_test_part Particle that has been updated using vectors - * @param vec_parts Particle array to be interacted using vectors - * @param count No. of particles that have been interacted - * - * @return Non-zero value if difference found, 0 otherwise - */ -int check_results(struct part serial_test_part, struct part *serial_parts, - struct part vec_test_part, struct part *vec_parts, - int count) { - int result = 0; - result += compare_particles(serial_test_part, vec_test_part, ACC_THRESHOLD); - - for (int i = 0; i < count; i++) - result += compare_particles(serial_parts[i], vec_parts[i], ACC_THRESHOLD); - - return result; -} - -/* - * @brief Calls the serial and vectorised version of the non-symmetrical density - * interaction. - * - * @param test_part Particle that will be updated - * @param parts Particle array to be interacted - * @param count No. of particles to be interacted - * @param serial_inter_func Serial interaction function to be called - * @param vec_inter_func Vectorised interaction function to be called - * @param runs No. of times to call interactions - * - */ -void test_interactions(struct part test_part, struct part *parts, size_t count, - char *filePrefix, int runs) { - - ticks serial_time = 0; -#ifdef WITH_VECTORIZATION - ticks vec_time = 0; -#endif - - FILE *file; - char serial_filename[200] = ""; - char vec_filename[200] = ""; - - strcpy(serial_filename, filePrefix); - strcpy(vec_filename, filePrefix); - sprintf(serial_filename + strlen(serial_filename), "_serial.dat"); - sprintf(vec_filename + strlen(vec_filename), "_vec.dat"); - - write_header(serial_filename); - write_header(vec_filename); - - struct part pi_serial, pi_vec; - struct part pj_serial[count], pj_vec[count]; - - float r2[count] __attribute__((aligned(array_align))); - float dx[3 * count] __attribute__((aligned(array_align))); - -#ifdef WITH_VECTORIZATION - struct part *piq[count], *pjq[count]; - for (size_t k = 0; k < count; k++) { - piq[k] = NULL; - pjq[k] = NULL; - } - - float r2q[count] __attribute__((aligned(array_align))); - float hiq[count] __attribute__((aligned(array_align))); - float dxq[count] __attribute__((aligned(array_align))); - - float dyq[count] __attribute__((aligned(array_align))); - float dzq[count] __attribute__((aligned(array_align))); - float mjq[count] __attribute__((aligned(array_align))); - float vixq[count] __attribute__((aligned(array_align))); - float viyq[count] __attribute__((aligned(array_align))); - float vizq[count] __attribute__((aligned(array_align))); - float vjxq[count] __attribute__((aligned(array_align))); - float vjyq[count] __attribute__((aligned(array_align))); - float vjzq[count] __attribute__((aligned(array_align))); -#endif - - /* Call serial interaction a set number of times. */ - for (int k = 0; k < runs; k++) { - /* Reset particle to initial setup */ - pi_serial = test_part; - for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i]; - - /* Only dump data on first run. */ - if (k == 0) { - /* Dump state of particles before serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi_serial); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(serial_filename, &pj_serial[i]); - } - - /* Perform serial interaction */ - for (size_t i = 0; i < count; i++) { - /* Compute the pairwise distance. */ - r2[i] = 0.0f; - for (int k = 0; k < 3; k++) { - int ind = (3 * i) + k; - dx[ind] = pi_serial.x[k] - pj_serial[i].x[k]; - r2[i] += dx[ind] * dx[ind]; - } - } - - const ticks tic = getticks(); -/* Perform serial interaction */ -#ifdef __ICC -#pragma novector -#endif - for (size_t i = 0; i < count; i++) { - IACT(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, &pi_serial, - &pj_serial[i]); - } - serial_time += getticks() - tic; - } - - file = fopen(serial_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - - /* Dump result of serial interaction. */ - dump_indv_particle_fields(serial_filename, &pi_serial); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(serial_filename, &pj_serial[i]); - - /* Call vector interaction a set number of times. */ - for (int k = 0; k < runs; k++) { - /* Reset particle to initial setup */ - pi_vec = test_part; - for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i]; - - /* Setup arrays for vector interaction. */ - for (size_t i = 0; i < count; i++) { - /* Compute the pairwise distance. */ - float r2 = 0.0f; - float dx[3]; - for (int k = 0; k < 3; k++) { - dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; - r2 += dx[k] * dx[k]; - } - -#ifdef WITH_VECTORIZATION - r2q[i] = r2; - dxq[i] = dx[0]; - hiq[i] = pi_vec.h; - piq[i] = &pi_vec; - pjq[i] = &pj_vec[i]; - - dyq[i] = dx[1]; - dzq[i] = dx[2]; - mjq[i] = pj_vec[i].mass; - vixq[i] = pi_vec.v[0]; - viyq[i] = pi_vec.v[1]; - vizq[i] = pi_vec.v[2]; - vjxq[i] = pj_vec[i].v[0]; - vjyq[i] = pj_vec[i].v[1]; - vjzq[i] = pj_vec[i].v[2]; -#endif - } - - /* Only dump data on first run. */ - if (k == 0) { -#ifdef WITH_VECTORIZATION - /* Dump state of particles before vector interaction. */ - dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(vec_filename, pjq[i]); -#endif - } - -/* Perform vector interaction. */ -#ifdef WITH_VECTORIZATION - vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec; - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; - - rhoSum.v = vec_set1(0.f); - rho_dhSum.v = vec_set1(0.f); - wcountSum.v = vec_set1(0.f); - wcount_dhSum.v = vec_set1(0.f); - div_vSum.v = vec_set1(0.f); - curlvxSum.v = vec_set1(0.f); - curlvySum.v = vec_set1(0.f); - curlvzSum.v = vec_set1(0.f); - - hi_vec.v = vec_load(&hiq[0]); - vix_vec.v = vec_load(&vixq[0]); - viy_vec.v = vec_load(&viyq[0]); - viz_vec.v = vec_load(&vizq[0]); - - hi_inv_vec = vec_reciprocal(hi_vec); - - mask_t mask; - vec_init_mask_true(mask); -#if (NUM_VEC_PROC_INT == 2) - mask_t mask2; - vec_init_mask_true(mask2); -#endif - const ticks vec_tic = getticks(); - - for (size_t i = 0; i < count; i += NUM_VEC_PROC_INT * VEC_SIZE) { - -/* Interleave two vectors for interaction. */ -#if (NUM_VEC_PROC_INT == 2) - IACT_VEC(&(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (hi_inv_vec), - (vix_vec), (viy_vec), (viz_vec), &(vjxq[i]), &(vjyq[i]), - &(vjzq[i]), &(mjq[i]), &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, - mask, mask2, 0); -#else /* Only use one vector for interaction. */ - vector r2, dx, dy, dz; - r2.v = vec_load(&(r2q[i])); - dx.v = vec_load(&(dxq[i])); - dy.v = vec_load(&(dyq[i])); - dz.v = vec_load(&(dzq[i])); - - IACT_VEC(&r2, &dx, &dy, &dz, (hi_inv_vec), (vix_vec), (viy_vec), - (viz_vec), &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(mjq[i]), - &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, - &curlvxSum, &curlvySum, &curlvzSum, mask); -#endif - } - - VEC_HADD(rhoSum, piq[0]->rho); - VEC_HADD(rho_dhSum, piq[0]->density.rho_dh); - VEC_HADD(wcountSum, piq[0]->density.wcount); - VEC_HADD(wcount_dhSum, piq[0]->density.wcount_dh); - VEC_HADD(div_vSum, piq[0]->density.div_v); - VEC_HADD(curlvxSum, piq[0]->density.rot_v[0]); - VEC_HADD(curlvySum, piq[0]->density.rot_v[1]); - VEC_HADD(curlvzSum, piq[0]->density.rot_v[2]); - - vec_time += getticks() - vec_tic; -#endif - } - - file = fopen(vec_filename, "a"); - fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); - fclose(file); - -#ifdef WITH_VECTORIZATION - /* Dump result of serial interaction. */ - dump_indv_particle_fields(vec_filename, piq[0]); - for (size_t i = 0; i < count; i++) - dump_indv_particle_fields(vec_filename, pjq[i]); -#endif - -#ifdef WITH_VECTORIZATION - /* Check serial results against the vectorised results. */ - if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count)) - message("Differences found..."); -#endif - - message("The serial interactions took : %15lli ticks.", - serial_time / runs); -#ifdef WITH_VECTORIZATION - message("The vectorised interactions took : %15lli ticks.", vec_time / runs); - message("Speed up: %15fx.", (double)(serial_time) / vec_time); -#endif -} - -/* And go... */ -int main(int argc, char *argv[]) { - size_t runs = 10000; - double h = 1.0, spacing = 0.5; - double offset[3] = {0.0, 0.0, 0.0}; - size_t count = 256; - - /* Get some randomness going */ - srand(0); - - char c; - while ((c = getopt(argc, argv, "h:s:n:r:")) != -1) { - switch (c) { - case 'h': - sscanf(optarg, "%lf", &h); - break; - case 's': - sscanf(optarg, "%lf", &spacing); - case 'n': - sscanf(optarg, "%zu", &count); - break; - case 'r': - sscanf(optarg, "%zu", &runs); - break; - case '?': - error("Unknown option."); - break; - } - } - - if (h < 0 || spacing < 0) { - printf( - "\nUsage: %s [OPTIONS...]\n" - "\nGenerates a particle array with equal particle separation." - "\nThese are then interacted using runner_iact_density and " - "runner_iact_vec_density." - "\n\nOptions:" - "\n-h DISTANCE=1.2348 - Smoothing length in units of <x>" - "\n-s SPACING=0.5 - Spacing between particles" - "\n-n NUMBER=9 - No. of particles", - argv[0]); - exit(1); - } - - /* Correct count so that VEC_SIZE of particles interact with the test - * particle. */ - count = count - (count % VEC_SIZE) + 1; - - /* Build the infrastructure */ - static long long partId = 0; - struct part test_particle; - struct part *particles = make_particles(count, offset, spacing, h, &partId); - -#if defined(NONSYM_FORCE) || defined(SYM_FORCE) - prepare_force(particles, count); -#endif - - test_particle = particles[0]; - /* Call the non-sym density test. */ - message("Testing %s interaction...", IACT_NAME); - test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs); - - return 0; -} diff --git a/tests/test125cells.c b/tests/test125cells.c index 2b35f7b5d509a7f966d07d6da82616869f29ab66..87325b4e9a2e254a2549b93b1031bf8af4538295 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -31,6 +31,18 @@ /* Local headers. */ #include "swift.h" +#if defined(WITH_VECTORIZATION) +#define DOSELF2 runner_doself2_force_vec +#define DOSELF2_NAME "runner_doself2_force_vec" +#define DOPAIR2_NAME "runner_dopair2_force" +#endif + +#ifndef DOSELF2 +#define DOSELF2 runner_doself2_force +#define DOSELF2_NAME "runner_doself2_density" +#define DOPAIR2_NAME "runner_dopair2_force" +#endif + enum velocity_field { velocity_zero, velocity_const, @@ -436,6 +448,7 @@ void runner_dopair1_branch_density(struct runner *r, struct cell *ci, void runner_doself1_density(struct runner *r, struct cell *ci); void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj); void runner_doself2_force(struct runner *r, struct cell *ci); +void runner_doself2_force_vec(struct runner *r, struct cell *ci); /* And go... */ int main(int argc, char *argv[]) { @@ -516,6 +529,8 @@ int main(int argc, char *argv[]) { } /* Help users... */ + message("DOSELF2 function called: %s", DOSELF2_NAME); + message("DOPAIR2 function called: %s", DOPAIR2_NAME); message("Adiabatic index: ga = %f", hydro_gamma); message("Hydro implementation: %s", SPH_IMPLEMENTATION); message("Smoothing length: h = %f", h * size); @@ -602,8 +617,12 @@ int main(int argc, char *argv[]) { malloc(main_cell->count * sizeof(struct solution_part)); get_solution(main_cell, solution, rho, vel, press, size); + ticks timings[27]; + for (int i = 0; i < 27; i++) timings[i] = 0; + /* Start the test */ ticks time = 0; + ticks self_force_time = 0; for (size_t n = 0; n < runs; ++n) { const ticks tic = getticks(); @@ -674,6 +693,7 @@ int main(int argc, char *argv[]) { /* Do the force calculation */ #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) + int ctr = 0; /* Do the pairs (for the central 27 cells) */ for (int i = 1; i < 4; i++) { for (int j = 1; j < 4; j++) { @@ -681,13 +701,32 @@ int main(int argc, char *argv[]) { struct cell *cj = cells[i * 25 + j * 5 + k]; - if (main_cell != cj) runner_dopair2_force(&runner, main_cell, cj); + if (main_cell != cj) { + + const ticks sub_tic = getticks(); + + runner_dopair2_force(&runner, main_cell, cj); + + const ticks sub_toc = getticks(); + timings[ctr++] += sub_toc - sub_tic; + } } } } +#ifdef WITH_VECTORIZATION + /* Initialise the cache. */ + runner.ci_cache.count = 0; + cache_init(&runner.ci_cache, 512); +#endif + + ticks self_tic = getticks(); + /* And now the self-interaction for the main cell */ - runner_doself2_force(&runner, main_cell); + DOSELF2(&runner, main_cell); + + self_force_time += getticks() - self_tic; + timings[26] += getticks() - self_tic; #endif /* Finally, give a gentle kick */ @@ -702,7 +741,6 @@ int main(int argc, char *argv[]) { dump_particle_fields(outputFileName, main_cell, solution, 0); } - /* Reset stuff */ for (int i = 0; i < 125; ++i) { for (int n = 0; n < cells[i]->count; ++n) hydro_init_part(&cells[i]->parts[n], &space.hs); @@ -710,7 +748,22 @@ int main(int argc, char *argv[]) { } /* Output timing */ - message("SWIFT calculation took : %15lli ticks.", time / runs); + ticks corner_time = timings[0] + timings[2] + timings[6] + timings[8] + + timings[17] + timings[19] + timings[23] + timings[25]; + + ticks edge_time = timings[1] + timings[3] + timings[5] + timings[7] + + timings[9] + timings[11] + timings[14] + timings[16] + + timings[18] + timings[20] + timings[22] + timings[24]; + + ticks face_time = timings[4] + timings[10] + timings[12] + timings[13] + + timings[15] + timings[21]; + + message("Corner calculations took : %15lli ticks.", corner_time / runs); + message("Edge calculations took : %15lli ticks.", edge_time / runs); + message("Face calculations took : %15lli ticks.", face_time / runs); + message("Self calculations took : %15lli ticks.", + self_force_time / runs); + message("SWIFT calculation took : %15lli ticks.", time / runs); for (int j = 0; j < 125; ++j) reset_particles(cells[j], &space.hs, vel, press, size, rho); diff --git a/tests/testInteractions.c b/tests/testInteractions.c index 54d1f38733a1f1647331166f1a37b40ed3511419..5bf036b17dce6755fc2b6f77aaea64851d613bb1 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -125,23 +125,28 @@ void dump_indv_particle_fields(char *fileName, struct part *p) { FILE *file = fopen(fileName, "a"); - /* Write header */ fprintf(file, - "%6llu %10f %10f %10f %10f %10f %10f %13e %13e %13e %13e %13e " - "%13e %13e %13e\n", - p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], + "%6llu %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f %8.5f " + "%8.5f " + "%8.5f %8.5f %13e %13e %13e %13e %13e %8.5f %8.5f\n", + p->id, p->x[0], p->x[1], p->x[2], p->v[0], p->v[1], p->v[2], p->h, hydro_get_density(p), -#if defined(GIZMO_SPH) || defined(SHADOWFAX_SPH) +#if defined(MINIMAL_SPH) || defined(SHADOWFAX_SPH) 0.f, #else - p->density.rho_dh, + p->density.div_v, #endif - p->density.wcount, p->density.wcount_dh, -#if defined(GADGET2_SPH) || defined(DEFAULT_SPH) || defined(HOPKINS_PE_SPH) - p->density.div_v, p->density.rot_v[0], p->density.rot_v[1], - p->density.rot_v[2] + hydro_get_entropy(p), hydro_get_internal_energy(p), + hydro_get_pressure(p), hydro_get_soundspeed(p), p->a_hydro[0], + p->a_hydro[1], p->a_hydro[2], p->force.h_dt, +#if defined(GADGET2_SPH) + p->force.v_sig, p->entropy_dt, 0.f +#elif defined(DEFAULT_SPH) + p->force.v_sig, 0.f, p->force.u_dt +#elif defined(MINIMAL_SPH) + p->force.v_sig, 0.f, p->u_dt #else - 0., 0., 0., 0. + 0.f, 0.f, 0.f #endif ); @@ -156,10 +161,12 @@ void write_header(char *fileName) { FILE *file = fopen(fileName, "w"); /* Write header */ fprintf(file, - "# %4s %10s %10s %10s %10s %10s %10s %13s %13s %13s %13s %13s " - "%13s %13s %13s\n", - "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "rho", "rho_dh", - "wcount", "wcount_dh", "div_v", "curl_vx", "curl_vy", "curl_vz"); + "# %4s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %13s %13s " + "%13s %13s %13s %8s %8s\n", + "ID", "pos_x", "pos_y", "pos_z", "v_x", "v_y", "v_z", "h", "rho", + "div_v", "S", "u", "P", "c", "a_x", "a_y", "a_z", "h_dt", "v_sig", + "dS/dt", "du/dt"); + fclose(file); } @@ -212,7 +219,7 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, strcpy(serial_filename, filePrefix); strcpy(vec_filename, filePrefix); sprintf(serial_filename + strlen(serial_filename), "_serial.dat"); - sprintf(vec_filename + strlen(vec_filename), "_vec.dat"); + sprintf(vec_filename + strlen(vec_filename), "_%d_vec.dat", num_vec_proc); write_header(serial_filename); write_header(vec_filename); @@ -388,6 +395,265 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, message("Speed up: %15fx.", (double)(serial_time) / vec_time); } +/* + * @brief Calls the serial and vectorised version of the non-symmetrical force + * interaction. + * + * @param test_part Particle that will be updated + * @param parts Particle array to be interacted + * @param count No. of particles to be interacted + * @param serial_inter_func Serial interaction function to be called + * @param vec_inter_func Vectorised interaction function to be called + * @param runs No. of times to call interactions + * + */ +void test_force_interactions(struct part test_part, struct part *parts, + size_t count, char *filePrefix, int runs, + int num_vec_proc) { + + ticks serial_time = 0; + ticks vec_time = 0; + + FILE *file; + char serial_filename[200] = ""; + char vec_filename[200] = ""; + + strcpy(serial_filename, filePrefix); + strcpy(vec_filename, filePrefix); + sprintf(serial_filename + strlen(serial_filename), "_serial.dat"); + sprintf(vec_filename + strlen(vec_filename), "_%d_vec.dat", num_vec_proc); + + write_header(serial_filename); + write_header(vec_filename); + + struct part pi_serial, pi_vec; + struct part pj_serial[count], pj_vec[count]; + + float r2[count] __attribute__((aligned(array_align))); + float dx[3 * count] __attribute__((aligned(array_align))); + + struct part *piq[count], *pjq[count]; + for (size_t k = 0; k < count; k++) { + piq[k] = NULL; + pjq[k] = NULL; + } + + float r2q[count] __attribute__((aligned(array_align))); + float dxq[count] __attribute__((aligned(array_align))); + float dyq[count] __attribute__((aligned(array_align))); + float dzq[count] __attribute__((aligned(array_align))); + + float hiq[count] __attribute__((aligned(array_align))); + float vixq[count] __attribute__((aligned(array_align))); + float viyq[count] __attribute__((aligned(array_align))); + float vizq[count] __attribute__((aligned(array_align))); + float rhoiq[count] __attribute__((aligned(array_align))); + float grad_hiq[count] __attribute__((aligned(array_align))); + float pOrhoi2q[count] __attribute__((aligned(array_align))); + float balsaraiq[count] __attribute__((aligned(array_align))); + float ciq[count] __attribute__((aligned(array_align))); + + float hj_invq[count] __attribute__((aligned(array_align))); + float mjq[count] __attribute__((aligned(array_align))); + float vjxq[count] __attribute__((aligned(array_align))); + float vjyq[count] __attribute__((aligned(array_align))); + float vjzq[count] __attribute__((aligned(array_align))); + float rhojq[count] __attribute__((aligned(array_align))); + float grad_hjq[count] __attribute__((aligned(array_align))); + float pOrhoj2q[count] __attribute__((aligned(array_align))); + float balsarajq[count] __attribute__((aligned(array_align))); + float cjq[count] __attribute__((aligned(array_align))); + + /* Call serial interaction a set number of times. */ + for (int k = 0; k < runs; k++) { + /* Reset particle to initial setup */ + pi_serial = test_part; + for (size_t i = 0; i < count; i++) pj_serial[i] = parts[i]; + + /* Only dump data on first run. */ + if (k == 0) { + /* Dump state of particles before serial interaction. */ + dump_indv_particle_fields(serial_filename, &pi_serial); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(serial_filename, &pj_serial[i]); + } + + /* Perform serial interaction */ + for (size_t i = 0; i < count; i++) { + /* Compute the pairwise distance. */ + r2[i] = 0.0f; + for (int k = 0; k < 3; k++) { + int ind = (3 * i) + k; + dx[ind] = pi_serial.x[k] - pj_serial[i].x[k]; + r2[i] += dx[ind] * dx[ind]; + } + } + + const ticks tic = getticks(); +/* Perform serial interaction */ +#ifdef __ICC +#pragma novector +#endif + for (size_t i = 0; i < count; i++) { + runner_iact_nonsym_force(r2[i], &(dx[3 * i]), pi_serial.h, pj_serial[i].h, + &pi_serial, &pj_serial[i]); + } + serial_time += getticks() - tic; + } + + file = fopen(serial_filename, "a"); + fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); + fclose(file); + + /* Dump result of serial interaction. */ + dump_indv_particle_fields(serial_filename, &pi_serial); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(serial_filename, &pj_serial[i]); + + /* Call vector interaction a set number of times. */ + for (int k = 0; k < runs; k++) { + /* Reset particle to initial setup */ + pi_vec = test_part; + for (size_t i = 0; i < count; i++) pj_vec[i] = parts[i]; + + /* Setup arrays for vector interaction. */ + for (size_t i = 0; i < count; i++) { + /* Compute the pairwise distance. */ + float r2 = 0.0f; + float dx[3]; + for (int k = 0; k < 3; k++) { + dx[k] = pi_vec.x[k] - pj_vec[i].x[k]; + r2 += dx[k] * dx[k]; + } + + piq[i] = &pi_vec; + pjq[i] = &pj_vec[i]; + + r2q[i] = r2; + dxq[i] = dx[0]; + dyq[i] = dx[1]; + dzq[i] = dx[2]; + + hiq[i] = pi_vec.h; + vixq[i] = pi_vec.v[0]; + viyq[i] = pi_vec.v[1]; + vizq[i] = pi_vec.v[2]; + rhoiq[i] = pi_vec.rho; + grad_hiq[i] = pi_vec.force.f; + pOrhoi2q[i] = pi_vec.force.P_over_rho2; + balsaraiq[i] = pi_vec.force.balsara; + ciq[i] = pi_vec.force.soundspeed; + + hj_invq[i] = 1.f / pj_vec[i].h; + mjq[i] = pj_vec[i].mass; + vjxq[i] = pj_vec[i].v[0]; + vjyq[i] = pj_vec[i].v[1]; + vjzq[i] = pj_vec[i].v[2]; + rhojq[i] = pj_vec[i].rho; + grad_hjq[i] = pj_vec[i].force.f; + pOrhoj2q[i] = pj_vec[i].force.P_over_rho2; + balsarajq[i] = pj_vec[i].force.balsara; + cjq[i] = pj_vec[i].force.soundspeed; + } + + /* Only dump data on first run. */ + if (k == 0) { + /* Dump state of particles before vector interaction. */ + dump_indv_particle_fields(vec_filename, piq[0]); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(vec_filename, pjq[i]); + } + + /* Perform vector interaction. */ + vector hi_vec, hi_inv_vec, vix_vec, viy_vec, viz_vec, rhoi_vec, grad_hi_vec, + pOrhoi2_vec, balsara_i_vec, ci_vec; + vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, + entropy_dtSum; + + a_hydro_xSum.v = vec_setzero(); + a_hydro_ySum.v = vec_setzero(); + a_hydro_zSum.v = vec_setzero(); + h_dtSum.v = vec_setzero(); + v_sigSum.v = vec_setzero(); + entropy_dtSum.v = vec_setzero(); + + hi_vec.v = vec_load(&hiq[0]); + vix_vec.v = vec_load(&vixq[0]); + viy_vec.v = vec_load(&viyq[0]); + viz_vec.v = vec_load(&vizq[0]); + rhoi_vec.v = vec_load(&rhoiq[0]); + grad_hi_vec.v = vec_load(&grad_hiq[0]); + pOrhoi2_vec.v = vec_load(&pOrhoi2q[0]); + balsara_i_vec.v = vec_load(&balsaraiq[0]); + ci_vec.v = vec_load(&ciq[0]); + + hi_inv_vec = vec_reciprocal(hi_vec); + + mask_t mask, mask2; + vec_init_mask_true(mask); + vec_init_mask_true(mask2); + + const ticks vec_tic = getticks(); + + for (size_t i = 0; i < count; i += num_vec_proc * VEC_SIZE) { + + if (num_vec_proc == 2) { + runner_iact_nonsym_2_vec_force( + &(r2q[i]), &(dxq[i]), &(dyq[i]), &(dzq[i]), (vix_vec), (viy_vec), + (viz_vec), rhoi_vec, grad_hi_vec, pOrhoi2_vec, balsara_i_vec, + ci_vec, &(vjxq[i]), &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), + &(grad_hjq[i]), &(pOrhoj2q[i]), &(balsarajq[i]), &(cjq[i]), + &(mjq[i]), hi_inv_vec, &(hj_invq[i]), &a_hydro_xSum, &a_hydro_ySum, + &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, mask, mask2, 0); + } else { /* Only use one vector for interaction. */ + + vector r2, dx, dy, dz, hj, hj_inv; + r2.v = vec_load(&(r2q[i])); + dx.v = vec_load(&(dxq[i])); + dy.v = vec_load(&(dyq[i])); + dz.v = vec_load(&(dzq[i])); + hj.v = vec_load(&hj_invq[i]); + hj_inv = vec_reciprocal(hj); + + runner_iact_nonsym_1_vec_force( + &r2, &dx, &dy, &dz, vix_vec, viy_vec, viz_vec, rhoi_vec, + grad_hi_vec, pOrhoi2_vec, balsara_i_vec, ci_vec, &(vjxq[i]), + &(vjyq[i]), &(vjzq[i]), &(rhojq[i]), &(grad_hjq[i]), &(pOrhoj2q[i]), + &(balsarajq[i]), &(cjq[i]), &(mjq[i]), hi_inv_vec, hj_inv, + &a_hydro_xSum, &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, + &entropy_dtSum, mask); + } + } + + VEC_HADD(a_hydro_xSum, piq[0]->a_hydro[0]); + VEC_HADD(a_hydro_ySum, piq[0]->a_hydro[1]); + VEC_HADD(a_hydro_zSum, piq[0]->a_hydro[2]); + VEC_HADD(h_dtSum, piq[0]->force.h_dt); + VEC_HMAX(v_sigSum, piq[0]->force.v_sig); + VEC_HADD(entropy_dtSum, piq[0]->entropy_dt); + + vec_time += getticks() - vec_tic; + } + + file = fopen(vec_filename, "a"); + fprintf(file, "\n# PARTICLES AFTER INTERACTION:\n"); + fclose(file); + + /* Dump result of serial interaction. */ + dump_indv_particle_fields(vec_filename, piq[0]); + for (size_t i = 0; i < count; i++) + dump_indv_particle_fields(vec_filename, pjq[i]); + + /* Check serial results against the vectorised results. */ + if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count)) + message("Differences found..."); + + message("The serial interactions took : %15lli ticks.", + serial_time / runs); + message("The vectorised interactions took : %15lli ticks.", vec_time / runs); + message("Speed up: %15fx.", (double)(serial_time) / vec_time); +} + /* And go... */ int main(int argc, char *argv[]) { size_t runs = 10000; @@ -449,6 +715,13 @@ int main(int argc, char *argv[]) { test_interactions(test_particle, &particles[1], count - 1, IACT_NAME, runs, 2); + prepare_force(particles, count); + + test_force_interactions(test_particle, &particles[1], count - 1, + "test_nonsym_force", runs, 1); + test_force_interactions(test_particle, &particles[1], count - 1, + "test_nonsym_force", runs, 2); + return 0; } diff --git a/tests/testInteractions.sh.in b/tests/testInteractions.sh.in index 4b002c56e37eff417c673ddac2e44b3edf17683a..2a0bba267dd3ba8468177ec5f91af108aba0f1e5 100644 --- a/tests/testInteractions.sh.in +++ b/tests/testInteractions.sh.in @@ -2,7 +2,7 @@ echo "" -rm -f test_nonsym_density_serial.dat test_nonsym_density_vec.dat +rm -f test_nonsym_density_serial.dat test_nonsym_density_1_vec.dat test_nonsym_density_2_vec.dat test_nonsym_force_1_vec.dat test_nonsym_force_2_vec.dat echo "Running ./testInteractions" @@ -13,15 +13,42 @@ if [ $? != 0 ]; then else if [ -e test_nonsym_density_serial.dat ] then - if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_vec.dat @srcdir@/tolerance_testInteractions.dat + if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_1_vec.dat @srcdir@/tolerance_testInteractions.dat then - echo "Accuracy test passed" + echo "Calculating density using 1 vector accuracy test passed" else - echo "Accuracy test failed" + echo "Calculating density using 1 vector accuracy test failed" + exit 1 + fi + if python @srcdir@/difffloat.py test_nonsym_density_serial.dat test_nonsym_density_2_vec.dat @srcdir@/tolerance_testInteractions.dat + then + echo "Calculating density using 2 vectors accuracy test passed" + else + echo "Calculating density using 2 vectors accuracy test failed" + exit 1 + fi + else + echo "Error Missing density test output file" + exit 1 + fi + if [ -e test_nonsym_force_serial.dat ] + then + if python @srcdir@/difffloat.py test_nonsym_force_serial.dat test_nonsym_force_1_vec.dat @srcdir@/tolerance_testInteractions.dat + then + echo "Calculating force using 1 vector accuracy test passed" + else + echo "Calculating force using 1 vector accuracy test failed" + exit 1 + fi + if python @srcdir@/difffloat.py test_nonsym_force_serial.dat test_nonsym_force_2_vec.dat @srcdir@/tolerance_testInteractions.dat + then + echo "Calculating force using 2 vectors accuracy test passed" + else + echo "Calculating force using 2 vectors accuracy test failed" exit 1 fi else - echo "Error Missing test output file" + echo "Error Missing force test output file" exit 1 fi fi diff --git a/tests/tolerance_testInteractions.dat b/tests/tolerance_testInteractions.dat index ebb376bf26bfdc0fb2107ab720bbf9eca5a35bce..0f11d03507b23c76b5703e118eede1359fe2afba 100644 --- a/tests/tolerance_testInteractions.dat +++ b/tests/tolerance_testInteractions.dat @@ -1,4 +1,4 @@ -# ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 4e-5 4e-4 1e-2 1e-5 6e-6 6e-6 6e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-4 1e-4 2e-4 2e-4 1e-4 1e-4 1e-4 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 +# ID pos_x pos_y pos_z v_x v_y v_z h rho div_v S u P c a_x a_y a_z h_dt v_sig dS/dt du/dt + 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 + 0 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 1e-4 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-5 1e-5 1e-5 1e-5 1e-5 1e-5 1e-5