diff --git a/.gitignore b/.gitignore index 9d0c3e8218a7672d5986f764f713ae299aaee44b..a390bdc87313904942dd17a417af2b96da4041b1 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,13 @@ tests/testSPHStep tests/testKernel tests/testParser tests/parser_output.yml +tests/test27cells.sh +tests/test27cellsPerturbed.sh +tests/testPair.sh +tests/testPairPerturbed.sh +tests/testParser.sh +tests/testReading.sh + theory/latex/swift.pdf theory/kernel/kernels.pdf diff --git a/INSTALL.swift b/INSTALL.swift index bd49dfffd641a3ad59ec25284b14685afb9f7b24..a07d5b24c2d8c75778e2a24d90f77724459ab61f 100644 --- a/INSTALL.swift +++ b/INSTALL.swift @@ -39,7 +39,7 @@ SWIFT has been successfully built and tested with the following compilers: - clang 3.4.x More recent versions and slightly older ones should also be able to -built the software. +build the software. By default an attempt to choose suitable set of optimizing compiler flags will be made, targeted for the host machine of the build. If this doesn't @@ -67,6 +67,14 @@ for instance. GCC address sanitizer flags can be included using the option. Note this requires a GCC compiler version of at least 4.8. +By default vectorization is switched on. The highest instruction set +available on the platform will be automatically used. However, not all +implementations of SPH available in the code have vectorized +routines. Vectorization will have to be switched off for these. It can +also be switched off for benchmarking purposes. To do so, you can use: + + ./configure --disable-vec + Dependencies ============ diff --git a/configure.ac b/configure.ac index 09346ca9083e18df6d69ea09d23df62e17bcde09..5eccef2fde6323460cc0d55c109a054767a33313 100644 --- a/configure.ac +++ b/configure.ac @@ -240,7 +240,12 @@ if test "$enable_vec" = "no"; then else AC_MSG_WARN([Do not know how to disable vectorization for this compiler]) fi + HAVEVECTORIZATION=0 +else + AC_DEFINE([WITH_VECTORIZATION],1,[Enable vectorization]) + HAVEVECTORIZATION=1 fi +AM_CONDITIONAL([HAVEVECTORIZATION],[test -n "$HAVEVECTORIZATION"]) # Autoconf stuff. AC_PROG_INSTALL diff --git a/examples/main.c b/examples/main.c index 016f948bb1b031ba756a344d020d4b26c8a42bed..b7397dfc7a195f7ea1ac5630ab4f0b1ddc8029b1 100644 --- a/examples/main.c +++ b/examples/main.c @@ -266,6 +266,24 @@ int main(int argc, char *argv[]) { message("sizeof(struct cell) is %4zi bytes.", sizeof(struct cell)); } +/* Temporary abort to handle absence of vectorized functions */ +#ifdef WITH_VECTORIZATION + +#ifdef GADGET2_SPH + error( + "Vectorized version of Gadget SPH routines not implemented yet. " + "Reconfigure with --disable-vec and recompile or use DEFAULT_SPH."); +#endif + +#ifdef MINIMAL_SPH + error( + "Vectorized version of Minimal SPH routines not implemented yet. " + "Reconfigure with --disable-vec and recompile or use DEFAULT_SPH."); +#endif + +#endif + /* End temporary fix */ + /* How vocal are we ? */ const int talking = (verbose == 1 && myrank == 0) || (verbose == 2); diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h index 62023345f174eb8cb9bae4d4438bdd50c9969494..dd6878841d7d0632ab8e4f218044516d9895cab8 100644 --- a/src/gravity/Default/gravity_iact.h +++ b/src/gravity/Default/gravity_iact.h @@ -60,7 +60,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav( __attribute__((always_inline)) INLINE static void runner_iact_vec_grav( float *R2, float *Dx, struct gpart **pi, struct gpart **pj) { -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION vector ir, r, r2, dx[3]; vector w, acc, ai, aj; diff --git a/src/hydro/Default/hydro_iact.h b/src/hydro/Default/hydro_iact.h index 0a577931b5e0ca67ab07dfe414a548da66e82cdd..cb7b1591a38a914ffa8ce528ed4d547fe6257a48 100644 --- a/src/hydro/Default/hydro_iact.h +++ b/src/hydro/Default/hydro_iact.h @@ -101,7 +101,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_density( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION vector r, ri, r2, xi, xj, hi, hj, hi_inv, hj_inv, wi, wj, wi_dx, wj_dx; vector rhoi, rhoj, rhoi_dh, rhoj_dh, wcounti, wcountj, wcounti_dh, wcountj_dh; @@ -263,7 +263,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION vector r, ri, r2, xi, hi, hi_inv, wi, wi_dx; vector rhoi, rhoi_dh, wcounti, wcounti_dh, div_vi; @@ -450,7 +450,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION vector r, r2, ri; vector xi, xj; @@ -648,8 +648,8 @@ __attribute__((always_inline)) INLINE static void runner_iact_vec_force( pi[k]->force.v_sig = vi_sig.f[k]; pj[k]->force.v_sig = vj_sig.f[k]; for (j = 0; j < 3; j++) { - pi[k]->a[j] -= pia[j].f[k]; - pj[k]->a[j] += pja[j].f[k]; + pi[k]->a_hydro[j] -= pia[j].f[k]; + pj[k]->a_hydro[j] += pja[j].f[k]; } } @@ -758,7 +758,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, struct part **pj) { -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION vector r, r2, ri; vector xi, xj; @@ -945,7 +945,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( pi[k]->h_dt -= pih_dt.f[k]; pi[k]->force.v_sig = vi_sig.f[k]; pj[k]->force.v_sig = vj_sig.f[k]; - for (j = 0; j < 3; j++) pi[k]->a[j] -= pia[j].f[k]; + for (j = 0; j < 3; j++) pi[k]->a_hydro[j] -= pia[j].f[k]; } #else diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 8738b4be09931df4c938f1dff3adeed11468dcfc..daa6c37681131e197b7e1834d0792238a600eaaa 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -103,6 +103,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( pj->density.rot_v[2] += facj * curlvr[2]; } +/** + * @brief Density loop (Vectorized version) + */ +__attribute__((always_inline)) INLINE static void runner_iact_vec_density( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { + error( + "A vectorised version of the Gadget2 density interaction function does " + "not exist yet!"); +} + /** * @brief Density loop (non-symmetric version) */ @@ -151,6 +162,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( pi->density.rot_v[2] += fac * curlvr[2]; } +/** + * @brief Density loop (non-symmetric vectorized version) + */ +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, + struct part **pi, struct part **pj) { + error( + "A vectorised version of the Gadget2 non-symmetric density interaction " + "function does not exist yet!"); +} + /** * @brief Force loop */ @@ -248,6 +270,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_force( pj->entropy_dt -= 0.5f * mi * visc_term * dvdr; } +/** + * @brief Force loop (Vectorized version) + */ +__attribute__((always_inline)) INLINE static void runner_iact_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { + error( + "A vectorised version of the Gadget2 force interaction function does not " + "exist yet!"); +} + /** * @brief Force loop (non-symmetric version) */ @@ -338,4 +371,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( pi->entropy_dt += 0.5f * mj * visc_term * dvdr; } +/** + * @brief Force loop (Vectorized non-symmetric version) + */ +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { + error( + "A vectorised version of the Gadget2 non-symmetric force interaction " + "function does not exist yet!"); +} + #endif /* SWIFT_RUNNER_IACT_LEGACY_H */ diff --git a/src/hydro/Minimal/hydro_iact.h b/src/hydro/Minimal/hydro_iact.h index c9da185b8a29eafe2a58420ae5de3a05ff043225..7196b236bb3e14c942730e5dfdba504042d0b5d4 100644 --- a/src/hydro/Minimal/hydro_iact.h +++ b/src/hydro/Minimal/hydro_iact.h @@ -61,6 +61,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_density( pj->density.wcount_dh -= xj * wj_dx; } +/** + * @brief Density loop (Vectorized version) + */ +__attribute__((always_inline)) INLINE static void runner_iact_vec_density( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { + error( + "A vectorised version of the Minimal density interaction function does " + "not exist yet!"); +} + /** * @brief Density loop (non-symmetric version) */ @@ -85,6 +96,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_density( pi->density.wcount_dh -= xi * wi_dx; } +/** + * @brief Density loop (non-symmetric vectorized version) + */ +__attribute__((always_inline)) INLINE static void +runner_iact_nonsym_vec_density(float *R2, float *Dx, float *Hi, float *Hj, + struct part **pi, struct part **pj) { + error( + "A vectorised version of the Minimal non-symmetric density interaction " + "function does not exist yet!"); +} + /** * @brief Force loop */ @@ -157,6 +179,17 @@ __attribute__((always_inline)) INLINE static void runner_iact_force( pj->force.v_sig = fmaxf(pj->force.v_sig, v_sig); } +/** + * @brief Force loop (Vectorized version) + */ +__attribute__((always_inline)) INLINE static void runner_iact_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { + error( + "A vectorised version of the Minimal force interaction function does not " + "exist yet!"); +} + /** * @brief Force loop (non-symmetric version) */ @@ -222,4 +255,15 @@ __attribute__((always_inline)) INLINE static void runner_iact_nonsym_force( pi->force.v_sig = fmaxf(pi->force.v_sig, v_sig); } +/** + * @brief Force loop (Vectorized non-symmetric version) + */ +__attribute__((always_inline)) INLINE static void runner_iact_nonsym_vec_force( + float *R2, float *Dx, float *Hi, float *Hj, struct part **pi, + struct part **pj) { + error( + "A vectorised version of the Minimal non-symmetric force interaction " + "function does not exist yet!"); +} + #endif /* SWIFT_RUNNER_IACT_MINIMAL_H */ diff --git a/src/kernel_gravity.h b/src/kernel_gravity.h index fedc046eed3bc64ed09120537cf6863179b171a6..0c786f5189b882ed4593ef4a927edbf2a8cdd6e7 100644 --- a/src/kernel_gravity.h +++ b/src/kernel_gravity.h @@ -79,7 +79,7 @@ __attribute__((always_inline)) INLINE static void kernel_grav_eval(float x, *W = w; } -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /** * @brief Computes the gravity cubic spline for a given distance x (Vectorized @@ -155,7 +155,7 @@ __attribute__((always_inline)) INLINE static void blender_deval(float x, *dW_dx = dw_dx; } -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /** * @brief Computes the cubic spline blender and its derivative for a given diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index b1774d8f35b7eddb6c2fdb0c341fa6299de74582..8a1dac79cd7766b23989fd377c80cb33fc0cccde 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -225,7 +225,7 @@ __attribute__((always_inline)) INLINE static void kernel_eval( *W = w * kernel_constant * kernel_igamma3; } -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION static const vector kernel_igamma_vec = FILL_VEC((float)kernel_igamma); diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 4da83b940d55460c4bf8d2f650534aedf94dbd5d..193666ad264ab2bc5efe092d4e5bebe525ba3c9a 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -113,7 +113,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, error("Don't use in actual runs ! Slow code !"); -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float hiq[VEC_SIZE] __attribute__((aligned(16))); @@ -169,7 +169,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, /* Hit or miss? */ if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT(r2, dx, hi, pj->h, pi, pj); @@ -199,7 +199,7 @@ void DOPAIR_NAIVE(struct runner *r, struct cell *restrict ci, } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (int k = 0; k < icount; k++) @@ -215,7 +215,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { error("Don't use in actual runs ! Slow code !"); -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float hiq[VEC_SIZE] __attribute__((aligned(16))); @@ -258,7 +258,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { /* Hit or miss? */ if (r2 < hig2 || r2 < pj->h * pj->h * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT(r2, dx, hi, pj->h, pi, pj); @@ -288,7 +288,7 @@ void DOSELF_NAIVE(struct runner *r, struct cell *restrict c) { } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (int k = 0; k < icount; k++) @@ -317,7 +317,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, error("Don't use in actual runs ! Slow code !"); -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float hiq[VEC_SIZE] __attribute__((aligned(16))); @@ -367,7 +367,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, /* Hit or miss? */ if (r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); @@ -397,7 +397,7 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (int k = 0; k < icount; k++) @@ -424,7 +424,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, struct engine *e = r->e; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float hiq[VEC_SIZE] __attribute__((aligned(16))); @@ -499,7 +499,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, /* Hit or miss? */ if (r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); @@ -564,7 +564,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, /* Hit or miss? */ if (r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); @@ -595,7 +595,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, } /* loop over the parts in ci. */ } -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (int k = 0; k < icount; k++) @@ -618,7 +618,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, struct part *restrict parts, int *restrict ind, int count) { -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float hiq[VEC_SIZE] __attribute__((aligned(16))); @@ -658,7 +658,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, /* Hit or miss? */ if (r2 > 0.0f && r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); @@ -688,7 +688,7 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (int k = 0; k < icount; k++) @@ -710,7 +710,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { struct engine *restrict e = r->e; const int ti_current = e->ti_current; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float hiq[VEC_SIZE] __attribute__((aligned(16))); @@ -783,7 +783,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hi, pj->h, pi, pj); @@ -845,7 +845,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hjg2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hj, pi->h, pj, pi); @@ -875,7 +875,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj) { } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (int k = 0; k < icount; k++) @@ -897,7 +897,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { struct engine *restrict e = r->e; const int ti_current = e->ti_current; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); float hiq1[VEC_SIZE] __attribute__((aligned(16))); @@ -1009,7 +1009,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hj, hi, pj, pi); @@ -1060,7 +1060,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hig2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION /* Does pj need to be updated too? */ if (pj->ti_end <= ti_current) @@ -1153,7 +1153,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hjg2 && r2 > hi * hi * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hi, hj, pi, pj); @@ -1203,7 +1203,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { /* Hit or miss? */ if (r2 < hjg2 && r2 > hi * hi * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION /* Does pi need to be updated too? */ if (pi->ti_end <= ti_current) @@ -1261,7 +1261,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount1 > 0) for (int k = 0; k < icount1; k++) @@ -1284,7 +1284,7 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { const int ti_current = r->e->ti_current; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); float hiq1[VEC_SIZE] __attribute__((aligned(16))); @@ -1350,7 +1350,7 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { /* Hit or miss? */ if (r2 < hj * hj * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hj, hi, pj, pi); @@ -1406,7 +1406,7 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { /* Hit or miss? */ if (r2 < hig2 || doj) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION /* Which parts need to be updated? */ if (r2 < hig2 && doj) @@ -1489,7 +1489,7 @@ void DOSELF1(struct runner *r, struct cell *restrict c) { } /* loop over all particles. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount1 > 0) for (int k = 0; k < icount1; k++) @@ -1512,7 +1512,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { const int ti_current = r->e->ti_current; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount1 = 0; float r2q1[VEC_SIZE] __attribute__((aligned(16))); float hiq1[VEC_SIZE] __attribute__((aligned(16))); @@ -1578,7 +1578,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { /* Hit or miss? */ if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION IACT_NONSYM(r2, dx, hj, hi, pj, pi); @@ -1632,7 +1632,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { /* Hit or miss? */ if (r2 < hig2 || r2 < hj * hj * kernel_gamma2) { -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION /* Does pj need to be updated too? */ if (pj->ti_end <= ti_current) @@ -1690,7 +1690,7 @@ void DOSELF2(struct runner *r, struct cell *restrict c) { } /* loop over all particles. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount1 > 0) for (int k = 0; k < icount1; k++) diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index e3788dfa1123584c913bca6baa6fc2db6698e6d0..aaf416e2dac76a3694b9bdf13b767d3b2a5e8722 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -40,7 +40,7 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci, int count_i, count_j, cnj, cnj_new; const int ti_current = e->ti_current; struct multipole m; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); @@ -107,7 +107,7 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci, r2 += dx[k] * dx[k]; } -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in @@ -157,7 +157,7 @@ void runner_dopair_grav_new(struct runner *r, struct cell *ci, } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (k = 0; k < icount; k++) @@ -335,7 +335,7 @@ void runner_dopair_grav(struct runner *r, struct cell *restrict ci, double pix[3]; float dx[3], r2; const int ti_current = r->e->ti_current; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); @@ -376,7 +376,7 @@ void runner_dopair_grav(struct runner *r, struct cell *restrict ci, } /* Compute the interaction. */ -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) // message( "interacting particles pi=%lli and pj=%lli with r=%.3e in @@ -409,7 +409,7 @@ void runner_dopair_grav(struct runner *r, struct cell *restrict ci, } /* loop over the parts in ci. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (k = 0; k < icount; k++) @@ -434,7 +434,7 @@ void runner_doself_grav(struct runner *r, struct cell *restrict c) { double pix[3] = {0.0, 0.0, 0.0}; float dx[3], r2; const int ti_current = r->e->ti_current; -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION int icount = 0; float r2q[VEC_SIZE] __attribute__((aligned(16))); float dxq[3 * VEC_SIZE] __attribute__((aligned(16))); @@ -466,7 +466,7 @@ void runner_doself_grav(struct runner *r, struct cell *restrict c) { } /* Compute the interaction. */ -#ifndef VECTORIZE +#ifndef WITH_VECTORIZATION // if ( pi->part->id == 3473472412525 || pj->part->id == 3473472412525 ) // message( "interacting particles pi=%lli and pj=%lli with r=%.3e." , @@ -497,7 +497,7 @@ void runner_doself_grav(struct runner *r, struct cell *restrict c) { } /* loop over the parts in c. */ -#ifdef VECTORIZE +#ifdef WITH_VECTORIZATION /* Pick up any leftovers. */ if (icount > 0) for (k = 0; k < icount; k++) diff --git a/src/vector.h b/src/vector.h index fa311f121f7b702f2288be0d561e520b52330457..53869fd2594227d3332d7435f47cdff7cded224b 100644 --- a/src/vector.h +++ b/src/vector.h @@ -25,6 +25,8 @@ #include "../config.h" +#ifdef WITH_VECTORIZATION + /* Need to check whether compiler supports this (IBM does not) This will prevent the macros to be defined and switch off explicit vectorization if the compiled does not support it */ @@ -38,8 +40,7 @@ __attribute__((vector_size((elcount) * sizeof(type)))) type /* So what will the vector size be? */ -#ifdef __MIC__ -#define VECTORIZE +#ifdef HAVE_AVX512_F #define VEC_HAVE_GATHER #define VEC_SIZE 16 #define VEC_FLOAT __m512 @@ -85,8 +86,7 @@ .f[6] = a, .f[7] = a, .f[8] = a, .f[9] = a, .f[10] = a, .f[11] = a, \ .f[12] = a, .f[13] = a, .f[14] = a, .f[15] = a \ } -#elif defined(NO__AVX__) -#define VECTORIZE +#elif defined(HAVE_AVX) #define VEC_SIZE 8 #define VEC_FLOAT __m256 #define VEC_DBL __m256d @@ -118,12 +118,11 @@ .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a, .f[4] = a, .f[5] = a, \ .f[6] = a, .f[7] = a \ } -#ifdef __AVX2__ +#ifdef HAVE_AVX2 #define VEC_HAVE_GATHER #define vec_gather(base, offsets) _mm256_i32gather_ps(base, offsets.m, 1) #endif -#elif defined(NO__SSE2__) -#define VECTORIZE +#elif defined(HAVE_SSE2) #define VEC_SIZE 4 #define VEC_FLOAT __m128 #define VEC_DBL __m128d @@ -157,7 +156,6 @@ #endif /* Define the composite types for element access. */ -#ifdef VECTORIZE typedef union { VEC_FLOAT v; VEC_DBL vd; @@ -166,8 +164,12 @@ typedef union { double d[VEC_SIZE / 2]; int i[VEC_SIZE]; } vector; -#endif -#endif +#else +/* Needed for cache alignment. */ +#define VEC_SIZE 16 +#endif /* WITH_VECTORIZATION */ + +#endif /* VEC_MACRO */ #endif /* SWIFT_VECTOR_H */ diff --git a/tests/Makefile.am b/tests/Makefile.am index 163a9ad096ddc2a79489de2930bb1d4288d1eb3e..64496e0e420b3db591e4f00b8fd697050d761514 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -22,7 +22,7 @@ AM_LDFLAGS = ../src/.libs/libswiftsim.a $(HDF5_LDFLAGS) $(HDF5_LIBS) # List of programs and scripts to run in the test suite TESTS = testGreetings testReading.sh testSingle testPair.sh testPairPerturbed.sh \ - test27cells.sh test27cellsPerturbed.sh testParser.sh testKernel + test27cells.sh test27cellsPerturbed.sh testParser.sh testKernel testSPHStep # List of test programs to compile check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \ diff --git a/tests/test27cells.c b/tests/test27cells.c index 3e2f11c7aabac0a7ff2df19f2c48f9f81ea55df5..b3bfdd18bd9165d68ecb380e31b9dee094c80f12 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -126,7 +126,6 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, cell->sorted = 0; cell->sort = NULL; cell->sortsize = 0; - runner_do_sort(NULL, cell, 0x1FFF, 0); return cell; } @@ -331,6 +330,8 @@ int main(int argc, char *argv[]) { double offset[3] = {i * size, j * size, k * size}; cells[i * 9 + j * 3 + k] = make_cell(particles, offset, size, h, rho, &partId, perturbation, vel); + + runner_do_sort(&runner, cells[i * 9 + j * 3 + k], 0x1FFF, 0); } } } @@ -345,6 +346,8 @@ int main(int argc, char *argv[]) { const ticks tic = getticks(); +#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) + /* Run all the pairs */ for (int j = 0; j < 27; ++j) if (cells[j] != main_cell) @@ -353,6 +356,8 @@ int main(int argc, char *argv[]) { /* And now the self-interaction */ runner_doself1_density(&runner, main_cell); +#endif + const ticks toc = getticks(); time += toc - tic; @@ -377,6 +382,8 @@ int main(int argc, char *argv[]) { const ticks tic = getticks(); +#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) + /* Run all the brute-force pairs */ for (int j = 0; j < 27; ++j) if (cells[j] != main_cell) pairs_all_density(&runner, main_cell, cells[j]); @@ -384,6 +391,8 @@ int main(int argc, char *argv[]) { /* And now the self-interaction */ self_all_density(&runner, main_cell); +#endif + const ticks toc = getticks(); /* Let's get physical ! */ diff --git a/tests/testKernel.c b/tests/testKernel.c index 182bae5334e1a5061e584212a31186dc4e7f0818..344c038be0020019cc989cca6969b4e65bfd973f 100644 --- a/tests/testKernel.c +++ b/tests/testKernel.c @@ -18,7 +18,6 @@ * ******************************************************************************/ -#define NO__AVX__ #include "kernel_hydro.h" #include "vector.h" @@ -53,6 +52,9 @@ int main() { printf("\nVector Output for VEC_SIZE=%d\n", VEC_SIZE); printf("-------------\n"); + +#ifdef WITH_VECORIZATION + for (int i = 0; i < numPoints; i += VEC_SIZE) { vector vx, vx_h; @@ -84,5 +86,7 @@ int main() { } printf("\nAll values are consistent\n"); + +#endif return 0; } diff --git a/tests/testPair.c b/tests/testPair.c index f9539fc1a444828c65b39e56618eb7bb98bd67de..2c0bd908cae4997250d40edb6ccfb8afeb8c5be1 100644 --- a/tests/testPair.c +++ b/tests/testPair.c @@ -91,7 +91,6 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, cell->sorted = 0; cell->sort = NULL; cell->sortsize = 0; - runner_do_sort(NULL, cell, 0x1FFF, 0); return cell; } @@ -245,6 +244,9 @@ int main(int argc, char *argv[]) { for (size_t i = 0; i < type + 1; ++i) offset[i] = 1.; cj = make_cell(particles, offset, size, h, rho, &partId, perturbation); + runner_do_sort(&runner, ci, 0x1FFF, 0); + runner_do_sort(&runner, cj, 0x1FFF, 0); + time = 0; for (size_t i = 0; i < runs; ++i) { /* Zero the fields */ @@ -253,8 +255,10 @@ int main(int argc, char *argv[]) { tic = getticks(); +#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) /* Run the test */ runner_dopair1_density(&runner, ci, cj); +#endif toc = getticks(); time += toc - tic; @@ -277,8 +281,10 @@ int main(int argc, char *argv[]) { tic = getticks(); +#if defined(DEFAULT_SPH) || !defined(WITH_VECTORIZATION) /* Run the brute-force test */ pairs_all_density(&runner, ci, cj); +#endif toc = getticks(); diff --git a/tests/testSPHStep.c b/tests/testSPHStep.c index 3af0c6ad1afdeab749a378153fd1a8e016f29659..12882347c2a7db1f8d89fd4b339722a8d0d43485 100644 --- a/tests/testSPHStep.c +++ b/tests/testSPHStep.c @@ -28,22 +28,23 @@ struct cell *make_cell(size_t N, float cellSize, int offset[3], int id_offset) { size_t count = N * N * N; struct cell *cell = malloc(sizeof(struct cell)); + bzero(cell, sizeof(struct cell)); struct part *part; struct xpart *xpart; float h; size_t x, y, z, size; size = count * sizeof(struct part); - if (posix_memalign((void **)&cell->parts, 32, size) != 0) { + if (posix_memalign((void **)&cell->parts, part_align, size) != 0) { error("couldn't allocate particles"); } size = count * sizeof(struct xpart); - if (posix_memalign((void **)&cell->xparts, 32, size) != 0) { + if (posix_memalign((void **)&cell->xparts, xpart_align, size) != 0) { error("couldn't allocate extended particles"); } - h = 1.127 * cellSize / N; + h = 1.2348 * cellSize / N; part = cell->parts; xpart = cell->xparts; @@ -61,6 +62,8 @@ struct cell *make_cell(size_t N, float cellSize, int offset[3], int id_offset) { part->h = h; part->id = x * N * N + y * N + z + id_offset; ++part; + part->ti_begin = 0; + part->ti_end = 1; } } } @@ -68,22 +71,35 @@ struct cell *make_cell(size_t N, float cellSize, int offset[3], int id_offset) { cell->split = 0; cell->h_max = h; cell->count = count; + cell->gcount = 0; + cell->dx_max = 0.; cell->h[0] = cellSize; cell->h[1] = cellSize; cell->h[2] = cellSize; + cell->ti_end_min = 1; + cell->ti_end_max = 1; + + cell->sorted = 0; + cell->sort = NULL; + cell->sortsize = 0; + return cell; } -#ifdef DEFAULT_SPH - /* Just a forward declaration... */ void runner_doself1_density(struct runner *r, struct cell *ci); void runner_doself2_force(struct runner *r, struct cell *ci); +void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj); /* Run a full time step integration for one cell */ int main() { +#ifndef DEFAULT_SPH + return 0; +#else + int i, j, k, offset[3]; struct part *p; @@ -92,6 +108,13 @@ int main() { float rho = 2.; float P = 1.; + /* Initialize CPU frequency, this also starts time. */ + unsigned long long cpufreq = 0; + clocks_set_cpufreq(cpufreq); + + /* Get some randomness going */ + srand(0); + /* Create cells */ struct cell *cells[27]; for (i = 0; i < 3; i++) @@ -117,7 +140,13 @@ int main() { struct cell *ci = cells[13]; /* Create the infrastructure */ + struct space space; + space.periodic = 0; + space.h_max = 1.; + struct engine e; + e.s = &space; + struct runner r; r.e = &e; @@ -125,29 +154,44 @@ int main() { e.timeBegin = 0.; e.timeEnd = 1.; e.timeOld = 0.; - e.time = 0.; - e.dt_min = 0.; - e.dt_max = 1e10; + e.time = 0.1f; + e.ti_current = 1; /* The tracked particle */ p = &(ci->parts[N * N * N / 2 + N * N / 2 + N / 2]); message("Studying particle p->id=%lld", p->id); + /* Sort the particles */ + for (j = 0; j < 27; ++j) { + runner_do_sort(&r, cells[j], 0x1FFF, 0); + } + + message("Sorting done"); + /* Initialise the particles */ for (j = 0; j < 27; ++j) { - runner_doinit(&r, cells[j], 0); + runner_do_init(&r, cells[j], 0); } + message("Init done"); + /* Compute density */ runner_doself1_density(&r, ci); + message("Self done"); + for (int j = 0; j < 27; ++j) + if (cells[j] != ci) runner_dopair1_density(&r, ci, cells[j]); + + message("Density done"); + + /* Ghost task */ runner_do_ghost(&r, ci); message("h=%f rho=%f N_ngb=%f", p->h, p->rho, p->density.wcount); message("c=%f", p->force.c); runner_doself2_force(&r, ci); - runner_dokick(&r, ci, 1); + runner_do_kick(&r, ci, 1); message("ti_end=%d", p->ti_end); @@ -155,10 +199,6 @@ int main() { free(ci->xparts); return 0; -} - -#else - -int main() { return 0; } #endif +}