diff --git a/src/engine.c b/src/engine.c index eb936f61b296c94d933855de874cc3949dce89bc..121d1419c337a2644f349cb137169031aab83e4d 100644 --- a/src/engine.c +++ b/src/engine.c @@ -3903,61 +3903,61 @@ void engine_split(struct engine *e, struct partition *initial_partition) { * @param e The #engine. */ void engine_dump_snapshot(struct engine *e) { - - struct clocks_time time1, time2; - clocks_gettime(&time1); - -#ifdef SWIFT_DEBUG_CHECKS - /* Check that all cells have been drifted to the current time. - * That can include cells that have not - * previously been active on this rank. */ - space_check_drift_point(e->s, e->ti_current, /* check_mpole=*/0); - - /* Be verbose about this */ - if (e->nodeID == 0) { - if (e->policy & engine_policy_cosmology) - message("Dumping snapshot at a=%e", - exp(e->ti_current * e->time_base) * e->cosmology->a_begin); - else - message("Dumping snapshot at t=%e", - e->ti_current * e->time_base + e->time_begin); - } -#else - if (e->verbose) { - if (e->policy & engine_policy_cosmology) - message("Dumping snapshot at a=%e", - exp(e->ti_current * e->time_base) * e->cosmology->a_begin); - else - message("Dumping snapshot at t=%e", - e->ti_current * e->time_base + e->time_begin); - } -#endif - -/* Dump... */ -#if defined(HAVE_HDF5) -#if defined(WITH_MPI) -#if defined(HAVE_PARALLEL_HDF5) - write_output_parallel(e, e->snapshot_base_name, e->internal_units, - e->snapshot_units, e->nodeID, e->nr_nodes, - MPI_COMM_WORLD, MPI_INFO_NULL); -#else - write_output_serial(e, e->snapshot_base_name, e->internal_units, - e->snapshot_units, e->nodeID, e->nr_nodes, MPI_COMM_WORLD, - MPI_INFO_NULL); -#endif -#else - write_output_single(e, e->snapshot_base_name, e->internal_units, - e->snapshot_units); -#endif -#endif - - /* Flag that we dumped a snapshot */ - e->step_props |= engine_step_prop_snapshot; - - clocks_gettime(&time2); - if (e->verbose) - message("writing particle properties took %.3f %s.", - (float)clocks_diff(&time1, &time2), clocks_getunit()); +// +// struct clocks_time time1, time2; +// clocks_gettime(&time1); +// +//#ifdef SWIFT_DEBUG_CHECKS +// /* Check that all cells have been drifted to the current time. +// * That can include cells that have not +// * previously been active on this rank. */ +// space_check_drift_point(e->s, e->ti_current, /* check_mpole=*/0); +// +// /* Be verbose about this */ +// if (e->nodeID == 0) { +// if (e->policy & engine_policy_cosmology) +// message("Dumping snapshot at a=%e", +// exp(e->ti_current * e->time_base) * e->cosmology->a_begin); +// else +// message("Dumping snapshot at t=%e", +// e->ti_current * e->time_base + e->time_begin); +// } +//#else +// if (e->verbose) { +// if (e->policy & engine_policy_cosmology) +// message("Dumping snapshot at a=%e", +// exp(e->ti_current * e->time_base) * e->cosmology->a_begin); +// else +// message("Dumping snapshot at t=%e", +// e->ti_current * e->time_base + e->time_begin); +// } +//#endif +// +///* Dump... */ +//#if defined(HAVE_HDF5) +//#if defined(WITH_MPI) +//#if defined(HAVE_PARALLEL_HDF5) +// write_output_parallel(e, e->snapshot_base_name, e->internal_units, +// e->snapshot_units, e->nodeID, e->nr_nodes, +// MPI_COMM_WORLD, MPI_INFO_NULL); +//#else +// write_output_serial(e, e->snapshot_base_name, e->internal_units, +// e->snapshot_units, e->nodeID, e->nr_nodes, MPI_COMM_WORLD, +// MPI_INFO_NULL); +//#endif +//#else +// write_output_single(e, e->snapshot_base_name, e->internal_units, +// e->snapshot_units); +//#endif +//#endif +// +// /* Flag that we dumped a snapshot */ +// e->step_props |= engine_step_prop_snapshot; +// +// clocks_gettime(&time2); +// if (e->verbose) +// message("writing particle properties took %.3f %s.", +// (float)clocks_diff(&time1, &time2), clocks_getunit()); } /** diff --git a/src/kernel_hydro.h b/src/kernel_hydro.h index 3d5ec6ac84a77941739f5d3b57ed0340c831c061..bff83fa569a78b638decdf486ad4852538f05aa2 100644 --- a/src/kernel_hydro.h +++ b/src/kernel_hydro.h @@ -517,6 +517,7 @@ __attribute__((always_inline)) INLINE static void kernel_deval_1_vec( * when the mask is 0.*/ w->v = vec_blend(mask_reg, w->v, w2.v); dw_dx->v = vec_blend(mask_reg, dw_dx->v, dw_dx2.v); + print_vector(*w); #else #error "Vectorisation not supported for this kernel!!!" diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 182e81e99c442cf5e27405ea71321e22e7f374e3..cb5c96cc3776f01b0903074266778188a0a76d9e 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -1491,6 +1491,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + print_vector(v_r2); mask_t v_doi_mask; diff --git a/src/vector.h b/src/vector.h index 9e56e9e2e3545f8877df0c9098287dfdf1a30112..60798af8e59382cc40c4f0e811ee5e92c692eb3c 100644 --- a/src/vector.h +++ b/src/vector.h @@ -19,6 +19,7 @@ ******************************************************************************/ #ifndef SWIFT_VECTOR_H #define SWIFT_VECTOR_H +#include <stdio.h> /* Config parameters. */ #include "../config.h" @@ -405,24 +406,26 @@ #include <arm_neon.h> #define VEC_SIZE 4 #define VEC_FLOAT float32x4_t -#define VEC_DB float64x2 +#define VEC_DBL float64x2_t #define VEC_INT int32x4_t +#define VEC_UINT uint32x4_t #define vec_load(a) vld1q_f32(a) #define vec_store(a, adds) vst1q_f32(addr,a) -#define vec_set_zero() vmovq_n_f32(0.0f) -#define vec_set1(a) vld1q_dup_f32(a) +#define vec_setzero() vmovq_n_f32(0.0f) +#define vec_set1(a) vmovq_n_f32(a) +#define vec_setint1(a) vmovq_n_s32(a) #define vec_add(a, b) vaddq_f32(a,b) -#define vec_add_mask(a, b, mask) vec_add(a, ((vector)vec_and(((vector)b).m,mask)).v) +#define vec_mask_add(a, b, mask) vec_add(a, ((vector)vec_and(((vector)b).m,mask)).v) #define vec_sub(a, b) vsubq_f32(a, b) #define vec_mask_sub(a, b, mask) vec_sub(a, ((vector)vec_and(((vector)b).m,mask)).v) #define vec_mul(a, b) vmulq_f32(a, b) #define vec_div(a, b) vdivq_f32(a, b) -#define vec_sqrt(a) vsqrt_f32(a) +#define vec_sqrt(a) vsqrtq_f32(a) #define vec_rcp(a) vrecpeq_f32(a) -#define vec_rsqrt(a) vrsqrte_f32(a) +#define vec_rsqrt(a) vrsqrteq_f32(a) #define vec_ftoi(a) vcvtq_s32_f32(a) -#define vec_fmin(a, b) vpmin_f32(a, b) -#define vec_fmax(a, b) vpmax_f32(q, b) +#define vec_fmin(a, b) vpminq_f32(a, b) +#define vec_fmax(a, b) vpmaxq_f32(a, b) #define vec_fabs(a) vabsq_f32(a) #define vec_floor(a) vcvtq_f32_s32(vcvtmq_s32_f32(a)) #define vec_cmp_gt(a, b) vcgtzq_f32(vec_sub(a,b)) @@ -431,27 +434,30 @@ #define vec_cmp_lt(a, b) vcgtzq_f32(vec_sub(b,a)) #define vec_cmp_lte(a, b) vcgezq_f32(vec_sub(b,a)) #define vec_cmp_result(a) vec_not(vceqzq_f32(a)) -#define vec_is_mask_true(a) vec_not(vceqzq_f32(a.v)) -#define vec_and(a, b) vandq_s32(a, b) +#define vec_is_mask_true(a) vaddvq_s32(a.m) +// Write vector conversion function? +#define vec_create_mask(mask, cond) mask.m = ((vector)cond).m +#define vec_and(a, b) vandq_s32(((vector)a).m, ((vector)b).m) #define vec_mask_and(a, b) vec_and(a.v, b.v) -#define vec_and_mask(a, mask) vec_and( ((vector)a).v, mask ) +#define vec_and_mask(a, mask) vcvtq_f32_s32(vec_and(a, mask.v)) #define vec_init_mask_true(mask) mask.m = vec_setint1(0xFFFFFFFF) #define vec_combine_masks(mask1, mask2) \ - ({ mask1.v = vec_mask_and(mask1,mask2); }) + ({ mask1.v = vcvtq_f32_s32(vec_mask_and(mask1,mask2)); }) #define vec_zero_mask(mask) mask.v = vec_setzero() #define vec_pad_mask(mask, pad) \ for(int i = VEC_SIZE - (pad); i < VEC_SIZE; i++) mask.i[i] = 0 +// Change to vector conversion function? #define vec_blend(mask, a, b) \ - ((vector)vec_or( vec_and(mask.m, ((vector)b).m ), vec_and(vec_not(mask.v), ((vector)a).m ).v) -#define vec_or(a, b) vorrq_u32(a, b) -#define vec_not(a) vmvnq_u32(a) + ((vector)vec_or( vec_and(mask.m, ((vector)b).m ), vcvtq_s32_f32(((vector)vec_and(vec_not(mask.m), ((vector)a).m )).v))).v +#define vec_or(a, b) vorrq_s32(((vector)a).m, ((vector)b).m) +#define vec_not(a) vmvnq_s32(((vector)a).m) #define FILL_VEC(a) \ { .f[0] = a, .f[1] = a, .f[2] = a, .f[3] = a} -#define VEC_HADD(a,b) b += vaddvq_f32(a) +#define VEC_HADD(a,b) b += vaddvq_f32(((vector)a).v) -#define VEC_HMAX(a,b) b = max(b,vmaxvq_f32(a)) +#define VEC_HMAX(a,b) b = max(b,vmaxvq_f32(((vector)a).v)) #define vec_fma(a, b, c) vec_add(vec_mul(a,b),c) #define vec_fnma(a,b,c) vec_sub(c, vec_mul(a,b)) @@ -465,11 +471,16 @@ typedef union { VEC_FLOAT v; VEC_DBL vd; VEC_INT m; + VEC_UINT um; float f[VEC_SIZE]; double d[VEC_SIZE / 2]; int i[VEC_SIZE]; } vector; +inline void print_vector(vector v) { + printf("vector %.5e %.5e %.5e %.5e\n", v.f[0], v.f[1], v.f[2], v.f[3]); + fflush(stdout); +} /* Define the mask type depending on the instruction set used. */ #ifdef HAVE_AVX512_F typedef __mmask16 mask_t; diff --git a/tests/test27cells.c b/tests/test27cells.c index cc34f503304feb56799a2d31baa3416b940202d3..82ec184fafb711bc6ce3f710529a89ac0e52d148 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -385,13 +385,13 @@ int main(int argc, char *argv[]) { /* Choke on FP-exceptions */ #ifdef HAVE_FE_ENABLE_EXCEPT - feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); #endif /* Get some randomness going */ srand(0); - char c; + signed char c; while ((c = getopt(argc, argv, "m:s:h:p:n:r:t:d:f:v:")) != -1) { switch (c) { case 'h': @@ -594,11 +594,11 @@ int main(int argc, char *argv[]) { ticks face_time = timings[4] + timings[10] + timings[12] + timings[14] + timings[16] + timings[22]; - message("Corner calculations took : %15lli ticks.", corner_time / runs); - message("Edge calculations took : %15lli ticks.", edge_time / runs); - message("Face calculations took : %15lli ticks.", face_time / runs); - message("Self calculations took : %15lli ticks.", timings[13] / runs); - message("SWIFT calculation took : %15lli ticks.", time / runs); + message("Corner calculations took : %15lu ticks.", corner_time / runs); + message("Edge calculations took : %15lu ticks.", edge_time / runs); + message("Face calculations took : %15lu ticks.", face_time / runs); + message("Self calculations took : %15lu ticks.", timings[13] / runs); + message("SWIFT calculation took : %15lu ticks.", time / runs); /* Now perform a brute-force version for accuracy tests */ @@ -624,7 +624,7 @@ int main(int argc, char *argv[]) { dump_particle_fields(outputFileName, main_cell, cells); /* Output timing */ - message("Brute force calculation took : %15lli ticks.", toc - tic); + message("Brute force calculation took : %15lu ticks.", toc - tic); /* Clean things to make the sanitizer happy ... */ for (int i = 0; i < 27; ++i) clean_up(cells[i]); diff --git a/tests/testInteractions.c b/tests/testInteractions.c index e14fddd640764c7e22a217fb483791494ba4fae0..8046b734d2d38a64514c67df925f00d9cb06710e 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -405,9 +405,9 @@ void test_interactions(struct part test_part, struct part *parts, size_t count, if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count)) message("Differences found..."); - message("The serial interactions took : %15lli ticks.", + message("The serial interactions took : %15lu ticks.", serial_time / runs); - message("The vectorised interactions took : %15lli ticks.", vec_time / runs); + message("The vectorised interactions took : %15lu ticks.", vec_time / runs); message("Speed up: %15fx.", (double)(serial_time) / vec_time); } @@ -677,9 +677,9 @@ void test_force_interactions(struct part test_part, struct part *parts, if (check_results(pi_serial, pj_serial, pi_vec, pj_vec, count)) message("Differences found..."); - message("The serial interactions took : %15lli ticks.", + message("The serial interactions took : %15lu ticks.", serial_time / runs); - message("The vectorised interactions took : %15lli ticks.", vec_time / runs); + message("The vectorised interactions took : %15lu ticks.", vec_time / runs); message("Speed up: %15fx.", (double)(serial_time) / vec_time); } @@ -693,7 +693,7 @@ int main(int argc, char *argv[]) { /* Get some randomness going */ srand(0); - char c; + signed char c; while ((c = getopt(argc, argv, "h:s:n:r:")) != -1) { switch (c) { case 'h':