diff --git a/.gitignore b/.gitignore index c75a8f8fa2c16b8c092ab1a361b003d1e3384032..812aa06dd54ee8bb81e87796deb8ee05160fec72 100644 --- a/.gitignore +++ b/.gitignore @@ -55,7 +55,9 @@ tests/brute_force_125_standard.dat tests/swift_dopair_125_standard.dat tests/brute_force_125_perturbed.dat tests/swift_dopair_125_perturbed.dat -tests/brute_force_active.dat +tests/brute_force_pair_active.dat +tests/brute_force_dopair2_active.dat +tests/swift_dopair2_force_active.dat tests/brute_force_periodic_BC_perturbed.dat tests/swift_dopair_active.dat tests/test_nonsym_density_serial.dat diff --git a/src/cache.h b/src/cache.h index 22b79b1e49230c434a852da16512493e242ba3f9..3eb1e194dd4232319ac1d4a4323ca8099f044063 100644 --- a/src/cache.h +++ b/src/cache.h @@ -198,10 +198,7 @@ __attribute__((always_inline)) INLINE void cache_read_particles( swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; - double loc[3]; - loc[0] = ci->loc[0]; - loc[1] = ci->loc[1]; - loc[2] = ci->loc[2]; + const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ @@ -210,7 +207,6 @@ __attribute__((always_inline)) INLINE void cache_read_particles( y[i] = (float)(parts[i].x[1] - loc[1]); z[i] = (float)(parts[i].x[2] - loc[2]); h[i] = parts[i].h; - m[i] = parts[i].mass; vx[i] = parts[i].v[0]; vy[i] = parts[i].v[1]; @@ -254,10 +250,7 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( SWIFT_CACHE_ALIGNMENT); const struct part *restrict parts = ci->parts; - double loc[3]; - loc[0] = ci->loc[0]; - loc[1] = ci->loc[1]; - loc[2] = ci->loc[2]; + const double loc[3] = {ci->loc[0], ci->loc[1], ci->loc[2]}; /* Shift the particles positions to a local frame so single precision can be * used instead of double precision. */ @@ -266,12 +259,10 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( y[i] = (float)(parts[i].x[1] - loc[1]); z[i] = (float)(parts[i].x[2] - loc[2]); h[i] = parts[i].h; - m[i] = parts[i].mass; vx[i] = parts[i].v[0]; vy[i] = parts[i].v[1]; vz[i] = parts[i].v[2]; - rho[i] = parts[i].rho; grad_h[i] = parts[i].force.f; pOrho2[i] = parts[i].force.P_over_rho2; @@ -296,7 +287,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( * @param shift The amount to shift the particle positions to account for BCs * @param first_pi The first particle in cell ci that is in range. * @param last_pj The last particle in cell cj that is in range. - * @param num_vec_proc Number of vectors that will be used to process the * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( @@ -304,38 +294,39 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( struct cache *restrict const ci_cache, struct cache *restrict const cj_cache, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const double *restrict const shift, - int *first_pi, int *last_pj, const int num_vec_proc) { + int *first_pi, int *last_pj) { + + /* Make the number of particles to be read a multiple of the vector size. + * This eliminates serial remainder loops where possible when populating the + * cache. */ - int idx; - /* Pad number of particles read to the vector size. */ - int rem = (ci->count - *first_pi) % (num_vec_proc * VEC_SIZE); + /* Is the number of particles to read a multiple of the vector size? */ + int rem = (ci->count - *first_pi) % VEC_SIZE; if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; + int pad = VEC_SIZE - rem; + /* Decrease first_pi if there are particles in the cell left to read. */ if (*first_pi - pad >= 0) *first_pi -= pad; } - rem = *last_pj % (num_vec_proc * VEC_SIZE); + rem = (*last_pj + 1) % VEC_SIZE; if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; + int pad = VEC_SIZE - rem; + /* Increase last_pj if there are particles in the cell left to read. */ if (*last_pj + pad < cj->count) *last_pj += pad; } - int first_pi_align = *first_pi; - int last_pj_align = *last_pj; + /* Get some local pointers */ + const int first_pi_align = *first_pi; + const int last_pj_align = *last_pj; const struct part *restrict parts_i = ci->parts; const struct part *restrict parts_j = cj->parts; - double loc[3]; - loc[0] = cj->loc[0]; - loc[1] = cj->loc[1]; - loc[2] = cj->loc[2]; - /* Shift ci particles for boundary conditions and location of cell.*/ - double total_ci_shift[3]; - total_ci_shift[0] = loc[0] + shift[0]; - total_ci_shift[1] = loc[1] + shift[1]; - total_ci_shift[2] = loc[2] + shift[2]; + /* Shift particles to the local frame and account for boundary conditions.*/ + const double total_ci_shift[3] = { + cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]}; + const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; /* Let the compiler know that the data is aligned and create pointers to the * arrays inside the cache. */ @@ -349,19 +340,15 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single - * precision - * can be - * used instead of double precision. Also shift the cell ci, particles - * positions - * due to BCs but leave cell cj. */ + * precision can be used instead of double precision. */ for (int i = 0; i < ci_cache_count; i++) { - idx = sort_i[i + first_pi_align].i; + const int idx = sort_i[i + first_pi_align].i; x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); h[i] = parts_i[idx].h; - m[i] = parts_i[idx].mass; vx[i] = parts_i[idx].v[0]; vy[i] = parts_i[idx].v[1]; @@ -384,36 +371,42 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( "is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. x=%f, ci->width[0]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, x[i], - ci->width[0]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, x[i], ci->width[0]); if (y[i] > shift_threshold_y || y[i] < -shift_threshold_y) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d y pos " "is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. y=%f, ci->width[1]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, y[i], - ci->width[1]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, y[i], ci->width[1]); if (z[i] > shift_threshold_z || z[i] < -shift_threshold_z) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d z pos " "is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. z=%f, ci->width[2]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, z[i], - ci->width[2]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, z[i], ci->width[2]); } #endif /* Pad cache with fake particles that exist outside the cell so will not - * interact.*/ - float fake_pix = 2.0f * parts_i[sort_i[ci->count - 1].i].x[0]; + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const double max_dx = max(ci->dx_max_part, cj->dx_max_part); + const float pos_padded[3] = {-(2. * ci->width[0] + max_dx), + -(2. * ci->width[1] + max_dx), + -(2. * ci->width[2] + max_dx)}; + const float h_padded = ci->parts[0].h; + for (int i = ci->count - first_pi_align; i < ci->count - first_pi_align + VEC_SIZE; i++) { - x[i] = fake_pix; - y[i] = 1.f; - z[i] = 1.f; - h[i] = 1.f; + x[i] = pos_padded[0]; + y[i] = pos_padded[1]; + z[i] = pos_padded[2]; + h[i] = h_padded; m[i] = 1.f; vx[i] = 1.f; @@ -433,12 +426,11 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); for (int i = 0; i <= last_pj_align; i++) { - idx = sort_j[i].i; - xj[i] = (float)(parts_j[idx].x[0] - loc[0]); - yj[i] = (float)(parts_j[idx].x[1] - loc[1]); - zj[i] = (float)(parts_j[idx].x[2] - loc[2]); + const int idx = sort_j[i].i; + xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]); + yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); + zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); hj[i] = parts_j[idx].h; - mj[i] = parts_j[idx].mass; vxj[i] = parts_j[idx].v[0]; vyj[i] = parts_j[idx].v[1]; @@ -454,40 +446,228 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( "pos is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. xj=%f, ci->width[0]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, xj[i], - ci->width[0]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, xj[i], ci->width[0]); if (yj[i] > shift_threshold_y || yj[i] < -shift_threshold_y) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d yj " "pos is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. yj=%f, ci->width[1]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, yj[i], - ci->width[1]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, yj[i], ci->width[1]); if (zj[i] > shift_threshold_z || zj[i] < -shift_threshold_z) error( "Error: ci->loc[%lf,%lf,%lf], cj->loc[%lf,%lf,%lf] Particle %d zj " "pos is not within " "[-4*ci->width*(1 + 2*space_maxreldx), 4*ci->width*(1 + " "2*space_maxreldx)]. zj=%f, ci->width[2]=%f", - ci->loc[0], ci->loc[1], ci->loc[2], loc[0], loc[1], loc[2], i, zj[i], - ci->width[2]); + ci->loc[0], ci->loc[1], ci->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], i, zj[i], ci->width[2]); } #endif /* Pad cache with fake particles that exist outside the cell so will not - * interact.*/ - float fake_pjx = 2.0f * cj->parts[sort_j[cj->count - 1].i].x[0]; + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx), + -(2. * cj->width[1] + max_dx), + -(2. * cj->width[2] + max_dx)}; + const float h_padded_j = cj->parts[0].h; + for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { - xj[i] = fake_pjx; - yj[i] = 1.f; - zj[i] = 1.f; - hj[i] = 1.f; + xj[i] = pos_padded_j[0]; + yj[i] = pos_padded_j[1]; + zj[i] = pos_padded_j[2]; + hj[i] = h_padded_j; + mj[i] = 1.f; + vxj[i] = 1.f; + vyj[i] = 1.f; + vzj[i] = 1.f; + } +} + +/** + * @brief Populate caches by only reading particles that are within range of + * each other within the adjoining cell.Also read the particles into the cache + * in sorted order. + * + * @param ci The i #cell. + * @param cj The j #cell. + * @param ci_cache The #cache for cell ci. + * @param cj_cache The #cache for cell cj. + * @param sort_i The array of sorted particle indices for cell ci. + * @param sort_j The array of sorted particle indices for cell ci. + * @param shift The amount to shift the particle positions to account for BCs + * @param first_pi The first particle in cell ci that is in range. + * @param last_pj The last particle in cell cj that is in range. + * interaction. + */ +__attribute__((always_inline)) INLINE void +cache_read_two_partial_cells_sorted_force( + const struct cell *const ci, const struct cell *const cj, + struct cache *const ci_cache, struct cache *const cj_cache, + const struct entry *restrict sort_i, const struct entry *restrict sort_j, + const double *const shift, int *first_pi, int *last_pj) { + + /* Make the number of particles to be read a multiple of the vector size. + * This eliminates serial remainder loops where possible when populating the + * cache. */ + + /* Is the number of particles to read a multiple of the vector size? */ + int rem = (ci->count - *first_pi) % VEC_SIZE; + if (rem != 0) { + int pad = VEC_SIZE - rem; + + /* Decrease first_pi if there are particles in the cell left to read. */ + if (*first_pi - pad >= 0) *first_pi -= pad; + } + + rem = (*last_pj + 1) % VEC_SIZE; + if (rem != 0) { + int pad = VEC_SIZE - rem; + + /* Increase last_pj if there are particles in the cell left to read. */ + if (*last_pj + pad < cj->count) *last_pj += pad; + } + + /* Get some local pointers */ + const int first_pi_align = *first_pi; + const int last_pj_align = *last_pj; + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; + + /* Shift particles to the local frame and account for boundary conditions.*/ + const double total_ci_shift[3] = { + cj->loc[0] + shift[0], cj->loc[1] + shift[1], cj->loc[2] + shift[2]}; + const double total_cj_shift[3] = {cj->loc[0], cj->loc[1], cj->loc[2]}; + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, rho, ci_cache->rho, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, grad_h, ci_cache->grad_h, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, pOrho2, ci_cache->pOrho2, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, balsara, ci_cache->balsara, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, soundspeed, ci_cache->soundspeed, + SWIFT_CACHE_ALIGNMENT); + + int ci_cache_count = ci->count - first_pi_align; + /* Shift the particles positions to a local frame (ci frame) so single + * precision can be used instead of double precision. */ + for (int i = 0; i < ci_cache_count; i++) { + + const int idx = sort_i[i + first_pi_align].i; + x[i] = (float)(parts_i[idx].x[0] - total_ci_shift[0]); + y[i] = (float)(parts_i[idx].x[1] - total_ci_shift[1]); + z[i] = (float)(parts_i[idx].x[2] - total_ci_shift[2]); + h[i] = parts_i[idx].h; + m[i] = parts_i[idx].mass; + vx[i] = parts_i[idx].v[0]; + vy[i] = parts_i[idx].v[1]; + vz[i] = parts_i[idx].v[2]; + rho[i] = parts_i[idx].rho; + grad_h[i] = parts_i[idx].force.f; + pOrho2[i] = parts_i[idx].force.P_over_rho2; + balsara[i] = parts_i[idx].force.balsara; + soundspeed[i] = parts_i[idx].force.soundspeed; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const double max_dx = max(ci->dx_max_part, cj->dx_max_part); + const float pos_padded[3] = {-(2. * ci->width[0] + max_dx), + -(2. * ci->width[1] + max_dx), + -(2. * ci->width[2] + max_dx)}; + const float h_padded = ci->parts[0].h; + + for (int i = ci->count - first_pi_align; + i < ci->count - first_pi_align + VEC_SIZE; i++) { + x[i] = pos_padded[0]; + y[i] = pos_padded[1]; + z[i] = pos_padded[2]; + h[i] = h_padded; + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; + rho[i] = 1.f; + grad_h[i] = 1.f; + pOrho2[i] = 1.f; + balsara[i] = 1.f; + soundspeed[i] = 1.f; + } + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, xj, cj_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, yj, cj_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, zj, cj_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, hj, cj_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, mj, cj_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vxj, cj_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vyj, cj_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vzj, cj_cache->vz, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, rhoj, cj_cache->rho, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, grad_hj, cj_cache->grad_h, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, pOrho2j, cj_cache->pOrho2, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, balsaraj, cj_cache->balsara, + SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, soundspeedj, cj_cache->soundspeed, + SWIFT_CACHE_ALIGNMENT); + + for (int i = 0; i <= last_pj_align; i++) { + const int idx = sort_j[i].i; + xj[i] = (float)(parts_j[idx].x[0] - total_cj_shift[0]); + yj[i] = (float)(parts_j[idx].x[1] - total_cj_shift[1]); + zj[i] = (float)(parts_j[idx].x[2] - total_cj_shift[2]); + hj[i] = parts_j[idx].h; + mj[i] = parts_j[idx].mass; + vxj[i] = parts_j[idx].v[0]; + vyj[i] = parts_j[idx].v[1]; + vzj[i] = parts_j[idx].v[2]; + rhoj[i] = parts_j[idx].rho; + grad_hj[i] = parts_j[idx].force.f; + pOrho2j[i] = parts_j[idx].force.P_over_rho2; + balsaraj[i] = parts_j[idx].force.balsara; + soundspeedj[i] = parts_j[idx].force.soundspeed; + } + /* Pad cache with fake particles that exist outside the cell so will not + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const float pos_padded_j[3] = {-(2. * cj->width[0] + max_dx), + -(2. * cj->width[1] + max_dx), + -(2. * cj->width[2] + max_dx)}; + const float h_padded_j = cj->parts[0].h; + + for (int i = last_pj_align + 1; i < last_pj_align + 1 + VEC_SIZE; i++) { + xj[i] = pos_padded_j[0]; + yj[i] = pos_padded_j[1]; + zj[i] = pos_padded_j[2]; + hj[i] = h_padded_j; mj[i] = 1.f; vxj[i] = 1.f; vyj[i] = 1.f; vzj[i] = 1.f; + rhoj[i] = 1.f; + grad_hj[i] = 1.f; + pOrho2j[i] = 1.f; + balsaraj[i] = 1.f; + soundspeedj[i] = 1.f; } } @@ -506,6 +686,11 @@ static INLINE void cache_clean(struct cache *c) { free(c->vz); free(c->h); free(c->max_index); + free(c->rho); + free(c->grad_h); + free(c->pOrho2); + free(c->balsara); + free(c->soundspeed); } } diff --git a/src/hydro/Gadget2/hydro_iact.h b/src/hydro/Gadget2/hydro_iact.h index 3851dd6e894eafff29d35942ab0f364b5919e029..666e0be835f114959f2f2a074900700b383e10fe 100644 --- a/src/hydro/Gadget2/hydro_iact.h +++ b/src/hydro/Gadget2/hydro_iact.h @@ -170,17 +170,15 @@ runner_iact_nonsym_1_vec_density(vector *r2, vector *dx, vector *dy, vector *dz, mask_t mask) { vector r, ri, ui, wi, wi_dx; - vector mj; vector dvx, dvy, dvz; - vector vjx, vjy, vjz; vector dvdr; vector curlvrx, curlvry, curlvrz; /* Fill the vectors. */ - mj.v = vec_load(Mj); - vjx.v = vec_load(Vjx); - vjy.v = vec_load(Vjy); - vjz.v = vec_load(Vjz); + const vector mj = vector_load(Mj); + const vector vjx = vector_load(Vjx); + const vector vjy = vector_load(Vjy); + const vector vjz = vector_load(Vjz); /* Get the radius and inverse radius. */ ri = vec_reciprocal_sqrt(*r2); @@ -245,38 +243,34 @@ runner_iact_nonsym_2_vec_density(float *R2, float *Dx, float *Dy, float *Dz, vector *curlvySum, vector *curlvzSum, mask_t mask, mask_t mask2, short mask_cond) { - vector r, ri, r2, ui, wi, wi_dx; - vector mj; - vector dx, dy, dz, dvx, dvy, dvz; - vector vjx, vjy, vjz; + vector r, ri, ui, wi, wi_dx; + vector dvx, dvy, dvz; vector dvdr; vector curlvrx, curlvry, curlvrz; - vector r_2, ri2, r2_2, ui2, wi2, wi_dx2; - vector mj2; - vector dx2, dy2, dz2, dvx2, dvy2, dvz2; - vector vjx2, vjy2, vjz2; + vector r_2, ri2, ui2, wi2, wi_dx2; + vector dvx2, dvy2, dvz2; vector dvdr2; vector curlvrx2, curlvry2, curlvrz2; /* Fill the vectors. */ - mj.v = vec_load(Mj); - mj2.v = vec_load(&Mj[VEC_SIZE]); - vjx.v = vec_load(Vjx); - vjx2.v = vec_load(&Vjx[VEC_SIZE]); - vjy.v = vec_load(Vjy); - vjy2.v = vec_load(&Vjy[VEC_SIZE]); - vjz.v = vec_load(Vjz); - vjz2.v = vec_load(&Vjz[VEC_SIZE]); - dx.v = vec_load(Dx); - dx2.v = vec_load(&Dx[VEC_SIZE]); - dy.v = vec_load(Dy); - dy2.v = vec_load(&Dy[VEC_SIZE]); - dz.v = vec_load(Dz); - dz2.v = vec_load(&Dz[VEC_SIZE]); + const vector mj = vector_load(Mj); + const vector mj2 = vector_load(&Mj[VEC_SIZE]); + const vector vjx = vector_load(Vjx); + const vector vjx2 = vector_load(&Vjx[VEC_SIZE]); + const vector vjy = vector_load(Vjy); + const vector vjy2 = vector_load(&Vjy[VEC_SIZE]); + const vector vjz = vector_load(Vjz); + const vector vjz2 = vector_load(&Vjz[VEC_SIZE]); + const vector dx = vector_load(Dx); + const vector dx2 = vector_load(&Dx[VEC_SIZE]); + const vector dy = vector_load(Dy); + const vector dy2 = vector_load(&Dy[VEC_SIZE]); + const vector dz = vector_load(Dz); + const vector dz2 = vector_load(&Dz[VEC_SIZE]); /* Get the radius and inverse radius. */ - r2.v = vec_load(R2); - r2_2.v = vec_load(&R2[VEC_SIZE]); + const vector r2 = vector_load(R2); + const vector r2_2 = vector_load(&R2[VEC_SIZE]); ri = vec_reciprocal_sqrt(r2); ri2 = vec_reciprocal_sqrt(r2_2); r.v = vec_mul(r2.v, ri.v); @@ -592,30 +586,29 @@ runner_iact_nonsym_1_vec_force( #ifdef WITH_VECTORIZATION vector r, ri; - vector vjx, vjy, vjz, dvx, dvy, dvz; - vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj; + vector dvx, dvy, dvz; vector xi, xj; vector hid_inv, hjd_inv; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piax, piay, piaz; vector pih_dt; vector v_sig; - vector omega_ij, mu_ij, fac_mu, balsara; + vector omega_ij, mu_ij, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; /* Fill vectors. */ - vjx.v = vec_load(Vjx); - vjy.v = vec_load(Vjy); - vjz.v = vec_load(Vjz); - mj.v = vec_load(Mj); - - pjrho.v = vec_load(Pjrho); - grad_hj.v = vec_load(Grad_hj); - pjPOrho2.v = vec_load(PjPOrho2); - balsara_j.v = vec_load(Balsara_j); - cj.v = vec_load(Cj); - - fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ + const vector vjx = vector_load(Vjx); + const vector vjy = vector_load(Vjy); + const vector vjz = vector_load(Vjz); + const vector mj = vector_load(Mj); + const vector pjrho = vector_load(Pjrho); + const vector grad_hj = vector_load(Grad_hj); + const vector pjPOrho2 = vector_load(PjPOrho2); + const vector balsara_j = vector_load(Balsara_j); + const vector cj = vector_load(Cj); + + const vector fac_mu = + vector_set1(1.f); /* Will change with cosmological integration */ /* Load stuff. */ balsara.v = vec_add(balsara_i.v, balsara_j.v); @@ -634,7 +627,7 @@ runner_iact_nonsym_1_vec_force( hjd_inv = pow_dimension_plus_one_vec(hj_inv); xj.v = vec_mul(r.v, hj_inv.v); - /* Calculate the kernel for two particles. */ + /* Calculate the kernel. */ kernel_eval_dWdx_force_vec(&xj, &wj_dx); wj_dr.v = vec_mul(hjd_inv.v, wj_dx.v); @@ -720,23 +713,19 @@ runner_iact_nonsym_2_vec_force( #ifdef WITH_VECTORIZATION - vector r, r2, ri; - vector dx, dy, dz, dvx, dvy, dvz; - vector vjx, vjy, vjz; - vector pjrho, grad_hj, pjPOrho2, balsara_j, cj, mj, hj_inv; + vector r, ri; + vector dvx, dvy, dvz; vector ui, uj; vector hid_inv, hjd_inv; vector wi_dx, wj_dx, wi_dr, wj_dr, dvdr; vector piax, piay, piaz; vector pih_dt; vector v_sig; - vector omega_ij, mu_ij, fac_mu, balsara; + vector omega_ij, mu_ij, balsara; vector rho_ij, visc, visc_term, sph_term, acc, entropy_dt; - vector r_2, r2_2, ri_2; - vector dx_2, dy_2, dz_2, dvx_2, dvy_2, dvz_2; - vector vjx_2, vjy_2, vjz_2; - vector pjrho_2, grad_hj_2, pjPOrho2_2, balsara_j_2, cj_2, mj_2, hj_inv_2; + vector r_2, ri_2; + vector dvx_2, dvy_2, dvz_2; vector ui_2, uj_2; vector hjd_inv_2; vector wi_dx_2, wj_dx_2, wi_dr_2, wj_dr_2, dvdr_2; @@ -747,44 +736,45 @@ runner_iact_nonsym_2_vec_force( vector rho_ij_2, visc_2, visc_term_2, sph_term_2, acc_2, entropy_dt_2; /* Fill vectors. */ - mj.v = vec_load(Mj); - mj_2.v = vec_load(&Mj[VEC_SIZE]); - vjx.v = vec_load(Vjx); - vjx_2.v = vec_load(&Vjx[VEC_SIZE]); - vjy.v = vec_load(Vjy); - vjy_2.v = vec_load(&Vjy[VEC_SIZE]); - vjz.v = vec_load(Vjz); - vjz_2.v = vec_load(&Vjz[VEC_SIZE]); - dx.v = vec_load(Dx); - dx_2.v = vec_load(&Dx[VEC_SIZE]); - dy.v = vec_load(Dy); - dy_2.v = vec_load(&Dy[VEC_SIZE]); - dz.v = vec_load(Dz); - dz_2.v = vec_load(&Dz[VEC_SIZE]); + const vector mj = vector_load(Mj); + const vector mj_2 = vector_load(&Mj[VEC_SIZE]); + const vector vjx = vector_load(Vjx); + const vector vjx_2 = vector_load(&Vjx[VEC_SIZE]); + const vector vjy = vector_load(Vjy); + const vector vjy_2 = vector_load(&Vjy[VEC_SIZE]); + const vector vjz = vector_load(Vjz); + const vector vjz_2 = vector_load(&Vjz[VEC_SIZE]); + const vector dx = vector_load(Dx); + const vector dx_2 = vector_load(&Dx[VEC_SIZE]); + const vector dy = vector_load(Dy); + const vector dy_2 = vector_load(&Dy[VEC_SIZE]); + const vector dz = vector_load(Dz); + const vector dz_2 = vector_load(&Dz[VEC_SIZE]); /* Get the radius and inverse radius. */ - r2.v = vec_load(R2); - r2_2.v = vec_load(&R2[VEC_SIZE]); + const vector r2 = vector_load(R2); + const vector r2_2 = vector_load(&R2[VEC_SIZE]); ri = vec_reciprocal_sqrt(r2); ri_2 = vec_reciprocal_sqrt(r2_2); r.v = vec_mul(r2.v, ri.v); r_2.v = vec_mul(r2_2.v, ri_2.v); /* Get remaining properties. */ - pjrho.v = vec_load(Pjrho); - pjrho_2.v = vec_load(&Pjrho[VEC_SIZE]); - grad_hj.v = vec_load(Grad_hj); - grad_hj_2.v = vec_load(&Grad_hj[VEC_SIZE]); - pjPOrho2.v = vec_load(PjPOrho2); - pjPOrho2_2.v = vec_load(&PjPOrho2[VEC_SIZE]); - balsara_j.v = vec_load(Balsara_j); - balsara_j_2.v = vec_load(&Balsara_j[VEC_SIZE]); - cj.v = vec_load(Cj); - cj_2.v = vec_load(&Cj[VEC_SIZE]); - hj_inv.v = vec_load(Hj_inv); - hj_inv_2.v = vec_load(&Hj_inv[VEC_SIZE]); - - fac_mu.v = vec_set1(1.f); /* Will change with cosmological integration */ + const vector pjrho = vector_load(Pjrho); + const vector pjrho_2 = vector_load(&Pjrho[VEC_SIZE]); + const vector grad_hj = vector_load(Grad_hj); + const vector grad_hj_2 = vector_load(&Grad_hj[VEC_SIZE]); + const vector pjPOrho2 = vector_load(PjPOrho2); + const vector pjPOrho2_2 = vector_load(&PjPOrho2[VEC_SIZE]); + const vector balsara_j = vector_load(Balsara_j); + const vector balsara_j_2 = vector_load(&Balsara_j[VEC_SIZE]); + const vector cj = vector_load(Cj); + const vector cj_2 = vector_load(&Cj[VEC_SIZE]); + const vector hj_inv = vector_load(Hj_inv); + const vector hj_inv_2 = vector_load(&Hj_inv[VEC_SIZE]); + + const vector fac_mu = + vector_set1(1.f); /* Will change with cosmological integration */ /* Find the balsara switch. */ balsara.v = vec_add(balsara_i.v, balsara_j.v); diff --git a/src/runner.c b/src/runner.c index b7ebf652738948141f8f3983b56331bad20e5a9d..73d7b5517fa073bef8e6fc3764733c7ec3e1f732 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1885,7 +1885,7 @@ void *runner_main(void *data) { runner_dopair1_branch_gradient(r, ci, cj); #endif else if (t->subtype == task_subtype_force) - runner_dopair2_force(r, ci, cj); + runner_dopair2_branch_force(r, ci, cj); else if (t->subtype == task_subtype_grav) runner_dopair_grav(r, ci, cj, 1); else diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 3ea935b950a3989fbb79499a81a54f1bf8aca432..0d642b23c46aee7d31b9b3469afcca1a1041dbc6 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -32,6 +32,9 @@ #define _DOPAIR1(f) PASTE(runner_dopair1, f) #define DOPAIR1 _DOPAIR1(FUNCTION) +#define _DOPAIR2_BRANCH(f) PASTE(runner_dopair2_branch, f) +#define DOPAIR2_BRANCH _DOPAIR2_BRANCH(FUNCTION) + #define _DOPAIR2(f) PASTE(runner_dopair2, f) #define DOPAIR2 _DOPAIR2(FUNCTION) @@ -778,37 +781,6 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, const struct entry *restrict sort_j = cj->sort[sid]; #ifdef SWIFT_DEBUG_CHECKS - /* Check that the dx_max_sort values in the cell are indeed an upper - bound on particle movement. */ - for (int pid = 0; pid < ci->count; pid++) { - const struct part *p = &ci->parts[sort_i[pid].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " - "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " - "ci->dx_max_sort_old=%e", - ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, - ci->dx_max_sort_old); - } - for (int pjd = 0; pjd < cj->count; pjd++) { - const struct part *p = &cj->parts[sort_j[pjd].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " - "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " - "cj->dx_max_sort_old=%e", - cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, - cj->dx_max_sort_old); - } - /* Some constants used to checks that the parts are in the right frame */ const float shift_threshold_x = 2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part); @@ -816,7 +788,6 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, 2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part); const float shift_threshold_z = 2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part); - #endif /* SWIFT_DEBUG_CHECKS */ /* Get some other useful values. */ @@ -1025,6 +996,43 @@ void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) { cj->dx_max_sort_old > space_maxreldx * cj->dmin) error("Interacting unsorted cells."); +#ifdef SWIFT_DEBUG_CHECKS + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = ci->sort[sid]; + const struct entry *restrict sort_j = cj->sort[sid]; + + /* Check that the dx_max_sort values in the cell are indeed an upper + bound on particle movement. */ + for (int pid = 0; pid < ci->count; pid++) { + const struct part *p = &ci->parts[sort_i[pid].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > + 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " + "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " + "ci->dx_max_sort_old=%e", + ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, + ci->dx_max_sort_old); + } + for (int pjd = 0; pjd < cj->count; pjd++) { + const struct part *p = &cj->parts[sort_j[pjd].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > + 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " + "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " + "cj->dx_max_sort_old=%e", + cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, + cj->dx_max_sort_old); + } +#endif /* SWIFT_DEBUG_CHECKS */ + #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \ (DOPAIR1_BRANCH == runner_dopair1_density_branch) if (!sort_is_corner(sid)) @@ -1042,8 +1050,11 @@ void DOPAIR1_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) { * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. + * @param sid The direction of the pair + * @param shift The shift vector to apply to the particles in ci. */ -void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { +void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj, const int sid, + const double *shift) { struct engine *restrict e = r->e; @@ -1054,24 +1065,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { TIMER_TIC; - /* Anything to do here? */ - if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; - - if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) - error("Interacting undrifted cells."); - - /* Get the shift ID. */ - double shift[3] = {0.0, 0.0, 0.0}; - const int sid = space_getsid(e->s, &ci, &cj, shift); - - /* Have the cells been sorted? */ - if (!(ci->sorted & (1 << sid)) || - ci->dx_max_sort_old > space_maxreldx * ci->dmin) - error("Interacting unsorted cells."); - if (!(cj->sorted & (1 << sid)) || - cj->dx_max_sort_old > space_maxreldx * cj->dmin) - error("Interacting unsorted cells."); - /* Get the cutoff shift. */ double rshift = 0.0; for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; @@ -1081,37 +1074,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { struct entry *restrict sort_j = cj->sort[sid]; #ifdef SWIFT_DEBUG_CHECKS - /* Check that the dx_max_sort values in the cell are indeed an upper - bound on particle movement. */ - for (int pid = 0; pid < ci->count; pid++) { - const struct part *p = &ci->parts[sort_i[pid].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " - "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " - "ci->dx_max_sort_old=%e", - ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, - ci->dx_max_sort_old); - } - for (int pjd = 0; pjd < cj->count; pjd++) { - const struct part *p = &cj->parts[sort_j[pjd].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " - "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " - "cj->dx_max_sort_old=%e", - cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, - cj->dx_max_sort_old); - } - /* Some constants used to checks that the parts are in the right frame */ const float shift_threshold_x = 2. * ci->width[0] + 2. * max(ci->dx_max_part, cj->dx_max_part); @@ -1119,7 +1081,6 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { 2. * ci->width[1] + 2. * max(ci->dx_max_part, cj->dx_max_part); const float shift_threshold_z = 2. * ci->width[2] + 2. * max(ci->dx_max_part, cj->dx_max_part); - #endif /* SWIFT_DEBUG_CHECKS */ /* Get some other useful values. */ @@ -1490,6 +1451,86 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { TIMER_TOC(TIMER_DOPAIR); } +/** + * @brief Determine which version of DOPAIR2 needs to be called depending on the + * orientation of the cells or whether DOPAIR2 needs to be called at all. + * + * @param r #runner + * @param ci #cell ci + * @param cj #cell cj + * + */ +void DOPAIR2_BRANCH(struct runner *r, struct cell *ci, struct cell *cj) { + + const struct engine *restrict e = r->e; + + /* Anything to do here? */ + if (!cell_is_active(ci, e) && !cell_is_active(cj, e)) return; + + /* Check that cells are drifted. */ + if (!cell_are_part_drifted(ci, e) || !cell_are_part_drifted(cj, e)) + error("Interacting undrifted cells."); + + /* Get the sort ID. */ + double shift[3] = {0.0, 0.0, 0.0}; + const int sid = space_getsid(e->s, &ci, &cj, shift); + + /* Have the cells been sorted? */ + if (!(ci->sorted & (1 << sid)) || + ci->dx_max_sort_old > space_maxreldx * ci->dmin) + error("Interacting unsorted cells."); + if (!(cj->sorted & (1 << sid)) || + cj->dx_max_sort_old > space_maxreldx * cj->dmin) + error("Interacting unsorted cells."); + +#ifdef SWIFT_DEBUG_CHECKS + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = ci->sort[sid]; + const struct entry *restrict sort_j = cj->sort[sid]; + + /* Check that the dx_max_sort values in the cell are indeed an upper + bound on particle movement. */ + for (int pid = 0; pid < ci->count; pid++) { + const struct part *p = &ci->parts[sort_i[pid].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > + 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " + "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " + "ci->dx_max_sort_old=%e", + ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, + ci->dx_max_sort_old); + } + for (int pjd = 0; pjd < cj->count; pjd++) { + const struct part *p = &cj->parts[sort_j[pjd].i]; + const float d = p->x[0] * runner_shift[sid][0] + + p->x[1] * runner_shift[sid][1] + + p->x[2] * runner_shift[sid][2]; + if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > + 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) + error( + "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " + "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " + "cj->dx_max_sort_old=%e", + cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, + cj->dx_max_sort_old); + } +#endif /* SWIFT_DEBUG_CHECKS */ + +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) && \ + (DOPAIR2_BRANCH == runner_dopair2_force_branch) + if (!sort_is_corner(sid)) + runner_dopair2_force_vec(r, ci, cj, sid, shift); + else + DOPAIR2(r, ci, cj, sid, shift); +#else + DOPAIR2(r, ci, cj, sid, shift); +#endif +} + /** * @brief Compute the cell self-interaction (non-symmetric). * @@ -2295,7 +2336,7 @@ void DOSUB_PAIR2(struct runner *r, struct cell *ci, struct cell *cj, int sid, error("Interacting unsorted cells."); /* Compute the interactions. */ - DOPAIR2(r, ci, cj); + DOPAIR2_BRANCH(r, ci, cj); } if (gettimer) TIMER_TOC(TIMER_DOSUB_PAIR); diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index 5e982830c609bcf2ce533c0423449a3e74e96bf4..7e34939ac4e6741dc9fd970f774bba01ca812593 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -35,21 +35,23 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param rhoSum (return) #vector holding the cumulative sum of the density + * @param v_rhoSum (return) #vector holding the cumulative sum of the density * update on pi. - * @param rho_dhSum (return) #vector holding the cumulative sum of the density + * @param v_rho_dhSum (return) #vector holding the cumulative sum of the density * gradient update on pi. - * @param wcountSum (return) #vector holding the cumulative sum of the wcount + * @param v_wcountSum (return) #vector holding the cumulative sum of the wcount * update on pi. - * @param wcount_dhSum (return) #vector holding the cumulative sum of the wcount + * @param v_wcount_dhSum (return) #vector holding the cumulative sum of the + * wcount * gradient update on pi. - * @param div_vSum (return) #vector holding the cumulative sum of the divergence + * @param v_div_vSum (return) #vector holding the cumulative sum of the + * divergence * update on pi. - * @param curlvxSum (return) #vector holding the cumulative sum of the curl of + * @param v_curlvxSum (return) #vector holding the cumulative sum of the curl of * vx update on pi. - * @param curlvySum (return) #vector holding the cumulative sum of the curl of + * @param v_curlvySum (return) #vector holding the cumulative sum of the curl of * vy update on pi. - * @param curlvzSum (return) #vector holding the cumulative sum of the curl of + * @param v_curlvzSum (return) #vector holding the cumulative sum of the curl of * vz update on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -59,11 +61,11 @@ static const vector kernel_gamma2_vec = FILL_VEC(kernel_gamma2); * interactions have been performed, should be a multiple of the vector length. */ __attribute__((always_inline)) INLINE static void calcRemInteractions( - struct c2_cache *const int_cache, const int icount, vector *rhoSum, - vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz, - int *icount_align) { + struct c2_cache *const int_cache, const int icount, vector *v_rhoSum, + vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum, + vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, + vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, + vector v_viz, int *icount_align) { mask_t int_mask, int_mask2; @@ -107,9 +109,9 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( &int_cache->dyq[*icount_align], &int_cache->dzq[*icount_align], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[*icount_align], &int_cache->vyq[*icount_align], &int_cache->vzq[*icount_align], - &int_cache->mq[*icount_align], rhoSum, rho_dhSum, wcountSum, - wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, int_mask, - int_mask2, 1); + &int_cache->mq[*icount_align], v_rhoSum, v_rho_dhSum, v_wcountSum, + v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, v_curlvzSum, + int_mask, int_mask2, 1); } } @@ -127,20 +129,25 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( * @param int_cache (return) secondary #cache of interactions between two * particles. * @param icount Interaction count. - * @param rhoSum #vector holding the cumulative sum of the density update on pi. - * @param rho_dhSum #vector holding the cumulative sum of the density gradient + * @param v_rhoSum #vector holding the cumulative sum of the density update on + * pi. + * @param v_rho_dhSum #vector holding the cumulative sum of the density gradient * update on pi. - * @param wcountSum #vector holding the cumulative sum of the wcount update on + * @param v_wcountSum #vector holding the cumulative sum of the wcount update on * pi. - * @param wcount_dhSum #vector holding the cumulative sum of the wcount gradient + * @param v_wcount_dhSum #vector holding the cumulative sum of the wcount + * gradient * update on pi. - * @param div_vSum #vector holding the cumulative sum of the divergence update + * @param v_div_vSum #vector holding the cumulative sum of the divergence update * on pi. - * @param curlvxSum #vector holding the cumulative sum of the curl of vx update + * @param v_curlvxSum #vector holding the cumulative sum of the curl of vx + * update * on pi. - * @param curlvySum #vector holding the cumulative sum of the curl of vy update + * @param v_curlvySum #vector holding the cumulative sum of the curl of vy + * update * on pi. - * @param curlvzSum #vector holding the cumulative sum of the curl of vz update + * @param v_curlvzSum #vector holding the cumulative sum of the curl of vz + * update * on pi. * @param v_hi_inv #vector of 1/h for pi. * @param v_vix #vector of x velocity of pi. @@ -150,10 +157,11 @@ __attribute__((always_inline)) INLINE static void calcRemInteractions( __attribute__((always_inline)) INLINE static void storeInteractions( const int mask, const int pjd, vector *v_r2, vector *v_dx, vector *v_dy, vector *v_dz, const struct cache *const cell_cache, - struct c2_cache *const int_cache, int *icount, vector *rhoSum, - vector *rho_dhSum, vector *wcountSum, vector *wcount_dhSum, - vector *div_vSum, vector *curlvxSum, vector *curlvySum, vector *curlvzSum, - vector v_hi_inv, vector v_vix, vector v_viy, vector v_viz) { + struct c2_cache *const int_cache, int *icount, vector *v_rhoSum, + vector *v_rho_dhSum, vector *v_wcountSum, vector *v_wcount_dhSum, + vector *v_div_vSum, vector *v_curlvxSum, vector *v_curlvySum, + vector *v_curlvzSum, vector v_hi_inv, vector v_vix, vector v_viy, + vector v_viz) { /* Left-pack values needed into the secondary cache using the interaction mask. */ @@ -202,9 +210,10 @@ __attribute__((always_inline)) INLINE static void storeInteractions( int icount_align = *icount; /* Peform remainder interactions. */ - calcRemInteractions(int_cache, *icount, rhoSum, rho_dhSum, wcountSum, - wcount_dhSum, div_vSum, curlvxSum, curlvySum, curlvzSum, - v_hi_inv, v_vix, v_viy, v_viz, &icount_align); + calcRemInteractions(int_cache, *icount, v_rhoSum, v_rho_dhSum, v_wcountSum, + v_wcount_dhSum, v_div_vSum, v_curlvxSum, v_curlvySum, + v_curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, + &icount_align); mask_t int_mask, int_mask2; vec_init_mask_true(int_mask); @@ -215,9 +224,9 @@ __attribute__((always_inline)) INLINE static void storeInteractions( runner_iact_nonsym_2_vec_density( &int_cache->r2q[j], &int_cache->dxq[j], &int_cache->dyq[j], &int_cache->dzq[j], v_hi_inv, v_vix, v_viy, v_viz, &int_cache->vxq[j], - &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], rhoSum, - rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, curlvySum, - curlvzSum, int_mask, int_mask2, 0); + &int_cache->vyq[j], &int_cache->vzq[j], &int_cache->mq[j], v_rhoSum, + v_rho_dhSum, v_wcountSum, v_wcount_dhSum, v_div_vSum, v_curlvxSum, + v_curlvySum, v_curlvzSum, int_mask, int_mask2, 0); } /* Reset interaction count. */ @@ -288,8 +297,9 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const float first_di = sort_i[first_pi].d + pi->h * kernel_gamma + dx_max - rshift; - /* Loop through particles in cell j until they are not in range of pi. */ - while (temp < cj->count && first_di > sort_j[temp].d) temp++; + /* Loop through particles in cell j until they are not in range of pi. + * Make sure that temp stays between 0 and cj->count - 1.*/ + while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++; max_index_i[first_pi] = temp; @@ -300,7 +310,8 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( const float di = sort_i[i].d + pi->h * kernel_gamma + dx_max - rshift; - while (temp < cj->count && di > sort_j[temp].d) temp++; + /* Make sure that temp stays between 0 and cj->count - 1.*/ + while (temp < cj->count - 1 && di > sort_j[temp].d) temp++; max_index_i[i] = temp; } @@ -357,6 +368,153 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( *init_pi = first_pi; *init_pj = last_pj; } + +/** + * @brief Populates the arrays max_index_i and max_index_j with the maximum + * indices of + * particles into their neighbouring cells. Also finds the first pi that + * interacts with any particle in cj and the last pj that interacts with any + * particle in ci. + * + * @param ci #cell pointer to ci + * @param cj #cell pointer to cj + * @param sort_i #entry array for particle distance in ci + * @param sort_j #entry array for particle distance in cj + * @param dx_max maximum particle movement allowed in cell + * @param rshift cutoff shift + * @param hi_max_raw Maximal smoothing length in cell ci + * @param hj_max_raw Maximal smoothing length in cell cj + * @param hi_max Maximal smoothing length in cell ci scaled by kernel_gamma + * @param hj_max Maximal smoothing length in cell cj scaled by kernel_gamma + * @param di_max Maximal position on the axis that can interact in cell ci + * @param dj_min Minimal position on the axis that can interact in cell ci + * @param max_index_i array to hold the maximum distances of pi particles into + * #cell cj + * @param max_index_j array to hold the maximum distances of pj particles into + * #cell cj + * @param init_pi first pi to interact with a pj particle + * @param init_pj last pj to interact with a pi particle + * @param max_active_bin The largest time-bin active during this step. + */ +__attribute__((always_inline)) INLINE static void +populate_max_index_no_cache_force(const struct cell *ci, const struct cell *cj, + const struct entry *restrict sort_i, + const struct entry *restrict sort_j, + const float dx_max, const float rshift, + const double hi_max_raw, + const double hj_max_raw, const double hi_max, + const double hj_max, const double di_max, + const double dj_min, int *max_index_i, + int *max_index_j, int *init_pi, int *init_pj, + const timebin_t max_active_bin) { + + const struct part *restrict parts_i = ci->parts; + const struct part *restrict parts_j = cj->parts; + + int first_pi = 0, last_pj = cj->count - 1; + int temp; + + /* Find the leftmost active particle in cell i that interacts with any + * particle in cell j. */ + first_pi = ci->count; + int active_id = first_pi - 1; + while (first_pi > 0 && + sort_i[first_pi - 1].d + dx_max + max(hi_max, hj_max) > dj_min) { + first_pi--; + /* Store the index of the particle if it is active. */ + if (part_is_active_no_debug(&parts_i[sort_i[first_pi].i], max_active_bin)) + active_id = first_pi; + } + + /* Set the first active pi in range of any particle in cell j. */ + first_pi = active_id; + + /* Find the maximum index into cell j for each particle in range in cell i. */ + if (first_pi < ci->count) { + + /* Start from the first particle in cell j. */ + temp = 0; + + const struct part *pi = &parts_i[sort_i[first_pi].i]; + const float first_di = sort_i[first_pi].d + + max(pi->h, hj_max_raw) * kernel_gamma + dx_max - + rshift; + + /* Loop through particles in cell j until they are not in range of pi. + * Make sure that temp stays between 0 and cj->count - 1.*/ + while (temp < cj->count - 1 && first_di > sort_j[temp].d) temp++; + + max_index_i[first_pi] = temp; + + /* Populate max_index_i for remaining particles that are within range. */ + for (int i = first_pi + 1; i < ci->count; i++) { + temp = max_index_i[i - 1]; + pi = &parts_i[sort_i[i].i]; + + const float di = + sort_i[i].d + max(pi->h, hj_max_raw) * kernel_gamma + dx_max - rshift; + + /* Make sure that temp stays between 0 and cj->count - 1.*/ + while (temp < cj->count - 1 && di > sort_j[temp].d) temp++; + + max_index_i[i] = temp; + } + } else { + /* Make sure that max index is set to first particle in cj.*/ + max_index_i[ci->count - 1] = 0; + } + + /* Find the rightmost active particle in cell j that interacts with any + * particle in cell i. */ + last_pj = -1; + active_id = last_pj; + while (last_pj < cj->count && + sort_j[last_pj + 1].d - max(hj_max, hi_max) - dx_max < di_max) { + last_pj++; + /* Store the index of the particle if it is active. */ + if (part_is_active_no_debug(&parts_j[sort_j[last_pj].i], max_active_bin)) + active_id = last_pj; + } + + /* Set the last active pj in range of any particle in cell i. */ + last_pj = active_id; + + /* Find the maximum index into cell i for each particle in range in cell j. */ + if (last_pj >= 0) { + + /* Start from the last particle in cell i. */ + temp = ci->count - 1; + + const struct part *pj = &parts_j[sort_j[last_pj].i]; + const float last_dj = sort_j[last_pj].d - dx_max - + max(pj->h, hi_max_raw) * kernel_gamma + rshift; + + /* Loop through particles in cell i until they are not in range of pj. */ + while (temp > 0 && last_dj < sort_i[temp].d) temp--; + + max_index_j[last_pj] = temp; + + /* Populate max_index_j for remaining particles that are within range. */ + for (int i = last_pj - 1; i >= 0; i--) { + temp = max_index_j[i + 1]; + pj = &parts_j[sort_j[i].i]; + + const float dj = sort_j[i].d - dx_max - + (max(pj->h, hi_max_raw) * kernel_gamma) + rshift; + + while (temp > 0 && dj < sort_i[temp].d) temp--; + + max_index_j[i] = temp; + } + } else { + /* Make sure that max index is set to last particle in ci.*/ + max_index_j[0] = ci->count - 1; + } + + *init_pi = first_pi; + *init_pj = last_pj; +} + #endif /* WITH_VECTORIZATION */ /** @@ -370,20 +528,17 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( struct runner *r, struct cell *restrict c) { #ifdef WITH_VECTORIZATION - const struct engine *e = r->e; - struct part *restrict pi; - int count_align; - int num_vec_proc = NUM_VEC_PROC; + const int num_vec_proc = NUM_VEC_PROC; + /* Get some local variables */ + const struct engine *e = r->e; + const timebin_t max_active_bin = e->max_active_bin; struct part *restrict parts = c->parts; const int count = c->count; - const timebin_t max_active_bin = e->max_active_bin; - - vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; - - TIMER_TIC + TIMER_TIC; + /* Anything to do here? */ if (!cell_is_active(c, e)) return; if (!cell_are_part_drifted(c, e)) error("Interacting undrifted cell."); @@ -407,88 +562,78 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( for (int pid = 0; pid < count; pid++) { /* Get a pointer to the ith particle. */ - pi = &parts[pid]; + struct part *restrict pi = &parts[pid]; /* Is the ith particle active? */ if (!part_is_active_no_debug(pi, max_active_bin)) continue; - vector pix, piy, piz; + vector v_r2; const float hi = cell_cache->h[pid]; /* Fill particle pi vectors. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(cell_cache->vx[pid]); - v_viy.v = vec_set1(cell_cache->vy[pid]); - v_viz.v = vec_set1(cell_cache->vz[pid]); + const vector v_pix = vector_set1(cell_cache->x[pid]); + const vector v_piy = vector_set1(cell_cache->y[pid]); + const vector v_piz = vector_set1(cell_cache->z[pid]); + const vector v_hi = vector_set1(hi); + const vector v_vix = vector_set1(cell_cache->vx[pid]); + const vector v_viy = vector_set1(cell_cache->vy[pid]); + const vector v_viz = vector_set1(cell_cache->vz[pid]); const float hig2 = hi * hi * kernel_gamma2; - v_hig2.v = vec_set1(hig2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + const vector v_hig2 = vector_set1(hig2); /* Get the inverse of hi. */ - vector v_hi_inv; - - v_hi_inv = vec_reciprocal(v_hi); + vector v_hi_inv = vec_reciprocal(v_hi); - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); + /* Reset cumulative sums of update vectors. */ + vector v_rhoSum = vector_setzero(); + vector v_rho_dhSum = vector_setzero(); + vector v_wcountSum = vector_setzero(); + vector v_wcount_dhSum = vector_setzero(); + vector v_div_vSum = vector_setzero(); + vector v_curlvxSum = vector_setzero(); + vector v_curlvySum = vector_setzero(); + vector v_curlvzSum = vector_setzero(); /* Pad cache if there is a serial remainder. */ - count_align = count; - int rem = count % (num_vec_proc * VEC_SIZE); + int count_align = count; + const int rem = count % (num_vec_proc * VEC_SIZE); if (rem != 0) { - int pad = (num_vec_proc * VEC_SIZE) - rem; - - count_align += pad; + count_align += (num_vec_proc * VEC_SIZE) - rem; /* Set positions to the same as particle pi so when the r2 > 0 mask is * applied these extra contributions are masked out.*/ for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; + cell_cache->x[i] = v_pix.f[0]; + cell_cache->y[i] = v_piy.f[0]; + cell_cache->z[i] = v_piz.f[0]; } } - vector pjx, pjy, pjz; - vector pjx2, pjy2, pjz2; - /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - pjy.v = vec_load(&cell_cache->y[pjd]); - pjz.v = vec_load(&cell_cache->z[pjd]); + const vector v_pjx = vector_load(&cell_cache->x[pjd]); + const vector v_pjy = vector_load(&cell_cache->y[pjd]); + const vector v_pjz = vector_load(&cell_cache->z[pjd]); - pjx2.v = vec_load(&cell_cache->x[pjd + VEC_SIZE]); - pjy2.v = vec_load(&cell_cache->y[pjd + VEC_SIZE]); - pjz2.v = vec_load(&cell_cache->z[pjd + VEC_SIZE]); + const vector v_pjx2 = vector_load(&cell_cache->x[pjd + VEC_SIZE]); + const vector v_pjy2 = vector_load(&cell_cache->y[pjd + VEC_SIZE]); + const vector v_pjz2 = vector_load(&cell_cache->z[pjd + VEC_SIZE]); /* Compute the pairwise distance. */ vector v_dx, v_dy, v_dz; vector v_dx_2, v_dy_2, v_dz_2, v_r2_2; - v_dx.v = vec_sub(pix.v, pjx.v); - v_dx_2.v = vec_sub(pix.v, pjx2.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dy_2.v = vec_sub(piy.v, pjy2.v); - v_dz.v = vec_sub(piz.v, pjz.v); - v_dz_2.v = vec_sub(piz.v, pjz2.v); + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dx_2.v = vec_sub(v_pix.v, v_pjx2.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dy_2.v = vec_sub(v_piy.v, v_pjy2.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); + v_dz_2.v = vec_sub(v_piz.v, v_pjz2.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2_2.v = vec_mul(v_dx_2.v, v_dx_2.v); @@ -500,7 +645,6 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( /* Form a mask from r2 < hig2 and r2 > 0.*/ mask_t v_doi_mask, v_doi_mask_self_check, v_doi_mask2, v_doi_mask2_self_check; - int doi_mask, doi_mask_self_check, doi_mask2, doi_mask2_self_check; /* Form r2 > 0 mask and r2 < hig2 mask. */ vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); @@ -511,39 +655,35 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( vec_cmp_gt(v_r2_2.v, vec_setzero())); vec_create_mask(v_doi_mask2, vec_cmp_lt(v_r2_2.v, v_hig2.v)); - /* Form integer masks. */ - doi_mask_self_check = vec_form_int_mask(v_doi_mask_self_check); - doi_mask = vec_form_int_mask(v_doi_mask); - - doi_mask2_self_check = vec_form_int_mask(v_doi_mask2_self_check); - doi_mask2 = vec_form_int_mask(v_doi_mask2); - - /* Combine the two masks. */ - doi_mask = doi_mask & doi_mask_self_check; - doi_mask2 = doi_mask2 & doi_mask2_self_check; + /* Combine two masks and form integer masks. */ + const int doi_mask = vec_is_mask_true(v_doi_mask) & + vec_is_mask_true(v_doi_mask_self_check); + const int doi_mask2 = vec_is_mask_true(v_doi_mask2) & + vec_is_mask_true(v_doi_mask2_self_check); /* If there are any interactions left pack interaction values into c2 * cache. */ if (doi_mask) { storeInteractions(doi_mask, pjd, &v_r2, &v_dx, &v_dy, &v_dz, cell_cache, - &int_cache, &icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz); + &int_cache, &icount, &v_rhoSum, &v_rho_dhSum, + &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, + &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, + v_vix, v_viy, v_viz); } if (doi_mask2) { storeInteractions(doi_mask2, pjd + VEC_SIZE, &v_r2_2, &v_dx_2, &v_dy_2, - &v_dz_2, cell_cache, &int_cache, &icount, &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, - &curlvxSum, &curlvySum, &curlvzSum, v_hi_inv, v_vix, - v_viy, v_viz); + &v_dz_2, cell_cache, &int_cache, &icount, &v_rhoSum, + &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_hi_inv, v_vix, v_viy, v_viz); } } /* Perform padded vector remainder interactions if any are present. */ - calcRemInteractions(&int_cache, icount, &rhoSum, &rho_dhSum, &wcountSum, - &wcount_dhSum, &div_vSum, &curlvxSum, &curlvySum, - &curlvzSum, v_hi_inv, v_vix, v_viy, v_viz, - &icount_align); + calcRemInteractions(&int_cache, icount, &v_rhoSum, &v_rho_dhSum, + &v_wcountSum, &v_wcount_dhSum, &v_div_vSum, + &v_curlvxSum, &v_curlvySum, &v_curlvzSum, v_hi_inv, + v_vix, v_viy, v_viz, &icount_align); /* Initialise masks to true in case remainder interactions have been * performed. */ @@ -557,21 +697,21 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( &int_cache.r2q[pjd], &int_cache.dxq[pjd], &int_cache.dyq[pjd], &int_cache.dzq[pjd], v_hi_inv, v_vix, v_viy, v_viz, &int_cache.vxq[pjd], &int_cache.vyq[pjd], &int_cache.vzq[pjd], - &int_cache.mq[pjd], &rhoSum, &rho_dhSum, &wcountSum, &wcount_dhSum, - &div_vSum, &curlvxSum, &curlvySum, &curlvzSum, int_mask, int_mask2, - 0); + &int_cache.mq[pjd], &v_rhoSum, &v_rho_dhSum, &v_wcountSum, + &v_wcount_dhSum, &v_div_vSum, &v_curlvxSum, &v_curlvySum, + &v_curlvzSum, int_mask, int_mask2, 0); } /* Perform horizontal adds on vector sums and store result in particle pi. */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); + VEC_HADD(v_rhoSum, pi->rho); + VEC_HADD(v_rho_dhSum, pi->density.rho_dh); + VEC_HADD(v_wcountSum, pi->density.wcount); + VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(v_div_vSum, pi->density.div_v); + VEC_HADD(v_curlvxSum, pi->density.rot_v[0]); + VEC_HADD(v_curlvySum, pi->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pi->density.rot_v[2]); /* Reset interaction count. */ icount = 0; @@ -602,9 +742,6 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( struct part *restrict parts = c->parts; const int count = c->count; - vector v_hi, v_vix, v_viy, v_viz, v_hig2, v_r2; - vector v_rhoi, v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci; - TIMER_TIC; if (!cell_is_active(c, e)) return; @@ -640,43 +777,36 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( /* Is the ith particle active? */ if (!part_is_active_no_debug(pi, max_active_bin)) continue; - vector pix, piy, piz; - const float hi = cell_cache->h[pid]; /* Fill particle pi vectors. */ - pix.v = vec_set1(cell_cache->x[pid]); - piy.v = vec_set1(cell_cache->y[pid]); - piz.v = vec_set1(cell_cache->z[pid]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(cell_cache->vx[pid]); - v_viy.v = vec_set1(cell_cache->vy[pid]); - v_viz.v = vec_set1(cell_cache->vz[pid]); - - v_rhoi.v = vec_set1(cell_cache->rho[pid]); - v_grad_hi.v = vec_set1(cell_cache->grad_h[pid]); - v_pOrhoi2.v = vec_set1(cell_cache->pOrho2[pid]); - v_balsara_i.v = vec_set1(cell_cache->balsara[pid]); - v_ci.v = vec_set1(cell_cache->soundspeed[pid]); + const vector v_pix = vector_set1(cell_cache->x[pid]); + const vector v_piy = vector_set1(cell_cache->y[pid]); + const vector v_piz = vector_set1(cell_cache->z[pid]); + const vector v_hi = vector_set1(hi); + const vector v_vix = vector_set1(cell_cache->vx[pid]); + const vector v_viy = vector_set1(cell_cache->vy[pid]); + const vector v_viz = vector_set1(cell_cache->vz[pid]); + + const vector v_rhoi = vector_set1(cell_cache->rho[pid]); + const vector v_grad_hi = vector_set1(cell_cache->grad_h[pid]); + const vector v_pOrhoi2 = vector_set1(cell_cache->pOrho2[pid]); + const vector v_balsara_i = vector_set1(cell_cache->balsara[pid]); + const vector v_ci = vector_set1(cell_cache->soundspeed[pid]); const float hig2 = hi * hi * kernel_gamma2; - v_hig2.v = vec_set1(hig2); - - /* Reset cumulative sums of update vectors. */ - vector a_hydro_xSum, a_hydro_ySum, a_hydro_zSum, h_dtSum, v_sigSum, - entropy_dtSum; + const vector v_hig2 = vector_set1(hig2); /* Get the inverse of hi. */ - vector v_hi_inv; - - v_hi_inv = vec_reciprocal(v_hi); + vector v_hi_inv = vec_reciprocal(v_hi); - a_hydro_xSum.v = vec_setzero(); - a_hydro_ySum.v = vec_setzero(); - a_hydro_zSum.v = vec_setzero(); - h_dtSum.v = vec_setzero(); - v_sigSum.v = vec_set1(pi->force.v_sig); - entropy_dtSum.v = vec_setzero(); + /* Reset cumulative sums of update vectors. */ + vector v_a_hydro_xSum = vector_setzero(); + vector v_a_hydro_ySum = vector_setzero(); + vector v_a_hydro_zSum = vector_setzero(); + vector v_h_dtSum = vector_setzero(); + vector v_sigSum = vector_set1(pi->force.v_sig); + vector v_entropy_dtSum = vector_setzero(); /* Pad cache if there is a serial remainder. */ count_align = count; @@ -689,9 +819,9 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( /* Set positions to the same as particle pi so when the r2 > 0 mask is * applied these extra contributions are masked out.*/ for (int i = count; i < count_align; i++) { - cell_cache->x[i] = pix.f[0]; - cell_cache->y[i] = piy.f[0]; - cell_cache->z[i] = piz.f[0]; + cell_cache->x[i] = v_pix.f[0]; + cell_cache->y[i] = v_piy.f[0]; + cell_cache->z[i] = v_piz.f[0]; cell_cache->h[i] = 1.f; cell_cache->rho[i] = 1.f; cell_cache->grad_h[i] = 1.f; @@ -701,25 +831,23 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( } } - vector pjx, pjy, pjz, hj, hjg2; - /* Find all of particle pi's interacions and store needed values in the * secondary cache.*/ for (int pjd = 0; pjd < count_align; pjd += (num_vec_proc * VEC_SIZE)) { /* Load 1 set of vectors from the particle cache. */ - pjx.v = vec_load(&cell_cache->x[pjd]); - pjy.v = vec_load(&cell_cache->y[pjd]); - pjz.v = vec_load(&cell_cache->z[pjd]); - hj.v = vec_load(&cell_cache->h[pjd]); + vector hjg2; + const vector v_pjx = vector_load(&cell_cache->x[pjd]); + const vector v_pjy = vector_load(&cell_cache->y[pjd]); + const vector v_pjz = vector_load(&cell_cache->z[pjd]); + const vector hj = vector_load(&cell_cache->h[pjd]); hjg2.v = vec_mul(vec_mul(hj.v, hj.v), kernel_gamma2_vec.v); /* Compute the pairwise distance. */ - vector v_dx, v_dy, v_dz; - - v_dx.v = vec_sub(pix.v, pjx.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dz.v = vec_sub(piz.v, pjz.v); + vector v_dx, v_dy, v_dz, v_r2; + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); @@ -727,7 +855,6 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( /* Form r2 > 0 mask, r2 < hig2 mask and r2 < hjg2 mask. */ mask_t v_doi_mask, v_doi_mask_self_check; - int doi_mask; /* Form r2 > 0 mask.*/ vec_create_mask(v_doi_mask_self_check, vec_cmp_gt(v_r2.v, vec_setzero())); @@ -737,18 +864,15 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( v_h2.v = vec_fmax(v_hig2.v, hjg2.v); vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); - /* Combine all 3 masks and form integer mask. */ + /* Combine all 3 masks. */ vec_combine_masks(v_doi_mask, v_doi_mask_self_check); - doi_mask = vec_form_int_mask(v_doi_mask); /* If there are any interactions perform them. */ - if (doi_mask) { - vector v_hj_inv; - v_hj_inv = vec_reciprocal(hj); + if (vec_is_mask_true(v_doi_mask)) { + vector v_hj_inv = vec_reciprocal(hj); /* To stop floating point exceptions for when particle separations are - * 0. - */ + * 0. */ v_r2.v = vec_add(v_r2.v, vec_set1(FLT_MIN)); runner_iact_nonsym_1_vec_force( @@ -757,19 +881,19 @@ __attribute__((always_inline)) INLINE void runner_doself2_force_vec( &cell_cache->vy[pjd], &cell_cache->vz[pjd], &cell_cache->rho[pjd], &cell_cache->grad_h[pjd], &cell_cache->pOrho2[pjd], &cell_cache->balsara[pjd], &cell_cache->soundspeed[pjd], - &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &a_hydro_xSum, - &a_hydro_ySum, &a_hydro_zSum, &h_dtSum, &v_sigSum, &entropy_dtSum, - v_doi_mask); + &cell_cache->m[pjd], v_hi_inv, v_hj_inv, &v_a_hydro_xSum, + &v_a_hydro_ySum, &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, + &v_entropy_dtSum, v_doi_mask); } } /* Loop over all other particles. */ - VEC_HADD(a_hydro_xSum, pi->a_hydro[0]); - VEC_HADD(a_hydro_ySum, pi->a_hydro[1]); - VEC_HADD(a_hydro_zSum, pi->a_hydro[2]); - VEC_HADD(h_dtSum, pi->force.h_dt); + VEC_HADD(v_a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(v_a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(v_a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(v_h_dtSum, pi->force.h_dt); VEC_HMAX(v_sigSum, pi->force.v_sig); - VEC_HADD(entropy_dtSum, pi->entropy_dt); + VEC_HADD(v_entropy_dtSum, pi->entropy_dt); } /* loop over all particles. */ @@ -795,8 +919,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const struct engine *restrict e = r->e; const timebin_t max_active_bin = e->max_active_bin; - vector v_hi, v_vix, v_viy, v_viz, v_hig2; - TIMER_TIC; /* Get the cutoff shift. */ @@ -807,39 +929,6 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const struct entry *restrict sort_i = ci->sort[sid]; const struct entry *restrict sort_j = cj->sort[sid]; -#ifdef SWIFT_DEBUG_CHECKS - /* Check that the dx_max_sort values in the cell are indeed an upper - bound on particle movement. */ - for (int pid = 0; pid < ci->count; pid++) { - const struct part *p = &ci->parts[sort_i[pid].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_i[pid].d) - ci->dx_max_sort > - 1.0e-4 * max(fabsf(d), ci->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell ci. ci->nodeID=%d " - "cj->nodeID=%d d=%e sort_i[pid].d=%e ci->dx_max_sort=%e " - "ci->dx_max_sort_old=%e", - ci->nodeID, cj->nodeID, d, sort_i[pid].d, ci->dx_max_sort, - ci->dx_max_sort_old); - } - for (int pjd = 0; pjd < cj->count; pjd++) { - const struct part *p = &cj->parts[sort_j[pjd].i]; - const float d = p->x[0] * runner_shift[sid][0] + - p->x[1] * runner_shift[sid][1] + - p->x[2] * runner_shift[sid][2]; - if (fabsf(d - sort_j[pjd].d) - cj->dx_max_sort > - 1.0e-4 * max(fabsf(d), cj->dx_max_sort_old)) - error( - "particle shift diff exceeds dx_max_sort in cell cj. cj->nodeID=%d " - "ci->nodeID=%d d=%e sort_j[pjd].d=%e cj->dx_max_sort=%e " - "cj->dx_max_sort_old=%e", - cj->nodeID, ci->nodeID, d, sort_j[pjd].d, cj->dx_max_sort, - cj->dx_max_sort_old); - } -#endif /* SWIFT_DEBUG_CHECKS */ - /* Get some other useful values. */ const int count_i = ci->count; const int count_j = cj->count; @@ -853,7 +942,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const int active_ci = cell_is_active(ci, e); const int active_cj = cell_is_active(cj, e); - /* Check if any particles are active and return if there are not. */ + /* Count number of particles that are in range and active*/ int numActive = 0; if (active_ci) { @@ -878,6 +967,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } } + /* Return if there are no active particles within range */ if (numActive == 0) return; /* Get both particle caches from the runner and re-allocate @@ -900,8 +990,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, max_index_j = r->cj_cache.max_index; /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[]. - */ - /* Also find the first pi that interacts with any particle in cj and the last + * Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, hj_max, di_max, dj_min, max_index_i, max_index_j, @@ -916,15 +1005,12 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, last_pj = max(last_pj, max_index_i[count_i - 1]); first_pi = min(first_pi, max_index_j[0]); - /* Read the needed particles into the two caches. */ - int first_pi_align = first_pi; - int last_pj_align = last_pj; + /* Read the required particles into the two caches. */ cache_read_two_partial_cells_sorted(ci, cj, ci_cache, cj_cache, sort_i, - sort_j, shift, &first_pi_align, - &last_pj_align, 1); + sort_j, shift, &first_pi, &last_pj); /* Get the number of particles read into the ci cache. */ - int ci_cache_count = count_i - first_pi_align; + const int ci_cache_count = count_i - first_pi; if (active_ci) { @@ -936,7 +1022,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (!part_is_active_no_debug(pi, max_active_bin)) continue; /* Set the cache index. */ - int ci_cache_idx = pid - first_pi_align; + const int ci_cache_idx = pid - first_pi; /* Skip this particle if no particle in cj is within range of it. */ const float hi = ci_cache->h[ci_cache_idx]; @@ -945,113 +1031,101 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (di_test < dj_min) continue; /* Determine the exit iteration of the interaction loop. */ - int exit_iteration = max_index_i[pid]; - - const float hig2 = hi * hi * kernel_gamma2; - - vector pix, piy, piz; + const int exit_iteration = max_index_i[pid]; /* Fill particle pi vectors. */ - pix.v = vec_set1(ci_cache->x[ci_cache_idx]); - piy.v = vec_set1(ci_cache->y[ci_cache_idx]); - piz.v = vec_set1(ci_cache->z[ci_cache_idx]); - v_hi.v = vec_set1(hi); - v_vix.v = vec_set1(ci_cache->vx[ci_cache_idx]); - v_viy.v = vec_set1(ci_cache->vy[ci_cache_idx]); - v_viz.v = vec_set1(ci_cache->vz[ci_cache_idx]); - - v_hig2.v = vec_set1(hig2); + const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]); + const vector v_piy = vector_set1(ci_cache->y[ci_cache_idx]); + const vector v_piz = vector_set1(ci_cache->z[ci_cache_idx]); + const vector v_hi = vector_set1(hi); + const vector v_vix = vector_set1(ci_cache->vx[ci_cache_idx]); + const vector v_viy = vector_set1(ci_cache->vy[ci_cache_idx]); + const vector v_viz = vector_set1(ci_cache->vz[ci_cache_idx]); - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + const float hig2 = hi * hi * kernel_gamma2; + const vector v_hig2 = vector_set1(hig2); /* Get the inverse of hi. */ - vector v_hi_inv; - - v_hi_inv = vec_reciprocal(v_hi); + vector v_hi_inv = vec_reciprocal(v_hi); - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); + /* Reset cumulative sums of update vectors. */ + vector v_rhoSum = vector_setzero(); + vector v_rho_dhSum = vector_setzero(); + vector v_wcountSum = vector_setzero(); + vector v_wcount_dhSum = vector_setzero(); + vector v_div_vSum = vector_setzero(); + vector v_curlvxSum = vector_setzero(); + vector v_curlvySum = vector_setzero(); + vector v_curlvzSum = vector_setzero(); /* Pad the exit iteration if there is a serial remainder. */ int exit_iteration_align = exit_iteration; - int rem = exit_iteration % VEC_SIZE; + const int rem = exit_iteration % VEC_SIZE; if (rem != 0) { - int pad = VEC_SIZE - rem; + const int pad = VEC_SIZE - rem; - if (exit_iteration_align + pad <= last_pj_align + 1) + if (exit_iteration_align + pad <= last_pj + 1) exit_iteration_align += pad; } - vector pjx, pjy, pjz; - - /* Loop over the parts in cj. */ - for (int pjd = 0; pjd < exit_iteration_align; pjd += VEC_SIZE) { + /* Loop over the parts in cj. Making sure to perform an iteration of the + * loop even if exit_iteration_align is zero and there is only one + * particle to interact with.*/ + for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) { /* Get the cache index to the jth particle. */ - int cj_cache_idx = pjd; + const int cj_cache_idx = pjd; vector v_dx, v_dy, v_dz, v_r2; #ifdef SWIFT_DEBUG_CHECKS if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || - cj_cache_idx + (VEC_SIZE - 1) > (last_pj_align + 1 + VEC_SIZE)) { - error("Unaligned read!!! cj_cache_idx=%d, last_pj_align=%d", - cj_cache_idx, last_pj_align); + cj_cache_idx + (VEC_SIZE - 1) > (last_pj + 1 + VEC_SIZE)) { + error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", cj_cache_idx, + last_pj); } #endif /* Load 2 sets of vectors from the particle cache. */ - pjx.v = vec_load(&cj_cache->x[cj_cache_idx]); - pjy.v = vec_load(&cj_cache->y[cj_cache_idx]); - pjz.v = vec_load(&cj_cache->z[cj_cache_idx]); + const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pix.v, pjx.v); - v_dy.v = vec_sub(piy.v, pjy.v); - v_dz.v = vec_sub(piz.v, pjz.v); + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); mask_t v_doi_mask; - int doi_mask; /* Form r2 < hig2 mask. */ vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); - /* Form integer mask. */ - doi_mask = vec_form_int_mask(v_doi_mask); - /* If there are any interactions perform them. */ - if (doi_mask) + if (vec_is_mask_true(v_doi_mask)) runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], - &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doi_mask); + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], + &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_doi_mask); } /* loop over the parts in cj. */ - /* Perform horizontal adds on vector sums and store result in particle pi. - */ - VEC_HADD(rhoSum, pi->rho); - VEC_HADD(rho_dhSum, pi->density.rho_dh); - VEC_HADD(wcountSum, pi->density.wcount); - VEC_HADD(wcount_dhSum, pi->density.wcount_dh); - VEC_HADD(div_vSum, pi->density.div_v); - VEC_HADD(curlvxSum, pi->density.rot_v[0]); - VEC_HADD(curlvySum, pi->density.rot_v[1]); - VEC_HADD(curlvzSum, pi->density.rot_v[2]); + /* Perform horizontal adds on vector sums and store result in pi. */ + VEC_HADD(v_rhoSum, pi->rho); + VEC_HADD(v_rho_dhSum, pi->density.rho_dh); + VEC_HADD(v_wcountSum, pi->density.wcount); + VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(v_div_vSum, pi->density.div_v); + VEC_HADD(v_curlvxSum, pi->density.rot_v[0]); + VEC_HADD(v_curlvySum, pi->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pi->density.rot_v[2]); } /* loop over the parts in ci. */ } @@ -1066,7 +1140,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (!part_is_active_no_debug(pj, max_active_bin)) continue; /* Set the cache index. */ - int cj_cache_idx = pjd; + const int cj_cache_idx = pjd; /* Skip this particle if no particle in ci is within range of it. */ const float hj = cj_cache->h[cj_cache_idx]; @@ -1074,49 +1148,38 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, if (dj_test > di_max) continue; /* Determine the exit iteration of the interaction loop. */ - int exit_iteration = max_index_j[pjd]; - - const float hjg2 = hj * hj * kernel_gamma2; - - vector pjx, pjy, pjz; - vector v_hj, v_vjx, v_vjy, v_vjz, v_hjg2; + const int exit_iteration = max_index_j[pjd]; /* Fill particle pi vectors. */ - pjx.v = vec_set1(cj_cache->x[cj_cache_idx]); - pjy.v = vec_set1(cj_cache->y[cj_cache_idx]); - pjz.v = vec_set1(cj_cache->z[cj_cache_idx]); - v_hj.v = vec_set1(hj); - v_vjx.v = vec_set1(cj_cache->vx[cj_cache_idx]); - v_vjy.v = vec_set1(cj_cache->vy[cj_cache_idx]); - v_vjz.v = vec_set1(cj_cache->vz[cj_cache_idx]); + const vector v_pjx = vector_set1(cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_set1(cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_set1(cj_cache->z[cj_cache_idx]); + const vector v_hj = vector_set1(hj); + const vector v_vjx = vector_set1(cj_cache->vx[cj_cache_idx]); + const vector v_vjy = vector_set1(cj_cache->vy[cj_cache_idx]); + const vector v_vjz = vector_set1(cj_cache->vz[cj_cache_idx]); - v_hjg2.v = vec_set1(hjg2); - - /* Reset cumulative sums of update vectors. */ - vector rhoSum, rho_dhSum, wcountSum, wcount_dhSum, div_vSum, curlvxSum, - curlvySum, curlvzSum; + const float hjg2 = hj * hj * kernel_gamma2; + const vector v_hjg2 = vector_set1(hjg2); /* Get the inverse of hj. */ - vector v_hj_inv; + vector v_hj_inv = vec_reciprocal(v_hj); - v_hj_inv = vec_reciprocal(v_hj); - - rhoSum.v = vec_setzero(); - rho_dhSum.v = vec_setzero(); - wcountSum.v = vec_setzero(); - wcount_dhSum.v = vec_setzero(); - div_vSum.v = vec_setzero(); - curlvxSum.v = vec_setzero(); - curlvySum.v = vec_setzero(); - curlvzSum.v = vec_setzero(); - - vector pix, piy, piz; + /* Reset cumulative sums of update vectors. */ + vector v_rhoSum = vector_setzero(); + vector v_rho_dhSum = vector_setzero(); + vector v_wcountSum = vector_setzero(); + vector v_wcount_dhSum = vector_setzero(); + vector v_div_vSum = vector_setzero(); + vector v_curlvxSum = vector_setzero(); + vector v_curlvySum = vector_setzero(); + vector v_curlvzSum = vector_setzero(); /* Convert exit iteration to cache indices. */ - int exit_iteration_align = exit_iteration - first_pi_align; + int exit_iteration_align = exit_iteration - first_pi; /* Pad the exit iteration align so cache reads are aligned. */ - int rem = exit_iteration_align % VEC_SIZE; + const int rem = exit_iteration_align % VEC_SIZE; if (exit_iteration_align < VEC_SIZE) { exit_iteration_align = 0; } else @@ -1128,61 +1191,56 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #ifdef SWIFT_DEBUG_CHECKS if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0 || - ci_cache_idx + (VEC_SIZE - 1) > - (count_i - first_pi_align + VEC_SIZE)) { + ci_cache_idx + (VEC_SIZE - 1) > (count_i - first_pi + VEC_SIZE)) { error( - "Unaligned read!!! ci_cache_idx=%d, first_pi_align=%d, " + "Unaligned read!!! ci_cache_idx=%d, first_pi=%d, " "count_i=%d", - ci_cache_idx, first_pi_align, count_i); + ci_cache_idx, first_pi, count_i); } #endif vector v_dx, v_dy, v_dz, v_r2; /* Load 2 sets of vectors from the particle cache. */ - pix.v = vec_load(&ci_cache->x[ci_cache_idx]); - piy.v = vec_load(&ci_cache->y[ci_cache_idx]); - piz.v = vec_load(&ci_cache->z[ci_cache_idx]); + const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]); + const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]); + const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]); /* Compute the pairwise distance. */ - v_dx.v = vec_sub(pjx.v, pix.v); - v_dy.v = vec_sub(pjy.v, piy.v); - v_dz.v = vec_sub(pjz.v, piz.v); + v_dx.v = vec_sub(v_pjx.v, v_pix.v); + v_dy.v = vec_sub(v_pjy.v, v_piy.v); + v_dz.v = vec_sub(v_pjz.v, v_piz.v); v_r2.v = vec_mul(v_dx.v, v_dx.v); v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); mask_t v_doj_mask; - int doj_mask; /* Form r2 < hig2 mask. */ vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_hjg2.v)); - /* Form integer mask. */ - doj_mask = vec_form_int_mask(v_doj_mask); - /* If there are any interactions perform them. */ - if (doj_mask) + if (vec_is_mask_true(v_doj_mask)) runner_iact_nonsym_1_vec_density( &v_r2, &v_dx, &v_dy, &v_dz, v_hj_inv, v_vjx, v_vjy, v_vjz, &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], - &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], &rhoSum, - &rho_dhSum, &wcountSum, &wcount_dhSum, &div_vSum, &curlvxSum, - &curlvySum, &curlvzSum, v_doj_mask); + &ci_cache->vz[ci_cache_idx], &ci_cache->m[ci_cache_idx], + &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_doj_mask); } /* loop over the parts in ci. */ - /* Perform horizontal adds on vector sums and store result in particle pj. - */ - VEC_HADD(rhoSum, pj->rho); - VEC_HADD(rho_dhSum, pj->density.rho_dh); - VEC_HADD(wcountSum, pj->density.wcount); - VEC_HADD(wcount_dhSum, pj->density.wcount_dh); - VEC_HADD(div_vSum, pj->density.div_v); - VEC_HADD(curlvxSum, pj->density.rot_v[0]); - VEC_HADD(curlvySum, pj->density.rot_v[1]); - VEC_HADD(curlvzSum, pj->density.rot_v[2]); + /* Perform horizontal adds on vector sums and store result in pj. */ + VEC_HADD(v_rhoSum, pj->rho); + VEC_HADD(v_rho_dhSum, pj->density.rho_dh); + VEC_HADD(v_wcountSum, pj->density.wcount); + VEC_HADD(v_wcount_dhSum, pj->density.wcount_dh); + VEC_HADD(v_div_vSum, pj->density.div_v); + VEC_HADD(v_curlvxSum, pj->density.rot_v[0]); + VEC_HADD(v_curlvySum, pj->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pj->density.rot_v[2]); } /* loop over the parts in cj. */ } @@ -1191,3 +1249,381 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #endif /* WITH_VECTORIZATION */ } + +/** + * @brief Compute the force interactions between a cell pair (non-symmetric) + * using vector intrinsics. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + * @param sid The direction of the pair + * @param shift The shift vector to apply to the particles in ci. + */ +void runner_dopair2_force_vec(struct runner *r, struct cell *ci, + struct cell *cj, const int sid, + const double *shift) { + +#ifdef WITH_VECTORIZATION + const struct engine *restrict e = r->e; + const timebin_t max_active_bin = e->max_active_bin; + + TIMER_TIC; + + /* Get the cutoff shift. */ + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; + + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_i = ci->sort[sid]; + const struct entry *restrict sort_j = cj->sort[sid]; + + /* Get some other useful values. */ + const int count_i = ci->count; + const int count_j = cj->count; + const double hi_max = ci->h_max * kernel_gamma - rshift; + const double hj_max = cj->h_max * kernel_gamma; + const double hi_max_raw = ci->h_max; + const double hj_max_raw = cj->h_max; + struct part *restrict parts_i = ci->parts; + struct part *restrict parts_j = cj->parts; + const double di_max = sort_i[count_i - 1].d - rshift; + const double dj_min = sort_j[0].d; + const float dx_max = (ci->dx_max_sort + cj->dx_max_sort); + const int active_ci = cell_is_active(ci, e); + const int active_cj = cell_is_active(cj, e); + + /* Check if any particles are active and in range */ + int numActive = 0; + + /* Use the largest smoothing length to make sure that no interactions are + * missed. */ + const double h_max = max(hi_max, hj_max); + + if (active_ci) { + for (int pid = count_i - 1; + pid >= 0 && sort_i[pid].d + h_max + dx_max > dj_min; pid--) { + struct part *restrict pi = &parts_i[sort_i[pid].i]; + if (part_is_active(pi, e)) { + numActive++; + break; + } + } + } + + if (!numActive && active_cj) { + for (int pjd = 0; pjd < count_j && sort_j[pjd].d - h_max - dx_max < di_max; + pjd++) { + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (part_is_active_no_debug(pj, max_active_bin)) { + numActive++; + break; + } + } + } + + /* Return if no active particle in range */ + if (numActive == 0) return; + + /* Get both particle caches from the runner and re-allocate + * them if they are not big enough for the cells. */ + struct cache *restrict ci_cache = &r->ci_cache; + struct cache *restrict cj_cache = &r->cj_cache; + + if (ci_cache->count < count_i) { + cache_init(ci_cache, count_i); + } + if (cj_cache->count < count_j) { + cache_init(cj_cache, count_j); + } + + int first_pi, last_pj; + int *max_index_i SWIFT_CACHE_ALIGN; + int *max_index_j SWIFT_CACHE_ALIGN; + + max_index_i = r->ci_cache.max_index; + max_index_j = r->cj_cache.max_index; + + /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ + /* Also find the first pi that interacts with any particle in cj and the last + * pj that interacts with any particle in ci. */ + populate_max_index_no_cache_force(ci, cj, sort_i, sort_j, dx_max, rshift, + hi_max_raw, hj_max_raw, hi_max, hj_max, + di_max, dj_min, max_index_i, max_index_j, + &first_pi, &last_pj, max_active_bin); + + /* Limits of the outer loops. */ + const int first_pi_loop = first_pi; + const int last_pj_loop = last_pj; + + /* Take the max/min of both values calculated to work out how many particles + * to read into the cache. */ + last_pj = max(last_pj, max_index_i[count_i - 1]); + first_pi = min(first_pi, max_index_j[0]); + + /* Read the required particles into the two caches. */ + cache_read_two_partial_cells_sorted_force(ci, cj, ci_cache, cj_cache, sort_i, + sort_j, shift, &first_pi, &last_pj); + + /* Get the number of particles read into the ci cache. */ + const int ci_cache_count = count_i - first_pi; + + if (active_ci) { + + /* Loop over the parts in ci until nothing is within range in cj. */ + for (int pid = count_i - 1; pid >= first_pi_loop; pid--) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[sort_i[pid].i]; + if (!part_is_active(pi, e)) continue; + + /* Set the cache index. */ + const int ci_cache_idx = pid - first_pi; + + /* Skip this particle if no particle in cj is within range of it. */ + const float hi = ci_cache->h[ci_cache_idx]; + const double di_test = + sort_i[pid].d + max(hi, hj_max_raw) * kernel_gamma + dx_max - rshift; + if (di_test < dj_min) continue; + + /* Determine the exit iteration of the interaction loop. */ + const int exit_iteration = max_index_i[pid]; + + /* Fill particle pi vectors. */ + const vector v_pix = vector_set1(ci_cache->x[ci_cache_idx]); + const vector v_piy = vector_set1(ci_cache->y[ci_cache_idx]); + const vector v_piz = vector_set1(ci_cache->z[ci_cache_idx]); + const vector v_hi = vector_set1(hi); + const vector v_vix = vector_set1(ci_cache->vx[ci_cache_idx]); + const vector v_viy = vector_set1(ci_cache->vy[ci_cache_idx]); + const vector v_viz = vector_set1(ci_cache->vz[ci_cache_idx]); + const vector v_rhoi = vector_set1(ci_cache->rho[ci_cache_idx]); + const vector v_grad_hi = vector_set1(ci_cache->grad_h[ci_cache_idx]); + const vector v_pOrhoi2 = vector_set1(ci_cache->pOrho2[ci_cache_idx]); + const vector v_balsara_i = vector_set1(ci_cache->balsara[ci_cache_idx]); + const vector v_ci = vector_set1(ci_cache->soundspeed[ci_cache_idx]); + + const float hig2 = hi * hi * kernel_gamma2; + const vector v_hig2 = vector_set1(hig2); + + /* Get the inverse of hi. */ + vector v_hi_inv = vec_reciprocal(v_hi); + + /* Reset cumulative sums of update vectors. */ + vector v_a_hydro_xSum = vector_setzero(); + vector v_a_hydro_ySum = vector_setzero(); + vector v_a_hydro_zSum = vector_setzero(); + vector v_h_dtSum = vector_setzero(); + vector v_sigSum = vector_set1(pi->force.v_sig); + vector v_entropy_dtSum = vector_setzero(); + + /* Pad the exit iteration if there is a serial remainder. */ + int exit_iteration_align = exit_iteration; + const int rem = exit_iteration % VEC_SIZE; + if (rem != 0) { + int pad = VEC_SIZE - rem; + + if (exit_iteration_align + pad <= last_pj + 1) + exit_iteration_align += pad; + } + + /* Loop over the parts in cj. Making sure to perform an iteration of the + * loop even if exit_iteration_align is zero and there is only one + * particle to interact with.*/ + for (int pjd = 0; pjd <= exit_iteration_align; pjd += VEC_SIZE) { + + /* Get the cache index to the jth particle. */ + const int cj_cache_idx = pjd; + + vector v_dx, v_dy, v_dz, v_r2; + vector v_hjg2; + +#ifdef SWIFT_DEBUG_CHECKS + if (cj_cache_idx % VEC_SIZE != 0 || cj_cache_idx < 0 || + cj_cache_idx + (VEC_SIZE - 1) > (last_pj + 1 + VEC_SIZE)) { + error("Unaligned read!!! cj_cache_idx=%d, last_pj=%d", cj_cache_idx, + last_pj); + } +#endif + + /* Load 2 sets of vectors from the particle cache. */ + const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); + const vector v_hj = vector_load(&cj_cache->h[cj_cache_idx]); + v_hjg2.v = vec_mul(vec_mul(v_hj.v, v_hj.v), kernel_gamma2_vec.v); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + mask_t v_doi_mask; + + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hig2.v, v_hjg2.v); + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + + /* If there are any interactions perform them. */ + if (vec_is_mask_true(v_doi_mask)) { + vector v_hj_inv = vec_reciprocal(v_hj); + + runner_iact_nonsym_1_vec_force( + &v_r2, &v_dx, &v_dy, &v_dz, v_vix, v_viy, v_viz, v_rhoi, + v_grad_hi, v_pOrhoi2, v_balsara_i, v_ci, + &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], + &cj_cache->vz[cj_cache_idx], &cj_cache->rho[cj_cache_idx], + &cj_cache->grad_h[cj_cache_idx], &cj_cache->pOrho2[cj_cache_idx], + &cj_cache->balsara[cj_cache_idx], + &cj_cache->soundspeed[cj_cache_idx], &cj_cache->m[cj_cache_idx], + v_hi_inv, v_hj_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, + &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, + v_doi_mask); + } + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in pi. */ + VEC_HADD(v_a_hydro_xSum, pi->a_hydro[0]); + VEC_HADD(v_a_hydro_ySum, pi->a_hydro[1]); + VEC_HADD(v_a_hydro_zSum, pi->a_hydro[2]); + VEC_HADD(v_h_dtSum, pi->force.h_dt); + VEC_HMAX(v_sigSum, pi->force.v_sig); + VEC_HADD(v_entropy_dtSum, pi->entropy_dt); + + } /* loop over the parts in ci. */ + } + + if (active_cj) { + + /* Loop over the parts in cj until nothing is within range in ci. */ + for (int pjd = 0; pjd <= last_pj_loop; pjd++) { + + /* Get a hold of the jth part in cj. */ + struct part *restrict pj = &parts_j[sort_j[pjd].i]; + if (!part_is_active(pj, e)) continue; + + /* Set the cache index. */ + const int cj_cache_idx = pjd; + + /* Skip this particle if no particle in ci is within range of it. */ + const float hj = cj_cache->h[cj_cache_idx]; + const double dj_test = + sort_j[pjd].d - max(hj, hi_max_raw) * kernel_gamma - dx_max; + if (dj_test > di_max) continue; + + /* Determine the exit iteration of the interaction loop. */ + const int exit_iteration = max_index_j[pjd]; + + /* Fill particle pi vectors. */ + const vector v_pjx = vector_set1(cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_set1(cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_set1(cj_cache->z[cj_cache_idx]); + const vector v_hj = vector_set1(hj); + const vector v_vjx = vector_set1(cj_cache->vx[cj_cache_idx]); + const vector v_vjy = vector_set1(cj_cache->vy[cj_cache_idx]); + const vector v_vjz = vector_set1(cj_cache->vz[cj_cache_idx]); + const vector v_rhoj = vector_set1(cj_cache->rho[cj_cache_idx]); + const vector v_grad_hj = vector_set1(cj_cache->grad_h[cj_cache_idx]); + const vector v_pOrhoj2 = vector_set1(cj_cache->pOrho2[cj_cache_idx]); + const vector v_balsara_j = vector_set1(cj_cache->balsara[cj_cache_idx]); + const vector v_cj = vector_set1(cj_cache->soundspeed[cj_cache_idx]); + + const float hjg2 = hj * hj * kernel_gamma2; + const vector v_hjg2 = vector_set1(hjg2); + + /* Get the inverse of hj. */ + vector v_hj_inv = vec_reciprocal(v_hj); + + /* Reset cumulative sums of update vectors. */ + vector v_a_hydro_xSum = vector_setzero(); + vector v_a_hydro_ySum = vector_setzero(); + vector v_a_hydro_zSum = vector_setzero(); + vector v_h_dtSum = vector_setzero(); + vector v_sigSum = vector_set1(pj->force.v_sig); + vector v_entropy_dtSum = vector_setzero(); + + /* Convert exit iteration to cache indices. */ + int exit_iteration_align = exit_iteration - first_pi; + + /* Pad the exit iteration align so cache reads are aligned. */ + const int rem = exit_iteration_align % VEC_SIZE; + if (exit_iteration_align < VEC_SIZE) { + exit_iteration_align = 0; + } else + exit_iteration_align -= rem; + + /* Loop over the parts in ci. */ + for (int ci_cache_idx = exit_iteration_align; + ci_cache_idx < ci_cache_count; ci_cache_idx += VEC_SIZE) { + +#ifdef SWIFT_DEBUG_CHECKS + if (ci_cache_idx % VEC_SIZE != 0 || ci_cache_idx < 0) { + error("Unaligned read!!! ci_cache_idx=%d", ci_cache_idx); + } +#endif + + vector v_hig2; + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 2 sets of vectors from the particle cache. */ + const vector v_pix = vector_load(&ci_cache->x[ci_cache_idx]); + const vector v_piy = vector_load(&ci_cache->y[ci_cache_idx]); + const vector v_piz = vector_load(&ci_cache->z[ci_cache_idx]); + const vector v_hi = vector_load(&ci_cache->h[ci_cache_idx]); + v_hig2.v = vec_mul(vec_mul(v_hi.v, v_hi.v), kernel_gamma2_vec.v); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(v_pjx.v, v_pix.v); + v_dy.v = vec_sub(v_pjy.v, v_piy.v); + v_dz.v = vec_sub(v_pjz.v, v_piz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + mask_t v_doj_mask; + + /* Form a mask from r2 < hig2 mask and r2 < hjg2 mask. */ + vector v_h2; + v_h2.v = vec_fmax(v_hjg2.v, v_hig2.v); + vec_create_mask(v_doj_mask, vec_cmp_lt(v_r2.v, v_h2.v)); + + /* If there are any interactions perform them. */ + if (vec_is_mask_true(v_doj_mask)) { + vector v_hi_inv = vec_reciprocal(v_hi); + + runner_iact_nonsym_1_vec_force( + &v_r2, &v_dx, &v_dy, &v_dz, v_vjx, v_vjy, v_vjz, v_rhoj, + v_grad_hj, v_pOrhoj2, v_balsara_j, v_cj, + &ci_cache->vx[ci_cache_idx], &ci_cache->vy[ci_cache_idx], + &ci_cache->vz[ci_cache_idx], &ci_cache->rho[ci_cache_idx], + &ci_cache->grad_h[ci_cache_idx], &ci_cache->pOrho2[ci_cache_idx], + &ci_cache->balsara[ci_cache_idx], + &ci_cache->soundspeed[ci_cache_idx], &ci_cache->m[ci_cache_idx], + v_hj_inv, v_hi_inv, &v_a_hydro_xSum, &v_a_hydro_ySum, + &v_a_hydro_zSum, &v_h_dtSum, &v_sigSum, &v_entropy_dtSum, + v_doj_mask); + } + } /* loop over the parts in ci. */ + + /* Perform horizontal adds on vector sums and store result in pj. */ + VEC_HADD(v_a_hydro_xSum, pj->a_hydro[0]); + VEC_HADD(v_a_hydro_ySum, pj->a_hydro[1]); + VEC_HADD(v_a_hydro_zSum, pj->a_hydro[2]); + VEC_HADD(v_h_dtSum, pj->force.h_dt); + VEC_HMAX(v_sigSum, pj->force.v_sig); + VEC_HADD(v_entropy_dtSum, pj->entropy_dt); + + } /* loop over the parts in cj. */ + + TIMER_TOC(timer_dopair_density); + } + +#endif /* WITH_VECTORIZATION */ +} diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 09dc76ef04df5d29ea32f4af24efdc09e433aa73..9d60c75b9545bfbee8023224e34d16d362560a3f 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -39,5 +39,8 @@ void runner_doself2_force_vec(struct runner *r, struct cell *restrict c); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj, const int sid, const double *shift); +void runner_dopair2_force_vec(struct runner *r, struct cell *restrict ci, + struct cell *restrict cj, const int sid, + const double *shift); #endif /* SWIFT_RUNNER_VEC_H */ diff --git a/src/tools.c b/src/tools.c index 3ee55db3d5f5348699372d2620b6d15af38b23d0..89d89e62ea092ba6c6ec661e423d3e0ee44eb7fe 100644 --- a/src/tools.c +++ b/src/tools.c @@ -254,6 +254,7 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { float r2, hi, hj, hig2, hjg2, dx[3]; struct part *pi, *pj; const double dim[3] = {r->e->s->dim[0], r->e->s->dim[1], r->e->s->dim[2]}; + const struct engine *e = r->e; /* Implements a double-for loop and checks every interaction */ for (int i = 0; i < ci->count; ++i) { @@ -262,6 +263,9 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { hi = pi->h; hig2 = hi * hi * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pi, e)) continue; + for (int j = 0; j < cj->count; ++j) { pj = &cj->parts[j]; @@ -292,6 +296,9 @@ void pairs_all_force(struct runner *r, struct cell *ci, struct cell *cj) { hj = pj->h; hjg2 = hj * hj * kernel_gamma2; + /* Skip inactive particles. */ + if (!part_is_active(pj, e)) continue; + for (int i = 0; i < ci->count; ++i) { pi = &ci->parts[i]; @@ -510,9 +517,7 @@ void shuffle_particles(struct part *parts, const int count) { parts[i] = particle; } - - } else - error("Array not big enough to shuffle!"); + } } /** diff --git a/src/vector.h b/src/vector.h index 3222a8a9b6a1bf7f5dc02e914a168d7c5c911b68..15a246d9b6a4d991638f7eb801e6173d977b1273 100644 --- a/src/vector.h +++ b/src/vector.h @@ -20,9 +20,6 @@ #ifndef SWIFT_VECTOR_H #define SWIFT_VECTOR_H -/* Have I already read this file? */ -#ifndef VEC_MACRO - /* Config parameters. */ #include "../config.h" @@ -84,7 +81,7 @@ #define vec_cmp_lte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm512_cmp_ps_mask(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) ({ a; }) -#define vec_form_int_mask(a) ({ a; }) +#define vec_is_mask_true(a) ({ a; }) #define vec_and(a, b) _mm512_and_ps(a, b) #define vec_mask_and(a, b) _mm512_kand(a, b) #define vec_and_mask(a, mask) _mm512_maskz_mov_ps(mask, a) @@ -182,7 +179,7 @@ #define vec_cmp_lte(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) #define vec_cmp_gte(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) #define vec_cmp_result(a) _mm256_movemask_ps(a) -#define vec_form_int_mask(a) _mm256_movemask_ps(a.v) +#define vec_is_mask_true(a) _mm256_movemask_ps(a.v) #define vec_and(a, b) _mm256_and_ps(a, b) #define vec_mask_and(a, b) _mm256_and_ps(a.v, b.v) #define vec_and_mask(a, mask) _mm256_and_ps(a, mask.v) @@ -431,11 +428,48 @@ __attribute__((always_inline)) INLINE vector vec_reciprocal_sqrt(vector x) { return x_inv; } +/** + * @brief Returns a new vector with data loaded from a memory address. + * + * @param x memory address to load from. + * @return Loaded #vector. + */ +__attribute__((always_inline)) INLINE vector vector_load(float *const x) { + + vector temp; + temp.v = vec_load(x); + return temp; +} + +/** + * @brief Returns a vector filled with one value. + * + * @param x value to set each element. + * @return A #vector filled with a given constant. + */ +__attribute__((always_inline)) INLINE vector vector_set1(const float x) { + + vector temp; + temp.v = vec_set1(x); + return temp; +} + +/** + * @brief Returns a new vector filled with zeros. + * + * @return temp set #vector. + * @return A #vector filled with zeros. + */ +__attribute__((always_inline)) INLINE vector vector_setzero() { + + vector temp; + temp.v = vec_setzero(); + return temp; +} + #else /* Needed for cache alignment. */ #define VEC_SIZE 8 #endif /* WITH_VECTORIZATION */ -#endif /* VEC_MACRO */ - #endif /* SWIFT_VECTOR_H */ diff --git a/tests/Makefile.am b/tests/Makefile.am index 553980a93e907e83b65bb4539ca49c8bc1b7207b..f6f88b51cbdc5a7c5ab1d331b295418af66ade53 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -102,5 +102,5 @@ EXTRA_DIST = testReading.sh makeInput.py testActivePair.sh \ testPeriodicBCPerturbed.sh test125cells.sh test125cellsPerturbed.sh testParserInput.yaml \ difffloat.py tolerance_125_normal.dat tolerance_125_perturbed.dat \ tolerance_27_normal.dat tolerance_27_perturbed.dat tolerance_27_perturbed_h.dat tolerance_27_perturbed_h2.dat \ - tolerance_testInteractions.dat tolerance_pair_active.dat \ + tolerance_testInteractions.dat tolerance_pair_active.dat tolerance_pair_force_active.dat \ fft_params.yml tolerance_periodic_BC_normal.dat tolerance_periodic_BC_perturbed.dat diff --git a/tests/test125cells.c b/tests/test125cells.c index dbb3a7c3f5b052a3585aaf04b2cd8b24ded9adba..62ded5c1df47c9d40759dbc7aa239bb4a8a47dc4 100644 --- a/tests/test125cells.c +++ b/tests/test125cells.c @@ -33,13 +33,18 @@ #if defined(WITH_VECTORIZATION) #define DOSELF2 runner_doself2_force_vec +#define DOPAIR2 runner_dopair2_branch_force #define DOSELF2_NAME "runner_doself2_force_vec" -#define DOPAIR2_NAME "runner_dopair2_force" +#define DOPAIR2_NAME "runner_dopair2_force_vec" #endif #ifndef DOSELF2 #define DOSELF2 runner_doself2_force #define DOSELF2_NAME "runner_doself2_density" +#endif + +#ifndef DOPAIR2 +#define DOPAIR2 runner_dopair2_branch_force #define DOPAIR2_NAME "runner_dopair2_force" #endif @@ -445,11 +450,11 @@ void dump_particle_fields(char *fileName, struct cell *main_cell, } /* Just a forward declaration... */ -void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); void runner_dopair1_branch_density(struct runner *r, struct cell *ci, struct cell *cj); void runner_doself1_density(struct runner *r, struct cell *ci); -void runner_dopair2_force(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair2_branch_force(struct runner *r, struct cell *ci, + struct cell *cj); void runner_doself2_force(struct runner *r, struct cell *ci); void runner_doself2_force_vec(struct runner *r, struct cell *ci); @@ -625,7 +630,6 @@ int main(int argc, char *argv[]) { /* Start the test */ ticks time = 0; - ticks self_force_time = 0; for (size_t n = 0; n < runs; ++n) { const ticks tic = getticks(); @@ -696,6 +700,14 @@ int main(int argc, char *argv[]) { /* Do the force calculation */ #if !(defined(MINIMAL_SPH) && defined(WITH_VECTORIZATION)) +#ifdef WITH_VECTORIZATION + /* Initialise the cache. */ + runner.ci_cache.count = 0; + runner.cj_cache.count = 0; + cache_init(&runner.ci_cache, 512); + cache_init(&runner.cj_cache, 512); +#endif + int ctr = 0; /* Do the pairs (for the central 27 cells) */ for (int i = 1; i < 4; i++) { @@ -708,27 +720,19 @@ int main(int argc, char *argv[]) { const ticks sub_tic = getticks(); - runner_dopair2_force(&runner, main_cell, cj); + DOPAIR2(&runner, main_cell, cj); - const ticks sub_toc = getticks(); - timings[ctr++] += sub_toc - sub_tic; + timings[ctr++] += getticks() - sub_tic; } } } } -#ifdef WITH_VECTORIZATION - /* Initialise the cache. */ - runner.ci_cache.count = 0; - cache_init(&runner.ci_cache, 512); -#endif - ticks self_tic = getticks(); /* And now the self-interaction for the main cell */ DOSELF2(&runner, main_cell); - self_force_time += getticks() - self_tic; timings[26] += getticks() - self_tic; #endif @@ -761,11 +765,12 @@ int main(int argc, char *argv[]) { ticks face_time = timings[4] + timings[10] + timings[12] + timings[13] + timings[15] + timings[21]; + ticks self_time = timings[26]; + message("Corner calculations took : %15lli ticks.", corner_time / runs); message("Edge calculations took : %15lli ticks.", edge_time / runs); message("Face calculations took : %15lli ticks.", face_time / runs); - message("Self calculations took : %15lli ticks.", - self_force_time / runs); + message("Self calculations took : %15lli ticks.", self_time / runs); message("SWIFT calculation took : %15lli ticks.", time / runs); for (int j = 0; j < 125; ++j) diff --git a/tests/testActivePair.c b/tests/testActivePair.c index 1e0111b4f0e480d0f66463b4c2264cdd89bd28c8..c8e0233c2c317be321ed8b923a46a98e0905b36c 100644 --- a/tests/testActivePair.c +++ b/tests/testActivePair.c @@ -29,6 +29,9 @@ /* Local headers. */ #include "swift.h" +/* Typdef function pointer for interaction function. */ +typedef void (*interaction_func)(struct runner *, struct cell *, struct cell *); + /** * @brief Constructs a cell and all of its particle in a valid state prior to * a DOPAIR or DOSELF calcuation. @@ -108,6 +111,12 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, else part->time_bin = num_time_bins + 1; + part->rho = 1.f; + part->force.f = 1.f; + part->force.P_over_rho2 = 1.f; + part->force.balsara = 1.f; + part->force.soundspeed = 1.f; + #ifdef SWIFT_DEBUG_CHECKS part->ti_drift = 8; part->ti_kick = 8; @@ -133,7 +142,7 @@ struct cell *make_cell(size_t n, double *offset, double size, double h, cell->ti_old_part = 8; cell->ti_end_min = 8; - cell->ti_end_max = 8; + cell->ti_end_max = 10; shuffle_particles(cell->parts, cell->count); @@ -156,6 +165,11 @@ void clean_up(struct cell *ci) { void zero_particle_fields(struct cell *c) { for (int pid = 0; pid < c->count; pid++) { hydro_init_part(&c->parts[pid], NULL); + c->parts[pid].rho = 1.f; + c->parts[pid].force.f = 1.f; + c->parts[pid].force.P_over_rho2 = 1.f; + c->parts[pid].force.balsara = 1.f; + c->parts[pid].force.soundspeed = 1.f; } } @@ -179,20 +193,20 @@ void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { FILE *file = fopen(fileName, "a"); /* Write header */ - fprintf(file, "# %4s %13s\n", "ID", "wcount"); + fprintf(file, "# %4s %13s %13s\n", "ID", "wcount", "h_dt"); fprintf(file, "# ci --------------------------------------------\n"); for (int pid = 0; pid < ci->count; pid++) { - fprintf(file, "%6llu %13e\n", ci->parts[pid].id, - ci->parts[pid].density.wcount); + fprintf(file, "%6llu %13e %13e\n", ci->parts[pid].id, + ci->parts[pid].density.wcount, ci->parts[pid].force.h_dt); } fprintf(file, "# cj --------------------------------------------\n"); for (int pjd = 0; pjd < cj->count; pjd++) { - fprintf(file, "%6llu %13e\n", cj->parts[pjd].id, - cj->parts[pjd].density.wcount); + fprintf(file, "%6llu %13e %13e\n", cj->parts[pjd].id, + cj->parts[pjd].density.wcount, cj->parts[pjd].force.h_dt); } fclose(file); @@ -200,9 +214,13 @@ void dump_particle_fields(char *fileName, struct cell *ci, struct cell *cj) { /* Just a forward declaration... */ void runner_dopair1_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair2_force_vec(struct runner *r, struct cell *ci, + struct cell *cj); void runner_doself1_density_vec(struct runner *r, struct cell *ci); void runner_dopair1_branch_density(struct runner *r, struct cell *ci, struct cell *cj); +void runner_dopair2_branch_force(struct runner *r, struct cell *ci, + struct cell *cj); /** * @brief Computes the pair interactions of two cells using SWIFT and a brute @@ -210,7 +228,9 @@ void runner_dopair1_branch_density(struct runner *r, struct cell *ci, */ void test_pair_interactions(struct runner *runner, struct cell **ci, struct cell **cj, char *swiftOutputFileName, - char *bruteForceOutputFileName) { + char *bruteForceOutputFileName, + interaction_func serial_interaction, + interaction_func vec_interaction) { runner_do_sort(runner, *ci, 0x1FFF, 0, 0); runner_do_sort(runner, *cj, 0x1FFF, 0, 0); @@ -220,7 +240,7 @@ void test_pair_interactions(struct runner *runner, struct cell **ci, zero_particle_fields(*cj); /* Run the test */ - runner_dopair1_branch_density(runner, *ci, *cj); + vec_interaction(runner, *ci, *cj); /* Let's get physical ! */ end_calculation(*ci); @@ -236,7 +256,7 @@ void test_pair_interactions(struct runner *runner, struct cell **ci, zero_particle_fields(*cj); /* Run the brute-force test */ - pairs_all_density(runner, *ci, *cj); + serial_interaction(runner, *ci, *cj); /* Let's get physical ! */ end_calculation(*ci); @@ -248,16 +268,26 @@ void test_pair_interactions(struct runner *runner, struct cell **ci, /** * @brief Computes the pair interactions of two cells in various configurations. */ -void test_all_pair_interactions(struct runner *runner, double *offset2, - size_t particles, double size, double h, - double rho, long long *partId, - double perturbation, double h_pert, - char *swiftOutputFileName, - char *bruteForceOutputFileName) { +void test_all_pair_interactions( + struct runner *runner, double *offset2, size_t particles, double size, + double h, double rho, long long *partId, double perturbation, double h_pert, + char *swiftOutputFileName, char *bruteForceOutputFileName, + interaction_func serial_interaction, interaction_func vec_interaction) { double offset1[3] = {0, 0, 0}; struct cell *ci, *cj; + /* Only one particle in each cell. */ + ci = make_cell(1, offset1, size, h, rho, partId, perturbation, h_pert, 1.); + cj = make_cell(1, offset2, size, h, rho, partId, perturbation, h_pert, 1.); + + test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, + bruteForceOutputFileName, serial_interaction, + vec_interaction); + + clean_up(ci); + clean_up(cj); + /* All active particles. */ ci = make_cell(particles, offset1, size, h, rho, partId, perturbation, h_pert, 1.); @@ -265,7 +295,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 1.); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -277,7 +308,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 0.5); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -289,7 +321,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 0.); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -301,7 +334,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 0.1); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -313,7 +347,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 0.); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -325,7 +360,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 1.0); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -335,7 +371,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, cj = make_cell(2, offset2, size, h, rho, partId, perturbation, h_pert, 1.0); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -345,7 +382,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, cj = make_cell(3, offset2, size, h, rho, partId, perturbation, h_pert, 0.75); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -357,7 +395,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 0.); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); clean_up(ci); clean_up(cj); @@ -369,7 +408,8 @@ void test_all_pair_interactions(struct runner *runner, double *offset2, 0.5); test_pair_interactions(runner, &ci, &cj, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_interaction, + vec_interaction); /* Clean things to make the sanitizer happy ... */ clean_up(ci); @@ -470,7 +510,7 @@ int main(int argc, char *argv[]) { /* Create output file names. */ sprintf(swiftOutputFileName, "swift_dopair_%s.dat", outputFileNameExtension); - sprintf(bruteForceOutputFileName, "brute_force_%s.dat", + sprintf(bruteForceOutputFileName, "brute_force_pair_%s.dat", outputFileNameExtension); /* Delete files if they already exist. */ @@ -486,10 +526,56 @@ int main(int argc, char *argv[]) { double offset[3] = {1., 0., 0.}; + /* Define which interactions to call */ + interaction_func serial_inter_func = &pairs_all_density; + interaction_func vec_inter_func = &runner_dopair1_branch_density; + /* Test a pair of cells face-on. */ test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, perturbation, h_pert, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_inter_func, + vec_inter_func); + + /* Test a pair of cells edge-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 0.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName, serial_inter_func, + vec_inter_func); + + /* Test a pair of cells corner-on. */ + offset[0] = 1.; + offset[1] = 1.; + offset[2] = 1.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName, serial_inter_func, + vec_inter_func); + + /* Re-assign function pointers. */ + serial_inter_func = &pairs_all_force; + vec_inter_func = &runner_dopair2_branch_force; + + /* Create new output file names. */ + sprintf(swiftOutputFileName, "swift_dopair2_force_%s.dat", + outputFileNameExtension); + sprintf(bruteForceOutputFileName, "brute_force_dopair2_%s.dat", + outputFileNameExtension); + + /* Delete files if they already exist. */ + remove(swiftOutputFileName); + remove(bruteForceOutputFileName); + + /* Test a pair of cells face-on. */ + offset[0] = 1.; + offset[1] = 0.; + offset[2] = 0.; + test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, + perturbation, h_pert, swiftOutputFileName, + bruteForceOutputFileName, serial_inter_func, + vec_inter_func); /* Test a pair of cells edge-on. */ offset[0] = 1.; @@ -497,7 +583,8 @@ int main(int argc, char *argv[]) { offset[2] = 0.; test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, perturbation, h_pert, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_inter_func, + vec_inter_func); /* Test a pair of cells corner-on. */ offset[0] = 1.; @@ -505,6 +592,7 @@ int main(int argc, char *argv[]) { offset[2] = 1.; test_all_pair_interactions(runner, offset, particles, size, h, rho, &partId, perturbation, h_pert, swiftOutputFileName, - bruteForceOutputFileName); + bruteForceOutputFileName, serial_inter_func, + vec_inter_func); return 0; } diff --git a/tests/testActivePair.sh.in b/tests/testActivePair.sh.in index 108da19c5c667a6a46a81707a7508a27f39f6b38..87d09cad7118c45b41c5a41058c331cbda92b613 100755 --- a/tests/testActivePair.sh.in +++ b/tests/testActivePair.sh.in @@ -8,7 +8,9 @@ echo "Running ./testActivePair -n 6 -r 1 -d 0 -f active" ./testActivePair -n 6 -r 1 -d 0 -f active -python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat +python @srcdir@/difffloat.py brute_force_pair_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat + +python @srcdir@/difffloat.py brute_force_dopair2_active.dat swift_dopair2_force_active.dat @srcdir@/tolerance_pair_force_active.dat rm -f brute_force_pair_active.dat swift_dopair_active.dat @@ -17,6 +19,8 @@ echo "Running ./testActivePair -n 6 -r 1 -d 0 -f active -s 1506434777" ./testActivePair -n 6 -r 1 -d 0 -f active -s 1506434777 -python @srcdir@/difffloat.py brute_force_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat +python @srcdir@/difffloat.py brute_force_pair_active.dat swift_dopair_active.dat @srcdir@/tolerance_pair_active.dat + +python @srcdir@/difffloat.py brute_force_dopair2_active.dat swift_dopair2_force_active.dat @srcdir@/tolerance_pair_force_active.dat exit $? diff --git a/tests/tolerance_27_perturbed_h.dat b/tests/tolerance_27_perturbed_h.dat index e25c3373afe6f2a8e7788dfc2d9eba0b74efdbee..5d6710c3946c0515f94c58eb3e0ecbfe14135b71 100644 --- a/tests/tolerance_27_perturbed_h.dat +++ b/tests/tolerance_27_perturbed_h.dat @@ -1,4 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 2.4e-6 1e-4 5e-4 1.2e-2 1.1e-5 3e-6 3e-6 8e-6 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.2e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.4e-2 1.1e-5 3e-6 3e-6 8e-6 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.4e-2 1e-5 2e-3 2.5e-4 3e-3 3e-3 3e-3 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_27_perturbed_h2.dat b/tests/tolerance_27_perturbed_h2.dat index d83f36a887d92b55c90b21adae65ffe734a2f188..9ae7dfe979bce3d047c810be128521de52f8a8a9 100644 --- a/tests/tolerance_27_perturbed_h2.dat +++ b/tests/tolerance_27_perturbed_h2.dat @@ -1,4 +1,4 @@ # ID pos_x pos_y pos_z v_x v_y v_z rho rho_dh wcount wcount_dh div_v curl_vx curl_vy curl_vz 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 3e-6 1e-4 5e-4 1.5e-2 1.4e-5 3e-6 3e-6 1e-5 - 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 1.57e-2 1e-5 5.86e-3 4.96e-4 3e-3 3e-3 3e-3 + 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1.5e-6 2.5e-2 1e-5 5.86e-3 4.96e-4 3e-3 3.7e-3 3e-3 0 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e-6 1e0 1e-6 4e-6 4e-6 4e-6 diff --git a/tests/tolerance_pair_active.dat b/tests/tolerance_pair_active.dat index b07697a686eb7801326ceaf77cf93fb3a1491c2e..087c33fb8fa73c6125231b3f58cc9412a96c771b 100644 --- a/tests/tolerance_pair_active.dat +++ b/tests/tolerance_pair_active.dat @@ -1,4 +1,4 @@ -# ID wcount - 0 1e-2 - 0 1e-2 - 0 1e-2 +# ID wcount h_dt + 0 1e-2 1 + 0 1e-2 1 + 0 1e-2 1 diff --git a/tests/tolerance_pair_force_active.dat b/tests/tolerance_pair_force_active.dat new file mode 100644 index 0000000000000000000000000000000000000000..c5b5620338b848f1b8f4658049e53a6424e93a60 --- /dev/null +++ b/tests/tolerance_pair_force_active.dat @@ -0,0 +1,4 @@ +# ID wcount h_dt + 0 1 2.4e-3 + 0 1 2.4e-3 + 0 1 1e-5