diff --git a/examples/EAGLE_100/eagle_100.yml b/examples/EAGLE_100/eagle_100.yml index 71cb0e96307cd6ce737986e3800824fd80313d48..651c906fb33634009f30f818e7a45d806c87fe46 100644 --- a/examples/EAGLE_100/eagle_100.yml +++ b/examples/EAGLE_100/eagle_100.yml @@ -40,4 +40,4 @@ SPH: # Parameters related to the initial conditions InitialConditions: file_name: ./EAGLE_ICs_100.hdf5 # The file to read - + cleanup_h: 1 diff --git a/examples/check_ngbs.py b/examples/check_ngbs.py index fca975e76457445480eb7b19e60e8f9174dee941..a4a07ce7bd6ffb817e8106b74d9895a0edbceca7 100644 --- a/examples/check_ngbs.py +++ b/examples/check_ngbs.py @@ -15,8 +15,13 @@ error = False inputFile1 = "" inputFile2 = "" +# Compare the values of two floats +def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): + return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) + # Check list of density neighbours and check that they are correct. -def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_invalid, acc): +def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, + h_naive, h_sort, num_invalid, acc): for k in range(0,num_invalid): @@ -48,12 +53,17 @@ def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, nu pi_pos = pos[np.where(pids == pid)] pj_pos = pos[np.where(pids == pjd)] - hi = h[np.where(pids == pid)] + hi = h_naive[np.where(pids == pid)] dx = pi_pos[0][0] - pj_pos[0][0] dy = pi_pos[0][1] - pj_pos[0][1] dz = pi_pos[0][2] - pj_pos[0][2] - + + # Correct for BCs + dx = nearest(dx) + dy = nearest(dy) + dz = nearest(dz) + r2 = dx*dx + dy*dy + dz*dz hig2 = hi*hi*kernel_gamma2 @@ -65,12 +75,21 @@ def check_density_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, nu if diff < acc * hig2: print "Missing interaction due to precision issue will be ignored." else: - return True + hi_2 = h_sort[np.where(pids == pid)] + + # If a neigbour is missing and the particle has the same h throw + # an error. + if(isclose(hi,hi_2)): + print "Missing interaction found but particle has the same smoothing length (hi_1: %e, hi_2: %e)."%(hi, hi_2) + return True + else: + print "Missing interaction due to different smoothing lengths will be ignored (hi_1: %e, hi_2: %e)."%(hi, hi_2) return False # Check list of force neighbours and check that they are correct. -def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_invalid, acc): +def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, + h_naive, h_sort, num_invalid, acc): error_val = False @@ -90,13 +109,18 @@ def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_ pi_pos = pos[np.where(pids == pid)] pj_pos = pos[np.where(pids == pjd)] - hi = h[np.where(pids == pid)] - hj = h[np.where(pids == pjd)] + hi = h_naive[np.where(pids == pid)] + hj = h_naive[np.where(pids == pjd)] dx = pi_pos[0][0] - pj_pos[0][0] dy = pi_pos[0][1] - pj_pos[0][1] dz = pi_pos[0][2] - pj_pos[0][2] - + + # Correct for BCs + dx = nearest(dx) + dy = nearest(dy) + dz = nearest(dz) + r2 = dx*dx + dy*dy + dz*dz hig2 = hi*hi*kernel_gamma2 @@ -106,14 +130,26 @@ def check_force_neighbours(pids, ngb_ids_naive, ngb_ids_sort, mask, pos, h, num_ print "Particle {} is missing {}, hig2: {}, hjg2: {}, r2: {}, |r2 - max(hig2,hjg2)|: {}".format(pid,pjd,hig2, hjg2, r2, diff) - if diff < acc * max(hig2,hjg2): print "Missing interaction due to precision issue will be ignored." else: - error_val = True + hi_2 = h_sort[np.where(pids == pid)] + if(isclose(hi,hi_2)): + print "Missing interaction due to the same smoothing lengths will not be ignored (hi_1: %e, hi_2: %e)."%(hi, hi_2) + error_val = True + else: + print "Missing interaction due to different smoothing lengths will be ignored (hi_1: %e, hi_2: %e)."%(hi, hi_2) return error_val +def nearest(dx): + if(dx > 0.5 * box_size): + return dx - box_size + elif(dx < -0.5 * box_size): + return dx + box_size + else: + return dx + # Parse command line arguments if len(sys.argv) < 3: print "Error: pass input files as arguments" @@ -132,6 +168,8 @@ else: file_naive = h.File(inputFile1, "r") file_sort = h.File(inputFile2, "r") +box_size = file_naive["/Header"].attrs["BoxSize"][0] + # Read input file fields ids_naive = file_naive["/PartType0/ParticleIDs"][:] ids_sort = file_sort["/PartType0/ParticleIDs"][:] @@ -140,7 +178,7 @@ h_naive = file_naive["/PartType0/SmoothingLength"][:] h_sort = file_sort["/PartType0/SmoothingLength"][:] pos_naive = file_naive["/PartType0/Coordinates"][:,:] -pos_sort = file_sort["/PartType0/Coordinates"][:,:] +#pos_sort = file_sort["/PartType0/Coordinates"][:,:] num_density_naive = file_naive["/PartType0/Num_ngb_density"][:] num_density_sort = file_sort["/PartType0/Num_ngb_density"][:] @@ -154,6 +192,7 @@ neighbour_ids_density_sort = file_sort["/PartType0/Ids_ngb_density"][:] neighbour_ids_force_naive = file_naive["/PartType0/Ids_ngb_force"][:] neighbour_ids_force_sort = file_sort["/PartType0/Ids_ngb_force"][:] + #wcount_naive = file_naive["/PartType0/Wcount"][:] #wcount_sort = file_sort["/PartType0/Wcount"][:] # @@ -194,7 +233,7 @@ neighbour_ids_force_sort = neighbour_ids_force_sort[index_sort] h_naive = h_naive[index_naive] h_sort = h_sort[index_sort] pos_naive = pos_naive[index_naive] -pos_sort = pos_sort[index_sort] +#pos_sort = pos_sort[index_sort] neighbour_length_naive = len(neighbour_ids_density_naive[0]) neighbour_length_sort = len(neighbour_ids_density_sort[0]) @@ -234,7 +273,7 @@ print ids_naive[mask_density] # Check density neighbour lists error += check_density_neighbours(ids_naive, neighbour_ids_density_naive, - neighbour_ids_density_sort, mask_density, pos_naive, h_naive, + neighbour_ids_density_sort, mask_density, pos_naive, h_naive, h_sort, num_invalid_density, 2e-6) print "Num of density interactions", inputFile1 @@ -249,7 +288,7 @@ print ids_naive[mask_force] # Check force neighbour lists error += check_force_neighbours(ids_naive, neighbour_ids_force_naive, - neighbour_ids_force_sort, mask_force, pos_naive, h_naive, + neighbour_ids_force_sort, mask_force, pos_naive, h_naive, h_sort, num_invalid_force, 2e-6) print "Num of force interactions", inputFile1 diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 8975981fd0b3a4a1dab8e2daf278cdd40e2ddbf0..efaa5bf2d172f49954a4e8e118b8b8d555a6b380 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -8,14 +8,16 @@ InternalUnitSystem: # Parameters for the task scheduling Scheduler: - nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. - cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). - cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value). - cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value). - cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). - max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). - tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...). - mpi_message_limit: 4096 # (Optional) Maximum MPI task message size to send non-buffered, KB. + nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. + cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). + cell_sub_size_pair_hydro: 256000000 # (Optional) Maximal number of interactions per sub-pair hydro task (this is the default value). + cell_sub_size_self_hydro: 32000 # (Optional) Maximal number of interactions per sub-self hydro task (this is the default value). + cell_sub_size_pair_grav: 256000000 # (Optional) Maximal number of interactions per sub-pair gravity task (this is the default value). + cell_sub_size_self_grav: 32000 # (Optional) Maximal number of interactions per sub-self gravity task (this is the default value). + cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). + max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). + tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...). + mpi_message_limit: 4096 # (Optional) Maximum MPI task message size to send non-buffered, KB. # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.) TimeIntegration: diff --git a/src/cache.h b/src/cache.h index c939da28589c1421e0e4241dca124a8320d5c87b..c41e11c34246ef0de93bb1ae7500277aab555b9e 100644 --- a/src/cache.h +++ b/src/cache.h @@ -216,6 +216,139 @@ __attribute__((always_inline)) INLINE void cache_read_particles( #endif } +/** + * @brief Populate cache by only reading particles that are within range of + * each other within the adjoining cell. Also read the particles into the cache + * in sorted order. + * + * @param ci The i #cell. + * @param ci_cache The #cache for cell ci. + * @param sort_i The array of sorted particle indices for cell ci. + * @param first_pi The first particle in cell ci that is in range. + * @param last_pi The last particle in cell ci that is in range. + * @param last_pi The last particle in cell ci that is in range. + * @param loc The cell location to remove from the particle positions. + * @param flipped Flag to check whether the cells have been flipped or not. + */ +__attribute__((always_inline)) INLINE void cache_read_particles_subset( + const struct cell *restrict const ci, struct cache *restrict const ci_cache, + const struct entry *restrict sort_i, int *first_pi, int *last_pi, + const double *loc, const int flipped) { + +#if defined(GADGET2_SPH) + + /* Let the compiler know that the data is aligned and create pointers to the + * arrays inside the cache. */ + swift_declare_aligned_ptr(float, x, ci_cache->x, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, y, ci_cache->y, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, z, ci_cache->z, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, h, ci_cache->h, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, m, ci_cache->m, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vx, ci_cache->vx, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vy, ci_cache->vy, SWIFT_CACHE_ALIGNMENT); + swift_declare_aligned_ptr(float, vz, ci_cache->vz, SWIFT_CACHE_ALIGNMENT); + + const struct part *restrict parts = ci->parts; + + /* The cell is on the right so read the particles + * into the cache from the start of the cell. */ + if (!flipped) { + const int rem = (*last_pi + 1) % VEC_SIZE; + if (rem != 0) { + const int pad = VEC_SIZE - rem; + + /* Increase last_pi if there are particles in the cell left to read. */ + if (*last_pi + pad < ci->count) *last_pi += pad; + } + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ + for (int i = 0; i < *last_pi; i++) { + const int idx = sort_i[i].i; + x[i] = (float)(parts[idx].x[0] - loc[0]); + y[i] = (float)(parts[idx].x[1] - loc[1]); + z[i] = (float)(parts[idx].x[2] - loc[2]); + h[i] = parts[idx].h; + m[i] = parts[idx].mass; + vx[i] = parts[idx].v[0]; + vy[i] = parts[idx].v[1]; + vz[i] = parts[idx].v[2]; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const double max_dx = ci->dx_max_part; + const float pos_padded[3] = {-(2. * ci->width[0] + max_dx), + -(2. * ci->width[1] + max_dx), + -(2. * ci->width[2] + max_dx)}; + const float h_padded = ci->parts[0].h; + + for (int i = *last_pi; i < *last_pi + VEC_SIZE; i++) { + x[i] = pos_padded[0]; + y[i] = pos_padded[1]; + z[i] = pos_padded[2]; + h[i] = h_padded; + + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; + } + } + /* The cell is on the left so read the particles + * into the cache from the end of the cell. */ + else { + const int rem = (ci->count - *first_pi) % VEC_SIZE; + if (rem != 0) { + const int pad = VEC_SIZE - rem; + + /* Decrease first_pi if there are particles in the cell left to read. */ + if (*first_pi - pad >= 0) *first_pi -= pad; + } + + const int ci_cache_count = ci->count - *first_pi; + + /* Shift the particles positions to a local frame so single precision can be + * used instead of double precision. */ + for (int i = 0; i < ci_cache_count; i++) { + const int idx = sort_i[i + *first_pi].i; + x[i] = (float)(parts[idx].x[0] - loc[0]); + y[i] = (float)(parts[idx].x[1] - loc[1]); + z[i] = (float)(parts[idx].x[2] - loc[2]); + h[i] = parts[idx].h; + m[i] = parts[idx].mass; + vx[i] = parts[idx].v[0]; + vy[i] = parts[idx].v[1]; + vz[i] = parts[idx].v[2]; + } + + /* Pad cache with fake particles that exist outside the cell so will not + * interact. We use values of the same magnitude (but negative!) as the real + * particles to avoid overflow problems. */ + const double max_dx = ci->dx_max_part; + const float pos_padded[3] = {-(2. * ci->width[0] + max_dx), + -(2. * ci->width[1] + max_dx), + -(2. * ci->width[2] + max_dx)}; + const float h_padded = ci->parts[0].h; + + for (int i = ci->count - *first_pi; i < ci->count - *first_pi + VEC_SIZE; + i++) { + x[i] = pos_padded[0]; + y[i] = pos_padded[1]; + z[i] = pos_padded[2]; + h[i] = h_padded; + + m[i] = 1.f; + vx[i] = 1.f; + vy[i] = 1.f; + vz[i] = 1.f; + } + } + +#endif +} + /** * @brief Populate cache for force interactions by reading in the particles in * unsorted order. @@ -287,7 +420,6 @@ __attribute__((always_inline)) INLINE void cache_read_force_particles( * @param shift The amount to shift the particle positions to account for BCs * @param first_pi The first particle in cell ci that is in range. * @param last_pj The last particle in cell cj that is in range. - * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( const struct cell *restrict const ci, const struct cell *restrict const cj, @@ -505,7 +637,6 @@ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted( * @param shift The amount to shift the particle positions to account for BCs * @param first_pi The first particle in cell ci that is in range. * @param last_pj The last particle in cell cj that is in range. - * interaction. */ __attribute__((always_inline)) INLINE void cache_read_two_partial_cells_sorted_force( diff --git a/src/cell.c b/src/cell.c index 488637599f3019cd82128f63e58c303e49d9723e..a36b8fd2fd28157fd1b35df572c3c831f78d26a9 100644 --- a/src/cell.c +++ b/src/cell.c @@ -803,8 +803,8 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset, /* Fill the buffer with the indices. */ for (int k = 0; k < count; k++) { - const int bid = (buff[k].x[0] > pivot[0]) * 4 + - (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]); + const int bid = (buff[k].x[0] >= pivot[0]) * 4 + + (buff[k].x[1] >= pivot[1]) * 2 + (buff[k].x[2] >= pivot[2]); bucket_count[bid]++; buff[k].ind = bid; } @@ -876,44 +876,44 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, ptrdiff_t sparts_offset, /* Verify a few sub-cells. */ for (int k = 0; k < c->progeny[0]->count; k++) - if (c->progeny[0]->parts[k].x[0] > pivot[0] || - c->progeny[0]->parts[k].x[1] > pivot[1] || - c->progeny[0]->parts[k].x[2] > pivot[2]) + if (c->progeny[0]->parts[k].x[0] >= pivot[0] || + c->progeny[0]->parts[k].x[1] >= pivot[1] || + c->progeny[0]->parts[k].x[2] >= pivot[2]) error("Sorting failed (progeny=0)."); for (int k = 0; k < c->progeny[1]->count; k++) - if (c->progeny[1]->parts[k].x[0] > pivot[0] || - c->progeny[1]->parts[k].x[1] > pivot[1] || - c->progeny[1]->parts[k].x[2] <= pivot[2]) + if (c->progeny[1]->parts[k].x[0] >= pivot[0] || + c->progeny[1]->parts[k].x[1] >= pivot[1] || + c->progeny[1]->parts[k].x[2] < pivot[2]) error("Sorting failed (progeny=1)."); for (int k = 0; k < c->progeny[2]->count; k++) - if (c->progeny[2]->parts[k].x[0] > pivot[0] || - c->progeny[2]->parts[k].x[1] <= pivot[1] || - c->progeny[2]->parts[k].x[2] > pivot[2]) + if (c->progeny[2]->parts[k].x[0] >= pivot[0] || + c->progeny[2]->parts[k].x[1] < pivot[1] || + c->progeny[2]->parts[k].x[2] >= pivot[2]) error("Sorting failed (progeny=2)."); for (int k = 0; k < c->progeny[3]->count; k++) - if (c->progeny[3]->parts[k].x[0] > pivot[0] || - c->progeny[3]->parts[k].x[1] <= pivot[1] || - c->progeny[3]->parts[k].x[2] <= pivot[2]) + if (c->progeny[3]->parts[k].x[0] >= pivot[0] || + c->progeny[3]->parts[k].x[1] < pivot[1] || + c->progeny[3]->parts[k].x[2] < pivot[2]) error("Sorting failed (progeny=3)."); for (int k = 0; k < c->progeny[4]->count; k++) - if (c->progeny[4]->parts[k].x[0] <= pivot[0] || - c->progeny[4]->parts[k].x[1] > pivot[1] || - c->progeny[4]->parts[k].x[2] > pivot[2]) + if (c->progeny[4]->parts[k].x[0] < pivot[0] || + c->progeny[4]->parts[k].x[1] >= pivot[1] || + c->progeny[4]->parts[k].x[2] >= pivot[2]) error("Sorting failed (progeny=4)."); for (int k = 0; k < c->progeny[5]->count; k++) - if (c->progeny[5]->parts[k].x[0] <= pivot[0] || - c->progeny[5]->parts[k].x[1] > pivot[1] || - c->progeny[5]->parts[k].x[2] <= pivot[2]) + if (c->progeny[5]->parts[k].x[0] < pivot[0] || + c->progeny[5]->parts[k].x[1] >= pivot[1] || + c->progeny[5]->parts[k].x[2] < pivot[2]) error("Sorting failed (progeny=5)."); for (int k = 0; k < c->progeny[6]->count; k++) - if (c->progeny[6]->parts[k].x[0] <= pivot[0] || - c->progeny[6]->parts[k].x[1] <= pivot[1] || - c->progeny[6]->parts[k].x[2] > pivot[2]) + if (c->progeny[6]->parts[k].x[0] < pivot[0] || + c->progeny[6]->parts[k].x[1] < pivot[1] || + c->progeny[6]->parts[k].x[2] >= pivot[2]) error("Sorting failed (progeny=6)."); for (int k = 0; k < c->progeny[7]->count; k++) - if (c->progeny[7]->parts[k].x[0] <= pivot[0] || - c->progeny[7]->parts[k].x[1] <= pivot[1] || - c->progeny[7]->parts[k].x[2] <= pivot[2]) + if (c->progeny[7]->parts[k].x[0] < pivot[0] || + c->progeny[7]->parts[k].x[1] < pivot[1] || + c->progeny[7]->parts[k].x[2] < pivot[2]) error("Sorting failed (progeny=7)."); #endif diff --git a/src/cooling/grackle/cooling.h b/src/cooling/grackle/cooling.h index 8e7a7dc86ff6a8f7be2193b43dea10ba933e3765..247df339c68eff6c15c5fcfe07d77afbea0722dd 100644 --- a/src/cooling/grackle/cooling.h +++ b/src/cooling/grackle/cooling.h @@ -26,8 +26,8 @@ /* Some standard headers. */ #include <float.h> -#include <math.h> #include <grackle.h> +#include <math.h> /* Local includes. */ #include "error.h" @@ -78,37 +78,34 @@ __attribute__((always_inline)) INLINE static float cooling_get_radiated_energy( return xp->cooling_data.radiated_energy; } - /** * @brief Prints the properties of the cooling model to stdout. * * @param cooling The properties of the cooling function. */ -__attribute__((always_inline))INLINE static void cooling_print_backend( +__attribute__((always_inline)) INLINE static void cooling_print_backend( const struct cooling_function_data* cooling) { message("Cooling function is 'Grackle'."); message("Using Grackle = %i", cooling->chemistry.use_grackle); - message("Chemical network = %i", cooling->chemistry.primordial_chemistry); - message("Radiative cooling = %i", cooling->chemistry.with_radiative_cooling); + message("Chemical network = %i", + cooling->chemistry.primordial_chemistry); + message("Radiative cooling = %i", + cooling->chemistry.with_radiative_cooling); message("Metal cooling = %i", cooling->chemistry.metal_cooling); - - message("CloudyTable = %s", - cooling->cloudy_table); + + message("CloudyTable = %s", cooling->cloudy_table); message("UVbackground = %d", cooling->uv_background); message("Redshift = %g", cooling->redshift); - message("Density Self Shielding = %g", - cooling->density_self_shielding); + message("Density Self Shielding = %g", cooling->density_self_shielding); message("Units:"); message("\tComoving = %i", cooling->units.comoving_coordinates); message("\tLength = %g", cooling->units.length_units); message("\tDensity = %g", cooling->units.density_units); message("\tTime = %g", cooling->units.time_units); message("\tScale Factor = %g", cooling->units.a_units); - } - /** * @brief Compute the cooling rate * @@ -126,8 +123,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate( const struct cooling_function_data* restrict cooling, struct part* restrict p, float dt) { - if (cooling->chemistry.primordial_chemistry > 1) - error("Not implemented"); + if (cooling->chemistry.primordial_chemistry > 1) error("Not implemented"); /* set current time */ code_units units = cooling->units; @@ -144,7 +140,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate( int grid_dimension[GRACKLE_RANK] = {GRACKLE_NPART, 1, 1}; int grid_start[GRACKLE_RANK] = {0, 0, 0}; int grid_end[GRACKLE_RANK] = {GRACKLE_NPART - 1, 0, 0}; - + data.grid_rank = GRACKLE_RANK; data.grid_dimension = grid_dimension; data.grid_start = grid_start; @@ -171,7 +167,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate( /* gr_float HeII_density = 0.; */ /* gr_float HeIII_density = 0.; */ /* gr_float e_density = 0.; */ - + /* data.HI_density = &HI_density; */ /* data.HII_density = &HII_density; */ /* data.HeI_density = &HeI_density; */ @@ -183,16 +179,16 @@ __attribute__((always_inline)) INLINE static double cooling_rate( /* gr_float HM_density = 0.; */ /* gr_float H2I_density = 0.; */ /* gr_float H2II_density = 0.; */ - + /* data.HM_density = &HM_density; */ /* data.H2I_density = &H2I_density; */ /* data.H2II_density = &H2II_density; */ - + /* /\* primordial chemistry >= 3 *\/ */ /* gr_float DI_density = 0.; */ /* gr_float DII_density = 0.; */ /* gr_float HDI_density = 0.; */ - + /* data.DI_density = &DI_density; */ /* data.DII_density = &DII_density; */ /* data.HDI_density = &HDI_density; */ @@ -206,7 +202,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate( /* gr_float volumetric_heating_rate = 0.; */ /* data.volumetric_heating_rate = &volumetric_heating_rate; */ - + /* /\* specific heating rate *\/ */ /* gr_float specific_heating_rate = 0.; */ @@ -216,7 +212,7 @@ __attribute__((always_inline)) INLINE static double cooling_rate( if (solve_chemistry(&units, &data, dt) == 0) { error("Error in solve_chemistry."); } - + return (energy - energy_before) / dt; } @@ -244,7 +240,7 @@ __attribute__((always_inline)) INLINE static void cooling_cool_part( const float du_dt = cooling_rate(phys_const, us, cooling, p, dt); /* record energy lost */ - xp->cooling_data.radiated_energy += - du_dt * dt * hydro_get_mass(p); + xp->cooling_data.radiated_energy += -du_dt * dt * hydro_get_mass(p); /* Update the internal energy */ hydro_set_internal_energy_dt(p, hydro_du_dt + du_dt); @@ -276,23 +272,23 @@ __attribute__((always_inline)) INLINE static float cooling_timestep( * @param phys_const The physical constants in internal units. * @param cooling The cooling properties to initialize */ -__attribute__((always_inline))INLINE static void cooling_init_backend( +__attribute__((always_inline)) INLINE static void cooling_init_backend( const struct swift_params* parameter_file, const struct unit_system* us, const struct phys_const* phys_const, struct cooling_function_data* cooling) { - /* read parameters */ + /* read parameters */ parser_get_param_string(parameter_file, "GrackleCooling:GrackleCloudyTable", cooling->cloudy_table); cooling->uv_background = - parser_get_param_int(parameter_file, "GrackleCooling:UVbackground"); + parser_get_param_int(parameter_file, "GrackleCooling:UVbackground"); cooling->redshift = - parser_get_param_double(parameter_file, "GrackleCooling:GrackleRedshift"); + parser_get_param_double(parameter_file, "GrackleCooling:GrackleRedshift"); cooling->density_self_shielding = parser_get_param_double( parameter_file, "GrackleCooling:GrackleHSShieldingDensityThreshold"); - + #ifdef SWIFT_DEBUG_CHECKS /* enable verbose for grackle */ grackle_verbose = 1; @@ -310,14 +306,16 @@ __attribute__((always_inline))INLINE static void cooling_init_backend( cooling->units.comoving_coordinates = 0; /* then units */ - cooling->units.density_units = us->UnitMass_in_cgs / pow(us->UnitLength_in_cgs, 3); + cooling->units.density_units = + us->UnitMass_in_cgs / pow(us->UnitLength_in_cgs, 3); cooling->units.length_units = us->UnitLength_in_cgs; cooling->units.time_units = us->UnitTime_in_cgs; - cooling->units.velocity_units = - cooling->units.a_units * cooling->units.length_units / cooling->units.time_units; + cooling->units.velocity_units = cooling->units.a_units * + cooling->units.length_units / + cooling->units.time_units; + + chemistry_data* chemistry = &cooling->chemistry; - chemistry_data *chemistry = &cooling->chemistry; - /* Create a chemistry object for parameters and rate data. */ if (set_default_chemistry_parameters(chemistry) == 0) { error("Error in set_default_chemistry_parameters."); @@ -341,7 +339,6 @@ __attribute__((always_inline))INLINE static void cooling_init_backend( error("Error in initialize_chemistry_data."); } - #ifdef SWIFT_DEBUG_CHECKS if (GRACKLE_NPART != 1) error("Grackle with multiple particles not implemented"); @@ -356,11 +353,9 @@ __attribute__((always_inline))INLINE static void cooling_init_backend( cooling_print_backend(cooling); message("Density Self Shielding = %g atom/cm3", threshold); - message(""); message("***************************************"); #endif - } #endif /* SWIFT_COOLING_GRACKLE_H */ diff --git a/src/debug.c b/src/debug.c index f63ed557d908607e6cd53edf8467675a938245d7..77d791a31787c7997a9a56e3204dac1160178976 100644 --- a/src/debug.c +++ b/src/debug.c @@ -224,8 +224,9 @@ int checkCellhdxmax(const struct cell *c, int *depth) { struct part *const p = &parts[k]; struct xpart *const xp = &xparts[k]; - if (p->x[0] < loc_min[0] || p->x[0] > loc_max[0] || p->x[1] < loc_min[1] || - p->x[1] > loc_max[1] || p->x[2] < loc_min[2] || p->x[2] > loc_max[2]) { + if (p->x[0] < loc_min[0] || p->x[0] >= loc_max[0] || p->x[1] < loc_min[1] || + p->x[1] >= loc_max[1] || p->x[2] < loc_min[2] || + p->x[2] >= loc_max[2]) { message( "Inconsistent part position p->x=[%e %e %e], c->loc=[%e %e %e] " diff --git a/src/engine.c b/src/engine.c index 70b3a1fb9b1208dd2dbd8c4657aca9eb1932271c..cef4e3e6f3aa00c40ba92e3980b9fc472b6898bc 100644 --- a/src/engine.c +++ b/src/engine.c @@ -2744,7 +2744,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, /* Now, build all the dependencies for the hydro */ engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci, with_cooling); - scheduler_addunlock(sched, t3, t->ci->super->kick2); + scheduler_addunlock(sched, t3, t->ci->super->end_force); #else /* Start by constructing the task for the second hydro loop */ @@ -2791,14 +2791,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, if (t->ci->nodeID == nodeID) { engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci, with_cooling); - scheduler_addunlock(sched, t3, t->ci->super->kick2); + scheduler_addunlock(sched, t3, t->ci->super->end_force); } if (t->cj->nodeID == nodeID) { if (t->ci->super_hydro != t->cj->super_hydro) engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->cj, with_cooling); if (t->ci->super != t->cj->super) - scheduler_addunlock(sched, t3, t->cj->super->kick2); + scheduler_addunlock(sched, t3, t->cj->super->end_force); } #else @@ -2856,7 +2856,7 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, if (t->ci->nodeID == nodeID) { engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci, with_cooling); - scheduler_addunlock(sched, t3, t->ci->super->kick2); + scheduler_addunlock(sched, t3, t->ci->super->end_force); } #else @@ -2913,14 +2913,14 @@ void engine_make_extra_hydroloop_tasks_mapper(void *map_data, int num_elements, if (t->ci->nodeID == nodeID) { engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->ci, with_cooling); - scheduler_addunlock(sched, t3, t->ci->super->kick2); + scheduler_addunlock(sched, t3, t->ci->super->end_force); } if (t->cj->nodeID == nodeID) { if (t->ci->super_hydro != t->cj->super_hydro) engine_make_hydro_loops_dependencies(sched, t, t2, t3, t->cj, with_cooling); if (t->ci->super != t->cj->super) - scheduler_addunlock(sched, t3, t->cj->super->kick2); + scheduler_addunlock(sched, t3, t->cj->super->end_force); } #else @@ -4251,15 +4251,17 @@ void engine_init_particles(struct engine *e, int flag_entropy_ICs, /* Sorting should put the same positions next to each other... */ int failed = 0; double *prev_x = s->parts[0].x; + long long *prev_id = &s->parts[0].id; for (size_t k = 1; k < s->nr_parts; k++) { if (prev_x[0] == s->parts[k].x[0] && prev_x[1] == s->parts[k].x[1] && prev_x[2] == s->parts[k].x[2]) { if (e->verbose) - message("Two particles occupy location: %f %f %f", prev_x[0], - prev_x[1], prev_x[2]); + message("Two particles occupy location: %f %f %f id=%lld id=%lld", + prev_x[0], prev_x[1], prev_x[2], *prev_id, s->parts[k].id); failed++; } prev_x = s->parts[k].x; + prev_id = &s->parts[k].id; } if (failed > 0) error( diff --git a/src/partition.c b/src/partition.c index 297614e4b930851916e7fce7de6c9ca8faac2655..297ac71ac75ea4a5c5446033418b1bb4352b116b 100644 --- a/src/partition.c +++ b/src/partition.c @@ -580,7 +580,7 @@ static void repart_edge_metis(int partweights, int bothweights, int timebins, if (t->type == task_type_drift_part || t->type == task_type_drift_gpart || t->type == task_type_ghost || t->type == task_type_extra_ghost || t->type == task_type_kick1 || t->type == task_type_kick2 || - t->type == task_type_end_force || t->type == task_type_cooling || + t->type == task_type_end_force || t->type == task_type_cooling || t->type == task_type_timestep || t->type == task_type_init_grav || t->type == task_type_grav_down || t->type == task_type_grav_long_range) { diff --git a/src/periodic.h b/src/periodic.h index 5874b8742e89c5c93727111adb5b289cff4cb6a6..7df36ac4c3927363305a7ef9fe3b369f58e7dd85 100644 --- a/src/periodic.h +++ b/src/periodic.h @@ -31,12 +31,12 @@ * Only wraps once. If x > 2b, the returned value will be larger than b. * Similarly for x < -b. */ -#define box_wrap(x, a, b) \ - ({ \ - const __typeof__(x) _x = (x); \ - const __typeof__(a) _a = (a); \ - const __typeof__(b) _b = (b); \ - _x < _a ? (_x + _b) : ((_x > _b) ? (_x - _b) : _x); \ +#define box_wrap(x, a, b) \ + ({ \ + const __typeof__(x) _x = (x); \ + const __typeof__(a) _a = (a); \ + const __typeof__(b) _b = (b); \ + _x < _a ? (_x + _b) : ((_x >= _b) ? (_x - _b) : _x); \ }) /** diff --git a/src/runner.c b/src/runner.c index 289dd95dd027ebbf6d82483a954011de05c61a9f..592ccdb5b411dbfa0ea60dd3928a53a4a79c5cb5 100644 --- a/src/runner.c +++ b/src/runner.c @@ -774,23 +774,18 @@ void runner_do_ghost(struct runner *r, struct cell *c, int timer) { /* Self-interaction? */ if (l->t->type == task_type_self) -#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) - runner_doself_subset_density_vec(r, finger, parts, pid, count); -#else - runner_doself_subset_density(r, finger, parts, pid, count); -#endif + runner_doself_subset_branch_density(r, finger, parts, pid, count); /* Otherwise, pair interaction? */ else if (l->t->type == task_type_pair) { /* Left or right? */ if (l->t->ci == finger) - runner_dopair_subset_density(r, finger, parts, pid, count, - l->t->cj); + runner_dopair_subset_branch_density(r, finger, parts, pid, + count, l->t->cj); else - runner_dopair_subset_density(r, finger, parts, pid, count, - l->t->ci); - + runner_dopair_subset_branch_density(r, finger, parts, pid, + count, l->t->ci); } /* Otherwise, sub-self interaction? */ diff --git a/src/runner_doiact.h b/src/runner_doiact.h index 09b6a83ec29dd4f57b8a850ee77a2e5692e1c4ec..dda34a6dc3eb7cb3ee77789cfb4a8cd51db37266 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -41,6 +41,9 @@ #define _DOPAIR_SUBSET(f) PASTE(runner_dopair_subset, f) #define DOPAIR_SUBSET _DOPAIR_SUBSET(FUNCTION) +#define _DOPAIR_SUBSET_BRANCH(f) PASTE(runner_dopair_subset_branch, f) +#define DOPAIR_SUBSET_BRANCH _DOPAIR_SUBSET_BRANCH(FUNCTION) + #define _DOPAIR_SUBSET_NOSORT(f) PASTE(runner_dopair_subset_nosort, f) #define DOPAIR_SUBSET_NOSORT _DOPAIR_SUBSET_NOSORT(FUNCTION) @@ -74,6 +77,9 @@ #define _DOSELF_SUBSET(f) PASTE(runner_doself_subset, f) #define DOSELF_SUBSET _DOSELF_SUBSET(FUNCTION) +#define _DOSELF_SUBSET_BRANCH(f) PASTE(runner_doself_subset_branch, f) +#define DOSELF_SUBSET_BRANCH _DOSELF_SUBSET_BRANCH(FUNCTION) + #define _DOSUB_SELF1(f) PASTE(runner_dosub_self1, f) #define DOSUB_SELF1 _DOSUB_SELF1(FUNCTION) @@ -467,27 +473,18 @@ void DOSELF2_NAIVE(struct runner *r, struct cell *restrict c) { * @param ind The list of indices of particles in @c ci to interact with. * @param count The number of particles in @c ind. * @param cj The second #cell. + * @param shift The shift vector to apply to the particles in ci. */ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, struct part *restrict parts_i, int *restrict ind, - int count, struct cell *restrict cj) { - - const struct engine *e = r->e; + int count, struct cell *restrict cj, + const double *shift) { TIMER_TIC; const int count_j = cj->count; struct part *restrict parts_j = cj->parts; - /* Get the relative distance between the pairs, wrapping. */ - double shift[3] = {0.0, 0.0, 0.0}; - for (int k = 0; k < 3; k++) { - if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) - shift[k] = e->s->dim[k]; - else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) - shift[k] = -e->s->dim[k]; - } - /* Loop over the parts_i. */ for (int pid = 0; pid < count; pid++) { @@ -499,9 +496,9 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, const float hig2 = hi * hi * kernel_gamma2; #ifdef SWIFT_DEBUG_CHECKS + const struct engine *e = r->e; if (!part_is_active(pi, e)) error("Trying to correct smoothing length of inactive particle !"); - #endif /* Loop over the parts in cj. */ @@ -547,48 +544,20 @@ void DOPAIR_SUBSET_NAIVE(struct runner *r, struct cell *restrict ci, * @param ind The list of indices of particles in @c ci to interact with. * @param count The number of particles in @c ind. * @param cj The second #cell. + * @param sid The direction of the pair. + * @param flipped Flag to check whether the cells have been flipped or not. + * @param shift The shift vector to apply to the particles in ci. */ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, struct part *restrict parts_i, int *restrict ind, int count, - struct cell *restrict cj) { - - const struct engine *e = r->e; - -#ifdef SWIFT_USE_NAIVE_INTERACTIONS - DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj); - return; -#endif + struct cell *restrict cj, const int sid, const int flipped, + const double *shift) { TIMER_TIC; const int count_j = cj->count; struct part *restrict parts_j = cj->parts; - /* Get the relative distance between the pairs, wrapping. */ - double shift[3] = {0.0, 0.0, 0.0}; - for (int k = 0; k < 3; k++) { - if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) - shift[k] = e->s->dim[k]; - else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) - shift[k] = -e->s->dim[k]; - } - - /* Get the sorting index. */ - int sid = 0; - for (int k = 0; k < 3; k++) - sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) - ? 0 - : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); - - /* Switch the cells around? */ - const int flipped = runner_flip[sid]; - sid = sortlistID[sid]; - - /* Has the cell cj been sorted? */ - if (!(cj->sorted & (1 << sid)) || - cj->dx_max_sort_old > space_maxreldx * cj->dmin) - error("Interacting unsorted cells."); - /* Pick-out the sorted lists. */ const struct entry *restrict sort_j = cj->sort[sid]; const float dxj = cj->dx_max_sort; @@ -624,6 +593,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; #ifdef SWIFT_DEBUG_CHECKS + const struct engine *e = r->e; /* Check that particles have been drifted to the current time */ if (pi->ti_drift != e->ti_current) error("Particle pi not drifted to current time"); @@ -671,6 +641,7 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, const float r2 = dx[0] * dx[0] + dx[1] * dx[1] + dx[2] * dx[2]; #ifdef SWIFT_DEBUG_CHECKS + const struct engine *e = r->e; /* Check that particles have been drifted to the current time */ if (pi->ti_drift != e->ti_current) error("Particle pi not drifted to current time"); @@ -690,6 +661,64 @@ void DOPAIR_SUBSET(struct runner *r, struct cell *restrict ci, TIMER_TOC(timer_dopair_subset); } +/** + * @brief Determine which version of DOPAIR_SUBSET needs to be called depending + * on the + * orientation of the cells or whether DOPAIR_SUBSET needs to be called at all. + * + * @param r The #runner. + * @param ci The first #cell. + * @param parts_i The #part to interact with @c cj. + * @param ind The list of indices of particles in @c ci to interact with. + * @param count The number of particles in @c ind. + * @param cj The second #cell. + */ +void DOPAIR_SUBSET_BRANCH(struct runner *r, struct cell *restrict ci, + struct part *restrict parts_i, int *restrict ind, + int count, struct cell *restrict cj) { + + const struct engine *e = r->e; + + /* Get the relative distance between the pairs, wrapping. */ + double shift[3] = {0.0, 0.0, 0.0}; + for (int k = 0; k < 3; k++) { + if (cj->loc[k] - ci->loc[k] < -e->s->dim[k] / 2) + shift[k] = e->s->dim[k]; + else if (cj->loc[k] - ci->loc[k] > e->s->dim[k] / 2) + shift[k] = -e->s->dim[k]; + } + +#if !defined(SWIFT_USE_NAIVE_INTERACTIONS) + /* Get the sorting index. */ + int sid = 0; + for (int k = 0; k < 3; k++) + sid = 3 * sid + ((cj->loc[k] - ci->loc[k] + shift[k] < 0) + ? 0 + : (cj->loc[k] - ci->loc[k] + shift[k] > 0) ? 2 : 1); + + /* Switch the cells around? */ + const int flipped = runner_flip[sid]; + sid = sortlistID[sid]; + + /* Has the cell cj been sorted? */ + if (!(cj->sorted & (1 << sid)) || + cj->dx_max_sort_old > space_maxreldx * cj->dmin) + error("Interacting unsorted cells."); +#endif + +#if defined(SWIFT_USE_NAIVE_INTERACTIONS) + DOPAIR_SUBSET_NAIVE(r, ci, parts_i, ind, count, cj, shift); +#elif defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + if (sort_is_face(sid)) + runner_dopair_subset_density_vec(r, ci, parts_i, ind, count, cj, sid, + flipped, shift); + else + DOPAIR_SUBSET(r, ci, parts_i, ind, count, cj, sid, flipped, shift); +#else + DOPAIR_SUBSET(r, ci, parts_i, ind, count, cj, sid, flipped, shift); +#endif +} + /** * @brief Compute the interactions between a cell pair, but only for the * given indices in ci. @@ -757,13 +786,34 @@ void DOSELF_SUBSET(struct runner *r, struct cell *restrict ci, TIMER_TOC(timer_doself_subset); } +/** + * @brief Determine which version of DOSELF_SUBSET needs to be called depending + * on the optimisation level. + + * @param r The #runner. + * @param ci The first #cell. + * @param parts The #part to interact. + * @param ind The list of indices of particles in @c ci to interact with. + * @param count The number of particles in @c ind. + */ +void DOSELF_SUBSET_BRANCH(struct runner *r, struct cell *restrict ci, + struct part *restrict parts, int *restrict ind, + int count) { + +#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) + runner_doself_subset_density_vec(r, ci, parts, ind, count); +#else + DOSELF_SUBSET(r, ci, parts, ind, count); +#endif +} + /** * @brief Compute the interactions between a cell pair (non-symmetric). * * @param r The #runner. * @param ci The first #cell. * @param cj The second #cell. - * @param sid The direction of the pair + * @param sid The direction of the pair. * @param shift The shift vector to apply to the particles in ci. */ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, @@ -2461,11 +2511,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, /* Otherwise, compute self-interaction. */ else -#if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) - runner_doself_subset_density_vec(r, ci, parts, ind, count); -#else - DOSELF_SUBSET(r, ci, parts, ind, count); -#endif + DOSELF_SUBSET_BRANCH(r, ci, parts, ind, count); } /* self-interaction. */ /* Otherwise, it's a pair interaction. */ @@ -2987,7 +3033,7 @@ void DOSUB_SUBSET(struct runner *r, struct cell *ci, struct part *parts, /* Do any of the cells need to be drifted first? */ if (!cell_are_part_drifted(cj, e)) error("Cell should be drifted!"); - DOPAIR_SUBSET(r, ci, parts, ind, count, cj); + DOPAIR_SUBSET_BRANCH(r, ci, parts, ind, count, cj); } } /* otherwise, pair interaction. */ diff --git a/src/runner_doiact_vec.c b/src/runner_doiact_vec.c index a31b39fc5c443132b7487ccd7d57dc47fd540132..aa6e6dc3c15f1846462b900076cf131e726cd883 100644 --- a/src/runner_doiact_vec.c +++ b/src/runner_doiact_vec.c @@ -262,7 +262,7 @@ __attribute__((always_inline)) INLINE static void storeInteractions( * @param active_ci Is any particle in cell ci active? * @param active_cj Is any particle in cell cj active? */ -__attribute__((always_inline)) INLINE static void populate_max_index_no_cache( +__attribute__((always_inline)) INLINE static void populate_max_index_density( const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const float dx_max, const float rshift, const double hi_max, @@ -421,8 +421,7 @@ __attribute__((always_inline)) INLINE static void populate_max_index_no_cache( * @param active_ci Is any particle in cell ci active? * @param active_cj Is any particle in cell cj active? */ -__attribute__((always_inline)) INLINE static void -populate_max_index_no_cache_force( +__attribute__((always_inline)) INLINE static void populate_max_index_force( const struct cell *ci, const struct cell *cj, const struct entry *restrict sort_i, const struct entry *restrict sort_j, const float dx_max, const float rshift, const double hi_max_raw, @@ -558,6 +557,88 @@ populate_max_index_no_cache_force( *init_pj = last_pj; } +/** + * @brief Populates the array max_index_i with the maximum + * index of + * particles into the neighbouring cell. Also finds the first/last pj that + * interacts with any particle in ci. + * + * @param count_i The number of particles in ci. + * @param count_j The number of particles in cj. + * @param parts_i The #part to interact with @c cj. + * @param ind The list of indices of particles in @c ci to interact with. + * @param total_ci_shift The shift vector to apply to the particles in ci. + * @param dxj Maximum particle movement allowed in cell cj. + * @param di_shift_correction The correction to di after the particles have been + * shifted to the frame of cell ci. + * @param runner_shift_x The runner_shift in the x direction. + * @param runner_shift_y The runner_shift in the y direction. + * @param runner_shift_z The runner_shift in the z direction. + * @param sort_j #entry array for particle distance in cj + * @param max_index_i array to hold the maximum distances of pi particles into + * #cell cj + * @param flipped Flag to check whether the cells have been flipped or not. + * @return first_pj/last_pj first or last pj to interact with any particle in ci + * depending whether the cells have been flipped or not. + */ +__attribute__((always_inline)) INLINE static int populate_max_index_subset( + const int count_i, const int count_j, struct part *restrict parts_i, + int *restrict ind, const double *total_ci_shift, const float dxj, + const double di_shift_correction, const double runner_shift_x, + const double runner_shift_y, const double runner_shift_z, + const struct entry *restrict sort_j, int *max_index_i, const int flipped) { + + /* The cell is on the right so read the particles + * into the cache from the start of the cell. */ + if (!flipped) { + + /* Find the rightmost particle in cell j that interacts with any + * particle in cell i. */ + int last_pj = 0; + + for (int pid = 0; pid < count_i; pid++) { + struct part *restrict pi = &parts_i[ind[pid]]; + const float pix = pi->x[0] - total_ci_shift[0]; + const float piy = pi->x[1] - total_ci_shift[1]; + const float piz = pi->x[2] - total_ci_shift[2]; + const float hi = pi->h; + + const double di = hi * kernel_gamma + dxj + pix * runner_shift_x + + piy * runner_shift_y + piz * runner_shift_z + + di_shift_correction; + + for (int pjd = last_pj; pjd < count_j && sort_j[pjd].d < di; pjd++) + last_pj++; + + max_index_i[pid] = last_pj; + } + return last_pj; + } + /* The cell is on the left so read the particles + * into the cache from the end of the cell. */ + else { + + int first_pj = count_j - 1; + + for (int pid = 0; pid < count_i; pid++) { + struct part *restrict pi = &parts_i[ind[pid]]; + const float pix = pi->x[0] - total_ci_shift[0]; + const float piy = pi->x[1] - total_ci_shift[1]; + const float piz = pi->x[2] - total_ci_shift[2]; + const float hi = pi->h; + + const double di = -hi * kernel_gamma - dxj + pix * runner_shift_x + + piy * runner_shift_y + piz * runner_shift_z + + di_shift_correction; + + for (int pjd = first_pj; pjd > 0 && di < sort_j[pjd].d; pjd--) first_pj--; + + max_index_i[pid] = first_pj; + } + return first_pj; + } +} + #endif /* WITH_VECTORIZATION && GADGET2_SPH */ /** @@ -567,8 +648,7 @@ populate_max_index_no_cache_force( * @param r The #runner. * @param c The #cell. */ -__attribute__((always_inline)) INLINE void runner_doself1_density_vec( - struct runner *r, struct cell *restrict c) { +void runner_doself1_density_vec(struct runner *r, struct cell *restrict c) { #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) @@ -802,9 +882,9 @@ __attribute__((always_inline)) INLINE void runner_doself1_density_vec( * @param ind The list of indices of particles in @c c to interact with. * @param pi_count The number of particles in @c ind. */ -__attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( - struct runner *r, struct cell *restrict c, struct part *restrict parts, - int *restrict ind, int pi_count) { +void runner_doself_subset_density_vec(struct runner *r, struct cell *restrict c, + struct part *restrict parts, + int *restrict ind, int pi_count) { #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) @@ -1026,8 +1106,7 @@ __attribute__((always_inline)) INLINE void runner_doself_subset_density_vec( * @param r The #runner. * @param c The #cell. */ -__attribute__((always_inline)) INLINE void runner_doself2_force_vec( - struct runner *r, struct cell *restrict c) { +void runner_doself2_force_vec(struct runner *r, struct cell *restrict c) { #if defined(WITH_VECTORIZATION) && defined(GADGET2_SPH) @@ -1309,10 +1388,10 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, /* Find particles maximum index into cj, max_index_i[] and ci, max_index_j[]. * Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ - populate_max_index_no_cache(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, - hj_max, di_max, dj_min, max_index_i, max_index_j, - &first_pi, &last_pj, max_active_bin, active_ci, - active_cj); + populate_max_index_density(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max, + hj_max, di_max, dj_min, max_index_i, max_index_j, + &first_pi, &last_pj, max_active_bin, active_ci, + active_cj); /* Limits of the outer loops. */ const int first_pi_loop = first_pi; @@ -1364,7 +1443,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, const vector v_hig2 = vector_set1(hig2); /* Get the inverse of hi. */ - vector v_hi_inv = vec_reciprocal(v_hi); + const vector v_hi_inv = vec_reciprocal(v_hi); /* Reset cumulative sums of update vectors. */ vector v_rhoSum = vector_setzero(); @@ -1394,7 +1473,7 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, } #endif - /* Load 2 sets of vectors from the particle cache. */ + /* Load 1 set of vectors from the particle cache. */ const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); @@ -1584,6 +1663,314 @@ void runner_dopair1_density_vec(struct runner *r, struct cell *ci, #endif /* WITH_VECTORIZATION */ } +/** + * @brief Compute the interactions between a cell pair, but only for the + * given indices in ci. (Vectorised) + * + * @param r The #runner. + * @param ci The first #cell. + * @param parts_i The #part to interact with @c cj. + * @param ind The list of indices of particles in @c ci to interact with. + * @param count The number of particles in @c ind. + * @param cj The second #cell. + * @param sid The direction of the pair. + * @param flipped Flag to check whether the cells have been flipped or not. + * @param shift The shift vector to apply to the particles in ci. + */ +void runner_dopair_subset_density_vec(struct runner *r, + struct cell *restrict ci, + struct part *restrict parts_i, + int *restrict ind, int count, + struct cell *restrict cj, const int sid, + const int flipped, const double *shift) { + +#ifdef WITH_VECTORIZATION + + TIMER_TIC; + + const int count_j = cj->count; + + /* Pick-out the sorted lists. */ + const struct entry *restrict sort_j = cj->sort[sid]; + const float dxj = cj->dx_max_sort; + + /* Get both particle caches from the runner and re-allocate + * them if they are not big enough for the cells. */ + struct cache *restrict cj_cache = &r->cj_cache; + + if (cj_cache->count < count_j) cache_init(cj_cache, count_j); + + /* Pull each runner_shift from memory. */ + const double runner_shift_x = runner_shift[sid][0]; + const double runner_shift_y = runner_shift[sid][1]; + const double runner_shift_z = runner_shift[sid][2]; + + const double total_ci_shift[3] = { + ci->loc[0] + shift[0], ci->loc[1] + shift[1], ci->loc[2] + shift[2]}; + + /* Calculate the correction to di after the particles have been shifted to the + * frame of cell ci. */ + const double di_shift_correction = ci->loc[0] * runner_shift_x + + ci->loc[1] * runner_shift_y + + ci->loc[2] * runner_shift_z; + + double rshift = 0.0; + for (int k = 0; k < 3; k++) rshift += shift[k] * runner_shift[sid][k]; + + int *restrict max_index_i SWIFT_CACHE_ALIGN; + max_index_i = r->ci_cache.max_index; + + /* Parts are on the left? */ + if (!flipped) { + + int last_pj = populate_max_index_subset( + count, count_j, parts_i, ind, total_ci_shift, dxj, di_shift_correction, + runner_shift_x, runner_shift_y, runner_shift_z, sort_j, max_index_i, 0); + + /* Read the particles from the cell and store them locally in the cache. */ + cache_read_particles_subset(cj, cj_cache, sort_j, 0, &last_pj, ci->loc, 0); + + const double dj_min = sort_j[0].d; + + /* Loop over the parts_i. */ + for (int pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[ind[pid]]; + const float pix = pi->x[0] - total_ci_shift[0]; + const float piy = pi->x[1] - total_ci_shift[1]; + const float piz = pi->x[2] - total_ci_shift[2]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + + /* Skip this particle if no particle in cj is within range of it. */ + const double di = hi * kernel_gamma + dxj + pix * runner_shift_x + + piy * runner_shift_y + piz * runner_shift_z + + di_shift_correction; + if (di < dj_min) continue; + + /* Fill particle pi vectors. */ + const vector v_pix = vector_set1(pix); + const vector v_piy = vector_set1(piy); + const vector v_piz = vector_set1(piz); + const vector v_hi = vector_set1(hi); + const vector v_vix = vector_set1(pi->v[0]); + const vector v_viy = vector_set1(pi->v[1]); + const vector v_viz = vector_set1(pi->v[2]); + const vector v_hig2 = vector_set1(hig2); + + /* Get the inverse of hi. */ + vector v_hi_inv = vec_reciprocal(v_hi); + + /* Reset cumulative sums of update vectors. */ + vector v_rhoSum = vector_setzero(); + vector v_rho_dhSum = vector_setzero(); + vector v_wcountSum = vector_setzero(); + vector v_wcount_dhSum = vector_setzero(); + vector v_div_vSum = vector_setzero(); + vector v_curlvxSum = vector_setzero(); + vector v_curlvySum = vector_setzero(); + vector v_curlvzSum = vector_setzero(); + + int exit_iteration_end = max_index_i[pid] + 1; + + /* Loop over the parts in cj. */ + for (int pjd = 0; pjd < exit_iteration_end; pjd += VEC_SIZE) { + + /* Get the cache index to the jth particle. */ + const int cj_cache_idx = pjd; + + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 1 set of vectors from the particle cache. */ + const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + /* Form r2 < hig2 mask. */ + mask_t v_doi_mask; + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); + +#ifdef DEBUG_INTERACTIONS_SPH + struct part *restrict parts_j = cj->parts; + for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { + if (vec_is_mask_true(v_doi_mask) & (1 << bit_index)) { + if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS) { + pi->ids_ngbs_density[pi->num_ngb_density] = + parts_j[sort_j[pjd + bit_index].i].id; + } + ++pi->num_ngb_density; + } + } +#endif + + /* If there are any interactions perform them. */ + if (vec_is_mask_true(v_doi_mask)) + runner_iact_nonsym_1_vec_density( + &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, + &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], + &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_doi_mask); + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in pi. */ + VEC_HADD(v_rhoSum, pi->rho); + VEC_HADD(v_rho_dhSum, pi->density.rho_dh); + VEC_HADD(v_wcountSum, pi->density.wcount); + VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(v_div_vSum, pi->density.div_v); + VEC_HADD(v_curlvxSum, pi->density.rot_v[0]); + VEC_HADD(v_curlvySum, pi->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pi->density.rot_v[2]); + + } /* loop over the parts in ci. */ + } + + /* Parts are on the right. */ + else { + + int first_pj = populate_max_index_subset( + count, count_j, parts_i, ind, total_ci_shift, dxj, di_shift_correction, + runner_shift_x, runner_shift_y, runner_shift_z, sort_j, max_index_i, 1); + + /* Read the particles from the cell and store them locally in the cache. */ + cache_read_particles_subset(cj, cj_cache, sort_j, &first_pj, 0, ci->loc, 1); + + /* Get the number of particles read into the ci cache. */ + const int cj_cache_count = count_j - first_pj; + + const double dj_max = sort_j[count_j - 1].d; + + /* Loop over the parts_i. */ + for (int pid = 0; pid < count; pid++) { + + /* Get a hold of the ith part in ci. */ + struct part *restrict pi = &parts_i[ind[pid]]; + const float pix = pi->x[0] - total_ci_shift[0]; + const float piy = pi->x[1] - total_ci_shift[1]; + const float piz = pi->x[2] - total_ci_shift[2]; + const float hi = pi->h; + const float hig2 = hi * hi * kernel_gamma2; + + /* Skip this particle if no particle in cj is within range of it. */ + const double di = -hi * kernel_gamma - dxj + pix * runner_shift_x + + piy * runner_shift_y + piz * runner_shift_z + + di_shift_correction; + if (di > dj_max) continue; + + /* Fill particle pi vectors. */ + const vector v_pix = vector_set1(pix); + const vector v_piy = vector_set1(piy); + const vector v_piz = vector_set1(piz); + const vector v_hi = vector_set1(hi); + const vector v_vix = vector_set1(pi->v[0]); + const vector v_viy = vector_set1(pi->v[1]); + const vector v_viz = vector_set1(pi->v[2]); + const vector v_hig2 = vector_set1(hig2); + + /* Get the inverse of hi. */ + vector v_hi_inv = vec_reciprocal(v_hi); + + /* Reset cumulative sums of update vectors. */ + vector v_rhoSum = vector_setzero(); + vector v_rho_dhSum = vector_setzero(); + vector v_wcountSum = vector_setzero(); + vector v_wcount_dhSum = vector_setzero(); + vector v_div_vSum = vector_setzero(); + vector v_curlvxSum = vector_setzero(); + vector v_curlvySum = vector_setzero(); + vector v_curlvzSum = vector_setzero(); + + int exit_iteration = max_index_i[pid]; + + /* Convert exit iteration to cache indices. */ + int exit_iteration_align = exit_iteration - first_pj; + + /* Pad the exit iteration align so cache reads are aligned. */ + const int rem = exit_iteration_align % VEC_SIZE; + if (exit_iteration_align < VEC_SIZE) { + exit_iteration_align = 0; + } else + exit_iteration_align -= rem; + + /* Loop over the parts in cj. */ + for (int cj_cache_idx = exit_iteration_align; + cj_cache_idx < cj_cache_count; cj_cache_idx += VEC_SIZE) { + + vector v_dx, v_dy, v_dz, v_r2; + + /* Load 1 set of vectors from the particle cache. */ + const vector v_pjx = vector_load(&cj_cache->x[cj_cache_idx]); + const vector v_pjy = vector_load(&cj_cache->y[cj_cache_idx]); + const vector v_pjz = vector_load(&cj_cache->z[cj_cache_idx]); + + /* Compute the pairwise distance. */ + v_dx.v = vec_sub(v_pix.v, v_pjx.v); + v_dy.v = vec_sub(v_piy.v, v_pjy.v); + v_dz.v = vec_sub(v_piz.v, v_pjz.v); + + v_r2.v = vec_mul(v_dx.v, v_dx.v); + v_r2.v = vec_fma(v_dy.v, v_dy.v, v_r2.v); + v_r2.v = vec_fma(v_dz.v, v_dz.v, v_r2.v); + + /* Form r2 < hig2 mask. */ + mask_t v_doi_mask; + vec_create_mask(v_doi_mask, vec_cmp_lt(v_r2.v, v_hig2.v)); + +#ifdef DEBUG_INTERACTIONS_SPH + struct part *restrict parts_j = cj->parts; + for (int bit_index = 0; bit_index < VEC_SIZE; bit_index++) { + if (vec_is_mask_true(v_doi_mask) & (1 << bit_index)) { + if (pi->num_ngb_density < MAX_NUM_OF_NEIGHBOURS) { + pi->ids_ngbs_density[pi->num_ngb_density] = + parts_j[sort_j[cj_cache_idx + first_pj + bit_index].i].id; + } + ++pi->num_ngb_density; + } + } +#endif + + /* If there are any interactions perform them. */ + if (vec_is_mask_true(v_doi_mask)) + runner_iact_nonsym_1_vec_density( + &v_r2, &v_dx, &v_dy, &v_dz, v_hi_inv, v_vix, v_viy, v_viz, + &cj_cache->vx[cj_cache_idx], &cj_cache->vy[cj_cache_idx], + &cj_cache->vz[cj_cache_idx], &cj_cache->m[cj_cache_idx], + &v_rhoSum, &v_rho_dhSum, &v_wcountSum, &v_wcount_dhSum, + &v_div_vSum, &v_curlvxSum, &v_curlvySum, &v_curlvzSum, + v_doi_mask); + + } /* loop over the parts in cj. */ + + /* Perform horizontal adds on vector sums and store result in pi. */ + VEC_HADD(v_rhoSum, pi->rho); + VEC_HADD(v_rho_dhSum, pi->density.rho_dh); + VEC_HADD(v_wcountSum, pi->density.wcount); + VEC_HADD(v_wcount_dhSum, pi->density.wcount_dh); + VEC_HADD(v_div_vSum, pi->density.div_v); + VEC_HADD(v_curlvxSum, pi->density.rot_v[0]); + VEC_HADD(v_curlvySum, pi->density.rot_v[1]); + VEC_HADD(v_curlvzSum, pi->density.rot_v[2]); + + } /* loop over the parts in ci. */ + } + + TIMER_TOC(timer_dopair_subset); +#endif /* WITH_VECTORIZATION */ +} + /** * @brief Compute the force interactions between a cell pair (non-symmetric) * using vector intrinsics. @@ -1691,10 +2078,10 @@ void runner_dopair2_force_vec(struct runner *r, struct cell *ci, /* Find particles maximum distance into cj, max_di[] and ci, max_dj[]. */ /* Also find the first pi that interacts with any particle in cj and the last * pj that interacts with any particle in ci. */ - populate_max_index_no_cache_force( - ci, cj, sort_i, sort_j, dx_max, rshift, hi_max_raw, hj_max_raw, h_max, - di_max, dj_min, max_index_i, max_index_j, &first_pi, &last_pj, - max_active_bin, active_ci, active_cj); + populate_max_index_force(ci, cj, sort_i, sort_j, dx_max, rshift, hi_max_raw, + hj_max_raw, h_max, di_max, dj_min, max_index_i, + max_index_j, &first_pi, &last_pj, max_active_bin, + active_ci, active_cj); /* Limits of the outer loops. */ const int first_pi_loop = first_pi; diff --git a/src/runner_doiact_vec.h b/src/runner_doiact_vec.h index 66240b1be542ba0c3d76c35e9d6e96a6849b02dc..6b01987c15513343d7a8784322803d7f834dea2c 100644 --- a/src/runner_doiact_vec.h +++ b/src/runner_doiact_vec.h @@ -40,6 +40,12 @@ void runner_doself_subset_density_vec(struct runner *r, int *restrict ind, int count); void runner_doself1_density_vec(struct runner *r, struct cell *restrict c); void runner_doself2_force_vec(struct runner *r, struct cell *restrict c); +void runner_dopair_subset_density_vec(struct runner *r, + struct cell *restrict ci, + struct part *restrict parts_i, + int *restrict ind, int count, + struct cell *restrict cj, const int sid, + const int flipped, const double *shift); void runner_dopair1_density_vec(struct runner *r, struct cell *restrict ci, struct cell *restrict cj, const int sid, const double *shift); diff --git a/src/scheduler.c b/src/scheduler.c index 6ef2a47d61e51a2ac6d2f103666a6cc9abd71a0d..0d96368e5711935813322107df4286176210552a 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -359,7 +359,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { if (cell_can_split_self_task(ci)) { /* Make a sub? */ - if (scheduler_dosub && ci->count < space_subsize_self) { + if (scheduler_dosub && ci->count < space_subsize_self_hydro) { /* convert to a self-subtask. */ t->type = task_type_sub_self; @@ -419,7 +419,7 @@ static void scheduler_splittask_hydro(struct task *t, struct scheduler *s) { /* Replace by a single sub-task? */ if (scheduler_dosub && /* Use division to avoid integer overflow. */ - ci->count * sid_scale[sid] < space_subsize_pair / cj->count && + ci->count * sid_scale[sid] < space_subsize_pair_hydro / cj->count && !sort_is_corner(sid)) { /* Make this task a sub task. */ @@ -1179,7 +1179,7 @@ void scheduler_reweight(struct scheduler *s, int verbose) { int *tid = s->tasks_ind; struct task *tasks = s->tasks; const int nodeID = s->nodeID; - const float wscale = 0.001; + const float wscale = 0.001f; const ticks tic = getticks(); /* Run through the tasks backwards and set their weights. */ @@ -1198,47 +1198,47 @@ void scheduler_reweight(struct scheduler *s, int verbose) { case task_type_self: if (t->subtype == task_subtype_grav) - cost = 1 * wscale * t->ci->gcount * t->ci->gcount; + cost = 1.f * (wscale * t->ci->gcount) * t->ci->gcount; else if (t->subtype == task_subtype_external_grav) - cost = 1 * wscale * t->ci->gcount; + cost = 1.f * wscale * t->ci->gcount; else - cost = 1 * wscale * t->ci->count * t->ci->count; + cost = 1.f * (wscale * t->ci->count) * t->ci->count; break; case task_type_pair: if (t->subtype == task_subtype_grav) { if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) - cost = 3 * wscale * t->ci->gcount * t->cj->gcount; + cost = 3.f * (wscale * t->ci->gcount) * t->cj->gcount; else - cost = 2 * wscale * t->ci->gcount * t->cj->gcount; + cost = 2.f * (wscale * t->ci->gcount) * t->cj->gcount; } else { if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) - cost = - 3 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; + cost = 3.f * (wscale * t->ci->count) * t->cj->count * + sid_scale[t->flags]; else - cost = - 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; + cost = 2.f * (wscale * t->ci->count) * t->cj->count * + sid_scale[t->flags]; } break; case task_type_sub_pair: if (t->ci->nodeID != nodeID || t->cj->nodeID != nodeID) { if (t->flags < 0) - cost = 3 * wscale * t->ci->count * t->cj->count; + cost = 3.f * (wscale * t->ci->count) * t->cj->count; else - cost = - 3 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; + cost = 3.f * (wscale * t->ci->count) * t->cj->count * + sid_scale[t->flags]; } else { if (t->flags < 0) - cost = 2 * wscale * t->ci->count * t->cj->count; + cost = 2.f * (wscale * t->ci->count) * t->cj->count; else - cost = - 2 * wscale * t->ci->count * t->cj->count * sid_scale[t->flags]; + cost = 2.f * (wscale * t->ci->count) * t->cj->count * + sid_scale[t->flags]; } break; case task_type_sub_self: - cost = 1 * wscale * t->ci->count * t->ci->count; + cost = 1.f * (wscale * t->ci->count) * t->ci->count; break; case task_type_ghost: if (t->ci == t->ci->super_hydro) cost = wscale * t->ci->count; @@ -1274,10 +1274,16 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = wscale * t->ci->count + wscale * t->ci->gcount; break; case task_type_send: - cost = 10 * wscale * t->ci->count * t->ci->count; + if (t->ci->count < 1e5) + cost = 10.f * (wscale * t->ci->count) * t->ci->count; + else + cost = 2e9; break; case task_type_recv: - cost = 5 * wscale * t->ci->count * t->ci->count; + if (t->ci->count < 1e5) + cost = 5.f * (wscale * t->ci->count) * t->ci->count; + else + cost = 1e9; break; default: cost = 0; diff --git a/src/space.c b/src/space.c index 02f285edb7b3069022b5e1e869e975abd3e896af..68d24ae689d1b0c36541b6ec96545276b2191363 100644 --- a/src/space.c +++ b/src/space.c @@ -60,8 +60,9 @@ /* Split size. */ int space_splitsize = space_splitsize_default; -int space_subsize_pair = space_subsize_pair_default; -int space_subsize_self = space_subsize_self_default; +int space_subsize_pair_hydro = space_subsize_pair_hydro_default; +int space_subsize_self_hydro = space_subsize_self_hydro_default; +int space_subsize_pair_grav = space_subsize_pair_grav_default; int space_subsize_self_grav = space_subsize_self_grav_default; int space_maxsize = space_maxsize_default; #ifdef SWIFT_DEBUG_CHECKS @@ -1081,7 +1082,11 @@ void space_parts_get_cell_index_mapper(void *map_data, int nr_parts, ind[k] = index; #ifdef SWIFT_DEBUG_CHECKS - if (pos_x > dim_x || pos_y > dim_y || pos_z > pos_z || pos_x < 0. || + if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2]) + error("Invalid index=%d cdim=[%d %d %d] p->x=[%e %e %e]", index, cdim[0], + cdim[1], cdim[2], pos_x, pos_y, pos_z); + + if (pos_x >= dim_x || pos_y >= dim_y || pos_z >= dim_z || pos_x < 0. || pos_y < 0. || pos_z < 0.) error("Particle outside of simulation box. p->x=[%e %e %e]", pos_x, pos_y, pos_z); @@ -1138,6 +1143,17 @@ void space_gparts_get_cell_index_mapper(void *map_data, int nr_gparts, cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z); ind[k] = index; +#ifdef SWIFT_DEBUG_CHECKS + if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2]) + error("Invalid index=%d cdim=[%d %d %d] p->x=[%e %e %e]", index, cdim[0], + cdim[1], cdim[2], pos_x, pos_y, pos_z); + + if (pos_x >= dim_x || pos_y >= dim_y || pos_z >= dim_z || pos_x < 0. || + pos_y < 0. || pos_z < 0.) + error("Particle outside of simulation box. p->x=[%e %e %e]", pos_x, pos_y, + pos_z); +#endif + /* Update the position */ gp->x[0] = pos_x; gp->x[1] = pos_y; @@ -1189,6 +1205,17 @@ void space_sparts_get_cell_index_mapper(void *map_data, int nr_sparts, cell_getid(cdim, pos_x * ih_x, pos_y * ih_y, pos_z * ih_z); ind[k] = index; +#ifdef SWIFT_DEBUG_CHECKS + if (index < 0 || index >= cdim[0] * cdim[1] * cdim[2]) + error("Invalid index=%d cdim=[%d %d %d] p->x=[%e %e %e]", index, cdim[0], + cdim[1], cdim[2], pos_x, pos_y, pos_z); + + if (pos_x >= dim_x || pos_y >= dim_y || pos_z >= dim_z || pos_x < 0. || + pos_y < 0. || pos_z < 0.) + error("Particle outside of simulation box. p->x=[%e %e %e]", pos_x, pos_y, + pos_z); +#endif + /* Update the position */ sp->x[0] = pos_x; sp->x[1] = pos_y; @@ -2612,8 +2639,8 @@ void space_first_init_parts(struct space *s) { hydro_first_init_part(&p[i], &xp[i]); #ifdef SWIFT_DEBUG_CHECKS - p->ti_drift = 0; - p->ti_kick = 0; + p[i].ti_drift = 0; + p[i].ti_kick = 0; #endif } } @@ -2861,21 +2888,29 @@ void space_init(struct space *s, const struct swift_params *params, /* Get the constants for the scheduler */ space_maxsize = parser_get_opt_param_int(params, "Scheduler:cell_max_size", space_maxsize_default); - space_subsize_pair = parser_get_opt_param_int( - params, "Scheduler:cell_sub_size_pair", space_subsize_pair_default); - space_subsize_self = parser_get_opt_param_int( - params, "Scheduler:cell_sub_size_self", space_subsize_self_default); + space_subsize_pair_hydro = + parser_get_opt_param_int(params, "Scheduler:cell_sub_size_pair_hydro", + space_subsize_pair_hydro_default); + space_subsize_self_hydro = + parser_get_opt_param_int(params, "Scheduler:cell_sub_size_self_hydro", + space_subsize_self_hydro_default); + space_subsize_pair_grav = + parser_get_opt_param_int(params, "Scheduler:cell_sub_size_pair_grav", + space_subsize_pair_grav_default); space_subsize_self_grav = parser_get_opt_param_int(params, "Scheduler:cell_sub_size_self_grav", space_subsize_self_grav_default); space_splitsize = parser_get_opt_param_int( params, "Scheduler:cell_split_size", space_splitsize_default); - if (verbose) - message( - "max_size set to %d, sub_size_pair set to %d, sub_size_self set to %d, " - "split_size set to %d", - space_maxsize, space_subsize_pair, space_subsize_self, space_splitsize); + if (verbose) { + message("max_size set to %d split_size set to %d", space_maxsize, + space_splitsize); + message("sub_size_pair_hydro set to %d, sub_size_self_hydro set to %d", + space_subsize_pair_hydro, space_subsize_self_hydro); + message("sub_size_pair_grav set to %d, sub_size_self_grav set to %d", + space_subsize_pair_grav, space_subsize_self_grav); + } /* Apply h scaling */ const double scaling = diff --git a/src/space.h b/src/space.h index c49dc37dc0df27cd3647044f219e36c299dcd73b..2d2a8e8ff43b5bfb586beada0a40d0af4d4e15fb 100644 --- a/src/space.h +++ b/src/space.h @@ -43,8 +43,9 @@ struct cell; #define space_cellallocchunk 1000 #define space_splitsize_default 400 #define space_maxsize_default 8000000 -#define space_subsize_pair_default 256000000 -#define space_subsize_self_default 32000 +#define space_subsize_pair_hydro_default 256000000 +#define space_subsize_self_hydro_default 32000 +#define space_subsize_pair_grav_default 256000000 #define space_subsize_self_grav_default 32000 #define space_max_top_level_cells_default 12 #define space_stretch 1.10f @@ -56,8 +57,9 @@ struct cell; /* Split size. */ extern int space_splitsize; extern int space_maxsize; -extern int space_subsize_pair; -extern int space_subsize_self; +extern int space_subsize_pair_hydro; +extern int space_subsize_self_hydro; +extern int space_subsize_pair_grav; extern int space_subsize_self_grav; /** diff --git a/src/timeline.h b/src/timeline.h index 352056cf340e7bbbce336e03e67a060f172155b1..0f38ff3e9421c122b61caf74290c170b62b2dd92 100644 --- a/src/timeline.h +++ b/src/timeline.h @@ -32,7 +32,7 @@ typedef long long integertime_t; typedef char timebin_t; /*! The number of time bins */ -#define num_time_bins 26 +#define num_time_bins 56 /*! The maximal number of timesteps in a simulation */ #define max_nr_timesteps (1LL << (num_time_bins + 1)) diff --git a/tests/test27cells.c b/tests/test27cells.c index c66a612d7325522213075d2d5ecd7e07ead6a17b..2d9cce7028026514e78da0ae09984ab39d7dce38 100644 --- a/tests/test27cells.c +++ b/tests/test27cells.c @@ -33,19 +33,24 @@ #if defined(WITH_VECTORIZATION) #define DOSELF1 runner_doself1_density_vec #define DOSELF1_SUBSET runner_doself_subset_density_vec +#define DOPAIR1_SUBSET runner_dopair_subset_branch_density #define DOPAIR1 runner_dopair1_branch_density -#ifdef DOSELF_SUBSET +#ifdef TEST_DOSELF_SUBSET #define DOSELF1_NAME "runner_doself_subset_density_vec" #else #define DOSELF1_NAME "runner_doself_density_vec" #endif -#define DOPAIR1_NAME "runner_dopair1_density_vec" +#ifdef TEST_DOPAIR_SUBSET +#define DOPAIR1_NAME "runner_dopair_subset_branch_density" +#else +#define DOPAIR1_NAME "runner_dopair_density_vec" +#endif #endif #ifndef DOSELF1 #define DOSELF1 runner_doself1_density #define DOSELF1_SUBSET runner_doself_subset_density -#ifdef DOSELF_SUBSET +#ifdef TEST_DOSELF_SUBSET #define DOSELF1_NAME "runner_doself1_subset_density" #else #define DOSELF1_NAME "runner_doself1_density" @@ -54,8 +59,13 @@ #ifndef DOPAIR1 #define DOPAIR1 runner_dopair1_branch_density +#define DOPAIR1_SUBSET runner_dopair_subset_branch_density +#ifdef TEST_DOPAIR_SUBSET +#define DOPAIR1_NAME "runner_dopair1_subset_branch_density" +#else #define DOPAIR1_NAME "runner_dopair1_density" #endif +#endif #define NODE_ID 1 @@ -316,10 +326,19 @@ void runner_dopair1_branch_density(struct runner *r, struct cell *ci, void runner_doself_subset_density(struct runner *r, struct cell *restrict ci, struct part *restrict parts, int *restrict ind, int count); +void runner_dopair_subset_density(struct runner *r, struct cell *restrict ci, + struct part *restrict parts_i, + int *restrict ind, int count, + struct cell *restrict cj); void runner_doself_subset_density_vec(struct runner *r, struct cell *restrict ci, struct part *restrict parts, int *restrict ind, int count); +void runner_dopair_subset_branch_density(struct runner *r, + struct cell *restrict ci, + struct part *restrict parts_i, + int *restrict ind, int count, + struct cell *restrict cj); /* And go... */ int main(int argc, char *argv[]) { @@ -480,18 +499,7 @@ int main(int argc, char *argv[]) { cache_init(&runner.cj_cache, 512); #endif - /* Run all the pairs */ - for (int j = 0; j < 27; ++j) { - if (cells[j] != main_cell) { - const ticks sub_tic = getticks(); - - DOPAIR1(&runner, main_cell, cells[j]); - - timings[j] += getticks() - sub_tic; - } - } - -#ifdef DOSELF_SUBSET +#if defined(TEST_DOSELF_SUBSET) || defined(TEST_DOPAIR_SUBSET) int *pid = NULL; int count = 0; if ((pid = malloc(sizeof(int) * main_cell->count)) == NULL) @@ -503,10 +511,26 @@ int main(int argc, char *argv[]) { } #endif + /* Run all the pairs */ + for (int j = 0; j < 27; ++j) { + if (cells[j] != main_cell) { + const ticks sub_tic = getticks(); + +#ifdef TEST_DOPAIR_SUBSET + DOPAIR1_SUBSET(&runner, main_cell, main_cell->parts, pid, count, + cells[j]); +#else + DOPAIR1(&runner, main_cell, cells[j]); +#endif + + timings[j] += getticks() - sub_tic; + } + } + /* And now the self-interaction */ const ticks self_tic = getticks(); -#ifdef DOSELF_SUBSET +#ifdef TEST_DOSELF_SUBSET DOSELF1_SUBSET(&runner, main_cell, main_cell->parts, pid, count); #else DOSELF1(&runner, main_cell);