/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2016 Matthieu Schaller (schaller@strw.leidenuniv.nl) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ /* Config parameters. */ #include /* MPI headers. */ #ifdef WITH_MPI #include #endif /* This object's header. */ #include "mesh_gravity_mpi.h" /* Local includes. */ #include "active.h" #include "debug.h" #include "engine.h" #include "error.h" #include "exchange_structs.h" #include "lock.h" #include "mesh_gravity_patch.h" #include "mesh_gravity_sort.h" #include "neutrino.h" #include "part.h" #include "periodic.h" #include "row_major_id.h" #include "space.h" #include "threadpool.h" /** * @brief Accumulate contributions from cell to density field * * Allocates a temporary mesh which covers the top level cell, * accumulates mass contributions to this mesh, and then * adds these contributions to the supplied hashmap. * * @param N The size of the mesh * @param fac Inverse of the cell size * @param dim The dimensions of the simulation box. * @param cell The #cell containing the particles. * @param patch The local mesh patch * @param nu_model Struct with neutrino constants * */ void accumulate_cell_to_local_patch(const int N, const double fac, const double *dim, const struct cell *cell, struct pm_mesh_patch *patch, const struct neutrino_model *nu_model) { /* If the cell is empty, then there's nothing to do (and the code to find the extent of the cell would fail) */ if (cell->grav.count == 0) return; /* Initialise the local mesh patch */ pm_mesh_patch_init(patch, cell, N, fac, dim, /*boundary_size=*/1); pm_mesh_patch_zero(patch); /* Loop over particles in this cell */ for (int ipart = 0; ipart < cell->grav.count; ipart++) { const struct gpart *gp = &(cell->grav.parts[ipart]); /* Box wrap the particle's position to the copy nearest the cell centre */ const double pos_x = box_wrap(gp->x[0], patch->wrap_min[0], patch->wrap_max[0]); const double pos_y = box_wrap(gp->x[1], patch->wrap_min[1], patch->wrap_max[1]); const double pos_z = box_wrap(gp->x[2], patch->wrap_min[2], patch->wrap_max[2]); /* Workout the CIC coefficients */ int i = (int)floor(fac * pos_x); const double dx = fac * pos_x - i; const double tx = 1. - dx; int j = (int)floor(fac * pos_y); const double dy = fac * pos_y - j; const double ty = 1. - dy; int k = (int)floor(fac * pos_z); const double dz = fac * pos_z - k; const double tz = 1. - dz; /* Get coordinates within the mesh patch */ const int ii = i - patch->mesh_min[0]; const int jj = j - patch->mesh_min[1]; const int kk = k - patch->mesh_min[2]; /* Compute weight (for neutrino delta-f weighting) */ double weight = 1.0; if (gp->type == swift_type_neutrino) gpart_neutrino_weight_mesh_only(gp, nu_model, &weight); /* Accumulate contributions to the local mesh patch */ const double mass = gp->mass; const double value = mass * weight; pm_mesh_patch_CIC_set(patch, ii, jj, kk, tx, ty, tz, dx, dy, dz, value); } } /** * @brief Shared information about the mesh to be used by all the threads in the * pool. */ struct accumulate_mapper_data { const struct cell *cells; const int *local_cells; struct pm_mesh_patch *local_patches; int N; double fac; double dim[3]; struct neutrino_model *nu_model; }; /** * @brief Threadpool mapper function for the mesh CIC assignment of a cell. * * @param map_data A chunk of the list of local cells. * @param num The number of cells in the chunk. * @param extra The information about the mesh and cells. */ void accumulate_cell_to_local_patches_mapper(void *map_data, int num, void *extra) { /* Unpack the shared information */ const struct accumulate_mapper_data *data = (struct accumulate_mapper_data *)extra; const struct cell *cells = data->cells; const int N = data->N; const double fac = data->fac; const double dim[3] = {data->dim[0], data->dim[1], data->dim[2]}; const struct neutrino_model *nu_model = data->nu_model; /* Pointer to the chunk to be processed */ int *local_cells = (int *)map_data; /* Start at the same position in the list of local patches */ const size_t offset = local_cells - data->local_cells; struct pm_mesh_patch *local_patches = data->local_patches + offset; /* Loop over the elements assigned to this thread */ for (int i = 0; i < num; ++i) { /* Pointer to local cell */ const struct cell *c = &cells[local_cells[i]]; /* Skip empty cells */ if (c->grav.count == 0) continue; /* Assign this cell's content to the mesh */ accumulate_cell_to_local_patch(N, fac, dim, c, &local_patches[i], nu_model); } } /** * @brief Accumulate local contributions to the density field * * Fill the array of local patches with the data corresponding * to the local top-level cells. * The patches are stored in the order of the space->local_cells_top list. * * @param N The size of the mesh * @param fac Inverse of the cell size * @param s The #space containing the particles. * @param local_patches The array of *local* mesh patches. * */ void mpi_mesh_accumulate_gparts_to_local_patches( struct threadpool *tp, const int N, const double fac, const struct space *s, struct pm_mesh_patch *local_patches) { #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) const int *local_cells = s->local_cells_top; const int nr_local_cells = s->nr_local_cells; const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; /* Gather some neutrino constants if using delta-f weighting on the mesh */ struct neutrino_model nu_model; bzero(&nu_model, sizeof(struct neutrino_model)); if (s->e->neutrino_properties->use_delta_f_mesh_only) gather_neutrino_consts(s, &nu_model); /* Use the threadpool to parallelize over cells */ struct accumulate_mapper_data data; data.cells = s->cells_top; data.local_cells = local_cells; data.local_patches = local_patches; data.N = N; data.fac = fac; data.dim[0] = dim[0]; data.dim[1] = dim[1]; data.dim[2] = dim[2]; data.nu_model = &nu_model; threadpool_map(tp, accumulate_cell_to_local_patches_mapper, (void *)local_cells, nr_local_cells, sizeof(int), threadpool_auto_chunk_size, (void *)&data); if (lock_destroy(&lock) != 0) error("Impossible to destroy lock!"); #else error("FFTW MPI not found - unable to use distributed mesh"); #endif } void mesh_patches_to_sorted_array(const struct pm_mesh_patch *local_patches, const int nr_patches, struct mesh_key_value_rho *array, const size_t size) { size_t count = 0; for (int p = 0; p < nr_patches; ++p) { const struct pm_mesh_patch *patch = &local_patches[p]; /* Loop over all cells in the patch */ for (int i = 0; i < patch->mesh_size[0]; i++) { for (int j = 0; j < patch->mesh_size[1]; j++) { for (int k = 0; k < patch->mesh_size[2]; k++) { /* Find array index in the mesh patch */ const int local_index = pm_mesh_patch_index(patch, i, j, k); /* Find index in the full mesh */ const size_t global_index = row_major_id_periodic_size_t_padded( i + patch->mesh_min[0], j + patch->mesh_min[1], k + patch->mesh_min[2], patch->N); /* Get the value */ const double value = patch->mesh[local_index]; /* Store everything in the flattened array using * the global index as a key */ array[count].value = value; array[count].key = global_index; ++count; } } } } /* quick check... */ if (count != size) error("Error flattening the mesh patches!"); } /** * @brief Convert the array of local patches to a slab-distributed 3D mesh * * For FFTW each rank needs to hold a slice of the full mesh. * This routine does the necessary communication to convert * the per-rank local patches into a slab-distributed mesh. * * This function will clean the memory allocated by each of the entry * in the local_patches array. * * @param N The size of the mesh * @param local_n0 The thickness of the slice to store on this rank * @param local_patches The array of local patches. * @param nr_patches The number of local patches. * @param mesh Pointer to the output data buffer. * @param tp The #threadpool object. * @param verbose Are we talkative? */ void mpi_mesh_local_patches_to_slices(const int N, const int local_n0, struct pm_mesh_patch *local_patches, const int nr_patches, double *mesh, struct threadpool *tp, const int verbose) { #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) /* Determine rank, number of ranks */ int nr_nodes, nodeID; MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes); MPI_Comm_rank(MPI_COMM_WORLD, &nodeID); ticks tic = getticks(); /* Count the total number of mesh cells we have. * * Note: There might be duplicates. We don't care at this point. */ size_t count = 0; for (int i = 0; i < nr_patches; ++i) { const struct pm_mesh_patch *p = &local_patches[i]; count += p->mesh_size[0] * p->mesh_size[1] * p->mesh_size[2]; } /* Create an array to contain all the individual mesh cells we have * on this node. For now, this is in random order */ struct mesh_key_value_rho *mesh_sendbuf_unsorted; if (swift_memalign("mesh_sendbuf_unsorted", (void **)&mesh_sendbuf_unsorted, SWIFT_CACHE_ALIGNMENT, count * sizeof(struct mesh_key_value_rho)) != 0) error("Failed to allocate array for unsorted mesh send buffer!"); /* Make an array with the (key, value) pairs from the mesh patches. * * We're going to distribute them between ranks according to their * x coordinate, so we later need to put them in order of destination rank. */ mesh_patches_to_sorted_array(local_patches, nr_patches, mesh_sendbuf_unsorted, count); if (verbose) message(" - Converting mesh patches to array took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); /* Clean the local patches array */ for (int i = 0; i < nr_patches; ++i) pm_mesh_patch_clean(&local_patches[i]); tic = getticks(); /* Now, create space for a sorted version of the array of mesh cells */ struct mesh_key_value_rho *mesh_sendbuf; if (swift_memalign("mesh_sendbuf", (void **)&mesh_sendbuf, SWIFT_CACHE_ALIGNMENT, count * sizeof(struct mesh_key_value_rho)) != 0) error("Failed to allocate array for unsorted mesh send buffer!"); size_t *sorted_offsets = (size_t *)malloc(N * sizeof(size_t)); /* Do a bucket sort of the mesh elements to have them sorted * by global x-coordinate (note we don't care about y,z at this stage) * Also reover the offsets where we switch from one bin to the next */ bucket_sort_mesh_key_value_rho(mesh_sendbuf_unsorted, count, N, tp, mesh_sendbuf, sorted_offsets); /* Let's free the unsorted array to keep things lean */ swift_free("mesh_sendbuf_unsorted", mesh_sendbuf_unsorted); mesh_sendbuf_unsorted = NULL; if (verbose) message(" - Sorting of mesh cells took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Get width of the slice on each rank */ int *slice_width = (int *)malloc(sizeof(int) * nr_nodes); MPI_Allgather(&local_n0, 1, MPI_INT, slice_width, 1, MPI_INT, MPI_COMM_WORLD); /* Determine offset to the slice on each rank */ int *slice_offset = (int *)malloc(sizeof(int) * nr_nodes); slice_offset[0] = 0; for (int i = 1; i < nr_nodes; i++) { slice_offset[i] = slice_offset[i - 1] + slice_width[i - 1]; } /* Compute how many elements are to be sent to each rank */ size_t *nr_send = (size_t *)calloc(nr_nodes, sizeof(size_t)); /* Loop over the offsets */ int dest_node = 0; for (int i = 0; i < N; ++i) { /* Find the first mesh cell in that bucket */ const size_t j = sorted_offsets[i]; /* Get the x coordinate of this mesh cell in the global mesh */ const int mesh_x = get_xcoord_from_padded_row_major_id((size_t)mesh_sendbuf[j].key, N); /* Advance to the destination node that is to contain this x coordinate */ while ((mesh_x >= slice_offset[dest_node] + slice_width[dest_node]) || (slice_width[dest_node] == 0)) { dest_node++; } /* Add all the mesh cells in this bucket */ if (i < N - 1) nr_send[dest_node] += sorted_offsets[i + 1] - sorted_offsets[i]; else nr_send[dest_node] += count - sorted_offsets[i]; } #ifdef SWIFT_DEBUG_CHECKS size_t *nr_send_check = (size_t *)calloc(nr_nodes, sizeof(size_t)); /* Brute-force list without using the offsets */ int dest_node_check = 0; for (size_t i = 0; i < count; i++) { /* Get the x coordinate of this mesh cell in the global mesh */ const int mesh_x = get_xcoord_from_padded_row_major_id((size_t)mesh_sendbuf[i].key, N); /* Advance to the destination node that is to contain this x coordinate */ while ((mesh_x >= slice_offset[dest_node_check] + slice_width[dest_node_check]) || (slice_width[dest_node_check] == 0)) { dest_node_check++; } nr_send_check[dest_node_check]++; } /* Verify the "smart" list is as good as the brute-force one */ for (int i = 0; i < nr_nodes; ++i) { if (nr_send[i] != nr_send_check[i]) error("Invalid send list!"); } free(nr_send_check); #endif /* We don't need the sorted offsets any more from here onwards */ free(sorted_offsets); /* Determine how many requests we'll receive from each MPI rank */ size_t *nr_recv = (size_t *)malloc(sizeof(size_t) * nr_nodes); MPI_Alltoall(nr_send, sizeof(size_t), MPI_BYTE, nr_recv, sizeof(size_t), MPI_BYTE, MPI_COMM_WORLD); size_t nr_recv_tot = 0; for (int i = 0; i < nr_nodes; i++) { nr_recv_tot += nr_recv[i]; } /* Allocate the receive buffer */ struct mesh_key_value_rho *mesh_recvbuf; if (swift_memalign("mesh_recvbuf", (void **)&mesh_recvbuf, SWIFT_CACHE_ALIGNMENT, nr_recv_tot * sizeof(struct mesh_key_value_rho)) != 0) error("Failed to allocate receive buffer for constructing MPI FFT mesh"); if (verbose) message(" - Preparing comms buffers took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Carry out the communication */ exchange_structs(nr_send, (char *)mesh_sendbuf, nr_recv, (char *)mesh_recvbuf, sizeof(struct mesh_key_value_rho)); if (verbose) message(" - MPI exchange took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Copy received data to the output buffer. * This is now a local slice of the global mesh. */ for (size_t i = 0; i < nr_recv_tot; i++) { #ifdef SWIFT_DEBUG_CHECKS /* Verify that we indeed got a cell that should be in the local mesh slice */ const int xcoord = get_xcoord_from_padded_row_major_id(mesh_recvbuf[i].key, N); if (xcoord < slice_offset[nodeID]) error("Received mesh cell is not in the local slice (xcoord too small)"); if (xcoord >= slice_offset[nodeID] + slice_width[nodeID]) error("Received mesh cell is not in the local slice (xcoord too large)"); #endif /* What cell are we looking at? */ const size_t local_index = get_index_in_local_slice( (size_t)mesh_recvbuf[i].key, N, slice_offset[nodeID]); /* Add to the cell*/ mesh[local_index] += mesh_recvbuf[i].value; } if (verbose) message(" - Filling of the density values took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); /* Tidy up */ free(slice_width); free(slice_offset); free(nr_send); free(nr_recv); swift_free("mesh_recvbuf", mesh_recvbuf); swift_free("mesh_sendbuf", mesh_sendbuf); #else error("FFTW MPI not found - unable to use distributed mesh"); #endif } /** * @brief Count the number of mesh cells we will need to request from other * nodes * * @param N the mesh size. * @param fac Inverse of the FFT mesh cell size * @param s The #space containing the particles. */ size_t count_required_mesh_cells(const int N, const double fac, const struct space *s) { const int *local_cells = s->local_cells_top; const int nr_local_cells = s->nr_local_cells; size_t count = 0; /* Loop over our local top level cells */ for (int icell = 0; icell < nr_local_cells; icell++) { struct cell *cell = &(s->cells_top[local_cells[icell]]); /* Skip empty cells */ if (cell->grav.count == 0) continue; /* Determine range of FFT mesh cells we need for particles in this top * level cell. The 5 point stencil used for accelerations requires * 2 neighbouring FFT mesh cells in each direction and for CIC * evaluation of the accelerations we need one extra FFT mesh cell * in the +ve direction. * * We also have to add a small buffer to avoid problems with rounding * * TODO: can we calculate exactly how big the rounding error can be? * Will just assume that 1% of a mesh cell is enough for now.*/ int ixmin[3]; int ixmax[3]; for (int idim = 0; idim < 3; idim++) { const double xmin = cell->loc[idim] - 2.01 / fac; const double xmax = cell->loc[idim] + cell->width[idim] + 3.01 / fac; ixmin[idim] = (int)floor(xmin * fac); ixmax[idim] = (int)floor(xmax * fac); } const int delta_i = (ixmax[0] - ixmin[0]) + 1; const int delta_j = (ixmax[1] - ixmin[1]) + 1; const int delta_k = (ixmax[2] - ixmin[2]) + 1; #ifdef SWIFT_DEBUG_CHECKS if (delta_i > (1 << 12)) error("Not enough bits to store local x-axis index"); if (delta_j > (1 << 12)) error("Not enough bits to store local y-axis index"); if (delta_k > (1 << 12)) error("Not enough bits to store local z-axis index"); #endif count += delta_i * delta_j * delta_k; } return count; } size_t init_required_mesh_cells(const int N, const double fac, const struct space *s, struct mesh_key_value_pot *send_cells) { const int *local_cells = s->local_cells_top; const int nr_local_cells = s->nr_local_cells; size_t count = 0; /* Loop over our local top level cells */ for (int icell = 0; icell < nr_local_cells; icell++) { struct cell *cell = &(s->cells_top[local_cells[icell]]); /* Skip empty cells */ if (cell->grav.count == 0) continue; /* Determine range of FFT mesh cells we need for particles in this top level cell. The 5 point stencil used for accelerations requires 2 neighbouring FFT mesh cells in each direction and for CIC evaluation of the accelerations we need one extra FFT mesh cell in the +ve direction. We also have to add a small buffer to avoid problems with rounding TODO: can we calculate exactly how big the rounding error can be? Will just assume that 1% of a mesh cell is enough for now. */ int ixmin[3]; int ixmax[3]; for (int idim = 0; idim < 3; idim++) { const double xmin = cell->loc[idim] - 2.01 / fac; const double xmax = cell->loc[idim] + cell->width[idim] + 3.01 / fac; ixmin[idim] = (int)floor(xmin * fac); ixmax[idim] = (int)floor(xmax * fac); } #ifdef SWIFT_DEBUG_CHECKS const int delta_i = (ixmax[0] - ixmin[0]) + 1; const int delta_j = (ixmax[1] - ixmin[1]) + 1; const int delta_k = (ixmax[2] - ixmin[2]) + 1; if (delta_i > (1 << 12)) error("Not enough bits to store local x-axis index"); if (delta_j > (1 << 12)) error("Not enough bits to store local y-axis index"); if (delta_k > (1 << 12)) error("Not enough bits to store local z-axis index"); #endif /* Add the required cells to the map */ for (int i = ixmin[0]; i <= ixmax[0]; i++) { for (int j = ixmin[1]; j <= ixmax[1]; j++) { for (int k = ixmin[2]; k <= ixmax[2]; k++) { const size_t index = row_major_id_periodic_size_t_padded(i, j, k, N); /* Indices relative to the patch */ const int ii = i - ixmin[0]; const int jj = j - ixmin[1]; const int kk = k - ixmin[2]; /* Generate a combined index */ const size_t cell_index = cell_index_from_patch_index(icell, ii, jj, kk); send_cells[count].cell_index = cell_index; send_cells[count].value = 0.; send_cells[count].key = index; #ifdef SWIFT_DEBUG_CHECKS int test_ci, test_ii, test_jj, test_kk; patch_index_from_cell_index(cell_index, &test_ci, &test_ii, &test_jj, &test_kk); if (icell != test_ci) error("Invalid cell_index!"); if (ii != test_ii) error("Invalid ii!"); if (jj != test_jj) error("Invalid jj!"); if (kk != test_kk) error("Invalid kk!"); #endif ++count; } } } } return count; } void fill_local_patches_from_mesh_cells( const int N, const double fac, const struct space *s, const struct mesh_key_value_pot *mesh_cells, struct pm_mesh_patch *local_patches, const size_t nr_send_tot) { const int *local_cells = s->local_cells_top; const int nr_local_cells = s->nr_local_cells; const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; size_t offset = 0; /* Loop over our local top level cells */ for (int icell = 0; icell < nr_local_cells; icell++) { const struct cell *cell = &(s->cells_top[local_cells[icell]]); struct pm_mesh_patch *patch = &local_patches[icell]; /* Skip empty cells */ if (cell->grav.count == 0) continue; patch->N = N; patch->fac = fac; /* Will need to wrap particles to position nearest the cell centre */ for (int i = 0; i < 3; i++) { patch->wrap_min[i] = cell->loc[i] + 0.5 * cell->width[i] - 0.5 * dim[i]; patch->wrap_max[i] = cell->loc[i] + 0.5 * cell->width[i] + 0.5 * dim[i]; } int num_cells = 1; for (int i = 0; i < 3; i++) { const double xmin = cell->loc[i] - 2.01 / fac; const double xmax = cell->loc[i] + cell->width[i] + 3.01 / fac; patch->mesh_min[i] = (int)floor(xmin * fac); patch->mesh_max[i] = (int)floor(xmax * fac); patch->mesh_size[i] = patch->mesh_max[i] - patch->mesh_min[i] + 1; num_cells *= patch->mesh_size[i]; } /* Allocate the mesh */ if (swift_memalign("mesh_patch", (void **)&patch->mesh, SWIFT_CACHE_ALIGNMENT, num_cells * sizeof(double)) != 0) error("Failed to allocate array for mesh patch!"); #ifdef SWIFT_DEBUG_CHECKS int count = 0; for (size_t imesh = 0; imesh < nr_send_tot; ++imesh) { const size_t cell_index = mesh_cells[imesh].cell_index; const int temp = cell_index_extract_patch_index(cell_index); if (temp == icell) ++count; } if (count != num_cells) error( "Invalide number of cells to fill the patch! icell=%d count=%d " "num_cells=%d", icell, count, num_cells); #endif /* Now, we can start filling the mesh patch cells from the array of * key-index-value tuples */ for (size_t imesh = offset; imesh < offset + num_cells; ++imesh) { const size_t cell_index = mesh_cells[imesh].cell_index; const double pot = mesh_cells[imesh].value; /* Recover the patch index (should be icell) and the i,j,k indices */ int patch_index, i, j, k; patch_index_from_cell_index(cell_index, &patch_index, &i, &j, &k); #ifdef SWIFT_DEBUG_CHECKS const int temp = cell_index_extract_patch_index(cell_index); if (patch_index != icell) error( "mesh cell not sorted in the right order icell=%d patch_index=%d " "cell_index=%zd i=%d j=%d k=%d imesh=%zd temp=%d", icell, patch_index, cell_index, i, j, k, imesh, temp); #endif /* Flatten out the i,j,k index */ const int local_index = i * (patch->mesh_size[1] * patch->mesh_size[2]) + j * patch->mesh_size[2] + k; /* Store the potential */ patch->mesh[local_index] = pot; } /* Move to the next cell */ offset += num_cells; } } /** * @brief Retrieve the potential in the mesh cells we need to * compute the force on particles on this MPI rank. Result is * returned in the supplied hashmap, which should be initially * empty. * * We need all cells containing points -2 and +3 mesh cell widths * away from each particle along each axis to compute the * potential gradient. * * @param N The size of the mesh * @param fac Inverse of the FFT mesh cell size * @param s The #space containing the particles. * @param local_0_start Offset to the first mesh x coordinate on this rank * @param local_n0 Width of the mesh slab on this rank * @param potential_slice Array with the potential on the local slice of the * mesh * @param tp The #threadpool object. * @param verbose Are we talkative? */ void mpi_mesh_fetch_potential(const int N, const double fac, const struct space *s, const int local_0_start, const int local_n0, double *potential_slice, struct pm_mesh_patch *local_patches, struct threadpool *tp, const int verbose) { #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) /* Determine rank, number of MPI ranks */ int nr_nodes, nodeID; MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes); MPI_Comm_rank(MPI_COMM_WORLD, &nodeID); ticks tic = getticks(); /* Determine how many mesh cells we will need to request */ const size_t nr_send_tot = count_required_mesh_cells(N, fac, s); if (verbose) message(" - Counting required mesh patches took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); struct mesh_key_value_pot *send_cells_unsorted; if (swift_memalign("send_cells_unsorted", (void **)&send_cells_unsorted, SWIFT_CACHE_ALIGNMENT, nr_send_tot * sizeof(struct mesh_key_value_pot)) != 0) error("Failed to allocate array for cells to request!"); tic = getticks(); /* Initialise the mesh cells we will request */ const size_t check_count = init_required_mesh_cells(N, fac, s, send_cells_unsorted); if (nr_send_tot != check_count) error("Count and initialisation incompatible!"); if (verbose) message(" - Init required mesh patches took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); struct mesh_key_value_pot *send_cells; if (swift_memalign("send_cells", (void **)&send_cells, SWIFT_CACHE_ALIGNMENT, nr_send_tot * sizeof(struct mesh_key_value_pot)) != 0) error("Failed to allocate array for cells to request!"); size_t *sorted_offsets = (size_t *)malloc(N * sizeof(size_t)); /* Do a bucket sort of the mesh elements to have them sorted * by global x-coordinate (note we don't care about y,z at this stage) */ bucket_sort_mesh_key_value_pot(send_cells_unsorted, nr_send_tot, N, tp, send_cells, sorted_offsets); swift_free("send_cells_unsorted", send_cells_unsorted); send_cells_unsorted = NULL; if (verbose) message(" - 1st mesh patches sort took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Get width of the mesh slice on each rank */ int *slice_width = (int *)malloc(sizeof(int) * nr_nodes); MPI_Allgather(&local_n0, 1, MPI_INT, slice_width, 1, MPI_INT, MPI_COMM_WORLD); /* Determine first mesh x coordinate stored on each rank */ int *slice_offset = (int *)malloc(sizeof(int) * nr_nodes); slice_offset[0] = 0; for (int i = 1; i < nr_nodes; i++) { slice_offset[i] = slice_offset[i - 1] + slice_width[i - 1]; } /* Count how many mesh cells we need to request from each MPI rank */ size_t *nr_send = (size_t *)calloc(nr_nodes, sizeof(size_t)); /* Loop over the offsets */ int dest_node = 0; for (int i = 0; i < N; ++i) { /* Find the first mesh cell in that bucket */ const size_t j = sorted_offsets[i]; /* Get the x coordinate of this mesh cell in the global mesh */ const int mesh_x = get_xcoord_from_padded_row_major_id((size_t)send_cells[j].key, N); /* Advance to the destination node that is to contain this x coordinate */ while ((mesh_x >= slice_offset[dest_node] + slice_width[dest_node]) || (slice_width[dest_node] == 0)) { dest_node++; } /* Add all the mesh cells in this bucket */ if (i < N - 1) nr_send[dest_node] += sorted_offsets[i + 1] - sorted_offsets[i]; else nr_send[dest_node] += nr_send_tot - sorted_offsets[i]; } #ifdef SWIFT_DEBUG_CHECKS size_t *nr_send_check = (size_t *)calloc(nr_nodes, sizeof(size_t)); /* Brute-force list without using the offsets */ int dest_node_check = 0; for (size_t i = 0; i < nr_send_tot; i++) { while (get_xcoord_from_padded_row_major_id(send_cells[i].key, N) >= (slice_offset[dest_node_check] + slice_width[dest_node_check]) || slice_width[dest_node_check] == 0) { dest_node_check++; } if (dest_node_check >= nr_nodes || dest_node_check < 0) error("Destination node out of range"); nr_send_check[dest_node_check]++; } /* Verify the "smart" list is as good as the brute-force one */ for (int i = 0; i < nr_nodes; ++i) { if (nr_send[i] != nr_send_check[i]) error("Invalid send list!"); } free(nr_send_check); #endif /* We don't need the sorted offsets any more from here onwards */ free(sorted_offsets); /* Determine how many requests we'll receive from each MPI rank */ size_t *nr_recv = (size_t *)malloc(sizeof(size_t) * nr_nodes); MPI_Alltoall(nr_send, sizeof(size_t), MPI_BYTE, nr_recv, sizeof(size_t), MPI_BYTE, MPI_COMM_WORLD); size_t nr_recv_tot = 0; for (int i = 0; i < nr_nodes; i++) { nr_recv_tot += nr_recv[i]; } if (verbose) message(" - Preparing comms buffers took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Allocate buffer to receive requests */ struct mesh_key_value_pot *recv_cells; if (swift_memalign("recv_cells", (void **)&recv_cells, SWIFT_CACHE_ALIGNMENT, nr_recv_tot * sizeof(struct mesh_key_value_pot)) != 0) error("Failed to allocate array for mesh receive buffer!"); /* Send requests for cells to other ranks */ exchange_structs(nr_send, (char *)send_cells, nr_recv, (char *)recv_cells, sizeof(struct mesh_key_value_pot)); if (verbose) message(" - 1st exchange took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Look up potential in the requested cells */ for (size_t i = 0; i < nr_recv_tot; i++) { #ifdef SWIFT_DEBUG_CHECKS const size_t cells_in_slab = ((size_t)N) * (2 * (N / 2 + 1)); const size_t first_local_id = local_0_start * cells_in_slab; const size_t num_local_ids = local_n0 * cells_in_slab; if (recv_cells[i].key < first_local_id || recv_cells[i].key >= first_local_id + num_local_ids) { error("Requested potential mesh cell ID is out of range"); } #endif const size_t local_id = get_index_in_local_slice(recv_cells[i].key, N, local_0_start); #ifdef SWIFT_DEBUG_CHECKS const size_t Ns = N; if (local_id >= Ns * (2 * (Ns / 2 + 1)) * local_n0) error("Local potential mesh cell ID is out of range"); #endif recv_cells[i].value = potential_slice[local_id]; } if (verbose) message(" - Filling of the potential values took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Return the results */ exchange_structs(nr_recv, (char *)recv_cells, nr_send, (char *)send_cells, sizeof(struct mesh_key_value_pot)); if (verbose) message(" - 2nd exchange took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Tidy up */ swift_free("recv_cells", recv_cells); free(slice_width); free(slice_offset); free(nr_send); free(nr_recv); struct mesh_key_value_pot *send_cells_sorted; if (swift_memalign("send_cells_sorted", (void **)&send_cells_sorted, SWIFT_CACHE_ALIGNMENT, nr_send_tot * sizeof(struct mesh_key_value_pot)) != 0) error("Failed to allocate array for cells to request!"); /* Now sort the mesh cells by top-level cell index (i.e. by the patch they * belong to) */ bucket_sort_mesh_key_value_pot_index( send_cells, nr_send_tot, s->nr_local_cells, tp, send_cells_sorted); swift_free("send_cells", send_cells); send_cells = NULL; if (verbose) message(" - 2nd mesh patches sort took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); tic = getticks(); /* Initialise the local patches with the data we just received */ fill_local_patches_from_mesh_cells(N, fac, s, send_cells_sorted, local_patches, nr_send_tot); if (verbose) message(" - Filling the local patches took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); swift_free("send_cells_sorted", send_cells_sorted); #else error("FFTW MPI not found - unable to use distributed mesh"); #endif } /** * @brief Computes the potential on a gpart from a given mesh using the CIC * method. * * @param gp The #gpart. * @param patch The local mesh patch */ #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) void mesh_patch_to_gparts_CIC(struct gpart *gp, const struct pm_mesh_patch *patch) { const double fac = patch->fac; /* Box wrap the gpart's position to the copy nearest the cell centre */ const double pos_x = box_wrap(gp->x[0], patch->wrap_min[0], patch->wrap_max[0]); const double pos_y = box_wrap(gp->x[1], patch->wrap_min[1], patch->wrap_max[1]); const double pos_z = box_wrap(gp->x[2], patch->wrap_min[2], patch->wrap_max[2]); /* Workout the CIC coefficients */ int i = (int)floor(fac * pos_x); const double dx = fac * pos_x - i; const double tx = 1. - dx; int j = (int)floor(fac * pos_y); const double dy = fac * pos_y - j; const double ty = 1. - dy; int k = (int)floor(fac * pos_z); const double dz = fac * pos_z - k; const double tz = 1. - dz; #ifdef SWIFT_GRAVITY_FORCE_CHECKS if (gp->a_grav_mesh[0] != 0.) error("Particle with non-initalised stuff"); #ifndef SWIFT_GRAVITY_NO_POTENTIAL if (gp->potential_mesh != 0.) error("Particle with non-initalised stuff"); #endif #endif /* Some local accumulators */ double p = 0.; double a[3] = {0.}; /* Get coordinates within the mesh patch */ const int ii = i - patch->mesh_min[0]; const int jj = j - patch->mesh_min[1]; const int kk = k - patch->mesh_min[2]; /* Simple CIC for the potential itself */ p += pm_mesh_patch_CIC_get(patch, ii, jj, kk, tx, ty, tz, dx, dy, dz); /* 5-point stencil along each axis for the accelerations */ a[0] += (1. / 12.) * pm_mesh_patch_CIC_get(patch, ii + 2, jj, kk, tx, ty, tz, dx, dy, dz); a[0] -= (2. / 3.) * pm_mesh_patch_CIC_get(patch, ii + 1, jj, kk, tx, ty, tz, dx, dy, dz); a[0] += (2. / 3.) * pm_mesh_patch_CIC_get(patch, ii - 1, jj, kk, tx, ty, tz, dx, dy, dz); a[0] -= (1. / 12.) * pm_mesh_patch_CIC_get(patch, ii - 2, jj, kk, tx, ty, tz, dx, dy, dz); a[1] += (1. / 12.) * pm_mesh_patch_CIC_get(patch, ii, jj + 2, kk, tx, ty, tz, dx, dy, dz); a[1] -= (2. / 3.) * pm_mesh_patch_CIC_get(patch, ii, jj + 1, kk, tx, ty, tz, dx, dy, dz); a[1] += (2. / 3.) * pm_mesh_patch_CIC_get(patch, ii, jj - 1, kk, tx, ty, tz, dx, dy, dz); a[1] -= (1. / 12.) * pm_mesh_patch_CIC_get(patch, ii, jj - 2, kk, tx, ty, tz, dx, dy, dz); a[2] += (1. / 12.) * pm_mesh_patch_CIC_get(patch, ii, jj, kk + 2, tx, ty, tz, dx, dy, dz); a[2] -= (2. / 3.) * pm_mesh_patch_CIC_get(patch, ii, jj, kk + 1, tx, ty, tz, dx, dy, dz); a[2] += (2. / 3.) * pm_mesh_patch_CIC_get(patch, ii, jj, kk - 1, tx, ty, tz, dx, dy, dz); a[2] -= (1. / 12.) * pm_mesh_patch_CIC_get(patch, ii, jj, kk - 2, tx, ty, tz, dx, dy, dz); /* Store things back */ gp->a_grav_mesh[0] = fac * a[0]; gp->a_grav_mesh[1] = fac * a[1]; gp->a_grav_mesh[2] = fac * a[2]; gravity_add_comoving_mesh_potential(gp, p); } #endif /** * @brief Interpolate the forces and potential from the mesh to the #gpart. * * This is for the case where the mesh is distributed between MPI ranks * and stored in the form of a hashmap. This function updates the particles * in one #cell. * * @param c The #cell containing the #gpart to update * @param potential Hashmap containing the potential to interpolate from. * @param N Size of the full mesh * @param fac Inverse of the FFT mesh cell size * @param const_G Gravitional constant * @param dim Dimensions of the #space */ void cell_distributed_mesh_to_gpart_CIC(const struct cell *c, const struct pm_mesh_patch *patch, const int N, const double fac, const float const_G, const double dim[3]) { #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) const int gcount = c->grav.count; struct gpart *gparts = c->grav.parts; /* Check for empty cell as this would cause problems finding the extent */ if (gcount == 0) return; /* Get the potential from the mesh patch to the active gparts using CIC */ for (int i = 0; i < gcount; ++i) { struct gpart *gp = &gparts[i]; if (gp->time_bin == time_bin_inhibited) continue; gp->a_grav_mesh[0] = 0.f; gp->a_grav_mesh[1] = 0.f; gp->a_grav_mesh[2] = 0.f; #ifndef SWIFT_GRAVITY_NO_POTENTIAL gp->potential_mesh = 0.f; #endif mesh_patch_to_gparts_CIC(gp, patch); gp->a_grav_mesh[0] *= const_G; gp->a_grav_mesh[1] *= const_G; gp->a_grav_mesh[2] *= const_G; #ifndef SWIFT_GRAVITY_NO_POTENTIAL gp->potential_mesh *= const_G; #endif } #else error("FFTW MPI not found - unable to use distributed mesh"); #endif } /** * @brief Shared information about the mesh to be used by all the threads in the * pool. */ struct distributed_cic_mapper_data { const struct cell *cells; const int *local_cells; const struct pm_mesh_patch *local_patches; int N; double fac; double dim[3]; float const_G; }; /** * @brief Threadpool mapper function for the mesh CIC assignment of a cell. * * @param map_data A chunk of the list of local cells. * @param num The number of cells in the chunk. * @param extra The information about the mesh and cells. */ void cell_distributed_mesh_to_gpart_CIC_mapper(void *map_data, int num, void *extra) { #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) /* Unpack the shared information */ const struct distributed_cic_mapper_data *data = (struct distributed_cic_mapper_data *)extra; const struct cell *cells = data->cells; const int N = data->N; const double fac = data->fac; const double dim[3] = {data->dim[0], data->dim[1], data->dim[2]}; const float const_G = data->const_G; /* Pointer to the chunk to be processed */ int *local_cells = (int *)map_data; /* Start at the same position in the list of local patches */ const size_t offset = local_cells - data->local_cells; const struct pm_mesh_patch *local_patches = data->local_patches + offset; /* Loop over the elements assigned to this thread */ for (int i = 0; i < num; ++i) { /* Pointer to local cell */ const struct cell *c = &cells[local_cells[i]]; /* Update acceleration and potential for gparts in this cell */ cell_distributed_mesh_to_gpart_CIC(c, &local_patches[i], N, fac, const_G, dim); } #else error("FFTW MPI not found - unable to use distributed mesh"); #endif } void mpi_mesh_update_gparts(struct pm_mesh_patch *local_patches, const struct space *s, struct threadpool *tp, const int N, const double cell_fac) { #if defined(WITH_MPI) && defined(HAVE_MPI_FFTW) const int *local_cells = s->local_cells_top; const int nr_local_cells = s->nr_local_cells; /* Gather the mesh shared information to be used by the threads */ struct distributed_cic_mapper_data data; data.cells = s->cells_top; data.local_cells = local_cells; data.local_patches = local_patches; data.N = N; data.fac = cell_fac; data.dim[0] = s->dim[0]; data.dim[1] = s->dim[1]; data.dim[2] = s->dim[2]; data.const_G = s->e->physical_constants->const_newton_G; if (nr_local_cells == 0) { error("Distributed mesh not implemented without cells"); } else { /* Evaluate acceleration and potential for each gpart */ threadpool_map(tp, cell_distributed_mesh_to_gpart_CIC_mapper, (void *)local_cells, nr_local_cells, sizeof(int), threadpool_auto_chunk_size, (void *)&data); } #else error("FFTW MPI not found - unable to use distributed mesh"); #endif }