Commit 2ec982df authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Move the updated synchronised redistribute function to the new engine_redistribute.c file.

parent b52e6c84
This diff is collapsed.
...@@ -29,7 +29,6 @@ ...@@ -29,7 +29,6 @@
#include "memswap.h" #include "memswap.h"
#ifdef WITH_MPI #ifdef WITH_MPI
/** /**
* Do the exchange of one type of particles with all the other nodes. * Do the exchange of one type of particles with all the other nodes.
* *
...@@ -44,6 +43,8 @@ ...@@ -44,6 +43,8 @@
* @param mpi_type the MPI_Datatype for these particles. * @param mpi_type the MPI_Datatype for these particles.
* @param nr_nodes the number of nodes to exchange with. * @param nr_nodes the number of nodes to exchange with.
* @param nodeID the id of this node. * @param nodeID the id of this node.
* @param syncredist whether to use slower more memory friendly synchronous
* exchanges.
* *
* @result new particle data constructed from all the exchanges with the * @result new particle data constructed from all the exchanges with the
* given alignment. * given alignment.
...@@ -51,7 +52,7 @@ ...@@ -51,7 +52,7 @@
static void *engine_do_redistribute(const char *label, int *counts, char *parts, static void *engine_do_redistribute(const char *label, int *counts, char *parts,
size_t new_nr_parts, size_t sizeofparts, size_t new_nr_parts, size_t sizeofparts,
size_t alignsize, MPI_Datatype mpi_type, size_t alignsize, MPI_Datatype mpi_type,
int nr_nodes, int nodeID) { int nr_nodes, int nodeID, int syncredist) {
/* Allocate a new particle array with some extra margin */ /* Allocate a new particle array with some extra margin */
char *parts_new = NULL; char *parts_new = NULL;
...@@ -60,100 +61,178 @@ static void *engine_do_redistribute(const char *label, int *counts, char *parts, ...@@ -60,100 +61,178 @@ static void *engine_do_redistribute(const char *label, int *counts, char *parts,
sizeofparts * new_nr_parts * engine_redistribute_alloc_margin) != 0) sizeofparts * new_nr_parts * engine_redistribute_alloc_margin) != 0)
error("Failed to allocate new particle data."); error("Failed to allocate new particle data.");
/* Prepare MPI requests for the asynchronous communications */ if (syncredist) {
MPI_Request *reqs;
if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 2 * nr_nodes)) == /* Slow synchronous redistribute,. */
NULL) size_t offset_send = 0, offset_recv = 0;
error("Failed to allocate MPI request list.");
/* Only send and receive only "chunk" particles per request. So we need to /* Only send and receive only "chunk" particles per request.
* loop as many times as necessary here. Make 2Gb/sizeofparts so we only * Fixing the message size to 2GB. */
* send 2Gb packets. */ const int chunk = INT_MAX / sizeofparts;
const int chunk = INT_MAX / sizeofparts; int res = 0;
int sent = 0; for (int k = 0; k < nr_nodes; k++) {
int recvd = 0; int kk = k;
/* Rank 0 decides the index of sending node */
MPI_Bcast(&kk, 1, MPI_INT, 0, MPI_COMM_WORLD);
int ind_recv = kk * nr_nodes + nodeID;
if (nodeID == kk) {
/* Send out our particles. */
offset_send = 0;
for (int j = 0; j < nr_nodes; j++) {
int ind_send = kk * nr_nodes + j;
/* Just copy our own parts */
if (counts[ind_send] > 0) {
if (j == nodeID) {
memcpy(&parts_new[offset_recv * sizeofparts],
&parts[offset_send * sizeofparts],
sizeofparts * counts[ind_recv]);
offset_send += counts[ind_send];
offset_recv += counts[ind_recv];
} else {
for (int i = 0, n = 0; i < counts[ind_send]; n++) {
/* Count and index, with chunk parts at most. */
size_t sendc = min(chunk, counts[ind_send] - i);
size_t sendo = offset_send + i;
res = MPI_Send(&parts[sendo * sizeofparts], sendc, mpi_type, j,
n, MPI_COMM_WORLD);
if (res != MPI_SUCCESS) {
mpi_error(res, "Failed to send parts to node %i from %i.", j,
nodeID);
}
i += sendc;
}
offset_send += counts[ind_send];
}
}
}
} else {
/* Listen for sends from kk. */
if (counts[ind_recv] > 0) {
for (int i = 0, n = 0; i < counts[ind_recv]; n++) {
/* Count and index, with +chunk parts at most. */
size_t recvc = min(chunk, counts[ind_recv] - i);
size_t recvo = offset_recv + i;
MPI_Status status;
res = MPI_Recv(&parts_new[recvo * sizeofparts], recvc, mpi_type, kk,
n, MPI_COMM_WORLD, &status);
if (res != MPI_SUCCESS) {
mpi_error(res, "Failed to recv of parts from node %i to %i.", kk,
nodeID);
}
i += recvc;
}
offset_recv += counts[ind_recv];
}
}
}
int activenodes = 1; } else {
while (activenodes) { /* Asynchronous redistribute, can take a lot of memory. */
for (int k = 0; k < 2 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL; /* Prepare MPI requests for the asynchronous communications */
MPI_Request *reqs;
if ((reqs = (MPI_Request *)malloc(sizeof(MPI_Request) * 2 * nr_nodes)) ==
NULL)
error("Failed to allocate MPI request list.");
/* Emit the sends and recvs for the data. */ /* Only send and receive only "chunk" particles per request. So we need to
size_t offset_send = sent; * loop as many times as necessary here. Make 2Gb/sizeofparts so we only
size_t offset_recv = recvd; * send 2Gb packets. */
activenodes = 0; const int chunk = INT_MAX / sizeofparts;
int sent = 0;
int recvd = 0;
for (int k = 0; k < nr_nodes; k++) { int activenodes = 1;
while (activenodes) {
/* Indices in the count arrays of the node of interest */ for (int k = 0; k < 2 * nr_nodes; k++) reqs[k] = MPI_REQUEST_NULL;
const int ind_send = nodeID * nr_nodes + k;
const int ind_recv = k * nr_nodes + nodeID;
/* Are we sending any data this loop? */ /* Emit the sends and recvs for the data. */
int sending = counts[ind_send] - sent; size_t offset_send = sent;
if (sending > 0) { size_t offset_recv = recvd;
activenodes++; activenodes = 0;
if (sending > chunk) sending = chunk;
/* If the send and receive is local then just copy. */ for (int k = 0; k < nr_nodes; k++) {
if (k == nodeID) {
int receiving = counts[ind_recv] - recvd;
if (receiving > chunk) receiving = chunk;
memcpy(&parts_new[offset_recv * sizeofparts],
&parts[offset_send * sizeofparts], sizeofparts * receiving);
} else {
/* Otherwise send it. */
int res =
MPI_Isend(&parts[offset_send * sizeofparts], sending, mpi_type, k,
ind_send, MPI_COMM_WORLD, &reqs[2 * k + 0]);
if (res != MPI_SUCCESS)
mpi_error(res, "Failed to isend parts to node %i.", k);
}
}
/* If we're sending to this node, then move past it to next. */ /* Indices in the count arrays of the node of interest */
if (counts[ind_send] > 0) offset_send += counts[ind_send]; const int ind_send = nodeID * nr_nodes + k;
const int ind_recv = k * nr_nodes + nodeID;
/* Are we receiving any data from this node? Note already done if coming /* Are we sending any data this loop? */
* from this node. */ int sending = counts[ind_send] - sent;
if (k != nodeID) { if (sending > 0) {
int receiving = counts[ind_recv] - recvd;
if (receiving > 0) {
activenodes++; activenodes++;
if (receiving > chunk) receiving = chunk; if (sending > chunk) sending = chunk;
int res = MPI_Irecv(&parts_new[offset_recv * sizeofparts], receiving,
mpi_type, k, ind_recv, MPI_COMM_WORLD, /* If the send and receive is local then just copy. */
&reqs[2 * k + 1]); if (k == nodeID) {
if (res != MPI_SUCCESS) int receiving = counts[ind_recv] - recvd;
mpi_error(res, "Failed to emit irecv of parts from node %i.", k); if (receiving > chunk) receiving = chunk;
memcpy(&parts_new[offset_recv * sizeofparts],
&parts[offset_send * sizeofparts], sizeofparts * receiving);
} else {
/* Otherwise send it. */
int res =
MPI_Isend(&parts[offset_send * sizeofparts], sending, mpi_type,
k, ind_send, MPI_COMM_WORLD, &reqs[2 * k + 0]);
if (res != MPI_SUCCESS)
mpi_error(res, "Failed to isend parts to node %i.", k);
}
} }
}
/* If we're receiving from this node, then move past it to next. */ /* If we're sending to this node, then move past it to next. */
if (counts[ind_recv] > 0) offset_recv += counts[ind_recv]; if (counts[ind_send] > 0) offset_send += counts[ind_send];
}
/* Are we receiving any data from this node? Note already done if coming
* from this node. */
if (k != nodeID) {
int receiving = counts[ind_recv] - recvd;
if (receiving > 0) {
activenodes++;
if (receiving > chunk) receiving = chunk;
int res = MPI_Irecv(&parts_new[offset_recv * sizeofparts],
receiving, mpi_type, k, ind_recv,
MPI_COMM_WORLD, &reqs[2 * k + 1]);
if (res != MPI_SUCCESS)
mpi_error(res, "Failed to emit irecv of parts from node %i.", k);
}
}
/* If we're receiving from this node, then move past it to next. */
if (counts[ind_recv] > 0) offset_recv += counts[ind_recv];
}
/* Wait for all the sends and recvs to tumble in. */ /* Wait for all the sends and recvs to tumble in. */
MPI_Status stats[2 * nr_nodes]; MPI_Status stats[2 * nr_nodes];
int res; int res;
if ((res = MPI_Waitall(2 * nr_nodes, reqs, stats)) != MPI_SUCCESS) { if ((res = MPI_Waitall(2 * nr_nodes, reqs, stats)) != MPI_SUCCESS) {
for (int k = 0; k < 2 * nr_nodes; k++) { for (int k = 0; k < 2 * nr_nodes; k++) {
char buff[MPI_MAX_ERROR_STRING]; char buff[MPI_MAX_ERROR_STRING];
MPI_Error_string(stats[k].MPI_ERROR, buff, &res); MPI_Error_string(stats[k].MPI_ERROR, buff, &res);
message("request from source %i, tag %i has error '%s'.", message("request from source %i, tag %i has error '%s'.",
stats[k].MPI_SOURCE, stats[k].MPI_TAG, buff); stats[k].MPI_SOURCE, stats[k].MPI_TAG, buff);
}
error("Failed during waitall for part data.");
} }
error("Failed during waitall for part data.");
/* Move to next chunks. */
sent += chunk;
recvd += chunk;
} }
/* Move to next chunks. */ /* Free temps. */
sent += chunk; free(reqs);
recvd += chunk;
} }
/* Free temps. */
free(reqs);
/* And return new memory. */ /* And return new memory. */
return parts_new; return parts_new;
} }
...@@ -430,7 +509,8 @@ static void engine_redistribute_relink_mapper(void *map_data, int num_elements, ...@@ -430,7 +509,8 @@ static void engine_redistribute_relink_mapper(void *map_data, int num_elements,
* 3) The particles to send are placed in a temporary buffer in which the * 3) The particles to send are placed in a temporary buffer in which the
* part-gpart links are preserved. * part-gpart links are preserved.
* 4) Each node allocates enough space for the new particles. * 4) Each node allocates enough space for the new particles.
* 5) (Asynchronous) communications are issued to transfer the data. * 5) Asynchronous or synchronous communications are issued to transfer the
* data.
* *
* *
* @param e The #engine. * @param e The #engine.
...@@ -895,7 +975,7 @@ void engine_redistribute(struct engine *e) { ...@@ -895,7 +975,7 @@ void engine_redistribute(struct engine *e) {
/* SPH particles. */ /* SPH particles. */
void *new_parts = engine_do_redistribute( void *new_parts = engine_do_redistribute(
"parts", counts, (char *)s->parts, nr_parts_new, sizeof(struct part), "parts", counts, (char *)s->parts, nr_parts_new, sizeof(struct part),
part_align, part_mpi_type, nr_nodes, nodeID); part_align, part_mpi_type, nr_nodes, nodeID, e->syncredist);
swift_free("parts", s->parts); swift_free("parts", s->parts);
s->parts = (struct part *)new_parts; s->parts = (struct part *)new_parts;
s->nr_parts = nr_parts_new; s->nr_parts = nr_parts_new;
...@@ -904,32 +984,35 @@ void engine_redistribute(struct engine *e) { ...@@ -904,32 +984,35 @@ void engine_redistribute(struct engine *e) {
/* Extra SPH particle properties. */ /* Extra SPH particle properties. */
new_parts = engine_do_redistribute( new_parts = engine_do_redistribute(
"xparts", counts, (char *)s->xparts, nr_parts_new, sizeof(struct xpart), "xparts", counts, (char *)s->xparts, nr_parts_new, sizeof(struct xpart),
xpart_align, xpart_mpi_type, nr_nodes, nodeID); xpart_align, xpart_mpi_type, nr_nodes, nodeID, e->syncredist);
swift_free("xparts", s->xparts); swift_free("xparts", s->xparts);
s->xparts = (struct xpart *)new_parts; s->xparts = (struct xpart *)new_parts;
/* Gravity particles. */ /* Gravity particles. */
new_parts = engine_do_redistribute( new_parts =
"gparts", g_counts, (char *)s->gparts, nr_gparts_new, engine_do_redistribute("gparts", g_counts, (char *)s->gparts,
sizeof(struct gpart), gpart_align, gpart_mpi_type, nr_nodes, nodeID); nr_gparts_new, sizeof(struct gpart), gpart_align,
gpart_mpi_type, nr_nodes, nodeID, e->syncredist);
swift_free("gparts", s->gparts); swift_free("gparts", s->gparts);
s->gparts = (struct gpart *)new_parts; s->gparts = (struct gpart *)new_parts;
s->nr_gparts = nr_gparts_new; s->nr_gparts = nr_gparts_new;
s->size_gparts = engine_redistribute_alloc_margin * nr_gparts_new; s->size_gparts = engine_redistribute_alloc_margin * nr_gparts_new;
/* Star particles. */ /* Star particles. */
new_parts = engine_do_redistribute( new_parts =
"sparts", s_counts, (char *)s->sparts, nr_sparts_new, engine_do_redistribute("sparts", s_counts, (char *)s->sparts,
sizeof(struct spart), spart_align, spart_mpi_type, nr_nodes, nodeID); nr_sparts_new, sizeof(struct spart), spart_align,
spart_mpi_type, nr_nodes, nodeID, e->syncredist);
swift_free("sparts", s->sparts); swift_free("sparts", s->sparts);
s->sparts = (struct spart *)new_parts; s->sparts = (struct spart *)new_parts;
s->nr_sparts = nr_sparts_new; s->nr_sparts = nr_sparts_new;
s->size_sparts = engine_redistribute_alloc_margin * nr_sparts_new; s->size_sparts = engine_redistribute_alloc_margin * nr_sparts_new;
/* Black holes particles. */ /* Black holes particles. */
new_parts = engine_do_redistribute( new_parts =
"bparts", b_counts, (char *)s->bparts, nr_bparts_new, engine_do_redistribute("bparts", b_counts, (char *)s->bparts,
sizeof(struct bpart), bpart_align, bpart_mpi_type, nr_nodes, nodeID); nr_bparts_new, sizeof(struct bpart), bpart_align,
bpart_mpi_type, nr_nodes, nodeID, e->syncredist);
swift_free("bparts", s->bparts); swift_free("bparts", s->bparts);
s->bparts = (struct bpart *)new_parts; s->bparts = (struct bpart *)new_parts;
s->nr_bparts = nr_bparts_new; s->nr_bparts = nr_bparts_new;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment