Skip to content
Snippets Groups Projects
Commit a757d065 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Some signs of life receive needs work on the logic

parent f860ad09
No related branches found
No related tags found
2 merge requests!11Draft: Fast one-sided MPI version,!8Draft: RDMA version with wrapped infinity calls
This commit is part of merge request !11. Comments created here will be created in the context of that merge request.
...@@ -17,6 +17,17 @@ ...@@ -17,6 +17,17 @@
* *
******************************************************************************/ ******************************************************************************/
// Simple approach, use the window as a message board, capable of receiving a
// single message per ranks at a time, so needs to be larger than the largest
// message, and we need one per of these per rank.
//
// So we poll all ranks waiting for a push update unlocks its board, we then
// check for the tag and size, which need to match one of the expected
// messages, at which point we copy that away and release the board.
//
// On the send side we work synchronously, sending a message at a time
// waiting for our board to be unlocked by the receiver.
#include <limits.h> #include <limits.h>
#include <mpi.h> #include <mpi.h>
#include <pthread.h> #include <pthread.h>
...@@ -36,8 +47,16 @@ int myrank = -1; ...@@ -36,8 +47,16 @@ int myrank = -1;
/* Number of ranks. */ /* Number of ranks. */
static int nr_ranks; static int nr_ranks;
#define READY -2 /* Flags for controlling access. */
#define DONE -10 static int LOCKED = -2;
static int UNLOCKED = -3;
/* Size of message header. The flag, size and tag. */
static size_t HEADER_SIZE = sizeof(size_t) * 3;
/* Size of a message board, we have one of these per rank per communicator
* (i.e. per window). */
static size_t MESSAGE_SIZE = 0;
/* Are we verbose. */ /* Are we verbose. */
static int verbose = 0; static int verbose = 0;
...@@ -57,17 +76,8 @@ static MPI_Comm subtypeMPI_comms[task_subtype_count]; ...@@ -57,17 +76,8 @@ static MPI_Comm subtypeMPI_comms[task_subtype_count];
static MPI_Win mpi_window[task_subtype_count]; static MPI_Win mpi_window[task_subtype_count];
static char *mpi_ptr[task_subtype_count]; static char *mpi_ptr[task_subtype_count];
// Simple approach, use the window as a message board, capable of receiving a /* Size of a board for a rank. */
// single message at a time, so needs to be larger than the largest message. static size_t board_size = 0;
//
// So we poll for all possible messages, until we see our tag and size
// at which point we copy that away and release the window.
//
// On the send side we work synchronously, sending a message at a time
// (could have one send per rank underway, as a further development).
//
// Synchronization seems two side, with exclusive locks on both sides
// and an atomic flag to control access.
/* The local send queue. */ /* The local send queue. */
static struct mpiuse_log_entry **volatile send_queue; static struct mpiuse_log_entry **volatile send_queue;
...@@ -110,6 +120,8 @@ static int datacheck_test(size_t size, void *data) { ...@@ -110,6 +120,8 @@ static int datacheck_test(size_t size, void *data) {
/** /**
* @brief Send thread, sends messages to other ranks one-by-one. * @brief Send thread, sends messages to other ranks one-by-one.
*
* Messages are all considered in order, regardless of the subtype.
*/ */
static void *send_thread(void *arg) { static void *send_thread(void *arg) {
...@@ -118,34 +130,65 @@ static void *send_thread(void *arg) { ...@@ -118,34 +130,65 @@ static void *send_thread(void *arg) {
for (int k = 0; k < nr_send; k++) { for (int k = 0; k < nr_send; k++) {
struct mpiuse_log_entry *log = send_queue[k]; struct mpiuse_log_entry *log = send_queue[k];
log->data = calloc(log->size + 1, 1);
/* Data has the actual data and room for the header. */
log->data = calloc(HEADER_SIZE + log->size, 1);
size_t *dataptr = (size_t *)log->data;
/* Fill data with pattern. */ /* Fill data with pattern. */
if (datacheck) datacheck_fill(log->size, log->data); if (datacheck) datacheck_fill(HEADER_SIZE + log->size, dataptr);
/* Last char is marked as READY (to receive) */ /* First element is marked as LOCKED, so only we can update. */
((char *)log->data)[log->size] = READY; dataptr[0] = LOCKED;
dataptr[1] = log->size;
dataptr[2] = log->tag;
/* And send data to other rank. */ /* And send data to other rank. */
MPI_Accumulate(log->data, log->size + 1, MPI_BYTE, log->otherrank, 0, MPI_Accumulate(log->data, HEADER_SIZE + log->size, MPI_BYTE,
log->size + 1, MPI_BYTE, MPI_REPLACE, log->otherrank, MESSAGE_SIZE * myrank,
HEADER_SIZE + log->size, MPI_BYTE, MPI_REPLACE,
mpi_window[log->subtype]); mpi_window[log->subtype]);
/* Now we change the last element to DONE so that the remote end can /* Now we change the last element to UNLOCKED so that the remote end can
* find out that the data has arrived. */ * find out that the data has arrived. */
char newval[1]; size_t newval[1];
char oldval[1]; size_t oldval[1];
newval[0] = DONE; newval[0] = UNLOCKED;
oldval[0] = DONE; oldval[0] = 0;
MPI_Compare_and_swap(&newval[0], &((char *)log->data)[log->size], MPI_Compare_and_swap(&newval[0], dataptr, &oldval[0], MPI_AINT,
&oldval[0], MPI_BYTE, log->otherrank, log->size, log->otherrank, myrank * MESSAGE_SIZE,
mpi_window[log->subtype]); mpi_window[log->subtype]);
//MPI_Win_flush(log->otherrank, mpi_window[log->subtype]); //MPI_Win_flush(log->otherrank, mpi_window[log->subtype]);
MPI_Win_flush_all(mpi_window[log->subtype]); MPI_Win_flush_all(mpi_window[log->subtype]);
// XXX need to make sure the data area is free to overwrite once more XXX if (oldval[0] == dataptr[0]) {
// flip the remote atomic. message("sent a message to %d/%d (%zd:%zd:%zd)", log->otherrank,
log->subtype, dataptr[0], oldval[0], newval[0]);
} else {
message("failed to send a message to %d/%d (%zd:%zd:%zd)", log->otherrank,
log->subtype, dataptr[0], oldval[0], newval[0]);
}
/* Wait for completion, this is when remote flips back to LOCKED. We poll
* on a get, as the local window is only used for receiving. Use an Rget
* so we can use MPI_Test to get some local progression. */
while (dataptr[0] == LOCKED) {
MPI_Request request;
MPI_Rget(dataptr, 1, MPI_AINT, log->otherrank, 0, 1, MPI_AINT,
mpi_window[log->subtype], &request);
MPI_Win_flush_all(mpi_window[log->subtype]);
int flag = 0;
while (flag == 0) {
MPI_Test(&request, &flag, MPI_STATUS_IGNORE);
}
message("Waiting for unlock (%zd)", dataptr[0]);
}
message("sent and received... %d/%d", k, nr_send);
/* Ready the next send. */
} }
message("took %.3f %s.", clocks_from_ticks(getticks() - starttics), message("took %.3f %s.", clocks_from_ticks(getticks() - starttics),
...@@ -180,17 +223,15 @@ static void *recv_thread(void *arg) { ...@@ -180,17 +223,15 @@ static void *recv_thread(void *arg) {
if (log != NULL && !log->done) { if (log != NULL && !log->done) {
ticks tics = getticks(); ticks tics = getticks();
int arrived = 0; MPI_Win_flush_all(mpi_window[log->subtype]); // XXX emergency measure
//message("Checking at %zd", log->size);
if (mpi_ptr[log->subtype][log->size] == DONE) arrived = 1;
// Need to allow for some MPI progession... Since we make no
// MPI calls. Should not be needed if using a progression thread?
int flag;
MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
&flag, MPI_STATUS_IGNORE);
// XXX match this to the expected receive... /* See if this message is ready (should really check for messages
* and match to the queue, so other way around). */
int ready = 0;
size_t lockval = ((size_t *)&mpi_ptr[log->subtype][log->otherrank * MESSAGE_SIZE])[0];
//message("Checking %d/%d at %zd: lockval %zd", log->rank, log->subtype,
// log->otherrank * MESSAGE_SIZE, lockval);
if (lockval == UNLOCKED) ready = 1;
/* Increment etc. of statistics about time spent waiting. */ /* Increment etc. of statistics about time spent waiting. */
ticks dt = getticks() - tics; ticks dt = getticks() - tics;
...@@ -205,11 +246,14 @@ static void *recv_thread(void *arg) { ...@@ -205,11 +246,14 @@ static void *recv_thread(void *arg) {
if (dt < lmint) lmint = dt; if (dt < lmint) lmint = dt;
if (dt > lmaxt) lmaxt = dt; if (dt > lmaxt) lmaxt = dt;
if (arrived) { if (ready) {
message("We have a ready message %d/%d at %zd: lockval %zd", log->rank, log->subtype,
log->otherrank * MESSAGE_SIZE, lockval);
/* Check data sent data is unchanged and received data is as /* Check data sent data is unchanged and received data is as
* expected. */ * expected. */
if (datacheck && !datacheck_test(log->size, log->data)) { if (datacheck && !datacheck_test(log->size, &mpi_ptr[log->subtype][log->otherrank * MESSAGE_SIZE])) {
error("Data mismatch on completion"); message("Data mismatch on completion");
} }
/* Done, clean up. */ /* Done, clean up. */
...@@ -217,7 +261,16 @@ static void *recv_thread(void *arg) { ...@@ -217,7 +261,16 @@ static void *recv_thread(void *arg) {
log->endtic = getticks(); log->endtic = getticks();
free(log->data); free(log->data);
atomic_dec(&todo_recv); atomic_dec(&todo_recv);
/* Ready for next message. */
((size_t *)&mpi_ptr[log->subtype][log->otherrank * MESSAGE_SIZE])[0] = LOCKED;
} }
/* Need to allow for some MPI progession. Since we make no
* MPI calls. Should not be needed if using a progression thread? */
int flag = 0;
MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
&flag, MPI_STATUS_IGNORE);
} }
} }
} }
...@@ -363,11 +416,14 @@ int main(int argc, char *argv[]) { ...@@ -363,11 +416,14 @@ int main(int argc, char *argv[]) {
/* Extract the send and recv messages for our rank. */ /* Extract the send and recv messages for our rank. */
size_t maxsize = pick_logs(); size_t maxsize = pick_logs();
/* Size of a message board. */
MESSAGE_SIZE = maxsize + HEADER_SIZE;
/* Now for the one-sided setup... We need a buffer with space largest /* Now for the one-sided setup... We need a buffer with space largest
* message. */ * message, plus one of these per rank. */
for (int i = 0; i < task_subtype_count; i++) { for (int i = 0; i < task_subtype_count; i++) {
MPI_Comm_dup(MPI_COMM_WORLD, &subtypeMPI_comms[i]); MPI_Comm_dup(MPI_COMM_WORLD, &subtypeMPI_comms[i]);
MPI_Win_allocate(maxsize + sizeof(size_t), sizeof(int), MPI_INFO_NULL, MPI_Win_allocate(MESSAGE_SIZE * nr_ranks, sizeof(int), MPI_INFO_NULL,
subtypeMPI_comms[i], &mpi_ptr[i], &mpi_window[i]); subtypeMPI_comms[i], &mpi_ptr[i], &mpi_window[i]);
/* Assert a shared lock with all the other processes on this window. */ /* Assert a shared lock with all the other processes on this window. */
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment