Skip to content
Snippets Groups Projects
Commit 47807adb authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Add more analysis of failure and more and less chat

parent 0342d542
No related branches found
No related tags found
No related merge requests found
......@@ -88,11 +88,40 @@ static int datacheck_test(int offset, size_t size, void *data) {
unsigned char *p = (unsigned char *)data;
for (size_t i = 0; i < size; i++) {
if (p[i] != fill) return 0;
if (p[i] != fill) {
if (verbose) {
message("%d: %d != %d", offset, p[i], fill);
fflush(stdout);
}
return 0;
}
}
return 1;
}
/**
* @brief check a data area reporting some statistics about the content.
*
* Assumes datacheck_test() has already failed.
*
* @param size size of data in bytes.
* @param data the data to fill.
*/
static void datacheck_fulltest(size_t size, void *data) {
unsigned char *p = (unsigned char *)data;
double sum = 0.0;
unsigned char pmin = 255;
unsigned char pmax = 0;
for (size_t i = 0; i < size; i++) {
sum += p[i];
if (p[i] > pmax) pmax = p[i];
if (p[i] < pmin) pmin = p[i];
}
message("sum: %.2f, mean: %.2f, min: %d, max: %d", sum, sum / (double)size,
pmin, pmax);
}
/**
* @brief Pick out the relevant logging data for our rank, i.e. all
* activations of sends and recvs. We ignore the original completion logs,
......@@ -172,7 +201,7 @@ int main(int argc, char *argv[]) {
if (res != MPI_SUCCESS)
error("Call to MPI_Comm_rank failed with error %i.", res);
/* Handle the command-line, we expect a mpiuse data file to read and
/* Handle the command-line, we expect a mpiuse data file to read and
* various options. */
int opt;
while ((opt = getopt(argc, argv, "vn:")) != -1) {
......@@ -201,7 +230,7 @@ int main(int argc, char *argv[]) {
/* Now we read the SWIFT MPI logger output that defines the communcations we
* will undertake. Note this has all ranks for a single step, SWIFT outputs
* one MPI log per rank per step, so you need to combine all ranks from a
* step. XXX also extract proxy related communications XXX */
* step. */
mpiuse_log_restore(infile);
int nranks = mpiuse_nr_ranks();
......@@ -226,27 +255,27 @@ int main(int argc, char *argv[]) {
if (myrank == 0)
message("*** Proxy simulation exchange loop: %d ***", nloop);
/* XXX note in SWIFT we use the threadpool to launch these. That may
* matter. */
/* Note in SWIFT we use the threadpool to launch these. */
for (int k = 0; k < nr_send_pcells; k++) {
struct mpiuse_log_entry *log = send_pcells[k];
/* Need to regenerate the tags for each other communication type. */
int basetag = myrank * proxy_tag_shift;
/* Start Isend counts of pcells. Really just the size of the buffer we're
* about to send, SWIFT sends the count. */
/* Start Isend counts of pcells. Really just the size of the buffer
* we're about to send, SWIFT sends the count. */
int size = log->size;
res =
MPI_Isend(&size, 1, MPI_INT, log->otherrank, basetag + proxy_tag_count,
MPI_COMM_WORLD, &req_send_counts[k]);
res = MPI_Isend(&size, 1, MPI_INT, log->otherrank,
basetag + proxy_tag_count,
MPI_COMM_WORLD, &req_send_counts[k]);
if (res != MPI_SUCCESS) error("Counts MPI_Isend failed.");
/* Start Isend of pcells. */
/* Start Isend of pcells, filling the data with a pattern for checking
* on arrival. */
log->data = calloc(log->size, 1);
/* Fill data with a pattern for checking on arrival. */
datacheck_fill(0, log->size, log->data);
res = MPI_Isend(log->data, log->size, MPI_BYTE, log->otherrank,
basetag + proxy_tag_cells, MPI_COMM_WORLD,
&req_pcells_out[k]);
......@@ -254,6 +283,7 @@ int main(int argc, char *argv[]) {
/* Start Irecv counts of pcells from other rank. */
basetag = log->otherrank * proxy_tag_shift;
res = MPI_Irecv(&pcells_size[k], 1, MPI_INT, log->otherrank,
basetag + proxy_tag_count, MPI_COMM_WORLD,
&req_recv_counts[k]);
......@@ -268,6 +298,7 @@ int main(int argc, char *argv[]) {
for (int k = 0; k < nr_send_pcells; k++) {
int pid = MPI_UNDEFINED;
MPI_Status status;
res = MPI_Waitany(nr_send_pcells, req_recv_counts, &pid, &status);
if (res != MPI_SUCCESS || pid == MPI_UNDEFINED)
error("MPI_Waitany failed.");
......@@ -280,36 +311,42 @@ int main(int argc, char *argv[]) {
/* Fill data with a pattern for checking when overwritten. */
datacheck_fill(1, pcells_size[pid], pcells_in[pid]);
res = MPI_Irecv(pcells_in[pid], pcells_size[pid], MPI_BYTE, log->otherrank,
basetag + proxy_tag_cells, MPI_COMM_WORLD,
&req_pcells_in[pid]);
res = MPI_Irecv(pcells_in[pid], pcells_size[pid], MPI_BYTE,
log->otherrank, basetag + proxy_tag_cells,
MPI_COMM_WORLD, &req_pcells_in[pid]);
if (res != MPI_SUCCESS) error("Pcell MPI_Irecv failed.");
}
message("All proxy cell counts have arrived");
message("All proxy cell counts have arrived, pcells irecvs are launched");
/* Waitall for all Isend counts to complete. */
res = MPI_Waitall(nr_send_pcells, req_send_counts, MPI_STATUSES_IGNORE);
if (res != MPI_SUCCESS) error("Waitall for counts Isend failed.");
message("All sends of counts have completed");
/* Now wait for the pcell irecvs to complete, so we receive the pcells,
* which would be unpacked in SWIFT. */
for (int k = 0; k < nr_send_pcells; k++) {
int pid = MPI_UNDEFINED;
MPI_Status status;
res = MPI_Waitany(nr_send_pcells, req_pcells_in, &pid, &status);
if (res != MPI_SUCCESS || pid == MPI_UNDEFINED)
error("MPI_Waitany failed.");
/* Check the data received is correct. */
if (!datacheck_test(0, pcells_size[pid], pcells_in[pid])) {
if (!datacheck_test(1, pcells_size[pid], pcells_in[pid])) {
error("Received data is not modified");
message("Received data is not correct");
datacheck_fulltest(pcells_size[pid], pcells_in[pid]);
if (datacheck_test(1, pcells_size[pid], pcells_in[pid])) {
error("Received data is not modified on receive");
} else {
error("Received data is corrupt");
}
} else {
message("Received data is correct");
if (verbose)
message("Received data is correct");
}
free(pcells_in[pid]);
pcells_in[pid] = NULL;
......@@ -319,14 +356,17 @@ int main(int argc, char *argv[]) {
/* Waitall for Isend of pcells to complete. */
res = MPI_Waitall(nr_send_pcells, req_pcells_out, MPI_STATUSES_IGNORE);
if (res != MPI_SUCCESS) error("Waitall for pcells Isend failed.");
message("All sends of pcells have completed");
/* Check data is unmodified. */
/* Check data is unmodified while being offloaded. */
for (int k = 0; k < nr_send_pcells; k++) {
struct mpiuse_log_entry *log = send_pcells[k];
if (!datacheck_test(0, log->size, log->data)) {
datacheck_fulltest(log->size, log->data);
error("Sent data has been corrupted");
} else {
message("Sent data is correct");
if (verbose)
message("Sent data is correct");
}
free(log->data);
log->data = NULL;
......@@ -339,7 +379,7 @@ int main(int argc, char *argv[]) {
if (res != MPI_SUCCESS)
error("call to MPI_Finalize failed with error %i.", res);
if (myrank == 0) message("Bye");
if (myrank == 0) message("All done, no errors detected");
return 0;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment