From 2709a75b028a8f203d0477274fb6f28b7184ea18 Mon Sep 17 00:00:00 2001 From: "Peter W. Draper" <p.w.draper@durham.ac.uk> Date: Fri, 6 Mar 2020 17:19:08 +0000 Subject: [PATCH] Start a proxy cell exchange simulation --- Makefile | 7 +- swiftmpiproxies.c | 259 ++++++++++++++++++ ...5-mpiuse_report-step436-4ranks-proxies.dat | 96 +++++++ 3 files changed, 360 insertions(+), 2 deletions(-) create mode 100644 swiftmpiproxies.c create mode 100644 testdata/EAGLE_25-mpiuse_report-step436-4ranks-proxies.dat diff --git a/Makefile b/Makefile index 500e52e..f136f42 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,13 @@ CFLAGS = -g -O0 -Wall - -all: swiftmpistepsim +all: swiftmpistepsim swiftmpiproxies swiftmpistepsim: swiftmpistepsim.c mpiuse.c mpiuse.h atomic.h cycle.h clocks.h clocks.c $(CC) $(CFLAGS) -o swiftmpistepsim swiftmpistepsim.c mpiuse.c clocks.c -I/usr/include/mpi -lmpi -lpthread +swiftmpiproxies: swiftmpiproxies.c mpiuse.c mpiuse.h atomic.h cycle.h clocks.h clocks.c + $(CC) $(CFLAGS) -o swiftmpiproxies swiftmpiproxies.c mpiuse.c clocks.c -I/usr/include/mpi -lmpi -lpthread + clean: rm swiftmpistepsim + rm swiftmpiproxies diff --git a/swiftmpiproxies.c b/swiftmpiproxies.c new file mode 100644 index 0000000..f9470c0 --- /dev/null +++ b/swiftmpiproxies.c @@ -0,0 +1,259 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2020 Peter W. Draper + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +#include <limits.h> +#include <mpi.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "atomic.h" +#include "clocks.h" +#include "error.h" +#include "mpiuse.h" + +/* Global: Our rank for all to see. */ +int myrank = -1; + +/* Are we verbose. */ +static int verbose = 0; + +/* Integer types of send and recv tasks, must match log. */ +static const int task_type_send = 23; +static const int task_type_recv = 24; + +static const int task_subtype_count = 29; +static const int task_subtype_pcells = 35; + +/* Proxy tag arithmetic. From proxy.h, must match log. */ +#define proxy_tag_shift 8 +#define proxy_tag_count 0 +#define proxy_tag_cells 6 + +/* Our queues of communications. Need two to separate out the pcell sends and + * recvs. */ +struct mpiuse_log_entry **send_pcells; +int nr_send_pcells = 0; +struct mpiuse_log_entry **recv_pcells; +int nr_recv_pcells = 0; + +/** + * @brief Pick out the relevant logging data for our rank, i.e. all + * activations of sends and recvs. We ignore the original completion logs, + * those are not relevant. + */ +static void pick_logs(void) { + size_t nlogs = mpiuse_nr_logs(); + + /* Duplicate of logs. */ + send_pcells = (struct mpiuse_log_entry **)calloc(nlogs, sizeof(struct mpiuse_log_entry *)); + nr_send_pcells = 0; + recv_pcells = (struct mpiuse_log_entry **)calloc(nlogs, sizeof(struct mpiuse_log_entry *)); + nr_recv_pcells = 0; + + for (int k = 0; k < nlogs; k++) { + struct mpiuse_log_entry *log = mpiuse_get_log(k); + if (log->rank == myrank && log->activation) { + log->data = NULL; + if (log->type == task_type_send) { + if (log->subtype == task_subtype_pcells) { + send_pcells[nr_send_pcells] = log; + nr_send_pcells++; + } else if (log->subtype != task_subtype_count) { + error("task subtype '%d' is not a known value", log->subtype); + } + + } else if (log->type == task_type_recv) { + + if (log->subtype == task_subtype_pcells) { + recv_pcells[nr_recv_pcells] = log; + nr_recv_pcells++; + } else if (log->subtype != task_subtype_count) { + error("task subtype '%d' is not a known value", log->subtype); + } + + } else { + error("task type '%d' is not a known send or recv task", log->type); + } + } + } +} + +/** + * @brief usage help. + */ +static void usage(char *argv[]) { + fprintf(stderr, "Usage: %s [-vf] SWIFT_mpiuse-log-file.dat\n", + argv[0]); + fprintf(stderr, " options: -v verbose\n"); + fflush(stderr); +} + +/** + * @brief main function. + */ +int main(int argc, char *argv[]) { + + /* Initiate MPI. */ + int prov = 0; + int res = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &prov); + if (res != MPI_SUCCESS) + error("Call to MPI_Init_thread failed with error %i.", res); + + int nr_nodes = 0; + res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes); + if (res != MPI_SUCCESS) error("MPI_Comm_size failed with error %i.", res); + + res = MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (res != MPI_SUCCESS) + error("Call to MPI_Comm_rank failed with error %i.", res); + + /* Handle the command-line, we expect a mpiuse data file to read and various + * options. */ + int opt; + while ((opt = getopt(argc, argv, "v")) != -1) { + switch (opt) { + case 'v': + verbose = 1; + break; + default: + if (myrank == 0) usage(argv); + return 1; + } + } + if (optind >= argc) { + if (myrank == 0) usage(argv); + return 1; + } + char *infile = argv[optind]; + + /* Start time across the ranks. */ + MPI_Barrier(MPI_COMM_WORLD); + clocks_set_cpufreq(0); + + /* Now we read the SWIFT MPI logger output that defines the communcations we + * will undertake. Note this has all ranks for a single step, SWIFT outputs + * one MPI log per rank per step, so you need to combine all ranks from a + * step. XXX also extract proxy related communications XXX */ + mpiuse_log_restore(infile); + int nranks = mpiuse_nr_ranks(); + + /* This should match the expected size. */ + if (nr_nodes != nranks) + error("The number of MPI ranks %d does not match the expected value %d", + nranks, nr_nodes); + + /* Each rank has its own queues of requests, so extract them. */ + pick_logs(); + + /* And run our version of the proxy exchanges. */ + MPI_Request req_send_counts[nr_send_pcells]; + MPI_Request req_recv_counts[nr_send_pcells]; + MPI_Request req_pcells_out[nr_send_pcells]; + int pcells_size[nr_send_pcells]; + + /* XXX note in SWIFT we use the threadpool to launch these. That may + * matter. */ + for (int k = 0; k < nr_send_pcells; k++) { + struct mpiuse_log_entry *log = send_pcells[k]; + + /* Need to regenerate the tags for each other communication type. */ + int basetag = log->rank * proxy_tag_shift; + + /* Start Isend counts of pcells. Really just the size of the buffer we're + * about to send, SWIFT sends the count. */ + int size = log->size; + res = MPI_Isend(&size, 1, MPI_INT, log->otherrank, + basetag + proxy_tag_count, + MPI_COMM_WORLD, &req_send_counts[k]); + if (res != MPI_SUCCESS) + error("Counts MPI_Isend failed."); + + /* Start Isend of pcells. */ + log->data = calloc(log->size, 1); + res = MPI_Isend(log->data, log->size, MPI_BYTE, log->otherrank, + basetag + proxy_tag_cells, MPI_COMM_WORLD, + &req_pcells_out[k]); + if (res != MPI_SUCCESS) + error("Pcell MPI_Isend failed."); + + /* Start Irecv counts of pcells from other rank. */ + basetag = log->rank * proxy_tag_shift; + res = MPI_Irecv(&pcells_size[k], 1, MPI_INT, log->otherrank, + basetag + proxy_tag_count, MPI_COMM_WORLD, + &req_recv_counts[k]); + if (res != MPI_SUCCESS) + error("Counts MPI_Irecv failed."); + } + + /* Now wait for any of the counts irecvs to complete and then create the + * irecv for the pcells. */ + void *pcells_in[nr_send_pcells]; + MPI_Request req_pcells_in[nr_send_pcells]; + for (int k = 0; k < nr_send_pcells; k++) { + int pid = MPI_UNDEFINED; + MPI_Status status; + res = MPI_Waitany(nr_send_pcells, req_recv_counts, &pid, &status); + if (res != MPI_SUCCESS || pid == MPI_UNDEFINED) + error("MPI_Waitany failed."); + + struct mpiuse_log_entry *log = send_pcells[pid]; + int basetag = log->rank * proxy_tag_shift; + + pcells_in[pid] = calloc(pcells_size[pid], 1); + res = MPI_Irecv(pcells_in[pid], pcells_size[pid], MPI_BYTE, + log->otherrank, basetag + proxy_tag_cells, + MPI_COMM_WORLD, &req_pcells_in[pid]); + + if (res != MPI_SUCCESS) + error("Pcell MPI_Irecv failed."); + } + + /* Waitall for all Isend counts to complete. */ + res = MPI_Waitall(nr_send_pcells, req_send_counts, MPI_STATUSES_IGNORE); + if (res != MPI_SUCCESS) + error("Waitall for counts Isend failed."); + + /* Now wait for the pcell irecvs to complete, so we receive the pcells, + * which would be unpacked in SWIFT. */ + for (int k = 0; k < nr_send_pcells; k++) { + int pid = MPI_UNDEFINED; + MPI_Status status; + res = MPI_Waitany(nr_send_pcells, req_pcells_in, &pid, &status); + if (res != MPI_SUCCESS || pid == MPI_UNDEFINED) + error("MPI_Waitany failed."); + + /* XXX check the data received is correct? */ + } + + /* Waitall for Isend of pcells to complete. */ + res = MPI_Waitall(nr_send_pcells, req_pcells_out, MPI_STATUSES_IGNORE); + if (res != MPI_SUCCESS) + error("Waitall for pcells Isend failed."); + + /* Shutdown MPI. */ + res = MPI_Finalize(); + if (res != MPI_SUCCESS) + error("call to MPI_Finalize failed with error %i.", res); + + if (myrank == 0) message("Bye"); + + return 0; +} diff --git a/testdata/EAGLE_25-mpiuse_report-step436-4ranks-proxies.dat b/testdata/EAGLE_25-mpiuse_report-step436-4ranks-proxies.dat new file mode 100644 index 0000000..6fb926e --- /dev/null +++ b/testdata/EAGLE_25-mpiuse_report-step436-4ranks-proxies.dat @@ -0,0 +1,96 @@ +4850098056 694601348570 0 436 0 1 send 23 unknown 29 1 0 4 4 +4850121226 694601371740 0 436 0 3 send 23 unknown 29 1 0 4 8 +4850137570 694601388084 0 436 0 2 send 23 unknown 29 1 0 4 12 +4903711386 694654961900 0 436 0 1 send 23 unknown 35 1 6 67144000 67144012 +4903771972 694655022486 0 436 0 1 recv 24 unknown 29 1 8 4 67144016 +4934108062 694685358576 0 436 0 2 send 23 unknown 35 1 6 112140224 179284240 +4934167322 694685417836 0 436 0 2 recv 24 unknown 29 1 16 4 179284244 +4934438236 694685688750 0 436 0 3 send 23 unknown 35 1 6 112768320 292052564 +4934474960 694685725474 0 436 0 3 recv 24 unknown 29 1 24 4 292052568 +4934698274 694685948788 30926302 436 0 1 recv 24 unknown 29 0 8 -4 292052564 +4935788496 694687039010 0 436 0 1 recv 24 unknown 35 1 14 50257088 342309652 +4985676110 694736926624 51508788 436 0 2 recv 24 unknown 29 0 16 -4 342309648 +4986972416 694738222930 0 436 0 2 recv 24 unknown 35 1 22 76386688 418696336 +4986993512 694738244026 52518552 436 0 3 recv 24 unknown 29 0 24 -4 418696332 +4987726376 694738976890 0 436 0 3 recv 24 unknown 35 1 30 58033472 476729804 +4987735264 694738985778 137637208 436 0 1 send 23 unknown 29 0 0 -4 476729800 +4987735548 694738986062 137614322 436 0 3 send 23 unknown 29 0 0 -4 476729796 +4987735802 694738986316 137598232 436 0 2 send 23 unknown 29 0 0 -4 476729792 +4987749628 694739000142 51961132 436 0 1 recv 24 unknown 35 0 14 -50257088 426472704 +5223473824 694974724338 236501408 436 0 2 recv 24 unknown 35 0 22 -76386688 350086016 +5564522798 695315773312 576796422 436 0 3 recv 24 unknown 35 0 30 -58033472 292052544 +5764020152 695515270666 860308766 436 0 1 send 23 unknown 35 0 6 -67144000 224908544 +5764021262 695515271776 829583026 436 0 3 send 23 unknown 35 0 6 -112768320 112140224 +5764021514 695515272028 829913452 436 0 2 send 23 unknown 35 0 6 -112140224 0 +4849661950 694605451240 0 436 1 0 send 23 unknown 29 1 8 4 4 +4849686448 694605475738 0 436 1 2 send 23 unknown 29 1 8 4 8 +4849702938 694605492228 0 436 1 3 send 23 unknown 29 1 8 4 12 +4888801436 694644590726 0 436 1 0 send 23 unknown 35 1 14 50257088 50257100 +4888872168 694644661458 0 436 1 0 recv 24 unknown 29 1 0 4 50257104 +4916161986 694671951276 0 436 1 2 send 23 unknown 35 1 14 87092544 137349648 +4916219058 694672008348 0 436 1 2 recv 24 unknown 29 1 16 4 137349652 +4926742762 694682532052 0 436 1 3 send 23 unknown 35 1 14 102003776 239353428 +4926795992 694682585282 0 436 1 3 recv 24 unknown 29 1 24 4 239353432 +4927024564 694682813854 38152396 436 1 0 recv 24 unknown 29 0 0 -4 239353428 +4928120732 694683910022 0 436 1 0 recv 24 unknown 35 1 6 67144000 306497428 +4991934490 694747723780 75715432 436 1 2 recv 24 unknown 29 0 16 -4 306497424 +4993226760 694749016050 0 436 1 2 recv 24 unknown 35 1 22 103360320 409857744 +4993246450 694749035740 66450458 436 1 3 recv 24 unknown 29 0 24 -4 409857740 +4994297096 694750086386 0 436 1 3 recv 24 unknown 35 1 30 96912704 506770444 +4994306838 694750096128 144603900 436 1 3 send 23 unknown 29 0 8 -4 506770440 +4994307102 694750096392 144645152 436 1 0 send 23 unknown 29 0 8 -4 506770436 +4994307370 694750096660 144620922 436 1 2 send 23 unknown 29 0 8 -4 506770432 +4994319476 694750108766 66198744 436 1 0 recv 24 unknown 35 0 6 -67144000 439626432 +5435638176 695191427466 442411416 436 1 2 recv 24 unknown 35 0 22 -103360320 336266112 +5853839496 695609628786 859542400 436 1 3 recv 24 unknown 35 0 30 -96912704 239353408 +6179956632 695935745922 1253213870 436 1 3 send 23 unknown 35 0 14 -102003776 137349632 +6179957364 695935746654 1291155928 436 1 0 send 23 unknown 35 0 14 -50257088 87092544 +6179957614 695935746904 1263795628 436 1 2 send 23 unknown 35 0 14 -87092544 0 +4865683574 694616573774 0 436 2 1 send 23 unknown 29 1 16 4 4 +4865694860 694616585060 0 436 2 0 send 23 unknown 29 1 16 4 8 +4865719040 694616609240 0 436 2 3 send 23 unknown 29 1 16 4 12 +4929671098 694680561298 0 436 2 0 send 23 unknown 35 1 22 76386688 76386700 +4929728618 694680618818 0 436 2 0 recv 24 unknown 29 1 0 4 76386704 +4946100866 694696991066 0 436 2 1 send 23 unknown 35 1 22 103360320 179747024 +4946157516 694697047716 0 436 2 1 recv 24 unknown 29 1 8 4 179747028 +4966431694 694717321894 0 436 2 3 send 23 unknown 35 1 22 131407808 311154836 +4966473568 694717363768 0 436 2 3 recv 24 unknown 29 1 24 4 311154840 +4966735668 694717625868 262100 436 2 3 recv 24 unknown 29 0 24 -4 311154836 +4967822066 694718712266 0 436 2 3 recv 24 unknown 35 1 30 77125888 388280724 +5039897426 694790787626 110168808 436 2 0 recv 24 unknown 29 0 0 -4 388280720 +5041296494 694792186694 0 436 2 0 recv 24 unknown 35 1 6 112140224 500420944 +5047476558 694798366758 101319042 436 2 1 recv 24 unknown 29 0 8 -4 500420940 +5049229182 694800119382 0 436 2 1 recv 24 unknown 35 1 14 87092544 587513484 +5049237980 694800128180 183543120 436 2 0 send 23 unknown 29 0 16 -4 587513480 +5049238262 694800128462 183554688 436 2 1 send 23 unknown 29 0 16 -4 587513476 +5049238548 694800128748 183519508 436 2 3 send 23 unknown 29 0 16 -4 587513472 +5049252308 694800142508 81430242 436 2 3 recv 24 unknown 35 0 30 -77125888 510387584 +5409565588 695160455788 368269094 436 2 0 recv 24 unknown 35 0 6 -112140224 398247360 +5767826276 695518716476 718597094 436 2 1 recv 24 unknown 35 0 14 -87092544 311154816 +6027888698 695778778898 1098217600 436 2 0 send 23 unknown 35 0 22 -76386688 234768128 +6027889152 695778779352 1081788286 436 2 1 send 23 unknown 35 0 22 -103360320 131407808 +6027889418 695778779618 1061457724 436 2 3 send 23 unknown 35 0 22 -131407808 0 +4854204482 694605090028 0 436 3 1 send 23 unknown 29 1 24 4 4 +4854234092 694605119638 0 436 3 2 send 23 unknown 29 1 24 4 8 +4854255180 694605140726 0 436 3 0 send 23 unknown 29 1 24 4 12 +4899877272 694650762818 0 436 3 0 send 23 unknown 35 1 30 58033472 58033484 +4899949318 694650834864 0 436 3 0 recv 24 unknown 29 1 0 4 58033488 +4912874760 694663760306 0 436 3 2 send 23 unknown 35 1 30 77125888 135159376 +4912937056 694663822602 0 436 3 2 recv 24 unknown 29 1 16 4 135159380 +4928374362 694679259908 0 436 3 1 send 23 unknown 35 1 30 96912704 232072084 +4928425744 694679311290 0 436 3 1 recv 24 unknown 29 1 8 4 232072088 +4928643938 694679529484 15706882 436 3 2 recv 24 unknown 29 0 16 -4 232072084 +4930353406 694681238952 0 436 3 2 recv 24 unknown 35 1 22 131407808 363479892 +4930363348 694681248894 30414030 436 3 0 recv 24 unknown 29 0 0 -4 363479888 +4931947584 694682833130 0 436 3 0 recv 24 unknown 35 1 6 112768320 476248208 +4931953158 694682838704 3527414 436 3 1 recv 24 unknown 29 0 8 -4 476248204 +4934140284 694685025830 0 436 3 1 recv 24 unknown 35 1 14 102003776 578251980 +4934154460 694685040006 79949978 436 3 1 send 23 unknown 29 0 24 -4 578251976 +4934154790 694685040336 79899610 436 3 0 send 23 unknown 29 0 24 -4 578251972 +4934155106 694685040652 79921014 436 3 2 send 23 unknown 29 0 24 -4 578251968 +5084820146 694835705692 154466740 436 3 2 recv 24 unknown 35 0 22 -131407808 446844160 +5647753010 695398638556 715805426 436 3 0 recv 24 unknown 35 0 6 -112768320 334075840 +6028108176 695778993722 1093967892 436 3 1 recv 24 unknown 35 0 14 -102003776 232072064 +6407169688 696158055234 1478795326 436 3 1 send 23 unknown 35 0 30 -96912704 135159360 +6407171042 696158056588 1507293770 436 3 0 send 23 unknown 35 0 30 -58033472 77125888 +6407171290 696158056836 1494296530 436 3 2 send 23 unknown 35 0 30 -77125888 0 -- GitLab