diff --git a/.gitignore b/.gitignore index afa445084e6ff202c6d10f1a42d9af832100a8b4..753ecb8407375c93ae44d9d3d0c26c836a5068e8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ swiftmpistepsim +swiftmpifakestepsim *.o *~ diff --git a/Makefile b/Makefile index 75a2e49df9978b6383cf26aa2ad88be12d501164..7756f83a1db34e27f3bf06278d902837b88d268a 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,16 @@ CFLAGS = -g -O0 -Wall -all: swiftmpistepsim +all: swiftmpistepsim swiftmpifakestepsim -swiftmpistepsim: swiftmpistepsim.c mpiuse.c mpiuse.h atomic.h cycle.h clocks.h clocks.c - mpicc $(CFLAGS) -o swiftmpistepsim swiftmpistepsim.c mpiuse.c clocks.c +SRC = mpiuse.c clocks.c histogram.c +INC = mpiuse.h atomic.h cycle.h clocks.h histogram.h + +swiftmpistepsim: swiftmpistepsim.c $(SRC) $(INC) + mpicc $(CFLAGS) -o swiftmpistepsim swiftmpistepsim.c $(SRC) -lpthread -lm + +swiftmpifakestepsim: swiftmpifakestepsim.c $(SRC) $(INC) + mpicc $(CFLAGS) -o swiftmpifakestepsim swiftmpifakestepsim.c $(SRC) -lpthread -lm clean: rm -f swiftmpistepsim + rm -f swiftmpifakestepsim diff --git a/README.fakedata.md b/README.fakedata.md new file mode 100644 index 0000000000000000000000000000000000000000..059615b4eff014910a247243e659d0a37dddb035 --- /dev/null +++ b/README.fakedata.md @@ -0,0 +1,49 @@ + +swiftmpifakestepsim +=================== + +The swiftmpifakestepsim program works much like swiftmpistepsim, except that +it generates fake logs: + +``` +Usage: ./swiftmpifakestepsim [options] nr_messages logfile.dat + options: -v verbose, -d data check, -s size (bytes/scale), + -f <1|2> randomize injection order, 1 == just sends, 2 == sends and recvs + [-r uniform random from 1 to size, | + -r -g half gaussian random from 1 with 2.5 sigma size., | + -r -c <file> use cdf from file, size is a scale factor., | + -r -o <file> use occurence sample of values in a file, size is a scale factor.,] + -x random seed +``` + +To use a fixed message size just use `-s`. + +To use a uniform distribution in the range 1 to N: +``` + -s N -r +``` + +To use a half gaussian (so biased towards smaller packets) use: +``` + -r -s <scale> -g +``` + +The cdf option reads a simple text file with a sampling of a cumulative +distribution function, where each line has three values, the minimum and +maximum range of the current bin and the value. Note that the value column +should be normalized into the range 0 to 1. +``` + -r -s <scale> -c <cdf_file> +``` + +The occurrence file has just one value per line, these should present the +sizes of the packets, this is used to form a cdf: +``` + -r -s <scale> -o <occurrence_file> +``` + +Other options are useful to make sure that the randoms are different, `-x` and +that they run in different order `-f <1|2>`. + +Peter W. Draper 24 Apr 2023 +--------------------------- diff --git a/README.md b/README.md index 2601f6ecf09cb6a2eca4b1422afd8ce5834ef078..712d27e46c35b6297be317f1cf4439f1b1c82eaf 100644 --- a/README.md +++ b/README.md @@ -64,3 +64,14 @@ eager exchanges are working and what effect the size of the packets has. --------------------------- Peter W. Draper 24 Sep 2019. + +Continuing from the notes above, there are also RDMA based versions of +swiftmpistepsim, which are found in the various branches of the +repository. That work and a re-working of SWIFT to use RDMA revealed that the +main driver for the delays wasn't the scatter in MPI completion, but the time +taken to copy data into and out of registered memory, the best solution to +which was to make better use of the memory bandwidth by copying using multiple +threads. + +----------------------------- +Peter W. Draper 24 April 2023. diff --git a/error.h b/error.h index d5e331486620f77cbf16b15ad673a9c43ad7876b..bda6a62106d387cfa62076d9e5299b8a11d44f3d 100644 --- a/error.h +++ b/error.h @@ -1,4 +1,5 @@ #include <mpi.h> + #include "clocks.h" extern int myrank; diff --git a/format.sh b/format.sh index 91346334c9b2eaf9fbb343aba44f8a02d866d1ef..db5e7ab01a1603b987c3fa7e83a8736ed7faea31 100755 --- a/format.sh +++ b/format.sh @@ -1,8 +1,8 @@ #!/bin/bash # Clang format command, can be overridden using CLANG_FORMAT_CMD. -# We currrently use version 5.0 so any overrides should provide that. -clang=${CLANG_FORMAT_CMD:="clang-format-5.0"} +# We currrently use version 13.0 so any overrides should provide that. +clang=${CLANG_FORMAT_CMD:="clang-format-13.0"} # Formatting command cmd="$clang -style=file $(git ls-files | grep '\.[ch]$')" diff --git a/histogram.c b/histogram.c new file mode 100644 index 0000000000000000000000000000000000000000..d9b1609db8e22abcbe5076e8c061538dc09d4212 --- /dev/null +++ b/histogram.c @@ -0,0 +1,138 @@ +#include "histogram.h" + +#include <float.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +/* + * Simple histogram routines. +int main(int argc, char *argv[]) { + double *values; + int nvalues; + if (histread(argv[1], &values, &nvalues)) { + printf("## Read %d values from %s\n", nvalues, argv[1]); + struct histogram *h = calloc(1, sizeof(struct histogram)); + histmake(nvalues, values, h); + printf("## Created cumulative histogram with %d values:\n", h->nvalues); + printf("# value sum\n"); + for (int k = 0; k < h->nvalues; k++) { + printf("%f %24.17g\n", h->values[k], h->sums[k]); + } + free(h); + free(values); + return 0; + } + return 1; +} +*/ + +/** + * Create a histogram by binning a number of data values. + * + * The histogram is returned in a histogram struct, which includes the basic + * histogram (with NHIST values) and a normalized cumulative version (with + * h->nvals values, as the zero bins are coalesced). + * + * @param nvalues number of values. + * @param values the data values to bin into a histogram. + * @param h the #histogram. + */ +void histmake(int nvalues, double *values, struct histogram *h) { + + /* Find the minimum and maximum values. */ + double dmin = DBL_MAX; + double dmax = -DBL_MAX; + for (int i = 0; i < nvalues; i++) { + dmin = fmin(dmin, values[i]); + dmax = fmax(dmax, values[i]); + } + + /* Form the fixed width bin histogram. */ + double scale = (double)NHIST / (dmax - dmin); + h->width = 1.0 / scale; + h->zero = dmin - h->width / 2.0; + double sum = 0.0; + for (int i = 0; i < nvalues; i++) { + int idiff = (int)round(scale * ((double)values[i] - dmin)); + h->hist[idiff] += 1; + sum++; + } + + double norm = 1.0 / sum; + + /* Form cumulative sums and count used bins. */ + sum = 0.0; + int lastbin = 0; + int usedbins = 0; + for (int i = 0; i < NHIST; i++) { + + /* Zero bins are folded into a range. */ + if (h->hist[i] > 0) { + + /* Value is mid of bin. */ + h->values[usedbins] = 0.5 * ((lastbin * h->width + h->zero) + + ((i + 1) * h->width + h->zero)); + sum += h->hist[i] * norm; + h->sums[usedbins] = sum; + usedbins++; + lastbin = i + 1; + } + } + h->nvalues = usedbins; +} + +/** + * Read in data to histogram. Assumes a simple text file with one value per + * line. + * + * @param file the name of the file to read. + * @param values the extracted values. Free when not needed. + * @param nvalues the number of values. + * + * @result 1 for successful, 0 otherwise. + */ +int histread(const char *file, double **values, int *nvalues) { + + *values = NULL; + *nvalues = 0; + + FILE *infile = fopen(file, "r"); + if (infile == NULL) { + printf("Failed to open sizes file: %s\n", file); + return 0; + } + + /* Initial space for data. */ + int nvals = 0; + int maxvals = 1024; + double *vals = malloc(maxvals * sizeof(double)); + + char line[132]; + while (!feof(infile)) { + if (fgets(line, 132, infile) != NULL) { + if (line[0] != '#') { + int n = sscanf(line, "%lf", &vals[nvals]); + if (n != 1) { + printf("Incorrect no. of values %s\n", line); + fclose(infile); + free(vals); + return 0; + } + nvals++; + + /* Make more space. */ + if (nvals > maxvals) { + maxvals += 1024; + vals = realloc(vals, maxvals * sizeof(double)); + } + } + } + } + fclose(infile); + + *values = vals; + *nvalues = nvals; + return 1; +} diff --git a/histogram.h b/histogram.h new file mode 100644 index 0000000000000000000000000000000000000000..1ddd42544eb168b51590171df46c8648ed8b6a08 --- /dev/null +++ b/histogram.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2021 Peter W. Draper (p.w.draper@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_HISTOGRAM_H +#define SWIFT_HISTOGRAM_H + +/* Bins in a histogram. Note this must be an even number. */ +#define NHIST 65536 + +/** Histogram structure. */ +struct histogram { + /* Raw histogram. NHIST counts in hist and the value associated with a bin + * is index*width+zero. */ + double width; + double zero; + int hist[NHIST]; + + /* Normalized cumulative histogram. Empty bins are joined into a larger one, + * so values are the bin centre, sums the sum to that bin and nvalues the + * number of bins that have been populated, can be less than NHIST. */ + double values[NHIST]; + double sums[NHIST]; + int nvalues; +}; + +int histread(const char *file, double **values, int *nvalues); +void histmake(int nvalues, double *values, struct histogram *h); + +#endif diff --git a/mpiuse.c b/mpiuse.c index 6bd7b3116e0910b152e49112be86057bad80a0f6..e25981defe39914f325f895eb382fd2d7c1e3827 100644 --- a/mpiuse.c +++ b/mpiuse.c @@ -22,6 +22,7 @@ */ /* Standard includes. */ +#include <math.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -34,6 +35,7 @@ #include "clocks.h" #include "cycle.h" #include "error.h" +#include "histogram.h" /* Our rank. */ extern int myrank; @@ -257,3 +259,173 @@ struct mpiuse_log_entry *mpiuse_get_log(int ind) { if (ind < mpiuse_log_count && ind >= 0) return &mpiuse_log[ind]; return NULL; } + +/** + * @brief return random number from a upper part of gaussian distribution. + * + * @result the random. + */ +static double gauss_rand_upper(void) { + + double V1, V2, S; + do { + double U1 = drand48(); + double U2 = drand48(); + + V1 = U1 - 1.0; + V2 = U2 - 1.0; + S = V1 * V1 + V2 * V2; + } while (S >= 1.0 || S == 0.0); + + return fabs(V1 * sqrt(-2.0 * log(S) / S)); +} + +/** + * @brief generate a list of fake exchanges as mpiuse logs. + * + * @param nr_nodes the number of ranks that will be used. + * @param nr_logs the number of logs to generate per rank. + * @param size bytes per message, unless random when this is the maximum + * and the minimum is 1 for uniform, if using a gaussian + * distribution the value is a 2.5 sigma, for a CDF based + * selection this is just a scale factor of the values. + * @param random whether to use random sizes. + * @param seed the random seed, use same for fixed sequences. + * @param uniform whether to use a uniform distribution or gaussian, unless + * cdf is defined, in which case this parameter is ignored. + * @param cdf text file containing a normalized CDF to use as a basis for + * inverse transform sampling of the randoms. NULL for no file. + * @param odata text file containing a values representing occurences of the + * expected distribution -- converted into a normalised CDF to + * use as a basis for inverse transform sampling of the + * randoms. NULL for no file. Not used if cdf is not NULL. + */ +void mpiuse_log_generate(int nr_nodes, int nr_logs, int size, int random, + long int seed, int uniform, const char *cdf, + const char *odata) { + + /* Only used for CDFs, may need to increase these. */ + int nvals = 0; + double imin[NHIST], imax[NHIST], value[NHIST]; + + /* Note that each rank exchanges messages with all the others and each "log" + * has the same size. */ + /* Set seed. */ + if (random) srand48(seed); + + /* Check for CDF. This should be based on some real distribution, the format + * is same as output from TOPCAT, i.e. bin-low, bin-high, value space + * separated values. Note the value column should be normalised into the + * range 0 to 1 so that it maps into a uniform random distribution. */ + if (cdf != NULL) { + FILE *infile = fopen(cdf, "r"); + if (infile == NULL) error("Failed to open CDF file: %s", cdf); + char line[132]; + while (!feof(infile)) { + if (fgets(line, 132, infile) != NULL) { + if (line[0] != '#') { + int nread = sscanf(line, "%lf %lf %lf", &imin[nvals], &imax[nvals], + &value[nvals]); + if (nread == 3) nvals++; + } + } + } + fclose(infile); + } else if (odata != NULL) { + double *values; + int nvalues; + if (histread(odata, &values, &nvalues)) { + // printf("## Read %d occurence values from %s\n", nvalues, odata); + struct histogram *h = calloc(1, sizeof(struct histogram)); + histmake(nvalues, values, h); + // printf("## Created cumulative histogram with %d values:\n", + // h->nvalues); printf("# value sum\n"); + imin[0] = 0.0; + imax[0] = h->values[0]; + value[0] = h->sums[0]; + for (int k = 1; k < h->nvalues; k++) { + imin[k] = h->values[k - 1]; + imax[k] = h->values[k]; + value[k] = h->sums[k]; + // printf("%f %24.17g\n", h->values[k], h->sums[k]); + } + nvals = h->nvalues; + + free(h); + free(values); + } else { + error("Failed to read occurrence data from file: %s", odata); + } + } + + /* Message tags increment with across rank logs. */ + int tag = 1; + for (int k = 0; k < nr_logs; k++) { + + /* Set size for this messages. */ + double logsize = size; + if (random) { + if (cdf || odata) { + /* CDF based randoms. */ + double rand = drand48(); + + /* Binary search for containing bin for this rand. */ + unsigned int lower = 0; + unsigned int upper = nvals; + unsigned int middle = 0; + while (lower < upper) { + middle = (upper + lower) / 2; + if (rand > value[middle]) + lower = middle + 1; + else + upper = middle; + } + logsize = 0.5 * (imax[middle] + imin[middle]); + + } else if (uniform) { + /* Uniform randoms in the range 0 to 1 */ + logsize = (drand48() * (double)size) + 1; + } else { + // Gaussian randoms so no maximum, assume size is 2.5 sigma. + logsize = (gauss_rand_upper() * (double)size * 0.25) + 1; + } + } + + /* Cannot send more than 2^31-1 bytes at a time, so truncate. */ + if (logsize > 2147483647.0) { + message("CDF size too large : %f, truncating", logsize); + logsize = 2147483647.0; + } + + for (int i = 0; i < nr_nodes; i++) { + for (int j = 0; j < nr_nodes; j++) { + if (i != j) { + mpiuse_log_allocation(i, 1, k, SEND_TYPE, NO_SUBTYPE, 1, + (size_t)logsize, j, tag); + mpiuse_log_allocation(j, 1, k, RECV_TYPE, NO_SUBTYPE, 1, + (size_t)logsize, i, tag); + } + } + } + tag++; + } +} + +/** + * Shuffle log pointers randomizing the order. + * + * Note assumes dran48() has been seeded. + * + * @param logs the log pointers to shuffle. + * @param nlogs the number of logs. + */ +void mpiuse_shuffle_logs(struct mpiuse_log_entry **logs, int nlogs) { + + struct mpiuse_log_entry tmp; + for (int k = nlogs - 1; k > 0; k--) { + unsigned int j = (unsigned int)(drand48() * (k + 1)); + memcpy(&tmp, &logs[j], sizeof(struct mpiuse_log_entry *)); + memcpy(&logs[j], &logs[k], sizeof(struct mpiuse_log_entry *)); + memcpy(&logs[k], &tmp, sizeof(struct mpiuse_log_entry *)); + } +} diff --git a/mpiuse.h b/mpiuse.h index 707ab27547b14fc701fcfd53254d29d603007891..71d8982f0cbb020dd6a563d2f8cb756ccacf3dd5 100644 --- a/mpiuse.h +++ b/mpiuse.h @@ -84,6 +84,13 @@ struct mpiuse_log_entry { ticks tmin; }; +/* Flags for the types of request when generating fakes. */ +#ifndef SEND_TYPE +#define SEND_TYPE 25 +#define RECV_TYPE 26 +#define NO_SUBTYPE 0 +#endif + /* API. */ void mpiuse_log_allocation(int rank, int step, size_t tic, int type, int subtype, int activation, size_t size, @@ -94,4 +101,9 @@ int mpiuse_nr_logs(void); int mpiuse_nr_ranks(void); void mpiuse_dump_logs(int nranks, const char *logfile); +void mpiuse_log_generate(int nr_nodes, int nr_logs, int size, int random, + long int seed, int uniform, const char *cdf, + const char *odata); +void mpiuse_shuffle_logs(struct mpiuse_log_entry **logs, int nlogs); + #endif /* SWIFT_MPIUSE_H */ diff --git a/swiftmpifakestepsim.c b/swiftmpifakestepsim.c new file mode 100644 index 0000000000000000000000000000000000000000..d14d8538de4f2cd51d599881b6dcd4e9ef6678c8 --- /dev/null +++ b/swiftmpifakestepsim.c @@ -0,0 +1,572 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2021 Peter W. Draper + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ + +#include <limits.h> +#include <mpi.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "atomic.h" +#include "clocks.h" +#include "error.h" +#include "mpiuse.h" + +/* Global: Our rank for all to see. */ +int myrank = -1; + +/* Are we verbose. */ +static int verbose = 0; + +/* Set a data pattern and check we get this back, slow... */ +static int datacheck = 0; + +/* Default seed for pseudorandoms. */ +static long int default_seed = 1987654321; + +/* MPI communicator for each rank. XXX static XXX. */ +static MPI_Comm node_comms[512]; + +/* The local queues. */ +static struct mpiuse_log_entry **volatile reqs_queue; +static int volatile ind_req = 0; +static int volatile nr_reqs = 0; +static int volatile injecting = 1; +static struct mpiuse_log_entry **volatile recvs_queue; +static int volatile nr_recvs = 0; +static int volatile ind_recv = 0; +static int volatile todo_recv = 0; +static struct mpiuse_log_entry **volatile sends_queue; +static int volatile nr_sends = 0; +static int volatile ind_send = 0; +static int volatile todo_send = 0; + +/** + * @brief fill a data area with a pattern that can be checked for changes. + * + * @param size size of data in bytes. + * @param data the data to fill. + */ +static void datacheck_fill(size_t size, void *data) { + unsigned char *p = (unsigned char *)data; + for (size_t i = 0; i < size; i++) { + p[i] = 170; /* 10101010 in bits. */ + } +} + +/** + * @brief test a filled data area for our pattern. + * + * @param size size of data in bytes. + * @param data the data to fill. + * + * @result 1 on success, 0 otherwise. + */ +static int datacheck_test(size_t size, void *data) { + unsigned char *p = (unsigned char *)data; + for (size_t i = 0; i < size; i++) { + if (p[i] != 170) return 0; + } + return 1; +} + +/** + * @brief Injection thread, initiates MPI_Isend and MPI_Irecv requests. + */ +static void *inject_thread(void *arg) { + + if (verbose) message("%d: injection thread starts", *((int *)arg)); + ticks starttics = getticks(); + + while (ind_req < nr_reqs) { + struct mpiuse_log_entry *log = reqs_queue[ind_req]; + + /* Initialise new log elements. */ + log->done = 0; + log->nr_tests = 0; + log->tsum = 0.0; + log->tmax = 0; + log->tmin = INT_MAX; + log->endtic = 0; + log->injtic = getticks(); + + /* Differences to SWIFT: MPI_BYTE not the MPI_Type. */ + int err = 0; + if (log->type == SEND_TYPE) { + log->data = calloc(log->size, 1); + + /* Fill data with pattern. */ + if (datacheck) datacheck_fill(log->size, log->data); + + /* And send. */ + err = MPI_Isend(log->data, log->size, MPI_BYTE, log->otherrank, log->tag, + node_comms[log->rank], &log->req); + + /* Add a new send request. */ + int ind = atomic_inc(&nr_sends); + sends_queue[ind] = log; + atomic_inc(&todo_send); + + } else { + + /* Ready to receive. */ + log->data = calloc(log->size, 1); + err = MPI_Irecv(log->data, log->size, MPI_BYTE, log->otherrank, log->tag, + node_comms[log->otherrank], &log->req); + + /* Add a new recv request. */ + int ind = atomic_inc(&nr_recvs); + recvs_queue[ind] = log; + atomic_inc(&todo_recv); + } + if (err != MPI_SUCCESS) error("Failed to activate send or recv"); + + ind_req++; + } + + /* All done, thread exiting. */ + if (verbose) { + message("%d injections completed, sends = %d, recvs = %d", ind_req, + nr_sends, nr_recvs); + message("remaining sends = %d, recvs = %d", todo_send, todo_recv); + } + message("took %.3f %s.", clocks_from_ticks(getticks() - starttics), + clocks_getunit()); + atomic_dec(&injecting); + return NULL; +} + +/** + * @brief main loop to run over a queue of MPI requests and test for when they + * complete. Returns the total amount of time spent in calls to MPI_Test and + * the number of times it was called. + * + * @param logs the list of logs pointing to requests. + * @param nr_logs pointer to the variable containing the current number of + * logs. + * @param todos pointer to the variable containing the number of requests that + * are still active. + * @param sum the total number of ticks spent in calls to MPI_Test. + * @param ncalls the total number of calls to MPI_Test. + * @param mint the minimum ticks an MPI_Test call took. + * @param maxt the maximum ticks an MPI_Test call took. + */ +static void queue_runner(struct mpiuse_log_entry **logs, int volatile *nr_logs, + int volatile *todos, double *sum, int *ncalls, + ticks *mint, ticks *maxt) { + + /* Global MPI_Test statistics. */ + int lncalls = 0; + double lsum = 0.0; + ticks lmint = INT_MAX; + ticks lmaxt = 0; + + /* We loop while new requests are being injected and we still have requests + * to complete. */ + while (injecting || (!injecting && *todos > 0)) { + int nlogs = *nr_logs; + for (int k = 0; k < nlogs; k++) { + struct mpiuse_log_entry *log = logs[k]; + if (log != NULL && !log->done) { + ticks tics = getticks(); + int res; + MPI_Status stat; + int err = MPI_Test(&log->req, &res, &stat); + if (err != MPI_SUCCESS) { + error("MPI_Test call failed"); + } + + /* Increment etc. of statistics about time in MPI_Test. */ + ticks dt = getticks() - tics; + log->tsum += (double)dt; + lsum += (double)dt; + + log->nr_tests++; + lncalls++; + + if (dt < log->tmin) log->tmin = dt; + if (dt > log->tmax) log->tmax = dt; + if (dt < lmint) lmint = dt; + if (dt > lmaxt) lmaxt = dt; + + if (res) { + /* Check data sent data is unchanged and received data is as + * expected. */ + if (datacheck && !datacheck_test(log->size, log->data)) { + error("Data mismatch on completion"); + } + + /* Done, clean up. */ + log->done = 1; + log->endtic = getticks(); + free(log->data); + atomic_dec(todos); + } + } + } + } + + /* All done. */ + *sum = lsum; + *ncalls = lncalls; + *mint = lmint; + *maxt = lmaxt; + return; +} + +/** + * @brief Send thread, checks if MPI_Isend requests have completed. + */ +static void *send_thread(void *arg) { + + if (verbose) message("%d: send thread starts (%d)", *((int *)arg), injecting); + ticks starttics = getticks(); + + int ncalls; + double sum; + ticks mint; + ticks maxt; + queue_runner(sends_queue, &nr_sends, &todo_send, &sum, &ncalls, &mint, &maxt); + + message( + "%d MPI_Test calls took: %.3f, mean time %.3f, min time %.3f, max time " + "%.3f (%s)", + ncalls, clocks_from_ticks(sum), clocks_from_ticks(sum / ncalls), + clocks_from_ticks(mint), clocks_from_ticks(maxt), clocks_getunit()); + if (verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - starttics), + clocks_getunit()); + + /* Thread exits. */ + return NULL; +} + +/** + * @brief Recv thread, checks if MPI_Irecv requests have completed. + */ +static void *recv_thread(void *arg) { + + if (verbose) message("%d: recv thread starts", *((int *)arg)); + ticks starttics = getticks(); + + int ncalls; + double sum; + ticks mint; + ticks maxt; + queue_runner(recvs_queue, &nr_recvs, &todo_recv, &sum, &ncalls, &mint, &maxt); + + message( + "%d MPI_Test calls took: %.3f, mean time %.3f, min time %.3f, max time " + "%.3f (%s)", + ncalls, clocks_from_ticks(sum), clocks_from_ticks(sum / ncalls), + clocks_from_ticks(mint), clocks_from_ticks(maxt), clocks_getunit()); + if (verbose) + message("took %.3f %s.", clocks_from_ticks(getticks() - starttics), + clocks_getunit()); + + /* Thread exits. */ + return NULL; +} + +/** + * @brief Comparison function for logged times. + */ +static int cmp_logs(const void *p1, const void *p2) { + struct mpiuse_log_entry *l1 = *(struct mpiuse_log_entry **)p1; + struct mpiuse_log_entry *l2 = *(struct mpiuse_log_entry **)p2; + + /* Large unsigned values, so take care. */ + if (l1->tic > l2->tic) return 1; + if (l1->tic < l2->tic) return -1; + return 0; +} + +/** + * @brief Pick out the relevant logging data for our rank, i.e. all + * activations of sends and recvs. We ignore the original completions. + * The final list is sorted into increasing time of activation if required, + * otherwise the order is randomized. + * + * @param random randomize injection order of sends or sends and recvs, + * otherwise use order of the original logs. Just sends + * are randomized when 1, 2 randomizes both, 0 neither. + */ +static void pick_logs(int random) { + size_t nlogs = mpiuse_nr_logs(); + + /* Duplicate of logs. */ + reqs_queue = (struct mpiuse_log_entry **)calloc( + nlogs, sizeof(struct mpiuse_log_entry *)); + nr_reqs = 0; + sends_queue = (struct mpiuse_log_entry **)calloc( + nlogs, sizeof(struct mpiuse_log_entry *)); + nr_sends = 0; + recvs_queue = (struct mpiuse_log_entry **)calloc( + nlogs, sizeof(struct mpiuse_log_entry *)); + nr_recvs = 0; + + if (random == 0 || random == 2) { + for (int k = 0; k < nlogs; k++) { + struct mpiuse_log_entry *log = mpiuse_get_log(k); + if (log->rank == myrank && log->activation) { + log->data = NULL; + reqs_queue[nr_reqs] = log; + nr_reqs++; + } + } + + if (random == 0) { + /* Sort into increasing time. */ + qsort(reqs_queue, nr_reqs, sizeof(struct mpiuse_log_entry *), cmp_logs); + } else { + /* Randomize the order, so ranks do not all work in sequence. */ + mpiuse_shuffle_logs(reqs_queue, nr_reqs); + } + + /* Check. */ + if (random == 0) { + for (int k = 0; k < nr_reqs - 1; k++) { + if (reqs_queue[k]->tic > reqs_queue[k + 1]->tic) + message("reqs_queue: %lld > %lld", reqs_queue[k]->tic, + reqs_queue[k + 1]->tic); + } + } + } else { + + /* Randomizing the sends, but injecting the recvs first. */ + + /* Get recvs. */ + int nrecv = 0; + for (int k = 0; k < nlogs; k++) { + struct mpiuse_log_entry *log = mpiuse_get_log(k); + if (log->rank == myrank && log->activation && log->type == RECV_TYPE) { + log->data = NULL; + reqs_queue[nr_reqs] = log; + nr_reqs++; + nrecv++; + } + } + + /* These are sorted into log time order. */ + qsort(reqs_queue, nrecv, sizeof(struct mpiuse_log_entry *), cmp_logs); + + /* Now the sends. */ + int nsend = 0; + for (int k = 0; k < nlogs; k++) { + struct mpiuse_log_entry *log = mpiuse_get_log(k); + if (log->rank == myrank && log->activation && log->type == SEND_TYPE) { + log->data = NULL; + reqs_queue[nr_reqs] = log; + nr_reqs++; + nsend++; + } + } + + /* These are randomized. */ + mpiuse_shuffle_logs(&reqs_queue[nrecv], nsend); + } +} + +/** + * @brief usage help. + */ +static void usage(char *argv[]) { + fprintf(stderr, "Usage: %s [options] nr_messages logfile.dat\n", argv[0]); + fprintf(stderr, + " options: -v verbose, -d data check, -s size (bytes/scale), \n" + "\t -f <1|2> randomize injection order, 1 == just sends, " + "2 == sends and recvs\n" + "\t[-r uniform random from 1 to size, | \n" + "\t-r -g half gaussian random from 1 with 2.5 sigma size., | \n" + "\t-r -c <file> use cdf from file, size is a scale factor., |\n" + "\t-r -o <file> use occurence sample of values in a file, size is a " + "scale factor.,] \n" + "\t-x random seed\n"); + fflush(stderr); +} + +/** + * @brief main function. + */ +int main(int argc, char *argv[]) { + + /* Initiate MPI. */ + int prov = 0; + int res = MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &prov); + if (res != MPI_SUCCESS) + error("Call to MPI_Init_thread failed with error %i.", res); + + int nr_nodes = 0; + res = MPI_Comm_size(MPI_COMM_WORLD, &nr_nodes); + if (res != MPI_SUCCESS) error("MPI_Comm_size failed with error %i.", res); + + res = MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + if (res != MPI_SUCCESS) + error("Call to MPI_Comm_rank failed with error %i.", res); + + /* Handle the command-line, we expect the number of messages to exchange per + * rank an output log and some options, the interesting ones are a size and + * whether to use a random selections of various kinds. */ + int size = 1024; + int random = 0; + int randomorder = 0; + int uniform = 1; + char *cdf = NULL; + char *odata = NULL; + int opt; + unsigned int seed = default_seed; + while ((opt = getopt(argc, argv, "vds:rgx:c:o:f:")) != -1) { + switch (opt) { + case 'd': + datacheck = 1; + break; + case 'c': + cdf = optarg; + break; + case 'f': + randomorder = atoi(optarg); + break; + case 'g': + uniform = 0; + break; + case 's': + size = atoi(optarg); + break; + case 'r': + random = 1; + break; + case 'o': + odata = optarg; + break; + case 'v': + verbose = 1; + break; + case 'x': + seed = atol(optarg); + break; + default: + if (myrank == 0) usage(argv); + return 1; + } + } + if (optind >= argc - 1) { + if (myrank == 0) usage(argv); + return 1; + } + if (cdf != NULL && odata != NULL) + error("Cannot use -c and -o options together"); + + int nr_logs = atoi(argv[optind]); + if (nr_logs == 0) + error("Expected number of messages to exchange, got: %s", argv[optind]); + char *logfile = argv[optind + 1]; + + /* Generate the fake logs for the exchanges. */ + if (myrank == 0) { + if (random) { + if (cdf != NULL) { + message( + "Generating %d fake logs for %d ranks with randoms" + " based on cdf %s scaled by factor %d", + nr_logs, nr_nodes, cdf, size); + } else if (odata != NULL) { + message( + "Generating %d fake logs for %d ranks with randoms" + " based on occurence data %s scaled by factor %d", + nr_logs, nr_nodes, cdf, size); + + } else if (uniform) { + message( + "Generating %d fake logs for %d ranks with random distribution" + " using size %d", + nr_logs, nr_nodes, size); + } else { + message( + "Generating %d fake logs for %d ranks with gaussian random " + "distribution using size %d as 2.5 sigma", + nr_logs, nr_nodes, size); + } + } else { + message("Generating %d fake logs for %d ranks of size %d", nr_logs, + nr_nodes, size); + } + } + mpiuse_log_generate(nr_nodes, nr_logs, size, random, seed, uniform, cdf, + odata); + int nranks = mpiuse_nr_ranks(); + + /* Create communicators for each MPI rank. */ + for (int i = 0; i < nr_nodes; i++) { + MPI_Comm_dup(MPI_COMM_WORLD, &node_comms[i]); + } + + /* Each rank requires its own queue, so extract them. */ + if (verbose && myrank == 0) { + if (randomorder == 0) message("Message order same as generated"); + if (randomorder == 1) message("Message send order randomized"); + if (randomorder == 2) message("Message send and recv order randomized"); + } + + pick_logs(randomorder); + + /* Time to start time. Try to make it synchronous across the ranks. */ + MPI_Barrier(MPI_COMM_WORLD); + clocks_set_cpufreq(0); + if (myrank == 0) { + message("Start of MPI tests"); + message("=================="); + if (verbose) { + if (datacheck) + message("checking data pattern on send and recv completion"); + } + } + + /* Make three threads, one for injecting tasks and two to check for + * completions of the sends and recv independently. */ + pthread_t injectthread; + if (pthread_create(&injectthread, NULL, &inject_thread, &myrank) != 0) + error("Failed to create injection thread."); + pthread_t sendthread; + if (pthread_create(&sendthread, NULL, &send_thread, &myrank) != 0) + error("Failed to create send thread."); + pthread_t recvthread; + if (pthread_create(&recvthread, NULL, &recv_thread, &myrank) != 0) + error("Failed to create recv thread."); + + /* Wait until all threads have exited and all MPI requests have completed. */ + pthread_join(injectthread, NULL); + pthread_join(sendthread, NULL); + pthread_join(recvthread, NULL); + + /* Dump the updated MPI logs. */ + MPI_Barrier(MPI_COMM_WORLD); + fflush(stdout); + if (myrank == 0) message("Dumping updated log"); + mpiuse_dump_logs(nranks, logfile); + + /* Shutdown MPI. */ + res = MPI_Finalize(); + if (res != MPI_SUCCESS) + error("call to MPI_Finalize failed with error %i.", res); + + if (myrank == 0) message("Bye"); + + return 0; +} diff --git a/swiftmpistepsim.c b/swiftmpistepsim.c index 1bbd67ad0c08bef6c82597a4290ebfeead30c362..8c5580ccb95ce6befafc88f07b82f69587f46c09 100644 --- a/swiftmpistepsim.c +++ b/swiftmpistepsim.c @@ -76,7 +76,7 @@ static double log_clocks_cpufreq = 2194844448.0; static void datacheck_fill(size_t size, void *data) { unsigned char *p = (unsigned char *)data; for (size_t i = 0; i < size; i++) { - p[i] = 170; /* 10101010 in bits. */ + p[i] = 170; /* 10101010 in bits. */ } } @@ -488,7 +488,8 @@ int main(int argc, char *argv[]) { message("=================="); if (verbose) { if (!usetics) message("using fast untimed injections"); - if (datacheck) message("checking data pattern on send and recv completion"); + if (datacheck) + message("checking data pattern on send and recv completion"); } }