Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
S
swiftmpistepsim
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
SWIFT
swiftmpistepsim
Commits
0cd76249
Commit
0cd76249
authored
5 years ago
by
Peter W. Draper
Browse files
Options
Downloads
Patches
Plain Diff
Cleanup
parent
fae46c95
No related branches found
No related tags found
1 merge request
!11
Draft: Fast one-sided MPI version
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
swiftmpirdmastepsim.c
+63
-132
63 additions, 132 deletions
swiftmpirdmastepsim.c
with
63 additions
and
132 deletions
swiftmpirdmastepsim.c
+
63
−
132
View file @
0cd76249
...
...
@@ -20,23 +20,22 @@
// Fully one sided approach with passive target communication. This means only
// the sending side updates the window buffer and since we have threaded
// access we can only use flushes with a shared lock that is permanently open
// to move data. The send side has no window, as it only pushes data.
// to move data. The send side has no
associated
window, as it only pushes data.
//
// So each rank needs a receive window that has room for all the expected
// sends, plus
an
additional element for controlling the read
y
ness of the data
// sends, plus additional element
s
for controlling the read
i
ness of the data
// (this is an atomic send that should be guaranteed to only be updated after
// the send of the main data).
// the send of the main data)
and correctness
.
//
// In this implementation the size of the receive buffer per rank is just the
// sum of all the messages we know we are about to get. The order of that
// buffer is determined by the rank and tags, which gives us a list of offsets
// into the buffer mapped by the ranktag, which we need to share with any rank
// that is expected to send us data. We'll send this data using normal MPI
// (could be done either as another extension into the window, which we get,
// but we'd need to synchronize that across all ranks, or we could use the
// global communicator to share this in a similar fashion).
//
// XXX ranktag per subtype is not unique we need to include the size.
// buffer is determined by the send and receive rank and the tag, which gives
// us a list of offsets into the buffer mapped by the ranktag, which we need
// to share with any rank that is expected to send us data. We'll send this
// data using normal MPI (could be done either as another extension into the
// window, which we get, but we'd need to synchronize that across all ranks,
// or we could use the global communicator to share this in a similar
// fashion).
#include
<limits.h>
#include
<mpi.h>
...
...
@@ -67,21 +66,24 @@ int myrank = -1;
/* Number of ranks. */
static
int
nr_ranks
;
/* Bit shift to accomodate all the bits of the maximum rank id. */
static
int
rank_shift
=
0
;
/* Maximum no. of messages (logs). */
static
size_t
max_logs
=
0
;
/* Flags for controlling access. */
/* Flags for controlling access.
High end of size_t.
*/
static
size_t
UNLOCKED
=
(((
size_t
)
2
<<
63
)
-
1
);
/* Size of a block of memory. All addressible memory chunks need to be a
* multiple of this as
as
we need to align sends in memory. */
* multiple of this as we need to align sends
and receives
in memory. */
#define BLOCKTYPE size_t
#define MPI_BLOCKTYPE MPI_AINT
static
const
int
BYTESINBLOCK
=
sizeof
(
BLOCKTYPE
);
/* Size of message header in blocks. The flag, size and tag. Note
size and tag
* are just for sanity checks. The flag value controls access to
the main data
* areas. */
/* Size of message header in blocks. The
unlocked
flag, size and tag. Note
*
size and tag
are just for sanity checks. The flag value controls access to
*
the main data
areas. */
static
const
size_t
HEADER_SIZE
=
3
;
/* Are we verbose. */
...
...
@@ -106,7 +108,6 @@ static BLOCKTYPE *mpi_ptr[task_subtype_count] = {NULL};
static
size_t
ranktag_sizes
[
task_subtype_count
]
=
{
0
};
static
size_t
*
ranktag_counts
;
static
size_t
*
ranktag_offsets
;
;
static
size_t
*
ranktag_lists
;
/* The local send queue. */
...
...
@@ -120,7 +121,7 @@ static int volatile nr_recv = 0;
static
int
volatile
todo_recv
=
0
;
/**
* @brief Convert ranks and tag into a single unique value.
* @brief Convert
two
ranks and tag into a single unique value.
*
* Assumes there is enough space in a size_t for these values.
*
...
...
@@ -131,21 +132,10 @@ static int volatile todo_recv = 0;
* @result a unique value based on both values
*/
static
size_t
toranktag
(
int
sendrank
,
int
recvrank
,
int
tag
)
{
int
shift
=
(
sizeof
(
int
)
*
8
)
-
__builtin_clz
(
nr_ranks
);
/* XXX could precalc. */
//message("nr_ranks = %d, shift = %d", nr_ranks, shift);
size_t
result
=
sendrank
|
recvrank
<<
shift
|
tag
<<
(
shift
*
2
);
size_t
result
=
sendrank
|
recvrank
<<
rank_shift
|
tag
<<
(
rank_shift
*
2
);
return
result
;
}
static
char
*
tokey
(
struct
mpiuse_log_entry
*
log
)
{
static
char
buf
[
256
];
sprintf
(
buf
,
"%d/%d/%d/%zd on %d ranktags %zd/%zd"
,
log
->
otherrank
,
log
->
tag
,
log
->
subtype
,
log
->
size
,
log
->
rank
,
toranktag
(
log
->
rank
,
log
->
otherrank
,
log
->
tag
),
toranktag
(
log
->
rank
,
log
->
otherrank
,
log
->
tag
));
return
buf
;
}
/**
* @brief Convert a byte count into a number of blocks, rounds up.
*
...
...
@@ -179,7 +169,8 @@ static void datacheck_fill(BLOCKTYPE size, BLOCKTYPE *data) {
}
/**
* @brief test a filled data area for a value.
* @brief test a filled data area for a value, reports if any unexpected value
* is found.
*
* @param size size of data in bytes.
* @param data the data to check.
...
...
@@ -229,37 +220,27 @@ static void *send_thread(void *arg) {
* subtype, tag and rank. Need to search the ranktag_lists for our ranktag
* value. XXX bisection search XXX */
size_t
ranktag
=
toranktag
(
log
->
rank
,
log
->
otherrank
,
log
->
tag
);
//message("looking for %s", tokey(log));
size_t
counts
=
ranktag_counts
[
INDEX2
(
task_subtype_count
,
log
->
subtype
,
log
->
otherrank
)];
size_t
offset
=
0
;
int
found
=
0
;
struct
mpiuse_log_entry
*
keeplog
=
NULL
;
counts
=
max_logs
;
int
found
=
0
;
counts
=
max_logs
;
// XXX do we still need this?
for
(
size_t
j
=
0
;
j
<
counts
;
j
++
)
{
if
(
ranktag_lists
[
INDEX3
(
task_subtype_count
,
nr_ranks
,
log
->
subtype
,
log
->
otherrank
,
j
)]
==
ranktag
)
{
if
(
found
)
message
(
"duplicate %zd: %s of %s"
,
ranktag
,
tokey
(
log
),
tokey
(
keeplog
));
offset
=
ranktag_offsets
[
INDEX3
(
task_subtype_count
,
nr_ranks
,
log
->
subtype
,
log
->
otherrank
,
j
)];
keeplog
=
log
;
//message("%d/%d located offset %zd one of %zd ranktag: %zd", log->otherrank,
// log->subtype, offset, counts, ranktag);
found
=
1
;
//
break;
break
;
}
}
if
(
!
found
)
{
fflush
(
stdout
);
fflush
(
stderr
);
error
(
"Failed sending a message of size %zd to %d/%d
@ %zd
\n
, no offset
"
"
found for ranktag %zd, counts = %zd"
,
datasize
,
log
->
otherrank
,
log
->
subtype
,
offset
,
ranktag
,
counts
);
error
(
"Failed sending a message of size %zd to %d/%d "
"@ %zd
\n
, no offset
found for ranktag %zd, counts = %zd"
,
datasize
,
log
->
otherrank
,
log
->
subtype
,
offset
,
ranktag
,
counts
);
}
//message("Sending a message of size %zd to %d/%d @ %zd", datasize,
// log->otherrank, log->subtype, offset);
/* And send data to other rank. */
int
ret
=
MPI_Accumulate
(
dataptr
,
datasize
,
MPI_BLOCKTYPE
,
log
->
otherrank
,
...
...
@@ -286,19 +267,6 @@ static void *send_thread(void *arg) {
ret
=
MPI_Win_flush
(
log
->
otherrank
,
mpi_window
[
log
->
subtype
]);
if
(
ret
!=
MPI_SUCCESS
)
mpi_error_message
(
ret
,
"MPI_Win_flush failed"
);
//if (verbose) {
// if (oldval[0] == dataptr[0]) {
// message("sent a message to %d/%d (%zd:%zd:%zd @ %zd %d/%d)",
// log->otherrank, log->subtype, dataptr[0], oldval[0], newval[0],
// offset, k, nr_send);
// } else {
// message("failed to send a message to %d/%d (%zd:%zd:%zd) @ %zd %d/%d",
// log->otherrank, log->subtype, dataptr[0], oldval[0], newval[0],
// offset, k, nr_send);
// }
//}
//sleep(1);
}
message
(
"took %.3f %s."
,
clocks_from_ticks
(
getticks
()
-
starttics
),
...
...
@@ -318,19 +286,12 @@ static void *recv_thread(void *arg) {
*
((
int
*
)
arg
),
nr_recv
,
nr_ranks
,
task_subtype_count
);
ticks
starttics
=
getticks
();
/* Global statistics. */
//int lncalls = 0;
//double lsum = 0.0;
//ticks lmint = INT_MAX;
//ticks lmaxt = 0;
/* No. of receives to process. */
todo_recv
=
nr_recv
;
/* We loop while new requests are being send and we still have messages
* to receive. */
while
(
todo_recv
>
0
)
{
//sleep(5);
for
(
int
k
=
0
;
k
<
nr_recv
;
k
++
)
{
struct
mpiuse_log_entry
*
log
=
recv_queue
[
k
];
if
(
log
!=
NULL
&&
!
log
->
done
)
{
...
...
@@ -341,28 +302,18 @@ static void *recv_thread(void *arg) {
/* Check if that part of the window has been unlocked. */
BLOCKTYPE
*
dataptr
=
&
mpi_ptr
[
log
->
subtype
][
offset
];
BLOCKTYPE
volatile
lock
=
dataptr
[
0
];
//message("Checking: %s @ %zd", (dataptr[0] == UNLOCKED?"unlocked":"locked"), offset);
//if (atomic_cas(&dataptr[0], UNLOCKED, UNLOCKED)) {
if
(
lock
==
UNLOCKED
)
{
//message("Checked: %s @ %zd", (dataptr[0] == UNLOCKED?"unlocked":"locked"), offset);
/* OK, so data should be ready for use, check the tag and size. */
if
((
size_t
)
log
->
size
==
dataptr
[
1
]
&&
(
size_t
)
log
->
tag
==
dataptr
[
2
])
{
if
(
verbose
)
//message("receive message %d/%d from %d @ offset %zd: "
// "dataptr[0] %zd, size %zd/%zd", log->rank, log->subtype,
// log->otherrank, offset, dataptr[0], log->size,
// toblocks(log->size) + HEADER_SIZE);
/* Check data sent data is unchanged. */
if
(
datacheck
)
{
if
(
!
datacheck_test
(
toblocks
(
log
->
size
),
&
dataptr
[
HEADER_SIZE
],
log
->
otherrank
))
{
fflush
(
stdout
);
fflush
(
stderr
);
error
(
"Data mismatch on completion"
);
if
((
size_t
)
log
->
size
==
dataptr
[
1
]
&&
(
size_t
)
log
->
tag
==
dataptr
[
2
])
{
if
(
verbose
)
/* Check data sent data is unchanged. */
if
(
datacheck
)
{
if
(
!
datacheck_test
(
toblocks
(
log
->
size
),
&
dataptr
[
HEADER_SIZE
],
log
->
otherrank
))
{
error
(
"Data mismatch on completion"
);
}
}
}
/* Done, clean up. */
log
->
done
=
1
;
...
...
@@ -370,20 +321,10 @@ static void *recv_thread(void *arg) {
if
(
todo_recv
==
0
)
break
;
}
else
{
message
(
"bad unlocked message %d/%d from %d @ offset %zd: "
"dataptr[0] %zd, size %zd/%zd"
,
log
->
rank
,
log
->
subtype
,
log
->
otherrank
,
offset
,
dataptr
[
0
],
log
->
size
,
toblocks
(
log
->
size
)
+
HEADER_SIZE
);
error
(
"Unlocked data has incorrect tag or size: %zd/%zd %d/%zd"
,
log
->
size
,
dataptr
[
1
],
log
->
tag
,
dataptr
[
2
]);
}
}
//else {
// message("unlocked message %d/%d from %d @ offset %zd: "
// "dataptr[0] %zd, size %zd/%zd", log->rank, log->subtype,
// log->otherrank, offset, dataptr[0], log->size,
// toblocks(log->size) + HEADER_SIZE);
//}
}
/* Need to allow for some MPI progession. Since we make no MPI calls
* (by intent receive is a passive target so only the sender should
...
...
@@ -448,7 +389,6 @@ static void pick_logs() {
log
->
done
=
0
;
log
->
data
=
NULL
;
log
->
ranktag
=
toranktag
(
log
->
otherrank
,
log
->
rank
,
log
->
tag
);
//message("add %s", tokey(log));
if
(
log
->
type
==
task_type_send
)
{
send_queue
[
nr_send
]
=
log
;
nr_send
++
;
...
...
@@ -478,20 +418,18 @@ static void pick_logs() {
* of the windows. */
for
(
int
k
=
0
;
k
<
nr_recv
;
k
++
)
{
struct
mpiuse_log_entry
*
log
=
recv_queue
[
k
];
ranktag_lists
[
INDEX3
(
task_subtype_count
,
nr_ranks
,
log
->
subtype
,
myrank
,
k
)]
=
log
->
ranktag
;
ranktag_offsets
[
INDEX3
(
task_subtype_count
,
nr_ranks
,
log
->
subtype
,
myrank
,
k
)]
=
ranktag_sizes
[
log
->
subtype
];
ranktag_lists
[
INDEX3
(
task_subtype_count
,
nr_ranks
,
log
->
subtype
,
myrank
,
k
)]
=
log
->
ranktag
;
ranktag_offsets
[
INDEX3
(
task_subtype_count
,
nr_ranks
,
log
->
subtype
,
myrank
,
k
)]
=
ranktag_sizes
[
log
->
subtype
];
log
->
offset
=
ranktag_sizes
[
log
->
subtype
];
/* Need to use a multiple of blocks to keep alignment. */
size_t
size
=
toblocks
(
log
->
size
)
+
HEADER_SIZE
;
//message("%d increment offset by %zd blocks", log->subtype, size);
ranktag_sizes
[
log
->
subtype
]
+=
size
;
ranktag_counts
[
INDEX2
(
task_subtype_count
,
log
->
subtype
,
myrank
)]
++
;
}
if
(
verbose
)
message
(
"local logs picked, nr_send = %d, nr_recv = %d "
,
nr_send
,
nr_recv
);
}
/**
...
...
@@ -557,39 +495,36 @@ int main(int argc, char *argv[]) {
error
(
"The number of MPI ranks %d does not match the expected value %d"
,
nranks
,
nr_ranks
);
/* Index of most significant bit in the maximum rank id. Assumes GCC
* intrinsic. */
rank_shift
=
(
sizeof
(
int
)
*
CHAR_BIT
)
-
__builtin_clz
(
nr_ranks
);
/* We all need to agree on a maximum count of logs, so we can exchange the
* offset arrays (would be ragged otherwise). */
* offset arrays (would be ragged otherwise
and difficult to exchange
). */
max_logs
=
mpiuse_nr_logs
()
/
2
+
1
;
MPI_Allreduce
(
MPI_IN_PLACE
,
&
max_logs
,
1
,
MPI_AINT
,
MPI_MAX
,
MPI_COMM_WORLD
);
if
(
verbose
)
message
(
"maximum log count = %zd"
,
max_logs
);
/* Extract the send and recv messages for our rank. */
/* Extract the send and recv messages for our rank
and populate the queues
. */
pick_logs
();
/* Now for the one-sided setup... Each rank needs a buffer per communicator
* with space for all the expected messages. */
for
(
int
i
=
0
;
i
<
task_subtype_count
;
i
++
)
{
MPI_Comm_dup
(
MPI_COMM_WORLD
,
&
subtypeMPI_comms
[
i
]);
size_t
size
=
tobytes
(
ranktag_sizes
[
i
]);
if
(
size
==
0
)
size
=
BYTESINBLOCK
;
MPI_Win_allocate
(
size
,
BYTESINBLOCK
,
MPI_INFO_NULL
,
subtypeMPI_comms
[
i
],
&
mpi_ptr
[
i
],
&
mpi_window
[
i
]);
memset
(
mpi_ptr
[
i
],
170
,
tobytes
(
ranktag_sizes
[
i
]));
//if (verbose)
// message("Allocated window of size %zd for subtype %d",
// ranktag_sizes[i], i);
/* Assert a shared lock with all the other processes on this
* window. Needed as we use threads, so cannot lock or unlock as a means
* of synchronization. */
MPI_Win_lock_all
(
MPI_MODE_NOCHECK
,
mpi_window
[
i
]);
MPI_Comm_dup
(
MPI_COMM_WORLD
,
&
subtypeMPI_comms
[
i
]);
size_t
size
=
tobytes
(
ranktag_sizes
[
i
]);
if
(
size
==
0
)
size
=
BYTESINBLOCK
;
MPI_Win_allocate
(
size
,
BYTESINBLOCK
,
MPI_INFO_NULL
,
subtypeMPI_comms
[
i
],
&
mpi_ptr
[
i
],
&
mpi_window
[
i
]);
memset
(
mpi_ptr
[
i
],
170
,
tobytes
(
ranktag_sizes
[
i
]));
/* Assert a shared lock with all the other processes on this window.
* Strictly needed as we use threads, so cannot lock or unlock as
* a means of synchronization. */
MPI_Win_lock_all
(
MPI_MODE_NOCHECK
,
mpi_window
[
i
]);
}
//message("Windows allocated");
/* We need to share all the offsets for each communucator with all the other
/* We need to share all the offsets for each communicator with all the other
* ranks so they can push data into the correct parts of our receive
* window. */
MPI_Allreduce
(
MPI_IN_PLACE
,
ranktag_offsets
,
...
...
@@ -600,7 +535,6 @@ int main(int argc, char *argv[]) {
MPI_Allreduce
(
MPI_IN_PLACE
,
ranktag_lists
,
task_subtype_count
*
nr_ranks
*
max_logs
,
MPI_AINT
,
MPI_SUM
,
MPI_COMM_WORLD
);
//message("check: local logs picked, nr_send = %d, nr_recv = %d ", nr_send, nr_recv);
/* Time to start time. Try to make it synchronous across the ranks. */
MPI_Barrier
(
MPI_COMM_WORLD
);
...
...
@@ -618,9 +552,6 @@ int main(int argc, char *argv[]) {
pthread_t
sendthread
;
if
(
pthread_create
(
&
sendthread
,
NULL
,
&
send_thread
,
&
myrank
)
!=
0
)
error
(
"Failed to create send thread."
);
/* XXX could have more than one of these... With a partition of the
* subtypes. */
pthread_t
recvthread
;
if
(
pthread_create
(
&
recvthread
,
NULL
,
&
recv_thread
,
&
myrank
)
!=
0
)
error
(
"Failed to create recv thread."
);
...
...
@@ -630,7 +561,7 @@ int main(int argc, char *argv[]) {
pthread_join
(
sendthread
,
NULL
);
pthread_join
(
recvthread
,
NULL
);
/* Free the window locks. On
ce
we all arrive. */
/* Free the window locks. On
ly after
we all arrive. */
MPI_Barrier
(
MPI_COMM_WORLD
);
for
(
int
i
=
0
;
i
<
task_subtype_count
;
i
++
)
{
MPI_Win_unlock_all
(
mpi_window
[
i
]);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment