diff --git a/src/memswap.h b/src/memswap.h
index 8c0e9ec2c0dc119b38358be5346174b6c1a1fe46..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644
--- a/src/memswap.h
+++ b/src/memswap.h
@@ -79,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
   swap_loop(char, a, b, bytes);
 }
 
+/**
+ * @brief Swap the contents of two elements in-place.
+ *
+ * As opposed to #memswap, this function does not require the parameters
+ * to be aligned in any specific way.
+ * Furthermore, register re-labeling only seems to work when the code is
+ * compiled with @c -funroll-loops.
+ *
+ * @param void_a Pointer to the first element.
+ * @param void_b Pointer to the second element.
+ * @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
+ */
+__attribute__((always_inline)) inline void memswap_unaligned(void *void_a,
+                                                             void *void_b,
+                                                             size_t bytes) {
+  char *a = (char *)void_a, *b = (char *)void_b;
+#ifdef __AVX512F__
+  while (bytes >= sizeof(__m512i)) {
+    register __m512i temp;
+    temp = _mm512_loadu_si512((__m512i *)a);
+    _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b));
+    _mm512_storeu_si512((__m512i *)b, temp);
+    a += sizeof(__m512i);
+    b += sizeof(__m512i);
+    bytes -= sizeof(__m512i);
+  }
+#endif
+#ifdef __AVX__
+  while (bytes >= sizeof(__m256i)) {
+    register __m256i temp;
+    temp = _mm256_loadu_si256((__m256i *)a);
+    _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b));
+    _mm256_storeu_si256((__m256i *)b, temp);
+    a += sizeof(__m256i);
+    b += sizeof(__m256i);
+    bytes -= sizeof(__m256i);
+  }
+#endif
+#ifdef __SSE2__
+  while (bytes >= sizeof(__m128i)) {
+    register __m128i temp;
+    temp = _mm_loadu_si128((__m128i *)a);
+    _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b));
+    _mm_storeu_si128((__m128i *)b, temp);
+    a += sizeof(__m128i);
+    b += sizeof(__m128i);
+    bytes -= sizeof(__m128i);
+  }
+#endif
+#ifdef __ALTIVEC__
+  // Power8 supports unaligned load/stores, but not sure what it will do here.
+  swap_loop(vector int, a, b, bytes);
+#endif
+  swap_loop(size_t, a, b, bytes);
+  swap_loop(int, a, b, bytes);
+  swap_loop(short, a, b, bytes);
+  swap_loop(char, a, b, bytes);
+}
+
 #endif /* SWIFT_MEMSWAP_H */