Skip to content
Snippets Groups Projects
Commit b8683524 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'cell_split' into 'master'

Buffered cell_split

This "works", but my tests crash because of #248, which I get every time I run EAGLE_12 with a single thread.

@pdraper, can you give this a spin through your regular tests to see if there are no hidden bugs/problems I've missed? Thanks!

See merge request !294
parents 004b6f59 5b9ab7fb
Branches
Tags
1 merge request!294Buffered cell_split
......@@ -472,8 +472,11 @@ void cell_gunlocktree(struct cell *c) {
* space's parts array, i.e. c->parts - s->parts.
* @param buff A buffer with at least max(c->count, c->gcount) entries,
* used for sorting indices.
* @param gbuff A buffer with at least max(c->count, c->gcount) entries,
* used for sorting indices for the gparts.
*/
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
struct cell_buff *gbuff) {
const int count = c->count, gcount = c->gcount;
struct part *parts = c->parts;
......@@ -485,18 +488,21 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int bucket_offset[9];
/* If the buff is NULL, allocate it, and remember to free it. */
const int allocate_buffer = (buff == NULL);
if (allocate_buffer &&
(buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
error("Failed to allocate temporary indices.");
#ifdef SWIFT_DEBUG_CHECKS
/* Check that the buffs are OK. */
for (int k = 0; k < count; k++) {
if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
buff[k].x[2] != parts[k].x[2])
error("Inconsistent buff contents.");
}
#endif /* SWIFT_DEBUG_CHECKS */
/* Fill the buffer with the indices. */
for (int k = 0; k < count; k++) {
const int bid = (parts[k].x[0] > pivot[0]) * 4 +
(parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
const int bid = (buff[k].x[0] > pivot[0]) * 4 +
(buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
buff[k].ind = bid;
}
/* Set the buffer offsets. */
......@@ -510,23 +516,25 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
int bid = buff[k].ind;
if (bid != bucket) {
struct part part = parts[k];
struct xpart xpart = xparts[k];
struct cell_buff temp_buff = buff[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
while (buff[j].ind == bid) {
j++;
bucket_count[bid]++;
}
memswap(&parts[j], &part, sizeof(struct part));
memswap(&xparts[j], &xpart, sizeof(struct xpart));
memswap(&buff[j], &bid, sizeof(int));
memswap(&buff[j], &temp_buff, sizeof(struct cell_buff));
bid = temp_buff.ind;
}
parts[k] = part;
xparts[k] = xpart;
buff[k] = bid;
buff[k] = temp_buff;
}
bucket_count[bid]++;
}
......@@ -543,6 +551,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
if (count > 0 && gcount > 0) part_relink_gparts(parts, count, parts_offset);
#ifdef SWIFT_DEBUG_CHECKS
/* Check that the buffs are OK. */
for (int k = 1; k < count; k++) {
if (buff[k].ind < buff[k - 1].ind) error("Buff not sorted.");
if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] ||
buff[k].x[2] != parts[k].x[2])
error("Inconsistent buff contents (k=%i).", k);
}
/* Verify that _all_ the parts have been assigned to a cell. */
for (int k = 1; k < 8; k++)
if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] !=
......@@ -569,6 +585,31 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
c->progeny[2]->parts[k].x[1] <= pivot[1] ||
c->progeny[2]->parts[k].x[2] > pivot[2])
error("Sorting failed (progeny=2).");
for (int k = 0; k < c->progeny[3]->count; k++)
if (c->progeny[3]->parts[k].x[0] > pivot[0] ||
c->progeny[3]->parts[k].x[1] <= pivot[1] ||
c->progeny[3]->parts[k].x[2] <= pivot[2])
error("Sorting failed (progeny=3).");
for (int k = 0; k < c->progeny[4]->count; k++)
if (c->progeny[4]->parts[k].x[0] <= pivot[0] ||
c->progeny[4]->parts[k].x[1] > pivot[1] ||
c->progeny[4]->parts[k].x[2] > pivot[2])
error("Sorting failed (progeny=4).");
for (int k = 0; k < c->progeny[5]->count; k++)
if (c->progeny[5]->parts[k].x[0] <= pivot[0] ||
c->progeny[5]->parts[k].x[1] > pivot[1] ||
c->progeny[5]->parts[k].x[2] <= pivot[2])
error("Sorting failed (progeny=5).");
for (int k = 0; k < c->progeny[6]->count; k++)
if (c->progeny[6]->parts[k].x[0] <= pivot[0] ||
c->progeny[6]->parts[k].x[1] <= pivot[1] ||
c->progeny[6]->parts[k].x[2] > pivot[2])
error("Sorting failed (progeny=6).");
for (int k = 0; k < c->progeny[7]->count; k++)
if (c->progeny[7]->parts[k].x[0] <= pivot[0] ||
c->progeny[7]->parts[k].x[1] <= pivot[1] ||
c->progeny[7]->parts[k].x[2] <= pivot[2])
error("Sorting failed (progeny=7).");
#endif
/* Now do the same song and dance for the gparts. */
......@@ -576,11 +617,10 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
/* Fill the buffer with the indices. */
for (int k = 0; k < gcount; k++) {
const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
(gparts[k].x[1] > pivot[1]) * 2 +
(gparts[k].x[2] > pivot[2]);
const int bid = (gbuff[k].x[0] > pivot[0]) * 4 +
(gbuff[k].x[1] > pivot[1]) * 2 + (gbuff[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
gbuff[k].ind = bid;
}
/* Set the buffer offsets. */
......@@ -594,20 +634,22 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
int bid = gbuff[k].ind;
if (bid != bucket) {
struct gpart gpart = gparts[k];
struct cell_buff temp_buff = gbuff[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
while (gbuff[j].ind == bid) {
j++;
bucket_count[bid]++;
}
memswap(&gparts[j], &gpart, sizeof(struct gpart));
memswap(&buff[j], &bid, sizeof(int));
memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff));
bid = temp_buff.ind;
}
gparts[k] = gpart;
buff[k] = bid;
gbuff[k] = temp_buff;
}
bucket_count[bid]++;
}
......
......@@ -52,6 +52,12 @@ struct scheduler;
/* Global variables. */
extern int cell_next_tag;
/* Struct to temporarily buffer the particle locations and bin id. */
struct cell_buff {
double x[3];
int ind;
} SWIFT_STRUCT_ALIGN;
/* Mini struct to link cells to tasks. Used as a linked list. */
struct link {
......@@ -275,7 +281,8 @@ struct cell {
((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
/* Function prototypes. */
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff,
struct cell_buff *gbuff);
void cell_sanitize(struct cell *c);
int cell_locktree(struct cell *c);
void cell_unlocktree(struct cell *c);
......
......@@ -32,24 +32,27 @@
#include <altivec.h>
#endif
/* Macro for in-place swap of two values a and b of type t. */
#define swap_loop(t, a, b, c) \
while (c >= sizeof(t)) { \
register t temp = *(t *)a; \
*(t *)a = *(t *)b; \
*(t *)b = temp; \
a += sizeof(t); \
b += sizeof(t); \
bytes -= sizeof(t); \
/* Macro for in-place swap of two values a and b of type t. a and b are
assumed to be of type char* so that the pointer arithmetic works. */
#define swap_loop(type, a, b, count) \
while (count >= sizeof(type)) { \
register type temp = *(type *)a; \
*(type *)a = *(type *)b; \
*(type *)b = temp; \
a += sizeof(type); \
b += sizeof(type); \
count -= sizeof(type); \
}
/**
* @brief Swap the contents of two elements in-place.
*
* Keep in mind that this function works best when the underlying data
* Keep in mind that this function only works when the underlying data
* is aligned to the vector length, e.g. with the @c
* __attribute__((aligned(32)))
* syntax, and the code is compiled with @c -funroll-loops.
* syntax!
* Furthermore, register re-labeling only seems to work when the code is
* compiled with @c -funroll-loops.
*
* @param void_a Pointer to the first element.
* @param void_b Pointer to the second element.
......@@ -76,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
swap_loop(char, a, b, bytes);
}
/**
* @brief Swap the contents of two elements in-place.
*
* As opposed to #memswap, this function does not require the parameters
* to be aligned in any specific way.
* Furthermore, register re-labeling only seems to work when the code is
* compiled with @c -funroll-loops.
*
* @param void_a Pointer to the first element.
* @param void_b Pointer to the second element.
* @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
*/
__attribute__((always_inline)) inline void memswap_unaligned(void *void_a,
void *void_b,
size_t bytes) {
char *a = (char *)void_a, *b = (char *)void_b;
#ifdef __AVX512F__
while (bytes >= sizeof(__m512i)) {
register __m512i temp;
temp = _mm512_loadu_si512((__m512i *)a);
_mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b));
_mm512_storeu_si512((__m512i *)b, temp);
a += sizeof(__m512i);
b += sizeof(__m512i);
bytes -= sizeof(__m512i);
}
#endif
#ifdef __AVX__
while (bytes >= sizeof(__m256i)) {
register __m256i temp;
temp = _mm256_loadu_si256((__m256i *)a);
_mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b));
_mm256_storeu_si256((__m256i *)b, temp);
a += sizeof(__m256i);
b += sizeof(__m256i);
bytes -= sizeof(__m256i);
}
#endif
#ifdef __SSE2__
while (bytes >= sizeof(__m128i)) {
register __m128i temp;
temp = _mm_loadu_si128((__m128i *)a);
_mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b));
_mm_storeu_si128((__m128i *)b, temp);
a += sizeof(__m128i);
b += sizeof(__m128i);
bytes -= sizeof(__m128i);
}
#endif
#ifdef __ALTIVEC__
// Power8 supports unaligned load/stores, but not sure what it will do here.
swap_loop(vector int, a, b, bytes);
#endif
swap_loop(size_t, a, b, bytes);
swap_loop(int, a, b, bytes);
swap_loop(short, a, b, bytes);
swap_loop(char, a, b, bytes);
}
#endif /* SWIFT_MEMSWAP_H */
......@@ -1467,9 +1467,12 @@ void space_map_cells_pre(struct space *s, int full,
* @param s The #space in which the cell lives.
* @param c The #cell to split recursively.
* @param buff A buffer for particle sorting, should be of size at least
* max(c->count, c->gount) or @c NULL.
* c->count or @c NULL.
* @param gbuff A buffer for particle sorting, should be of size at least
* c->gcount or @c NULL.
*/
void space_split_recursive(struct space *s, struct cell *c, int *buff) {
void space_split_recursive(struct space *s, struct cell *c,
struct cell_buff *buff, struct cell_buff *gbuff) {
const int count = c->count;
const int gcount = c->gcount;
......@@ -1482,10 +1485,29 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
struct xpart *xparts = c->xparts;
/* If the buff is NULL, allocate it, and remember to free it. */
const int allocate_buffer = (buff == NULL);
if (allocate_buffer &&
(buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
error("Failed to allocate temporary indices.");
const int allocate_buffer = (buff == NULL && gbuff == NULL);
if (allocate_buffer) {
if (count > 0) {
if (posix_memalign((void *)&buff, SWIFT_STRUCT_ALIGNMENT,
sizeof(struct cell_buff) * count) != 0)
error("Failed to allocate temporary indices.");
for (int k = 0; k < count; k++) {
buff[k].x[0] = parts[k].x[0];
buff[k].x[1] = parts[k].x[1];
buff[k].x[2] = parts[k].x[2];
}
}
if (gcount > 0) {
if (posix_memalign((void *)&gbuff, SWIFT_STRUCT_ALIGNMENT,
sizeof(struct cell_buff) * gcount) != 0)
error("Failed to allocate temporary indices.");
for (int k = 0; k < gcount; k++) {
gbuff[k].x[0] = gparts[k].x[0];
gbuff[k].x[1] = gparts[k].x[1];
gbuff[k].x[2] = gparts[k].x[2];
}
}
}
/* Check the depth. */
while (depth > (maxdepth = s->maxdepth)) {
......@@ -1531,15 +1553,18 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
}
/* Split the cell data. */
cell_split(c, c->parts - s->parts, buff);
cell_split(c, c->parts - s->parts, buff, gbuff);
/* Remove any progeny with zero parts. */
struct cell_buff *progeny_buff = buff, *progeny_gbuff = gbuff;
for (int k = 0; k < 8; k++)
if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
space_recycle(s, c->progeny[k]);
c->progeny[k] = NULL;
} else {
space_split_recursive(s, c->progeny[k], buff);
space_split_recursive(s, c->progeny[k], progeny_buff, progeny_gbuff);
progeny_buff += c->progeny[k]->count;
progeny_gbuff += c->progeny[k]->gcount;
h_max = max(h_max, c->progeny[k]->h_max);
ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
......@@ -1598,7 +1623,10 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) {
c->owner = 0; /* Ok, there is really nothing on this rank... */
/* Clean up. */
if (allocate_buffer) free(buff);
if (allocate_buffer) {
if (buff != NULL) free(buff);
if (gbuff != NULL) free(gbuff);
}
}
/**
......@@ -1617,7 +1645,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
for (int ind = 0; ind < num_cells; ind++) {
struct cell *c = &cells_top[ind];
space_split_recursive(s, c, NULL);
space_split_recursive(s, c, NULL, NULL);
}
#ifdef SWIFT_DEBUG_CHECKS
......@@ -1625,7 +1653,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
for (int ind = 0; ind < num_cells; ind++) {
int depth = 0;
if (!checkCellhdxmax(&cells_top[ind], &depth))
message(" at cell depth %d", depth);
error(" at cell depth %d", depth);
}
#endif
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment