diff --git a/src/cell.c b/src/cell.c index d17ba527a3d9f7960a7740e7f517d37ec015d5cc..2b3f5ab2e85552416d541222c94a9eeaed056b17 100644 --- a/src/cell.c +++ b/src/cell.c @@ -472,8 +472,11 @@ void cell_gunlocktree(struct cell *c) { * space's parts array, i.e. c->parts - s->parts. * @param buff A buffer with at least max(c->count, c->gcount) entries, * used for sorting indices. + * @param gbuff A buffer with at least max(c->count, c->gcount) entries, + * used for sorting indices for the gparts. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { +void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff, + struct cell_buff *gbuff) { const int count = c->count, gcount = c->gcount; struct part *parts = c->parts; @@ -485,18 +488,21 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0}; int bucket_offset[9]; - /* If the buff is NULL, allocate it, and remember to free it. */ - const int allocate_buffer = (buff == NULL); - if (allocate_buffer && - (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) - error("Failed to allocate temporary indices."); +#ifdef SWIFT_DEBUG_CHECKS + /* Check that the buffs are OK. */ + for (int k = 0; k < count; k++) { + if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] || + buff[k].x[2] != parts[k].x[2]) + error("Inconsistent buff contents."); + } +#endif /* SWIFT_DEBUG_CHECKS */ /* Fill the buffer with the indices. */ for (int k = 0; k < count; k++) { - const int bid = (parts[k].x[0] > pivot[0]) * 4 + - (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]); + const int bid = (buff[k].x[0] > pivot[0]) * 4 + + (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]); bucket_count[bid]++; - buff[k] = bid; + buff[k].ind = bid; } /* Set the buffer offsets. */ @@ -510,23 +516,25 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { for (int bucket = 0; bucket < 8; bucket++) { for (int k = bucket_offset[bucket] + bucket_count[bucket]; k < bucket_offset[bucket + 1]; k++) { - int bid = buff[k]; + int bid = buff[k].ind; if (bid != bucket) { struct part part = parts[k]; struct xpart xpart = xparts[k]; + struct cell_buff temp_buff = buff[k]; while (bid != bucket) { int j = bucket_offset[bid] + bucket_count[bid]++; - while (buff[j] == bid) { + while (buff[j].ind == bid) { j++; bucket_count[bid]++; } memswap(&parts[j], &part, sizeof(struct part)); memswap(&xparts[j], &xpart, sizeof(struct xpart)); - memswap(&buff[j], &bid, sizeof(int)); + memswap(&buff[j], &temp_buff, sizeof(struct cell_buff)); + bid = temp_buff.ind; } parts[k] = part; xparts[k] = xpart; - buff[k] = bid; + buff[k] = temp_buff; } bucket_count[bid]++; } @@ -543,6 +551,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { if (count > 0 && gcount > 0) part_relink_gparts(parts, count, parts_offset); #ifdef SWIFT_DEBUG_CHECKS + /* Check that the buffs are OK. */ + for (int k = 1; k < count; k++) { + if (buff[k].ind < buff[k - 1].ind) error("Buff not sorted."); + if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] || + buff[k].x[2] != parts[k].x[2]) + error("Inconsistent buff contents (k=%i).", k); + } + /* Verify that _all_ the parts have been assigned to a cell. */ for (int k = 1; k < 8; k++) if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] != @@ -569,6 +585,31 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { c->progeny[2]->parts[k].x[1] <= pivot[1] || c->progeny[2]->parts[k].x[2] > pivot[2]) error("Sorting failed (progeny=2)."); + for (int k = 0; k < c->progeny[3]->count; k++) + if (c->progeny[3]->parts[k].x[0] > pivot[0] || + c->progeny[3]->parts[k].x[1] <= pivot[1] || + c->progeny[3]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=3)."); + for (int k = 0; k < c->progeny[4]->count; k++) + if (c->progeny[4]->parts[k].x[0] <= pivot[0] || + c->progeny[4]->parts[k].x[1] > pivot[1] || + c->progeny[4]->parts[k].x[2] > pivot[2]) + error("Sorting failed (progeny=4)."); + for (int k = 0; k < c->progeny[5]->count; k++) + if (c->progeny[5]->parts[k].x[0] <= pivot[0] || + c->progeny[5]->parts[k].x[1] > pivot[1] || + c->progeny[5]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=5)."); + for (int k = 0; k < c->progeny[6]->count; k++) + if (c->progeny[6]->parts[k].x[0] <= pivot[0] || + c->progeny[6]->parts[k].x[1] <= pivot[1] || + c->progeny[6]->parts[k].x[2] > pivot[2]) + error("Sorting failed (progeny=6)."); + for (int k = 0; k < c->progeny[7]->count; k++) + if (c->progeny[7]->parts[k].x[0] <= pivot[0] || + c->progeny[7]->parts[k].x[1] <= pivot[1] || + c->progeny[7]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=7)."); #endif /* Now do the same song and dance for the gparts. */ @@ -576,11 +617,10 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { /* Fill the buffer with the indices. */ for (int k = 0; k < gcount; k++) { - const int bid = (gparts[k].x[0] > pivot[0]) * 4 + - (gparts[k].x[1] > pivot[1]) * 2 + - (gparts[k].x[2] > pivot[2]); + const int bid = (gbuff[k].x[0] > pivot[0]) * 4 + + (gbuff[k].x[1] > pivot[1]) * 2 + (gbuff[k].x[2] > pivot[2]); bucket_count[bid]++; - buff[k] = bid; + gbuff[k].ind = bid; } /* Set the buffer offsets. */ @@ -594,20 +634,22 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { for (int bucket = 0; bucket < 8; bucket++) { for (int k = bucket_offset[bucket] + bucket_count[bucket]; k < bucket_offset[bucket + 1]; k++) { - int bid = buff[k]; + int bid = gbuff[k].ind; if (bid != bucket) { struct gpart gpart = gparts[k]; + struct cell_buff temp_buff = gbuff[k]; while (bid != bucket) { int j = bucket_offset[bid] + bucket_count[bid]++; - while (buff[j] == bid) { + while (gbuff[j].ind == bid) { j++; bucket_count[bid]++; } memswap(&gparts[j], &gpart, sizeof(struct gpart)); - memswap(&buff[j], &bid, sizeof(int)); + memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff)); + bid = temp_buff.ind; } gparts[k] = gpart; - buff[k] = bid; + gbuff[k] = temp_buff; } bucket_count[bid]++; } diff --git a/src/cell.h b/src/cell.h index c01594d4885067e63cab363718cc1559c3ff1034..d99019f58ea20ad83c835178ba71569192be85a1 100644 --- a/src/cell.h +++ b/src/cell.h @@ -52,6 +52,12 @@ struct scheduler; /* Global variables. */ extern int cell_next_tag; +/* Struct to temporarily buffer the particle locations and bin id. */ +struct cell_buff { + double x[3]; + int ind; +} SWIFT_STRUCT_ALIGN; + /* Mini struct to link cells to tasks. Used as a linked list. */ struct link { @@ -281,7 +287,8 @@ struct cell { ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* Function prototypes. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff); +void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff, + struct cell_buff *gbuff); void cell_sanitize(struct cell *c); int cell_locktree(struct cell *c); void cell_unlocktree(struct cell *c); diff --git a/src/memswap.h b/src/memswap.h index 4643725535917952d12927d52187bc7306ced5ef..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644 --- a/src/memswap.h +++ b/src/memswap.h @@ -32,24 +32,27 @@ #include <altivec.h> #endif -/* Macro for in-place swap of two values a and b of type t. */ -#define swap_loop(t, a, b, c) \ - while (c >= sizeof(t)) { \ - register t temp = *(t *)a; \ - *(t *)a = *(t *)b; \ - *(t *)b = temp; \ - a += sizeof(t); \ - b += sizeof(t); \ - bytes -= sizeof(t); \ +/* Macro for in-place swap of two values a and b of type t. a and b are + assumed to be of type char* so that the pointer arithmetic works. */ +#define swap_loop(type, a, b, count) \ + while (count >= sizeof(type)) { \ + register type temp = *(type *)a; \ + *(type *)a = *(type *)b; \ + *(type *)b = temp; \ + a += sizeof(type); \ + b += sizeof(type); \ + count -= sizeof(type); \ } /** * @brief Swap the contents of two elements in-place. * - * Keep in mind that this function works best when the underlying data + * Keep in mind that this function only works when the underlying data * is aligned to the vector length, e.g. with the @c * __attribute__((aligned(32))) - * syntax, and the code is compiled with @c -funroll-loops. + * syntax! + * Furthermore, register re-labeling only seems to work when the code is + * compiled with @c -funroll-loops. * * @param void_a Pointer to the first element. * @param void_b Pointer to the second element. @@ -76,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, swap_loop(char, a, b, bytes); } +/** + * @brief Swap the contents of two elements in-place. + * + * As opposed to #memswap, this function does not require the parameters + * to be aligned in any specific way. + * Furthermore, register re-labeling only seems to work when the code is + * compiled with @c -funroll-loops. + * + * @param void_a Pointer to the first element. + * @param void_b Pointer to the second element. + * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. + */ +__attribute__((always_inline)) inline void memswap_unaligned(void *void_a, + void *void_b, + size_t bytes) { + char *a = (char *)void_a, *b = (char *)void_b; +#ifdef __AVX512F__ + while (bytes >= sizeof(__m512i)) { + register __m512i temp; + temp = _mm512_loadu_si512((__m512i *)a); + _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b)); + _mm512_storeu_si512((__m512i *)b, temp); + a += sizeof(__m512i); + b += sizeof(__m512i); + bytes -= sizeof(__m512i); + } +#endif +#ifdef __AVX__ + while (bytes >= sizeof(__m256i)) { + register __m256i temp; + temp = _mm256_loadu_si256((__m256i *)a); + _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b)); + _mm256_storeu_si256((__m256i *)b, temp); + a += sizeof(__m256i); + b += sizeof(__m256i); + bytes -= sizeof(__m256i); + } +#endif +#ifdef __SSE2__ + while (bytes >= sizeof(__m128i)) { + register __m128i temp; + temp = _mm_loadu_si128((__m128i *)a); + _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b)); + _mm_storeu_si128((__m128i *)b, temp); + a += sizeof(__m128i); + b += sizeof(__m128i); + bytes -= sizeof(__m128i); + } +#endif +#ifdef __ALTIVEC__ + // Power8 supports unaligned load/stores, but not sure what it will do here. + swap_loop(vector int, a, b, bytes); +#endif + swap_loop(size_t, a, b, bytes); + swap_loop(int, a, b, bytes); + swap_loop(short, a, b, bytes); + swap_loop(char, a, b, bytes); +} + #endif /* SWIFT_MEMSWAP_H */ diff --git a/src/space.c b/src/space.c index 08dd41cdea5bf598c2b17d47f35e6fc0221c2a06..24c2c413198d32e0942a0d26e5a3efbd5571c68e 100644 --- a/src/space.c +++ b/src/space.c @@ -1470,9 +1470,12 @@ void space_map_cells_pre(struct space *s, int full, * @param s The #space in which the cell lives. * @param c The #cell to split recursively. * @param buff A buffer for particle sorting, should be of size at least - * max(c->count, c->gount) or @c NULL. + * c->count or @c NULL. + * @param gbuff A buffer for particle sorting, should be of size at least + * c->gcount or @c NULL. */ -void space_split_recursive(struct space *s, struct cell *c, int *buff) { +void space_split_recursive(struct space *s, struct cell *c, + struct cell_buff *buff, struct cell_buff *gbuff) { const int count = c->count; const int gcount = c->gcount; @@ -1486,10 +1489,29 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) { struct engine *e = s->e; /* If the buff is NULL, allocate it, and remember to free it. */ - const int allocate_buffer = (buff == NULL); - if (allocate_buffer && - (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) - error("Failed to allocate temporary indices."); + const int allocate_buffer = (buff == NULL && gbuff == NULL); + if (allocate_buffer) { + if (count > 0) { + if (posix_memalign((void *)&buff, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct cell_buff) * count) != 0) + error("Failed to allocate temporary indices."); + for (int k = 0; k < count; k++) { + buff[k].x[0] = parts[k].x[0]; + buff[k].x[1] = parts[k].x[1]; + buff[k].x[2] = parts[k].x[2]; + } + } + if (gcount > 0) { + if (posix_memalign((void *)&gbuff, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct cell_buff) * gcount) != 0) + error("Failed to allocate temporary indices."); + for (int k = 0; k < gcount; k++) { + gbuff[k].x[0] = gparts[k].x[0]; + gbuff[k].x[1] = gparts[k].x[1]; + gbuff[k].x[2] = gparts[k].x[2]; + } + } + } /* Check the depth. */ while (depth > (maxdepth = s->maxdepth)) { @@ -1535,15 +1557,18 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) { } /* Split the cell data. */ - cell_split(c, c->parts - s->parts, buff); + cell_split(c, c->parts - s->parts, buff, gbuff); /* Remove any progeny with zero parts. */ + struct cell_buff *progeny_buff = buff, *progeny_gbuff = gbuff; for (int k = 0; k < 8; k++) if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { space_recycle(s, c->progeny[k]); c->progeny[k] = NULL; } else { - space_split_recursive(s, c->progeny[k], buff); + space_split_recursive(s, c->progeny[k], progeny_buff, progeny_gbuff); + progeny_buff += c->progeny[k]->count; + progeny_gbuff += c->progeny[k]->gcount; h_max = max(h_max, c->progeny[k]->h_max); ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); @@ -1604,7 +1629,10 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) { c->owner = 0; /* Ok, there is really nothing on this rank... */ /* Clean up. */ - if (allocate_buffer) free(buff); + if (allocate_buffer) { + if (buff != NULL) free(buff); + if (gbuff != NULL) free(gbuff); + } } /** @@ -1623,7 +1651,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { for (int ind = 0; ind < num_cells; ind++) { struct cell *c = &cells_top[ind]; - space_split_recursive(s, c, NULL); + space_split_recursive(s, c, NULL, NULL); } #ifdef SWIFT_DEBUG_CHECKS @@ -1631,7 +1659,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { for (int ind = 0; ind < num_cells; ind++) { int depth = 0; if (!checkCellhdxmax(&cells_top[ind], &depth)) - message(" at cell depth %d", depth); + error(" at cell depth %d", depth); } #endif }