diff --git a/configure.ac b/configure.ac index be5d4652e46ce15657336219c1ba0dce2c1f25bc..9fa9a1de591d63794dde5db6a8dd733cfcaada09 100644 --- a/configure.ac +++ b/configure.ac @@ -296,7 +296,6 @@ if test "$enable_san" = "yes"; then fi fi - # Autoconf stuff. AC_PROG_INSTALL AC_PROG_MAKE_SET @@ -473,8 +472,9 @@ if test "$ac_cv_header_fftw3_h" = "yes"; then fi AC_SUBST([FFTW_LIBS]) -# Check for Intel intrinsics header optionally used by vector.h. +# Check for Intel and PowerPC intrinsics header optionally used by vector.h. AC_CHECK_HEADERS([immintrin.h]) +AC_CHECK_HEADERS([altivec.h]) # Check for timing functions needed by cycle.h. AC_HEADER_TIME diff --git a/src/Makefile.am b/src/Makefile.am index 9c841db1176df09ee5bcaa0e051f4f0ae97df518..18da59c2b04f50cac29d948bdbe502bc2f70bec2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -44,7 +44,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \ physical_constants.h physical_constants_cgs.h potential.h version.h \ hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \ - sourceterms_struct.h statistics.h + sourceterms_struct.h statistics.h memswap.h # Common source files @@ -83,7 +83,8 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h potential/softened_isothermal/potential.h \ cooling/none/cooling.h cooling/none/cooling_struct.h \ cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \ - cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h + cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \ + memswap.h # Sources and flags for regular library diff --git a/src/cell.c b/src/cell.c index 878401d770ca4a7787056a25da6d3b60d4313f2f..e2767cdaa9e1189ec87b5ef51cc578c91f8cfe4c 100644 --- a/src/cell.c +++ b/src/cell.c @@ -53,6 +53,8 @@ #include "gravity.h" #include "hydro.h" #include "hydro_properties.h" +#include "memswap.h" +#include "minmax.h" #include "scheduler.h" #include "space.h" #include "timers.h" @@ -463,122 +465,73 @@ void cell_gunlocktree(struct cell *c) { * @param c The #cell array to be sorted. * @param parts_offset Offset of the cell parts array relative to the * space's parts array, i.e. c->parts - s->parts. + * @param buff A buffer with at least max(c->count, c->gcount) entries, + * used for sorting indices. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset) { +void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { - int i, j; const int count = c->count, gcount = c->gcount; struct part *parts = c->parts; struct xpart *xparts = c->xparts; struct gpart *gparts = c->gparts; - int left[8], right[8]; - double pivot[3]; - - /* Init the pivots. */ - for (int k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->width[k] / 2; - - /* Split along the x-axis. */ - i = 0; - j = count - 1; - while (i <= j) { - while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1; - while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1; - if (i < j) { - struct part temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - struct xpart xtemp = xparts[i]; - xparts[i] = xparts[j]; - xparts[j] = xtemp; - } + const double pivot[3] = {c->loc[0] + c->width[0] / 2, + c->loc[1] + c->width[1] / 2, + c->loc[2] + c->width[2] / 2}; + int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + int bucket_offset[9]; + + /* If the buff is NULL, allocate it, and remember to free it. */ + const int allocate_buffer = (buff == NULL); + if (allocate_buffer && + (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) + error("Failed to allocate temporary indices."); + + /* Fill the buffer with the indices. */ + for (int k = 0; k < count; k++) { + const int bid = (parts[k].x[0] > pivot[0]) * 4 + + (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]); + bucket_count[bid]++; + buff[k] = bid; } -#ifdef SWIFT_DEBUG_CHECKS - for (int k = 0; k <= j; k++) - if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed."); - for (int k = i; k < count; k++) - if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed."); -#endif - - left[1] = i; - right[1] = count - 1; - left[0] = 0; - right[0] = j; - - /* Split along the y axis, twice. */ - for (int k = 1; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1; - while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1; - if (i < j) { - struct part temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - struct xpart xtemp = xparts[i]; - xparts[i] = xparts[j]; - xparts[j] = xtemp; - } - } - -#ifdef SWIFT_DEBUG_CHECKS - for (int kk = left[k]; kk <= j; kk++) - if (parts[kk].x[1] > pivot[1]) { - message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); - error("sorting failed (left)."); - } - for (int kk = i; kk <= right[k]; kk++) - if (parts[kk].x[1] < pivot[1]) error("sorting failed (right)."); -#endif - - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; + /* Set the buffer offsets. */ + bucket_offset[0] = 0; + for (int k = 1; k <= 8; k++) { + bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1]; + bucket_count[k - 1] = 0; } - /* Split along the z axis, four times. */ - for (int k = 3; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1; - while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1; - if (i < j) { - struct part temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - struct xpart xtemp = xparts[i]; - xparts[i] = xparts[j]; - xparts[j] = xtemp; + /* Run through the buckets, and swap particles to their correct spot. */ + for (int bucket = 0; bucket < 8; bucket++) { + for (int k = bucket_offset[bucket] + bucket_count[bucket]; + k < bucket_offset[bucket + 1]; k++) { + int bid = buff[k]; + if (bid != bucket) { + struct part part = parts[k]; + struct xpart xpart = xparts[k]; + while (bid != bucket) { + int j = bucket_offset[bid] + bucket_count[bid]++; + while (buff[j] == bid) { + j++; + bucket_count[bid]++; + } + memswap(&parts[j], &part, sizeof(struct part)); + memswap(&xparts[j], &xpart, sizeof(struct xpart)); + memswap(&buff[j], &bid, sizeof(int)); + } + parts[k] = part; + xparts[k] = xpart; + buff[k] = bid; } + bucket_count[bid]++; } - -#ifdef SWIFT_DEBUG_CHECKS - for (int kk = left[k]; kk <= j; kk++) - if (parts[kk].x[2] > pivot[2]) { - message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); - error("sorting failed (left)."); - } - for (int kk = i; kk <= right[k]; kk++) - if (parts[kk].x[2] < pivot[2]) { - message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); - error("sorting failed (right)."); - } -#endif - - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; } /* Store the counts and offsets. */ for (int k = 0; k < 8; k++) { - c->progeny[k]->count = right[k] - left[k] + 1; - c->progeny[k]->parts = &c->parts[left[k]]; - c->progeny[k]->xparts = &c->xparts[left[k]]; + c->progeny[k]->count = bucket_count[k]; + c->progeny[k]->parts = &c->parts[bucket_offset[k]]; + c->progeny[k]->xparts = &c->xparts[bucket_offset[k]]; } /* Re-link the gparts. */ @@ -614,66 +567,51 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) { #endif /* Now do the same song and dance for the gparts. */ - - /* Split along the x-axis. */ - i = 0; - j = gcount - 1; - while (i <= j) { - while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1; - while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1; - if (i < j) { - struct gpart gtemp = gparts[i]; - gparts[i] = gparts[j]; - gparts[j] = gtemp; - } + for (int k = 0; k < 8; k++) bucket_count[k] = 0; + + /* Fill the buffer with the indices. */ + for (int k = 0; k < gcount; k++) { + const int bid = (gparts[k].x[0] > pivot[0]) * 4 + + (gparts[k].x[1] > pivot[1]) * 2 + + (gparts[k].x[2] > pivot[2]); + bucket_count[bid]++; + buff[k] = bid; } - left[1] = i; - right[1] = gcount - 1; - left[0] = 0; - right[0] = j; - - /* Split along the y axis, twice. */ - for (int k = 1; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1; - while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1; - if (i < j) { - struct gpart gtemp = gparts[i]; - gparts[i] = gparts[j]; - gparts[j] = gtemp; - } - } - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; + + /* Set the buffer offsets. */ + bucket_offset[0] = 0; + for (int k = 1; k <= 8; k++) { + bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1]; + bucket_count[k - 1] = 0; } - /* Split along the z axis, four times. */ - for (int k = 3; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1; - while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1; - if (i < j) { - struct gpart gtemp = gparts[i]; - gparts[i] = gparts[j]; - gparts[j] = gtemp; + /* Run through the buckets, and swap particles to their correct spot. */ + for (int bucket = 0; bucket < 8; bucket++) { + for (int k = bucket_offset[bucket] + bucket_count[bucket]; + k < bucket_offset[bucket + 1]; k++) { + int bid = buff[k]; + if (bid != bucket) { + struct gpart gpart = gparts[k]; + while (bid != bucket) { + int j = bucket_offset[bid] + bucket_count[bid]++; + while (buff[j] == bid) { + j++; + bucket_count[bid]++; + } + memswap(&gparts[j], &gpart, sizeof(struct gpart)); + memswap(&buff[j], &bid, sizeof(int)); + } + gparts[k] = gpart; + buff[k] = bid; } + bucket_count[bid]++; } - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; } /* Store the counts and offsets. */ for (int k = 0; k < 8; k++) { - c->progeny[k]->gcount = right[k] - left[k] + 1; - c->progeny[k]->gparts = &c->gparts[left[k]]; + c->progeny[k]->gcount = bucket_count[k]; + c->progeny[k]->gparts = &c->gparts[bucket_offset[k]]; } /* Re-link the parts. */ diff --git a/src/cell.h b/src/cell.h index eefb2114cde1705ab4e4c59f4b837bddeb093e2b..2cd13cf2ab6b934f6aab84bcbacf510270892866 100644 --- a/src/cell.h +++ b/src/cell.h @@ -272,7 +272,7 @@ struct cell { ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* Function prototypes. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset); +void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff); void cell_sanitize(struct cell *c); int cell_locktree(struct cell *c); void cell_unlocktree(struct cell *c); diff --git a/src/memswap.h b/src/memswap.h new file mode 100644 index 0000000000000000000000000000000000000000..4643725535917952d12927d52187bc7306ced5ef --- /dev/null +++ b/src/memswap.h @@ -0,0 +1,79 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + ******************************************************************************/ +#ifndef SWIFT_MEMSWAP_H +#define SWIFT_MEMSWAP_H + +/* Config parameters. */ +#include "../config.h" + +#ifdef HAVE_IMMINTRIN_H +/* Include the header file with the intrinsics for Intel architecture. */ +#include <immintrin.h> +#endif + +#ifdef HAVE_ALTIVEC_H +/* Include the header file with the intrinsics for Intel architecture. */ +#include <altivec.h> +#endif + +/* Macro for in-place swap of two values a and b of type t. */ +#define swap_loop(t, a, b, c) \ + while (c >= sizeof(t)) { \ + register t temp = *(t *)a; \ + *(t *)a = *(t *)b; \ + *(t *)b = temp; \ + a += sizeof(t); \ + b += sizeof(t); \ + bytes -= sizeof(t); \ + } + +/** + * @brief Swap the contents of two elements in-place. + * + * Keep in mind that this function works best when the underlying data + * is aligned to the vector length, e.g. with the @c + * __attribute__((aligned(32))) + * syntax, and the code is compiled with @c -funroll-loops. + * + * @param void_a Pointer to the first element. + * @param void_b Pointer to the second element. + * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. + */ +__attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, + size_t bytes) { + char *a = (char *)void_a, *b = (char *)void_b; +#ifdef __AVX512F__ + swap_loop(__m512i, a, b, bytes); +#endif +#ifdef __AVX__ + swap_loop(__m256i, a, b, bytes); +#endif +#ifdef __SSE2__ + swap_loop(__m128i, a, b, bytes); +#endif +#ifdef __ALTIVEC__ + swap_loop(vector int, a, b, bytes); +#endif + swap_loop(size_t, a, b, bytes); + swap_loop(int, a, b, bytes); + swap_loop(short, a, b, bytes); + swap_loop(char, a, b, bytes); +} + +#endif /* SWIFT_MEMSWAP_H */ diff --git a/src/space.c b/src/space.c index e04aad2207e736ac8fe4da28b7d22b7c7999765b..935677a9ebed97acfde8341ec1545ef4f33a56c0 100644 --- a/src/space.c +++ b/src/space.c @@ -49,6 +49,7 @@ #include "hydro.h" #include "kernel_hydro.h" #include "lock.h" +#include "memswap.h" #include "minmax.h" #include "runner.h" #include "threadpool.h" @@ -382,7 +383,7 @@ void space_regrid(struct space *s, int verbose) { /* Finished with these. */ free(oldnodeIDs); } -#endif +#endif /* WITH_MPI */ // message( "rebuilding upper-level cells took %.3f %s." , // clocks_from_ticks(double)(getticks() - tic), clocks_getunit()); @@ -453,21 +454,22 @@ void space_rebuild(struct space *s, int verbose) { struct cell *restrict cells_top = s->cells_top; const int ti_current = (s->e != NULL) ? s->e->ti_current : 0; - /* Run through the particles and get their cell index. */ - const size_t ind_size = s->size_parts; + /* Run through the particles and get their cell index. Allocates + an index that is larger than the number of particles to avoid + re-allocating after shuffling. */ + const size_t ind_size = s->size_parts + 100; int *ind; if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL) error("Failed to allocate temporary particle indices."); - if (ind_size > 0) space_parts_get_cell_index(s, ind, cells_top, verbose); - for (size_t i = 0; i < s->nr_parts; ++i) cells_top[ind[i]].count++; + if (s->size_parts > 0) space_parts_get_cell_index(s, ind, cells_top, verbose); /* Run through the gravity particles and get their cell index. */ - const size_t gind_size = s->size_gparts; + const size_t gind_size = s->size_gparts + 100; int *gind; if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL) error("Failed to allocate temporary g-particle indices."); - if (gind_size > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose); - for (size_t i = 0; i < s->nr_gparts; ++i) cells_top[gind[i]].gcount++; + if (s->size_gparts > 0) + space_gparts_get_cell_index(s, gind, cells_top, verbose); #ifdef WITH_MPI @@ -475,7 +477,6 @@ void space_rebuild(struct space *s, int verbose) { const int local_nodeID = s->e->nodeID; for (size_t k = 0; k < nr_parts;) { if (cells_top[ind[k]].nodeID != local_nodeID) { - cells_top[ind[k]].count -= 1; nr_parts -= 1; const struct part tp = s->parts[k]; s->parts[k] = s->parts[nr_parts]; @@ -515,7 +516,6 @@ void space_rebuild(struct space *s, int verbose) { /* Move non-local gparts to the end of the list. */ for (size_t k = 0; k < nr_gparts;) { if (cells_top[gind[k]].nodeID != local_nodeID) { - cells_top[gind[k]].gcount -= 1; nr_gparts -= 1; const struct gpart tp = s->gparts[k]; s->gparts[k] = s->gparts[nr_gparts]; @@ -562,9 +562,9 @@ void space_rebuild(struct space *s, int verbose) { s->nr_gparts = nr_gparts + nr_gparts_exchanged; /* Re-allocate the index array if needed.. */ - if (s->nr_parts > ind_size) { + if (s->nr_parts + 1 > ind_size) { int *ind_new; - if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL) + if ((ind_new = (int *)malloc(sizeof(int) * (s->nr_parts + 1))) == NULL) error("Failed to allocate temporary particle indices."); memcpy(ind_new, ind, sizeof(int) * nr_parts); free(ind); @@ -579,7 +579,6 @@ void space_rebuild(struct space *s, int verbose) { const struct part *const p = &s->parts[k]; ind[k] = cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); - cells_top[ind[k]].count += 1; #ifdef SWIFT_DEBUG_CHECKS if (cells_top[ind[k]].nodeID != local_nodeID) error("Received part that does not belong to me (nodeID=%i).", @@ -591,7 +590,8 @@ void space_rebuild(struct space *s, int verbose) { #endif /* WITH_MPI */ /* Sort the parts according to their cells. */ - space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); + if (nr_parts > 0) + space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); /* Re-link the gparts. */ if (nr_parts > 0 && nr_gparts > 0) part_relink_gparts(s->parts, nr_parts, 0); @@ -609,15 +609,25 @@ void space_rebuild(struct space *s, int verbose) { } #endif + /* Extract the cell counts from the sorted indices. */ + size_t last_index = 0; + ind[nr_parts] = s->nr_cells; // sentinel. + for (size_t k = 0; k < nr_parts; k++) { + if (ind[k] < ind[k + 1]) { + cells_top[ind[k]].count = k - last_index + 1; + last_index = k + 1; + } + } + /* We no longer need the indices as of here. */ free(ind); #ifdef WITH_MPI /* Re-allocate the index array if needed.. */ - if (s->nr_gparts > gind_size) { + if (s->nr_gparts + 1 > gind_size) { int *gind_new; - if ((gind_new = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL) + if ((gind_new = (int *)malloc(sizeof(int) * (s->nr_gparts + 1))) == NULL) error("Failed to allocate temporary g-particle indices."); memcpy(gind_new, gind, sizeof(int) * nr_gparts); free(gind); @@ -629,12 +639,11 @@ void space_rebuild(struct space *s, int verbose) { const struct gpart *const p = &s->gparts[k]; gind[k] = cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); - cells_top[gind[k]].gcount += 1; #ifdef SWIFT_DEBUG_CHECKS - if (cells_top[ind[k]].nodeID != s->e->nodeID) + if (cells_top[gind[k]].nodeID != s->e->nodeID) error("Received part that does not belong to me (nodeID=%i).", - cells_top[ind[k]].nodeID); + cells_top[gind[k]].nodeID); #endif } nr_gparts = s->nr_gparts; @@ -642,12 +651,23 @@ void space_rebuild(struct space *s, int verbose) { #endif /* Sort the gparts according to their cells. */ - space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); + if (nr_gparts > 0) + space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); /* Re-link the parts. */ if (nr_parts > 0 && nr_gparts > 0) part_relink_parts(s->gparts, nr_gparts, s->parts); + /* Extract the cell counts from the sorted indices. */ + size_t last_gindex = 0; + gind[nr_gparts] = s->nr_cells; + for (size_t k = 0; k < nr_gparts; k++) { + if (gind[k] < gind[k + 1]) { + cells_top[gind[k]].gcount = k - last_gindex + 1; + last_gindex = k + 1; + } + } + /* We no longer need the indices as of here. */ free(gind); @@ -1011,15 +1031,9 @@ void space_parts_sort_mapper(void *map_data, int num_elements, while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - size_t temp_i = ind[ii]; - ind[ii] = ind[jj]; - ind[jj] = temp_i; - struct part temp_p = parts[ii]; - parts[ii] = parts[jj]; - parts[jj] = temp_p; - struct xpart temp_xp = xparts[ii]; - xparts[ii] = xparts[jj]; - xparts[jj] = temp_xp; + memswap(&ind[ii], &ind[jj], sizeof(int)); + memswap(&parts[ii], &parts[jj], sizeof(struct part)); + memswap(&xparts[ii], &xparts[jj], sizeof(struct xpart)); } } @@ -1194,12 +1208,8 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - size_t temp_i = ind[ii]; - ind[ii] = ind[jj]; - ind[jj] = temp_i; - struct gpart temp_p = gparts[ii]; - gparts[ii] = gparts[jj]; - gparts[jj] = temp_p; + memswap(&ind[ii], &ind[jj], sizeof(int)); + memswap(&gparts[ii], &gparts[jj], sizeof(struct gpart)); } } @@ -1425,144 +1435,164 @@ void space_map_cells_pre(struct space *s, int full, } /** - * @brief #threadpool mapper function to split cells if they contain - * too many particles. + * @brief Recursively split a cell. * - * @param map_data Pointer towards the top-cells. - * @param num_cells The number of cells to treat. - * @param extra_data Pointers to the #space. + * @param s The #space in which the cell lives. + * @param c The #cell to split recursively. + * @param buff A buffer for particle sorting, should be of size at least + * max(c->count, c->gount) or @c NULL. */ -void space_split_mapper(void *map_data, int num_cells, void *extra_data) { - - /* Unpack the inputs. */ - struct space *s = (struct space *)extra_data; - struct cell *restrict cells_top = (struct cell *)map_data; +void space_split_recursive(struct space *s, struct cell *c, int *buff) { + + const int count = c->count; + const int gcount = c->gcount; + const int depth = c->depth; + int maxdepth = 0; + float h_max = 0.0f; + int ti_end_min = max_nr_timesteps, ti_end_max = 0; + struct cell *temp; + struct part *parts = c->parts; + struct gpart *gparts = c->gparts; + struct xpart *xparts = c->xparts; struct engine *e = s->e; - for (int ind = 0; ind < num_cells; ind++) { + /* If the buff is NULL, allocate it, and remember to free it. */ + const int allocate_buffer = (buff == NULL); + if (allocate_buffer && + (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) + error("Failed to allocate temporary indices."); - struct cell *c = &cells_top[ind]; + /* Check the depth. */ + while (depth > (maxdepth = s->maxdepth)) { + atomic_cas(&s->maxdepth, maxdepth, depth); + } - const int count = c->count; - const int gcount = c->gcount; - const int depth = c->depth; - int maxdepth = 0; - float h_max = 0.0f; - int ti_end_min = max_nr_timesteps, ti_end_max = 0; - struct cell *temp; - struct part *parts = c->parts; - struct gpart *gparts = c->gparts; - struct xpart *xparts = c->xparts; - - /* Check the depth. */ - while (depth > (maxdepth = s->maxdepth)) { - atomic_cas(&s->maxdepth, maxdepth, depth); - } + /* If the depth is too large, we have a problem and should stop. */ + if (maxdepth > space_cell_maxdepth) { + error("Exceeded maximum depth (%d) when splitting cells, aborting", + space_cell_maxdepth); + } - /* If the depth is too large, we have a problem and should stop. */ - if (maxdepth > space_cell_maxdepth) { - error("Exceeded maximum depth (%d) when splitting cells, aborting", - space_cell_maxdepth); + /* Split or let it be? */ + if (count > space_splitsize || gcount > space_splitsize) { + + /* No longer just a leaf. */ + c->split = 1; + + /* Create the cell's progeny. */ + for (int k = 0; k < 8; k++) { + temp = space_getcell(s); + temp->count = 0; + temp->gcount = 0; + temp->ti_old = e->ti_current; + temp->loc[0] = c->loc[0]; + temp->loc[1] = c->loc[1]; + temp->loc[2] = c->loc[2]; + temp->width[0] = c->width[0] / 2; + temp->width[1] = c->width[1] / 2; + temp->width[2] = c->width[2] / 2; + temp->dmin = c->dmin / 2; + if (k & 4) temp->loc[0] += temp->width[0]; + if (k & 2) temp->loc[1] += temp->width[1]; + if (k & 1) temp->loc[2] += temp->width[2]; + temp->depth = c->depth + 1; + temp->split = 0; + temp->h_max = 0.0; + temp->dx_max = 0.f; + temp->nodeID = c->nodeID; + temp->parent = c; + temp->super = NULL; + c->progeny[k] = temp; } - /* Split or let it be? */ - if (count > space_splitsize || gcount > space_splitsize) { - - /* No longer just a leaf. */ - c->split = 1; - - /* Create the cell's progeny. */ - for (int k = 0; k < 8; k++) { - temp = space_getcell(s); - temp->count = 0; - temp->gcount = 0; - temp->ti_old = e->ti_current; - temp->loc[0] = c->loc[0]; - temp->loc[1] = c->loc[1]; - temp->loc[2] = c->loc[2]; - temp->width[0] = c->width[0] / 2; - temp->width[1] = c->width[1] / 2; - temp->width[2] = c->width[2] / 2; - temp->dmin = c->dmin / 2; - if (k & 4) temp->loc[0] += temp->width[0]; - if (k & 2) temp->loc[1] += temp->width[1]; - if (k & 1) temp->loc[2] += temp->width[2]; - temp->depth = c->depth + 1; - temp->split = 0; - temp->h_max = 0.0; - temp->dx_max = 0.f; - temp->nodeID = c->nodeID; - temp->parent = c; - temp->super = NULL; - c->progeny[k] = temp; + /* Split the cell data. */ + cell_split(c, c->parts - s->parts, buff); + + /* Remove any progeny with zero parts. */ + for (int k = 0; k < 8; k++) + if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { + space_recycle(s, c->progeny[k]); + c->progeny[k] = NULL; + } else { + space_split_recursive(s, c->progeny[k], buff); + h_max = max(h_max, c->progeny[k]->h_max); + ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); + ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); + if (c->progeny[k]->maxdepth > maxdepth) + maxdepth = c->progeny[k]->maxdepth; } - /* Split the cell data. */ - cell_split(c, c->parts - s->parts); - - /* Remove any progeny with zero parts. */ - for (int k = 0; k < 8; k++) - if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { - space_recycle(s, c->progeny[k]); - c->progeny[k] = NULL; - } else { - space_split_mapper(c->progeny[k], 1, s); - h_max = max(h_max, c->progeny[k]->h_max); - ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); - ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); - if (c->progeny[k]->maxdepth > maxdepth) - maxdepth = c->progeny[k]->maxdepth; - } + } + /* Otherwise, collect the data for this cell. */ + else { + + /* Clear the progeny. */ + bzero(c->progeny, sizeof(struct cell *) * 8); + c->split = 0; + maxdepth = c->depth; + + /* Get dt_min/dt_max. */ + for (int k = 0; k < count; k++) { + struct part *p = &parts[k]; + struct xpart *xp = &xparts[k]; + const float h = p->h; + const int ti_end = p->ti_end; + xp->x_diff[0] = 0.f; + xp->x_diff[1] = 0.f; + xp->x_diff[2] = 0.f; + if (h > h_max) h_max = h; + if (ti_end < ti_end_min) ti_end_min = ti_end; + if (ti_end > ti_end_max) ti_end_max = ti_end; } - - /* Otherwise, collect the data for this cell. */ - else { - - /* Clear the progeny. */ - bzero(c->progeny, sizeof(struct cell *) * 8); - c->split = 0; - maxdepth = c->depth; - - /* Get dt_min/dt_max. */ - for (int k = 0; k < count; k++) { - struct part *p = &parts[k]; - struct xpart *xp = &xparts[k]; - const float h = p->h; - const int ti_end = p->ti_end; - xp->x_diff[0] = 0.f; - xp->x_diff[1] = 0.f; - xp->x_diff[2] = 0.f; - if (h > h_max) h_max = h; - if (ti_end < ti_end_min) ti_end_min = ti_end; - if (ti_end > ti_end_max) ti_end_max = ti_end; - } - for (int k = 0; k < gcount; k++) { - struct gpart *gp = &gparts[k]; - const int ti_end = gp->ti_end; - gp->x_diff[0] = 0.f; - gp->x_diff[1] = 0.f; - gp->x_diff[2] = 0.f; - if (ti_end < ti_end_min) ti_end_min = ti_end; - if (ti_end > ti_end_max) ti_end_max = ti_end; - } + for (int k = 0; k < gcount; k++) { + struct gpart *gp = &gparts[k]; + const int ti_end = gp->ti_end; + gp->x_diff[0] = 0.f; + gp->x_diff[1] = 0.f; + gp->x_diff[2] = 0.f; + if (ti_end < ti_end_min) ti_end_min = ti_end; + if (ti_end > ti_end_max) ti_end_max = ti_end; } + } - /* Set the values for this cell. */ - c->h_max = h_max; - c->ti_end_min = ti_end_min; - c->ti_end_max = ti_end_max; - c->maxdepth = maxdepth; - - /* Set ownership according to the start of the parts array. */ - if (s->nr_parts > 0) - c->owner = - ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts; - else if (s->nr_gparts > 0) - c->owner = ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / - s->nr_gparts; - else - c->owner = 0; /* Ok, there is really nothing on this rank... */ + /* Set the values for this cell. */ + c->h_max = h_max; + c->ti_end_min = ti_end_min; + c->ti_end_max = ti_end_max; + c->maxdepth = maxdepth; + + /* Set ownership according to the start of the parts array. */ + if (s->nr_parts > 0) + c->owner = + ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts; + else if (s->nr_gparts > 0) + c->owner = + ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / s->nr_gparts; + else + c->owner = 0; /* Ok, there is really nothing on this rank... */ + + /* Clean up. */ + if (allocate_buffer) free(buff); +} + +/** + * @brief #threadpool mapper function to split cells if they contain + * too many particles. + * + * @param map_data Pointer towards the top-cells. + * @param num_cells The number of cells to treat. + * @param extra_data Pointers to the #space. + */ +void space_split_mapper(void *map_data, int num_cells, void *extra_data) { + + /* Unpack the inputs. */ + struct space *s = (struct space *)extra_data; + struct cell *restrict cells_top = (struct cell *)map_data; + + for (int ind = 0; ind < num_cells; ind++) { + struct cell *c = &cells_top[ind]; + space_split_recursive(s, c, NULL); } #ifdef SWIFT_DEBUG_CHECKS @@ -1603,7 +1633,6 @@ void space_recycle(struct space *s, struct cell *c) { /* Unlock the space. */ lock_unlock_blind(&s->lock); } - /** * @brief Get a new empty (sub-)#cell. *