Commit 6ae05a1f authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'space_rebuild' into 'master'

Space rebuild

Implements #243. space_split with bucketsort instead of quicksort. Reduce the amount of swaps needed to sort the particles into the correct progeny, should be faster and scale better.

See merge request !284
parents 19d8695c dec3b22e
......@@ -296,7 +296,6 @@ if test "$enable_san" = "yes"; then
fi
fi
# Autoconf stuff.
AC_PROG_INSTALL
AC_PROG_MAKE_SET
......@@ -473,8 +472,9 @@ if test "$ac_cv_header_fftw3_h" = "yes"; then
fi
AC_SUBST([FFTW_LIBS])
# Check for Intel intrinsics header optionally used by vector.h.
# Check for Intel and PowerPC intrinsics header optionally used by vector.h.
AC_CHECK_HEADERS([immintrin.h])
AC_CHECK_HEADERS([altivec.h])
# Check for timing functions needed by cycle.h.
AC_HEADER_TIME
......
......@@ -44,7 +44,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \
physical_constants.h physical_constants_cgs.h potential.h version.h \
hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
sourceterms_struct.h statistics.h
sourceterms_struct.h statistics.h memswap.h
# Common source files
......@@ -83,7 +83,8 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
potential/softened_isothermal/potential.h \
cooling/none/cooling.h cooling/none/cooling_struct.h \
cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \
cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h
cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \
memswap.h
# Sources and flags for regular library
......
......@@ -53,6 +53,8 @@
#include "gravity.h"
#include "hydro.h"
#include "hydro_properties.h"
#include "memswap.h"
#include "minmax.h"
#include "scheduler.h"
#include "space.h"
#include "timers.h"
......@@ -463,122 +465,73 @@ void cell_gunlocktree(struct cell *c) {
* @param c The #cell array to be sorted.
* @param parts_offset Offset of the cell parts array relative to the
* space's parts array, i.e. c->parts - s->parts.
* @param buff A buffer with at least max(c->count, c->gcount) entries,
* used for sorting indices.
*/
void cell_split(struct cell *c, ptrdiff_t parts_offset) {
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
int i, j;
const int count = c->count, gcount = c->gcount;
struct part *parts = c->parts;
struct xpart *xparts = c->xparts;
struct gpart *gparts = c->gparts;
int left[8], right[8];
double pivot[3];
/* Init the pivots. */
for (int k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->width[k] / 2;
/* Split along the x-axis. */
i = 0;
j = count - 1;
while (i <= j) {
while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1;
while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
}
const double pivot[3] = {c->loc[0] + c->width[0] / 2,
c->loc[1] + c->width[1] / 2,
c->loc[2] + c->width[2] / 2};
int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int bucket_offset[9];
/* If the buff is NULL, allocate it, and remember to free it. */
const int allocate_buffer = (buff == NULL);
if (allocate_buffer &&
(buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
error("Failed to allocate temporary indices.");
/* Fill the buffer with the indices. */
for (int k = 0; k < count; k++) {
const int bid = (parts[k].x[0] > pivot[0]) * 4 +
(parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
}
#ifdef SWIFT_DEBUG_CHECKS
for (int k = 0; k <= j; k++)
if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
for (int k = i; k < count; k++)
if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
#endif
left[1] = i;
right[1] = count - 1;
left[0] = 0;
right[0] = j;
/* Split along the y axis, twice. */
for (int k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1;
while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
}
}
#ifdef SWIFT_DEBUG_CHECKS
for (int kk = left[k]; kk <= j; kk++)
if (parts[kk].x[1] > pivot[1]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (left).");
}
for (int kk = i; kk <= right[k]; kk++)
if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
#endif
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
/* Set the buffer offsets. */
bucket_offset[0] = 0;
for (int k = 1; k <= 8; k++) {
bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
bucket_count[k - 1] = 0;
}
/* Split along the z axis, four times. */
for (int k = 3; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1;
while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
/* Run through the buckets, and swap particles to their correct spot. */
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
if (bid != bucket) {
struct part part = parts[k];
struct xpart xpart = xparts[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
j++;
bucket_count[bid]++;
}
memswap(&parts[j], &part, sizeof(struct part));
memswap(&xparts[j], &xpart, sizeof(struct xpart));
memswap(&buff[j], &bid, sizeof(int));
}
parts[k] = part;
xparts[k] = xpart;
buff[k] = bid;
}
bucket_count[bid]++;
}
#ifdef SWIFT_DEBUG_CHECKS
for (int kk = left[k]; kk <= j; kk++)
if (parts[kk].x[2] > pivot[2]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (left).");
}
for (int kk = i; kk <= right[k]; kk++)
if (parts[kk].x[2] < pivot[2]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (right).");
}
#endif
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for (int k = 0; k < 8; k++) {
c->progeny[k]->count = right[k] - left[k] + 1;
c->progeny[k]->parts = &c->parts[left[k]];
c->progeny[k]->xparts = &c->xparts[left[k]];
c->progeny[k]->count = bucket_count[k];
c->progeny[k]->parts = &c->parts[bucket_offset[k]];
c->progeny[k]->xparts = &c->xparts[bucket_offset[k]];
}
/* Re-link the gparts. */
......@@ -614,66 +567,51 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
#endif
/* Now do the same song and dance for the gparts. */
/* Split along the x-axis. */
i = 0;
j = gcount - 1;
while (i <= j) {
while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1;
while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
}
for (int k = 0; k < 8; k++) bucket_count[k] = 0;
/* Fill the buffer with the indices. */
for (int k = 0; k < gcount; k++) {
const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
(gparts[k].x[1] > pivot[1]) * 2 +
(gparts[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
}
left[1] = i;
right[1] = gcount - 1;
left[0] = 0;
right[0] = j;
/* Split along the y axis, twice. */
for (int k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1;
while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
}
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
/* Set the buffer offsets. */
bucket_offset[0] = 0;
for (int k = 1; k <= 8; k++) {
bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
bucket_count[k - 1] = 0;
}
/* Split along the z axis, four times. */
for (int k = 3; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1;
while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
/* Run through the buckets, and swap particles to their correct spot. */
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
if (bid != bucket) {
struct gpart gpart = gparts[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
j++;
bucket_count[bid]++;
}
memswap(&gparts[j], &gpart, sizeof(struct gpart));
memswap(&buff[j], &bid, sizeof(int));
}
gparts[k] = gpart;
buff[k] = bid;
}
bucket_count[bid]++;
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for (int k = 0; k < 8; k++) {
c->progeny[k]->gcount = right[k] - left[k] + 1;
c->progeny[k]->gparts = &c->gparts[left[k]];
c->progeny[k]->gcount = bucket_count[k];
c->progeny[k]->gparts = &c->gparts[bucket_offset[k]];
}
/* Re-link the parts. */
......
......@@ -272,7 +272,7 @@ struct cell {
((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
/* Function prototypes. */
void cell_split(struct cell *c, ptrdiff_t parts_offset);
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
void cell_sanitize(struct cell *c);
int cell_locktree(struct cell *c);
void cell_unlocktree(struct cell *c);
......
/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
#ifndef SWIFT_MEMSWAP_H
#define SWIFT_MEMSWAP_H
/* Config parameters. */
#include "../config.h"
#ifdef HAVE_IMMINTRIN_H
/* Include the header file with the intrinsics for Intel architecture. */
#include <immintrin.h>
#endif
#ifdef HAVE_ALTIVEC_H
/* Include the header file with the intrinsics for Intel architecture. */
#include <altivec.h>
#endif
/* Macro for in-place swap of two values a and b of type t. */
#define swap_loop(t, a, b, c) \
while (c >= sizeof(t)) { \
register t temp = *(t *)a; \
*(t *)a = *(t *)b; \
*(t *)b = temp; \
a += sizeof(t); \
b += sizeof(t); \
bytes -= sizeof(t); \
}
/**
* @brief Swap the contents of two elements in-place.
*
* Keep in mind that this function works best when the underlying data
* is aligned to the vector length, e.g. with the @c
* __attribute__((aligned(32)))
* syntax, and the code is compiled with @c -funroll-loops.
*
* @param void_a Pointer to the first element.
* @param void_b Pointer to the second element.
* @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
*/
__attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
size_t bytes) {
char *a = (char *)void_a, *b = (char *)void_b;
#ifdef __AVX512F__
swap_loop(__m512i, a, b, bytes);
#endif
#ifdef __AVX__
swap_loop(__m256i, a, b, bytes);
#endif
#ifdef __SSE2__
swap_loop(__m128i, a, b, bytes);
#endif
#ifdef __ALTIVEC__
swap_loop(vector int, a, b, bytes);
#endif
swap_loop(size_t, a, b, bytes);
swap_loop(int, a, b, bytes);
swap_loop(short, a, b, bytes);
swap_loop(char, a, b, bytes);
}
#endif /* SWIFT_MEMSWAP_H */
......@@ -49,6 +49,7 @@
#include "hydro.h"
#include "kernel_hydro.h"
#include "lock.h"
#include "memswap.h"
#include "minmax.h"
#include "runner.h"
#include "threadpool.h"
......@@ -382,7 +383,7 @@ void space_regrid(struct space *s, int verbose) {
/* Finished with these. */
free(oldnodeIDs);
}
#endif
#endif /* WITH_MPI */
// message( "rebuilding upper-level cells took %.3f %s." ,
// clocks_from_ticks(double)(getticks() - tic), clocks_getunit());
......@@ -453,21 +454,22 @@ void space_rebuild(struct space *s, int verbose) {
struct cell *restrict cells_top = s->cells_top;
const int ti_current = (s->e != NULL) ? s->e->ti_current : 0;
/* Run through the particles and get their cell index. */
const size_t ind_size = s->size_parts;
/* Run through the particles and get their cell index. Allocates
an index that is larger than the number of particles to avoid
re-allocating after shuffling. */
const size_t ind_size = s->size_parts + 100;
int *ind;
if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL)
error("Failed to allocate temporary particle indices.");
if (ind_size > 0) space_parts_get_cell_index(s, ind, cells_top, verbose);
for (size_t i = 0; i < s->nr_parts; ++i) cells_top[ind[i]].count++;
if (s->size_parts > 0) space_parts_get_cell_index(s, ind, cells_top, verbose);
/* Run through the gravity particles and get their cell index. */
const size_t gind_size = s->size_gparts;
const size_t gind_size = s->size_gparts + 100;
int *gind;
if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL)
error("Failed to allocate temporary g-particle indices.");
if (gind_size > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose);
for (size_t i = 0; i < s->nr_gparts; ++i) cells_top[gind[i]].gcount++;
if (s->size_gparts > 0)
space_gparts_get_cell_index(s, gind, cells_top, verbose);
#ifdef WITH_MPI
......@@ -475,7 +477,6 @@ void space_rebuild(struct space *s, int verbose) {
const int local_nodeID = s->e->nodeID;
for (size_t k = 0; k < nr_parts;) {
if (cells_top[ind[k]].nodeID != local_nodeID) {
cells_top[ind[k]].count -= 1;
nr_parts -= 1;
const struct part tp = s->parts[k];
s->parts[k] = s->parts[nr_parts];
......@@ -515,7 +516,6 @@ void space_rebuild(struct space *s, int verbose) {
/* Move non-local gparts to the end of the list. */
for (size_t k = 0; k < nr_gparts;) {
if (cells_top[gind[k]].nodeID != local_nodeID) {
cells_top[gind[k]].gcount -= 1;
nr_gparts -= 1;
const struct gpart tp = s->gparts[k];
s->gparts[k] = s->gparts[nr_gparts];
......@@ -562,9 +562,9 @@ void space_rebuild(struct space *s, int verbose) {
s->nr_gparts = nr_gparts + nr_gparts_exchanged;
/* Re-allocate the index array if needed.. */
if (s->nr_parts > ind_size) {
if (s->nr_parts + 1 > ind_size) {
int *ind_new;
if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL)
if ((ind_new = (int *)malloc(sizeof(int) * (s->nr_parts + 1))) == NULL)
error("Failed to allocate temporary particle indices.");
memcpy(ind_new, ind, sizeof(int) * nr_parts);
free(ind);
......@@ -579,7 +579,6 @@ void space_rebuild(struct space *s, int verbose) {
const struct part *const p = &s->parts[k];
ind[k] =
cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
cells_top[ind[k]].count += 1;
#ifdef SWIFT_DEBUG_CHECKS
if (cells_top[ind[k]].nodeID != local_nodeID)
error("Received part that does not belong to me (nodeID=%i).",
......@@ -591,7 +590,8 @@ void space_rebuild(struct space *s, int verbose) {
#endif /* WITH_MPI */
/* Sort the parts according to their cells. */
space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
if (nr_parts > 0)
space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
/* Re-link the gparts. */
if (nr_parts > 0 && nr_gparts > 0) part_relink_gparts(s->parts, nr_parts, 0);
......@@ -609,15 +609,25 @@ void space_rebuild(struct space *s, int verbose) {
}
#endif
/* Extract the cell counts from the sorted indices. */
size_t last_index = 0;
ind[nr_parts] = s->nr_cells; // sentinel.
for (size_t k = 0; k < nr_parts; k++) {
if (ind[k] < ind[k + 1]) {
cells_top[ind[k]].count = k - last_index + 1;
last_index = k + 1;
}
}
/* We no longer need the indices as of here. */
free(ind);
#ifdef WITH_MPI
/* Re-allocate the index array if needed.. */
if (s->nr_gparts > gind_size) {
if (s->nr_gparts + 1 > gind_size) {
int *gind_new;
if ((gind_new = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL)
if ((gind_new = (int *)malloc(sizeof(int) * (s->nr_gparts + 1))) == NULL)
error("Failed to allocate temporary g-particle indices.");
memcpy(gind_new, gind, sizeof(int) * nr_gparts);
free(gind);
......@@ -629,12 +639,11 @@ void space_rebuild(struct space *s, int verbose) {
const struct gpart *const p = &s->gparts[k];
gind[k] =
cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]);
cells_top[gind[k]].gcount += 1;
#ifdef SWIFT_DEBUG_CHECKS
if (cells_top[ind[k]].nodeID != s->e->nodeID)
if (cells_top[gind[k]].nodeID != s->e->nodeID)
error("Received part that does not belong to me (nodeID=%i).",
cells_top[ind[k]].nodeID);
cells_top[gind[k]].nodeID);
#endif
}
nr_gparts = s->nr_gparts;
......@@ -642,12 +651,23 @@ void space_rebuild(struct space *s, int verbose) {
#endif
/* Sort the gparts according to their cells. */
space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
if (nr_gparts > 0)
space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
/* Re-link the parts. */
if (nr_parts > 0 && nr_gparts > 0)
part_relink_parts(s->gparts, nr_gparts, s->parts);
/* Extract the cell counts from the sorted indices. */
size_t last_gindex = 0;
gind[nr_gparts] = s->nr_cells;
for (size_t k = 0; k < nr_gparts; k++) {
if (gind[k] < gind[k + 1]) {
cells_top[gind[k]].gcount = k - last_gindex + 1;
last_gindex = k + 1;
}
}
/* We no longer need the indices as of here. */
free(gind);
......@@ -1011,15 +1031,9 @@ void space_parts_sort_mapper(void *map_data, int num_elements,
while (ii <= j && ind[ii] <= pivot) ii++;
while (jj >= i && ind[jj] > pivot) jj--;
if (ii < jj) {
size_t temp_i = ind[ii];
ind[ii] = ind[jj];
ind[jj] = temp_i;
struct part temp_p = parts[ii];
parts[ii] = parts[jj];
parts[jj] = temp_p;
struct xpart temp_xp = xparts[ii];
xparts[ii] = xparts[jj];
xparts[jj] = temp_xp;
memswap(&ind[ii], &ind[jj], sizeof(int));
memswap(&parts[ii], &parts[jj], sizeof(struct part));
memswap(&xparts[ii], &xparts[jj], sizeof(struct xpart));
}
}
......@@ -1194,12 +1208,8 @@ void space_gparts_sort_mapper(void *map_data, int num_elements,
while (ii <= j && ind[ii] <= pivot) ii++;
while (jj >= i && ind[jj] > pivot) jj--;
if (ii < jj) {
size_t temp_i = ind[ii];
ind[ii] = ind[jj];
ind[jj] = temp_i;
struct gpart temp_p = gparts[ii];
gparts[ii] = gparts[jj];
gparts[jj] = temp_p;
memswap(&ind[ii], &ind[jj], sizeof(int));
memswap(&gparts[ii], &gparts[jj], sizeof(struct gpart));
}
}
......@@ -1425,144 +1435,164 @@ void space_map_cells_pre(struct space *s, int full,
}
/**