Commit 4f696e9b authored by Pedro Gonnet's avatar Pedro Gonnet
Browse files

use a one-pass bucket sort to split cells.

parent a5900a34
......@@ -52,6 +52,7 @@
#include "gravity.h"
#include "hydro.h"
#include "hydro_properties.h"
#include "minmax.h"
#include "scheduler.h"
#include "space.h"
#include "timers.h"
......@@ -462,122 +463,79 @@ void cell_gunlocktree(struct cell *c) {
* @param c The #cell array to be sorted.
* @param parts_offset Offset of the cell parts array relative to the
* space's parts array, i.e. c->parts - s->parts.
* @param buff A buffer with at least max(c->count, c->gcount) entries,
* used for sorting indices.
*/
void cell_split(struct cell *c, ptrdiff_t parts_offset) {
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
int i, j;
const int count = c->count, gcount = c->gcount;
struct part *parts = c->parts;
struct xpart *xparts = c->xparts;
struct gpart *gparts = c->gparts;
int left[8], right[8];
double pivot[3];
/* Init the pivots. */
for (int k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->width[k] / 2;
/* Split along the x-axis. */
i = 0;
j = count - 1;
while (i <= j) {
while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1;
while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
}
const double pivot[3] = {c->loc[0] + c->width[0] / 2,
c->loc[1] + c->width[1] / 2,
c->loc[2] + c->width[2] / 2};
int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int bucket_offset[9];
/* If the buff is NULL, allocate it, and remember to free it. */
const int allocate_buffer = (buff == NULL);
if (allocate_buffer &&
(buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
error("Failed to allocate temporary indices.");
/* Fill the buffer with the indices. */
for (int k = 0; k < count; k++) {
const int bid = (parts[k].x[0] > pivot[0]) * 4 +
(parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
}
#ifdef SWIFT_DEBUG_CHECKS
for (int k = 0; k <= j; k++)
if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
for (int k = i; k < count; k++)
if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
#endif
left[1] = i;
right[1] = count - 1;
left[0] = 0;
right[0] = j;
/* Split along the y axis, twice. */
for (int k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1;
while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
}
}
#ifdef SWIFT_DEBUG_CHECKS
for (int kk = left[k]; kk <= j; kk++)
if (parts[kk].x[1] > pivot[1]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (left).");
}
for (int kk = i; kk <= right[k]; kk++)
if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
#endif
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
/* Set the buffer offsets. */
bucket_offset[0] = 0;
for (int k = 1; k <= 8; k++) {
bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
bucket_count[k - 1] = 0;
}
/* Split along the z axis, four times. */
for (int k = 3; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1;
while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
/* Run through the buckets, and swap particles to their correct spot. */
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
if (bid != bucket) {
struct part part = parts[k];
struct xpart xpart = xparts[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
j++;
bucket_count[bid]++;
}
struct part temp_part = parts[j];
struct xpart temp_xpart = xparts[j];
int temp_buff = buff[j];
parts[j] = part;
xparts[j] = xpart;
buff[j] = bid;
part = temp_part;
xpart = temp_xpart;
bid = temp_buff;
}
parts[k] = part;
xparts[k] = xpart;
buff[k] = bid;
}
bucket_count[bid]++;
}
#ifdef SWIFT_DEBUG_CHECKS
for (int kk = left[k]; kk <= j; kk++)
if (parts[kk].x[2] > pivot[2]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (left).");
}
for (int kk = i; kk <= right[k]; kk++)
if (parts[kk].x[2] < pivot[2]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (right).");
}
#endif
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for (int k = 0; k < 8; k++) {
c->progeny[k]->count = right[k] - left[k] + 1;
c->progeny[k]->parts = &c->parts[left[k]];
c->progeny[k]->xparts = &c->xparts[left[k]];
c->progeny[k]->count = bucket_count[k];
c->progeny[k]->parts = &c->parts[bucket_offset[k]];
c->progeny[k]->xparts = &c->xparts[bucket_offset[k]];
}
/* Re-link the gparts. */
......@@ -613,66 +571,55 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
#endif
/* Now do the same song and dance for the gparts. */
/* Split along the x-axis. */
i = 0;
j = gcount - 1;
while (i <= j) {
while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1;
while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
}
for (int k = 0; k < 8; k++) bucket_count[k] = 0;
/* Fill the buffer with the indices. */
for (int k = 0; k < gcount; k++) {
const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
(gparts[k].x[1] > pivot[1]) * 2 +
(gparts[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
}
left[1] = i;
right[1] = gcount - 1;
left[0] = 0;
right[0] = j;
/* Split along the y axis, twice. */
for (int k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1;
while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
}
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
/* Set the buffer offsets. */
bucket_offset[0] = 0;
for (int k = 1; k <= 8; k++) {
bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
bucket_count[k - 1] = 0;
}
/* Split along the z axis, four times. */
for (int k = 3; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1;
while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
/* Run through the buckets, and swap particles to their correct spot. */
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
if (bid != bucket) {
struct gpart gpart = gparts[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
j++;
bucket_count[bid]++;
}
struct gpart temp_gpart = gparts[j];
int temp_buff = buff[j];
gparts[j] = gpart;
buff[j] = bid;
gpart = temp_gpart;
bid = temp_buff;
}
gparts[k] = gpart;
buff[k] = bid;
}
bucket_count[bid]++;
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for (int k = 0; k < 8; k++) {
c->progeny[k]->gcount = right[k] - left[k] + 1;
c->progeny[k]->gparts = &c->gparts[left[k]];
c->progeny[k]->gcount = bucket_count[k];
c->progeny[k]->gparts = &c->gparts[bucket_offset[k]];
}
/* Re-link the parts. */
......
......@@ -271,7 +271,7 @@ struct cell {
((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
/* Function prototypes. */
void cell_split(struct cell *c, ptrdiff_t parts_offset);
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
void cell_sanitize(struct cell *c);
int cell_locktree(struct cell *c);
void cell_unlocktree(struct cell *c);
......
......@@ -602,7 +602,8 @@ void space_rebuild(struct space *s, int verbose) {
#endif /* WITH_MPI */
/* Sort the parts according to their cells. */
if(nr_parts > 0) space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
if (nr_parts > 0)
space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose);
/* Re-link the gparts. */
if (nr_parts > 0 && nr_gparts > 0) part_relink_gparts(s->parts, nr_parts, 0);
......@@ -662,7 +663,8 @@ void space_rebuild(struct space *s, int verbose) {
#endif
/* Sort the gparts according to their cells. */
if(nr_gparts > 0) space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
if (nr_gparts > 0)
space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose);
/* Re-link the parts. */
if (nr_parts > 0 && nr_gparts > 0)
......@@ -1455,144 +1457,164 @@ void space_map_cells_pre(struct space *s, int full,
}
/**
* @brief #threadpool mapper function to split cells if they contain
* too many particles.
* @brief Recursively split a cell.
*
* @param map_data Pointer towards the top-cells.
* @param num_cells The number of cells to treat.
* @param extra_data Pointers to the #space.
* @param s The #space in which the cell lives.
* @param c The #cell to split recursively.
* @param buff A buffer for particle sorting, should be of size at least
* max(c->count, c->gount) or @c NULL.
*/
void space_split_mapper(void *map_data, int num_cells, void *extra_data) {
/* Unpack the inputs. */
struct space *s = (struct space *)extra_data;
struct cell *restrict cells_top = (struct cell *)map_data;
void space_split_recursive(struct space *s, struct cell *c, int *buff) {
const int count = c->count;
const int gcount = c->gcount;
const int depth = c->depth;
int maxdepth = 0;
float h_max = 0.0f;
int ti_end_min = max_nr_timesteps, ti_end_max = 0;
struct cell *temp;
struct part *parts = c->parts;
struct gpart *gparts = c->gparts;
struct xpart *xparts = c->xparts;
struct engine *e = s->e;
for (int ind = 0; ind < num_cells; ind++) {
/* If the buff is NULL, allocate it, and remember to free it. */
const int allocate_buffer = (buff == NULL);
if (allocate_buffer &&
(buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
error("Failed to allocate temporary indices.");
struct cell *c = &cells_top[ind];
/* Check the depth. */
while (depth > (maxdepth = s->maxdepth)) {
atomic_cas(&s->maxdepth, maxdepth, depth);
}
const int count = c->count;
const int gcount = c->gcount;
const int depth = c->depth;
int maxdepth = 0;
float h_max = 0.0f;
int ti_end_min = max_nr_timesteps, ti_end_max = 0;
struct cell *temp;
struct part *parts = c->parts;
struct gpart *gparts = c->gparts;
struct xpart *xparts = c->xparts;
/* Check the depth. */
while (depth > (maxdepth = s->maxdepth)) {
atomic_cas(&s->maxdepth, maxdepth, depth);
}
/* If the depth is too large, we have a problem and should stop. */
if (maxdepth > space_cell_maxdepth) {
error("Exceeded maximum depth (%d) when splitting cells, aborting",
space_cell_maxdepth);
}
/* If the depth is too large, we have a problem and should stop. */
if (maxdepth > space_cell_maxdepth) {
error("Exceeded maximum depth (%d) when splitting cells, aborting",
space_cell_maxdepth);
/* Split or let it be? */
if (count > space_splitsize || gcount > space_splitsize) {
/* No longer just a leaf. */
c->split = 1;
/* Create the cell's progeny. */
for (int k = 0; k < 8; k++) {
temp = space_getcell(s);
temp->count = 0;
temp->gcount = 0;
temp->ti_old = e->ti_current;
temp->loc[0] = c->loc[0];
temp->loc[1] = c->loc[1];
temp->loc[2] = c->loc[2];
temp->width[0] = c->width[0] / 2;
temp->width[1] = c->width[1] / 2;
temp->width[2] = c->width[2] / 2;
temp->dmin = c->dmin / 2;
if (k & 4) temp->loc[0] += temp->width[0];
if (k & 2) temp->loc[1] += temp->width[1];
if (k & 1) temp->loc[2] += temp->width[2];
temp->depth = c->depth + 1;
temp->split = 0;
temp->h_max = 0.0;
temp->dx_max = 0.f;
temp->nodeID = c->nodeID;
temp->parent = c;
temp->super = NULL;
c->progeny[k] = temp;
}
/* Split or let it be? */
if (count > space_splitsize || gcount > space_splitsize) {
/* No longer just a leaf. */
c->split = 1;
/* Create the cell's progeny. */
for (int k = 0; k < 8; k++) {
temp = space_getcell(s);
temp->count = 0;
temp->gcount = 0;
temp->ti_old = e->ti_current;
temp->loc[0] = c->loc[0];
temp->loc[1] = c->loc[1];
temp->loc[2] = c->loc[2];
temp->width[0] = c->width[0] / 2;
temp->width[1] = c->width[1] / 2;
temp->width[2] = c->width[2] / 2;
temp->dmin = c->dmin / 2;
if (k & 4) temp->loc[0] += temp->width[0];
if (k & 2) temp->loc[1] += temp->width[1];
if (k & 1) temp->loc[2] += temp->width[2];
temp->depth = c->depth + 1;
temp->split = 0;
temp->h_max = 0.0;
temp->dx_max = 0.f;
temp->nodeID = c->nodeID;
temp->parent = c;
temp->super = NULL;
c->progeny[k] = temp;
/* Split the cell data. */
cell_split(c, c->parts - s->parts, buff);
/* Remove any progeny with zero parts. */
for (int k = 0; k < 8; k++)
if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
space_recycle(s, c->progeny[k]);
c->progeny[k] = NULL;
} else {
space_split_recursive(s, c->progeny[k], buff);
h_max = max(h_max, c->progeny[k]->h_max);
ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
if (c->progeny[k]->maxdepth > maxdepth)
maxdepth = c->progeny[k]->maxdepth;
}
/* Split the cell data. */
cell_split(c, c->parts - s->parts, NULL);
/* Remove any progeny with zero parts. */
for (int k = 0; k < 8; k++)
if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) {
space_recycle(s, c->progeny[k]);
c->progeny[k] = NULL;
} else {
space_split_mapper(c->progeny[k], 1, s);
h_max = max(h_max, c->progeny[k]->h_max);
ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min);
ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max);
if (c->progeny[k]->maxdepth > maxdepth)
maxdepth = c->progeny[k]->maxdepth;
}
}
/* Otherwise, collect the data for this cell. */
else {
/* Clear the progeny. */
bzero(c->progeny, sizeof(struct cell *) * 8);
c->split = 0;
maxdepth = c->depth;
/* Get dt_min/dt_max. */
for (int k = 0; k < count; k++) {
struct part *p = &parts[k];
struct xpart *xp = &xparts[k];
const float h = p->h;
const int ti_end = p->ti_end;
xp->x_diff[0] = 0.f;
xp->x_diff[1] = 0.f;
xp->x_diff[2] = 0.f;
if (h > h_max) h_max = h;
if (ti_end < ti_end_min) ti_end_min = ti_end;
if (ti_end > ti_end_max) ti_end_max = ti_end;
}
/* Otherwise, collect the data for this cell. */
else {
/* Clear the progeny. */
bzero(c->progeny, sizeof(struct cell *) * 8);
c->split = 0;
maxdepth = c->depth;
/* Get dt_min/dt_max. */
for (int k = 0; k < count; k++) {
struct part *p = &parts[k];
struct xpart *xp = &xparts[k];
const float h = p->h;
const int ti_end = p->ti_end;
xp->x_diff[0] = 0.f;
xp->x_diff[1] = 0.f;
xp->x_diff[2] = 0.f;
if (h > h_max) h_max = h;
if (ti_end < ti_end_min) ti_end_min = ti_end;
if (ti_end > ti_end_max) ti_end_max = ti_end;
}
for (int k = 0; k < gcount; k++) {
struct gpart *gp = &gparts[k];
const int ti_end = gp->ti_end;
gp->x_diff[0] = 0.f;
gp->x_diff[1] = 0.f;
gp->x_diff[2] = 0.f;
if (ti_end < ti_end_min) ti_end_min = ti_end;
if (ti_end > ti_end_max) ti_end_max = ti_end;
}
for (int k = 0; k < gcount; k++) {
struct gpart *gp = &gparts[k];
const int ti_end = gp->ti_end;
gp->x_diff[0] = 0.f;
gp->x_diff[1] = 0.f;
gp->x_diff[2] = 0.f;
if (ti_end < ti_end_min) ti_end_min = ti_end;
if (ti_end > ti_end_max) ti_end_max = ti_end;
}
}
/* Set the values for this cell. */
c->h_max = h_max;
c->ti_end_min = ti_end_min;
c->ti_end_max = ti_end_max;
c->maxdepth = maxdepth;
/* Set ownership according to the start of the parts array. */
if (s->nr_parts > 0)
c->owner =
((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts;
else if (s->nr_gparts > 0)
c->owner = ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues /
s->nr_gparts;
else
c->owner = 0; /* Ok, there is really nothing on this rank... */
/* Set the values for this cell. */
c->h_max = h_max;
c->ti_end_min = ti_end_min;
c->ti_end_max = ti_end_max;
c->maxdepth = maxdepth;
/* Set ownership according to the start of the parts array. */
if (s->nr_parts > 0)
c->owner =
((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts;
else if (s->nr_gparts > 0)
c->owner =
((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / s->nr_gparts;
else
c->owner = 0; /* Ok, there is really nothing on this rank... */
/* Clean up. */
if (allocate_buffer) free(buff);
}
/**
* @brief #threadpool mapper function to split cells if they contain
* too many particles.
*