Commit 981be8bc authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'master' into space_recycle

Conflicts:
  src/space.c
parents 75a9d6f7 290e85f7
......@@ -296,7 +296,6 @@ if test "$enable_san" = "yes"; then
fi
fi
# Autoconf stuff.
AC_PROG_INSTALL
AC_PROG_MAKE_SET
......@@ -473,8 +472,9 @@ if test "$ac_cv_header_fftw3_h" = "yes"; then
fi
AC_SUBST([FFTW_LIBS])
# Check for Intel intrinsics header optionally used by vector.h.
# Check for Intel and PowerPC intrinsics header optionally used by vector.h.
AC_CHECK_HEADERS([immintrin.h])
AC_CHECK_HEADERS([altivec.h])
# Check for timing functions needed by cycle.h.
AC_HEADER_TIME
......
......@@ -45,6 +45,9 @@
#define ENGINE_POLICY engine_policy_none
#endif
/* Global profiler. */
struct profiler prof;
/**
* @brief Help messages for the command line parameters.
*/
......
......@@ -44,8 +44,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \
physical_constants.h physical_constants_cgs.h potential.h version.h \
hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \
sourceterms_struct.h statistics.h
sourceterms_struct.h statistics.h memswap.h profiler.h
# Common source files
AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
......@@ -54,7 +53,7 @@ AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
kernel_hydro.c tools.c part.c partition.c clocks.c parser.c \
physical_constants.c potential.c hydro_properties.c \
runner_doiact_fft.c threadpool.c cooling.c sourceterms.c \
statistics.c
statistics.c profiler.c
# Include files for distribution, not installation.
nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h kernel_hydro.h kernel_gravity.h \
......@@ -83,7 +82,8 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
potential/softened_isothermal/potential.h \
cooling/none/cooling.h cooling/none/cooling_struct.h \
cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \
cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h
cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \
memswap.h
# Sources and flags for regular library
......
......@@ -53,6 +53,8 @@
#include "gravity.h"
#include "hydro.h"
#include "hydro_properties.h"
#include "memswap.h"
#include "minmax.h"
#include "scheduler.h"
#include "space.h"
#include "timers.h"
......@@ -463,122 +465,73 @@ void cell_gunlocktree(struct cell *c) {
* @param c The #cell array to be sorted.
* @param parts_offset Offset of the cell parts array relative to the
* space's parts array, i.e. c->parts - s->parts.
* @param buff A buffer with at least max(c->count, c->gcount) entries,
* used for sorting indices.
*/
void cell_split(struct cell *c, ptrdiff_t parts_offset) {
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) {
int i, j;
const int count = c->count, gcount = c->gcount;
struct part *parts = c->parts;
struct xpart *xparts = c->xparts;
struct gpart *gparts = c->gparts;
int left[8], right[8];
double pivot[3];
/* Init the pivots. */
for (int k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->width[k] / 2;
/* Split along the x-axis. */
i = 0;
j = count - 1;
while (i <= j) {
while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1;
while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
}
const double pivot[3] = {c->loc[0] + c->width[0] / 2,
c->loc[1] + c->width[1] / 2,
c->loc[2] + c->width[2] / 2};
int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0};
int bucket_offset[9];
/* If the buff is NULL, allocate it, and remember to free it. */
const int allocate_buffer = (buff == NULL);
if (allocate_buffer &&
(buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL)
error("Failed to allocate temporary indices.");
/* Fill the buffer with the indices. */
for (int k = 0; k < count; k++) {
const int bid = (parts[k].x[0] > pivot[0]) * 4 +
(parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
}
#ifdef SWIFT_DEBUG_CHECKS
for (int k = 0; k <= j; k++)
if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed.");
for (int k = i; k < count; k++)
if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed.");
#endif
left[1] = i;
right[1] = count - 1;
left[0] = 0;
right[0] = j;
/* Split along the y axis, twice. */
for (int k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1;
while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
}
}
#ifdef SWIFT_DEBUG_CHECKS
for (int kk = left[k]; kk <= j; kk++)
if (parts[kk].x[1] > pivot[1]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (left).");
}
for (int kk = i; kk <= right[k]; kk++)
if (parts[kk].x[1] < pivot[1]) error("sorting failed (right).");
#endif
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
/* Set the buffer offsets. */
bucket_offset[0] = 0;
for (int k = 1; k <= 8; k++) {
bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
bucket_count[k - 1] = 0;
}
/* Split along the z axis, four times. */
for (int k = 3; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1;
while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1;
if (i < j) {
struct part temp = parts[i];
parts[i] = parts[j];
parts[j] = temp;
struct xpart xtemp = xparts[i];
xparts[i] = xparts[j];
xparts[j] = xtemp;
/* Run through the buckets, and swap particles to their correct spot. */
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
if (bid != bucket) {
struct part part = parts[k];
struct xpart xpart = xparts[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
j++;
bucket_count[bid]++;
}
memswap(&parts[j], &part, sizeof(struct part));
memswap(&xparts[j], &xpart, sizeof(struct xpart));
memswap(&buff[j], &bid, sizeof(int));
}
parts[k] = part;
xparts[k] = xpart;
buff[k] = bid;
}
bucket_count[bid]++;
}
#ifdef SWIFT_DEBUG_CHECKS
for (int kk = left[k]; kk <= j; kk++)
if (parts[kk].x[2] > pivot[2]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (left).");
}
for (int kk = i; kk <= right[k]; kk++)
if (parts[kk].x[2] < pivot[2]) {
message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j);
error("sorting failed (right).");
}
#endif
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for (int k = 0; k < 8; k++) {
c->progeny[k]->count = right[k] - left[k] + 1;
c->progeny[k]->parts = &c->parts[left[k]];
c->progeny[k]->xparts = &c->xparts[left[k]];
c->progeny[k]->count = bucket_count[k];
c->progeny[k]->parts = &c->parts[bucket_offset[k]];
c->progeny[k]->xparts = &c->xparts[bucket_offset[k]];
}
/* Re-link the gparts. */
......@@ -614,66 +567,51 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) {
#endif
/* Now do the same song and dance for the gparts. */
/* Split along the x-axis. */
i = 0;
j = gcount - 1;
while (i <= j) {
while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1;
while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
}
for (int k = 0; k < 8; k++) bucket_count[k] = 0;
/* Fill the buffer with the indices. */
for (int k = 0; k < gcount; k++) {
const int bid = (gparts[k].x[0] > pivot[0]) * 4 +
(gparts[k].x[1] > pivot[1]) * 2 +
(gparts[k].x[2] > pivot[2]);
bucket_count[bid]++;
buff[k] = bid;
}
left[1] = i;
right[1] = gcount - 1;
left[0] = 0;
right[0] = j;
/* Split along the y axis, twice. */
for (int k = 1; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1;
while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
}
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
/* Set the buffer offsets. */
bucket_offset[0] = 0;
for (int k = 1; k <= 8; k++) {
bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1];
bucket_count[k - 1] = 0;
}
/* Split along the z axis, four times. */
for (int k = 3; k >= 0; k--) {
i = left[k];
j = right[k];
while (i <= j) {
while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1;
while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1;
if (i < j) {
struct gpart gtemp = gparts[i];
gparts[i] = gparts[j];
gparts[j] = gtemp;
/* Run through the buckets, and swap particles to their correct spot. */
for (int bucket = 0; bucket < 8; bucket++) {
for (int k = bucket_offset[bucket] + bucket_count[bucket];
k < bucket_offset[bucket + 1]; k++) {
int bid = buff[k];
if (bid != bucket) {
struct gpart gpart = gparts[k];
while (bid != bucket) {
int j = bucket_offset[bid] + bucket_count[bid]++;
while (buff[j] == bid) {
j++;
bucket_count[bid]++;
}
memswap(&gparts[j], &gpart, sizeof(struct gpart));
memswap(&buff[j], &bid, sizeof(int));
}
gparts[k] = gpart;
buff[k] = bid;
}
bucket_count[bid]++;
}
left[2 * k + 1] = i;
right[2 * k + 1] = right[k];
left[2 * k] = left[k];
right[2 * k] = j;
}
/* Store the counts and offsets. */
for (int k = 0; k < 8; k++) {
c->progeny[k]->gcount = right[k] - left[k] + 1;
c->progeny[k]->gparts = &c->gparts[left[k]];
c->progeny[k]->gcount = bucket_count[k];
c->progeny[k]->gparts = &c->gparts[bucket_offset[k]];
}
/* Re-link the parts. */
......
......@@ -272,7 +272,7 @@ struct cell {
((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i)))
/* Function prototypes. */
void cell_split(struct cell *c, ptrdiff_t parts_offset);
void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff);
void cell_sanitize(struct cell *c);
int cell_locktree(struct cell *c);
void cell_unlocktree(struct cell *c);
......
......@@ -59,6 +59,7 @@
#include "parallel_io.h"
#include "part.h"
#include "partition.h"
#include "profiler.h"
#include "proxy.h"
#include "runner.h"
#include "serial_io.h"
......@@ -317,6 +318,23 @@ void engine_redistribute(struct engine *e) {
MPI_COMM_WORLD) != MPI_SUCCESS)
error("Failed to allreduce particle transfer counts.");
/* Report how many particles will be moved. */
if (e->verbose) {
if (e->nodeID == 0) {
size_t total = 0;
size_t unmoved = 0;
for (int p = 0, r = 0; p < nr_nodes; p++) {
for (int s = 0; s < nr_nodes; s++) {
total += counts[r];
if (p == s) unmoved += counts[r];
r++;
}
}
message("%ld of %ld (%.2f%%) of particles moved", total - unmoved,
total, 100.0 * (double)(total - unmoved) / (double)total);
}
}
/* Get all the g_counts from all the nodes. */
if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT,
MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS)
......
/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
#ifndef SWIFT_MEMSWAP_H
#define SWIFT_MEMSWAP_H
/* Config parameters. */
#include "../config.h"
#ifdef HAVE_IMMINTRIN_H
/* Include the header file with the intrinsics for Intel architecture. */
#include <immintrin.h>
#endif
#ifdef HAVE_ALTIVEC_H
/* Include the header file with the intrinsics for Intel architecture. */
#include <altivec.h>
#endif
/* Macro for in-place swap of two values a and b of type t. */
#define swap_loop(t, a, b, c) \
while (c >= sizeof(t)) { \
register t temp = *(t *)a; \
*(t *)a = *(t *)b; \
*(t *)b = temp; \
a += sizeof(t); \
b += sizeof(t); \
bytes -= sizeof(t); \
}
/**
* @brief Swap the contents of two elements in-place.
*
* Keep in mind that this function works best when the underlying data
* is aligned to the vector length, e.g. with the @c
* __attribute__((aligned(32)))
* syntax, and the code is compiled with @c -funroll-loops.
*
* @param void_a Pointer to the first element.
* @param void_b Pointer to the second element.
* @param bytes Size, in bytes, of the data pointed to by @c a and @c b.
*/
__attribute__((always_inline)) inline void memswap(void *void_a, void *void_b,
size_t bytes) {
char *a = (char *)void_a, *b = (char *)void_b;
#ifdef __AVX512F__
swap_loop(__m512i, a, b, bytes);
#endif
#ifdef __AVX__
swap_loop(__m256i, a, b, bytes);
#endif
#ifdef __SSE2__
swap_loop(__m128i, a, b, bytes);
#endif
#ifdef __ALTIVEC__
swap_loop(vector int, a, b, bytes);
#endif
swap_loop(size_t, a, b, bytes);
swap_loop(int, a, b, bytes);
swap_loop(short, a, b, bytes);
swap_loop(char, a, b, bytes);
}
#endif /* SWIFT_MEMSWAP_H */
......@@ -278,6 +278,18 @@ static void split_metis(struct space *s, int nregions, int *celllist) {
#endif
#if defined(WITH_MPI) && defined(HAVE_METIS)
/* qsort support. */
struct indexval {
int index;
int count;
};
static int indexvalcmp(const void *p1, const void *p2) {
const struct indexval *iv1 = (const struct indexval *)p1;
const struct indexval *iv2 = (const struct indexval *)p2;
return iv2->count - iv1->count;
}
/**
* @brief Partition the given space into a number of connected regions.
*
......@@ -382,14 +394,70 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew,
if (regionid[k] < 0 || regionid[k] >= nregions)
error("Got bad nodeID %" PRIDX " for cell %i.", regionid[k], k);
/* We want a solution in which the current regions of the space are
* preserved when possible, to avoid unneccesary particle movement.
* So create a 2d-array of cells counts that are common to all pairs
* of old and new ranks. Each element of the array has a cell count and
* an unique index so we can sort into decreasing counts. */
int indmax = nregions * nregions;
struct indexval *ivs = malloc(sizeof(struct indexval) * indmax);
bzero(ivs, sizeof(struct indexval) * indmax);
for (int k = 0; k < ncells; k++) {
int index = regionid[k] + nregions * s->cells_top[k].nodeID;
ivs[index].count++;
ivs[index].index = index;
}
qsort(ivs, indmax, sizeof(struct indexval), indexvalcmp);
/* Go through the ivs using the largest counts first, these are the
* regions with the most cells in common, old partition to new. */
int *oldmap = malloc(sizeof(int) * nregions);
int *newmap = malloc(sizeof(int) * nregions);
for (int k = 0; k < nregions; k++) {
oldmap[k] = -1;
newmap[k] = -1;
}
for (int k = 0; k < indmax; k++) {
/* Stop when all regions with common cells have been considered. */
if (ivs[k].count == 0) break;
/* Store old and new IDs, if not already used. */
int oldregion = ivs[k].index / nregions;
int newregion = ivs[k].index - oldregion * nregions;
if (newmap[newregion] == -1 && oldmap[oldregion] == -1) {
newmap[newregion] = oldregion;
oldmap[oldregion] = newregion;
}
}
/* Handle any regions that did not get selected by picking an unused rank
* from oldmap and assigning to newmap. */
int spare = 0;
for (int k = 0; k < nregions; k++) {
if (newmap[k] == -1) {
for (int j = spare; j < nregions; j++) {
if (oldmap[j] == -1) {
newmap[k] = j;
oldmap[j] = j;
spare = j;
break;
}
}
}
}
/* Set the cell list to the region index. */
for (int k = 0; k < ncells; k++) {
celllist[k] = regionid[k];
celllist[k] = newmap[regionid[k]];
}
/* Clean up. */
if (weights_v != NULL) free(weights_v);
if (weights_e != NULL) free(weights_e);
free(ivs);
free(oldmap);
free(newmap);
free(xadj);
free(adjncy);
free(regionid);
......
/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2016 James S. Willis (james.s.willis@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
/* Config parameters. */
#include "../config.h"
/* Some standard headers. */
#include <string.h>
/* This object's header. */
#include "profiler.h"
/* Local includes */
#include "clocks.h"
#include "hydro.h"
#include "version.h"
/**
* @brief Resets all timers.
*
* @param profiler #profiler object that holds file pointers and
* function timers.
*/
void profiler_reset_timers(struct profiler *profiler) {
profiler->collect_timesteps_time = 0;
profiler->drift_time = 0;
profiler->rebuild_time = 0;
profiler->reweight_time = 0;
profiler->clear_waits_time = 0;
profiler->re_wait_time = 0;
profiler->enqueue_time = 0;
profiler->stats_time = 0;
profiler->launch_time = 0;
profiler->space_rebuild_time = 0;
profiler->engine_maketasks_time = 0;
profiler->engine_marktasks_time = 0;
profiler->space_regrid_time = 0;