Commit d6590e40 authored by James Willis's avatar James Willis
Browse files

Merge branch 'master' into AVX512-Fixes

Conflicts:
	tests/testPair.c
parents 75d4e1e9 23710c75
......@@ -26,7 +26,7 @@ Statistics:
# Parameters for the self-gravity scheme
Gravity:
eta: 0.025 # Constant dimensionless multiplier for time integration.
epsilon: 0.0001 # Softening length (in internal units).
epsilon: 0.001 # Softening length (in internal units).
theta: 0.7 # Opening angle (Multipole acceptance criterion)
# Parameters for the hydrodynamics scheme
......
......@@ -12,6 +12,9 @@ TimeIntegration:
time_end: 1e-2 # The end time of the simulation (in internal units).
dt_min: 1e-10 # The minimal time-step size of the simulation (in internal units).
dt_max: 1e-4 # The maximal time-step size of the simulation (in internal units).
Scheduler:
cell_split_size: 64
# Parameters governing the snapshots
Snapshots:
......
......@@ -9,7 +9,7 @@ InternalUnitSystem:
# Parameters governing the time integration
TimeIntegration:
time_begin: 0. # The starting time of the simulation (in internal units).
time_end: 10. # The end time of the simulation (in internal units).
time_end: 1000. # The end time of the simulation (in internal units).
dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units).
dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units).
......@@ -21,12 +21,11 @@ Snapshots:
# Parameters governing the conserved quantities statistics
Statistics:
delta_time: 1e-3 # Time between statistics output
delta_time: 1. # Time between statistics output
# Parameters for the hydrodynamics scheme
SPH:
resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours.
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions
......
......@@ -9,9 +9,9 @@ InternalUnitSystem:
# Parameters governing the time integration
TimeIntegration:
time_begin: 0. # The starting time of the simulation (in internal units).
time_end: 1. # The end time of the simulation (in internal units).
time_end: 1000 # The end time of the simulation (in internal units).
dt_min: 1e-6 # The minimal time-step size of the simulation (in internal units).
dt_max: 1e-3 # The maximal time-step size of the simulation (in internal units).
dt_max: 1e-2 # The maximal time-step size of the simulation (in internal units).
# Parameters governing the snapshots
Snapshots:
......@@ -21,12 +21,11 @@ Snapshots:
# Parameters governing the conserved quantities statistics
Statistics:
delta_time: 1e-3 # Time between statistics output
delta_time: 1. # Time between statistics output
# Parameters for the hydrodynamics scheme
SPH:
resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours.
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions
......
......@@ -190,7 +190,11 @@ int main(int argc, char *argv[]) {
while ((c = getopt(argc, argv, "acCdDef:FgGhMn:P:sSt:Tv:y:Y:")) != -1)
switch (c) {
case 'a':
#if defined(HAVE_SETAFFINITY) && defined(HAVE_LIBNUMA)
with_aff = 1;
#else
error("Need NUMA support for thread affinity");
#endif
break;
case 'c':
with_cosmology = 1;
......@@ -392,8 +396,12 @@ int main(int argc, char *argv[]) {
parser_read_file(paramFileName, params);
/* Handle any command-line overrides. */
if (nparams > 0)
if (nparams > 0) {
message(
"Overwriting values read from the YAML file with command-line "
"values.");
for (int k = 0; k < nparams; k++) parser_set_param(params, cmdparams[k]);
}
/* And dump the parameters as used. */
// parser_print_params(&params);
......@@ -565,6 +573,11 @@ int main(int argc, char *argv[]) {
message("nr of cells at depth %i is %i.", data[0], data[1]);
}
/* Initialise the table of Ewald corrections for the gravity checks */
#ifdef SWIFT_GRAVITY_FORCE_CHECKS
if (periodic) gravity_exact_force_ewald_init(dim[0]);
#endif
/* Initialise the external potential properties */
struct external_potential potential;
if (with_external_gravity)
......
......@@ -64,7 +64,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h
kernel_long_gravity.h vector.h cache.h runner_doiact.h runner_doiact_vec.h runner_doiact_grav.h runner_doiact_fft.h \
runner_doiact_nosort.h units.h intrinsics.h minmax.h kick.h timestep.h drift.h adiabatic_index.h io_properties.h \
dimension.h equation_of_state.h part_type.h periodic.h \
gravity.h gravity_io.h \
gravity.h gravity_io.h gravity_cache.h \
gravity/Default/gravity.h gravity/Default/gravity_iact.h gravity/Default/gravity_io.h \
gravity/Default/gravity_debug.h gravity/Default/gravity_part.h \
sourceterms.h \
......
......@@ -23,9 +23,55 @@
* @brief The default struct alignment in SWIFT.
*/
#define SWIFT_STRUCT_ALIGNMENT 32
/**
* @brief Defines alignment of structures
*/
#define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT)))
/**
* @brief The default cache alignment in SWIFT.
*/
#define SWIFT_CACHE_ALIGNMENT 64
/**
* @brief Defines alignment of caches
*/
#define SWIFT_CACHE_ALIGN __attribute__((aligned(SWIFT_CACHE_ALIGNMENT)))
/**
* @brief Macro to tell the compiler that a given array has the specified
* alignment.
*
* Note that this turns into a no-op but gives information to the compiler.
*
* @param array The array.
* @param alignment The alignment in bytes of the array.
*/
#if defined(__ICC)
#define swift_align_information(array, alignment) \
__assume_aligned(array, alignment);
#elif defined(__GNUC__)
#define swift_align_information(array, alignment) \
array = __builtin_assume_aligned(array, alignment);
#else
#define swift_align_information(array, alignment) ;
#endif
/**
* @brief Macro to tell the compiler that a given number is 0 modulo a given
* size.
*
* Note that this turns into a no-op but gives information to the compiler.
* GCC does not have the equivalent built-in so defaults to nothing.
*
* @param var The variable
* @param size The modulo of interest.
*/
#if defined(__ICC)
#define swift_assume_size(var, size) __assume(var % size == 0);
#else
#define swift_assume_size(var, size) ;
#endif
#endif /* SWIFT_ALIGN_H */
......@@ -26,6 +26,7 @@
/* Some standard headers. */
#include <float.h>
#include <stdio.h>
#include <unistd.h>
/* This object's header. */
#include "debug.h"
......@@ -450,3 +451,69 @@ void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells) {
}
#endif /* HAVE_MPI */
/**
* @brief parse the process /proc/self/statm file to get the process
* memory use (in KB). Top field in ().
*
* @param size total virtual memory (VIRT)
* @param resident resident non-swapped memory (RES)
* @param share shared (mmap'd) memory (SHR)
* @param trs text (exe) resident set (CODE)
* @param lrs library resident set
* @param drs data+stack resident set (DATA)
* @param dt dirty pages (nDRT)
*/
void getProcMemUse(long *size, long *resident, long *share, long *trs,
long *lrs, long *drs, long *dt) {
/* Open the file. */
FILE *file = fopen("/proc/self/statm", "r");
if (file != NULL) {
int nscan = fscanf(file, "%ld %ld %ld %ld %ld %ld %ld", size, resident,
share, trs, lrs, drs, dt);
if (nscan == 7) {
/* Convert pages into bytes. Usually 4096, but could be 512 on some
* systems so take care in conversion to KB. */
long sz = sysconf(_SC_PAGESIZE);
*size *= sz;
*resident *= sz;
*share *= sz;
*trs *= sz;
*lrs *= sz;
*drs *= sz;
*dt *= sz;
*size /= 1024;
*resident /= 1024;
*share /= 1024;
*trs /= 1024;
*lrs /= 1024;
*drs /= 1024;
*dt /= 1024;
} else {
error("Failed to read sufficient fields from /proc/self/statm");
}
fclose(file);
} else {
error("Failed to open /proc/self/statm");
}
}
/**
* @brief Print the current memory use of the process. A la "top".
*/
void printProcMemUse() {
long size;
long resident;
long share;
long trs;
long lrs;
long drs;
long dt;
getProcMemUse(&size, &resident, &share, &trs, &lrs, &drs, &dt);
printf("## VIRT = %ld , RES = %ld , SHR = %ld , CODE = %ld, DATA = %ld\n",
size, resident, share, trs, drs);
fflush(stdout);
}
......@@ -44,4 +44,7 @@ void dumpMETISGraph(const char *prefix, idx_t nvtxs, idx_t ncon, idx_t *xadj,
void dumpCellRanks(const char *prefix, struct cell *cells_top, int nr_cells);
#endif
void getProcMemUse(long *size, long *resident, long *share, long *trs,
long *lrs, long *drs, long *dt);
void printProcMemUse();
#endif /* SWIFT_DEBUG_H */
......@@ -57,6 +57,7 @@
#include "error.h"
#include "gravity.h"
#include "hydro.h"
#include "map.h"
#include "minmax.h"
#include "parallel_io.h"
#include "part.h"
......@@ -153,7 +154,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
struct scheduler *s = &e->sched;
const int periodic = e->s->periodic;
const int is_hydro = (e->policy & engine_policy_hydro);
const int is_with_hydro = (e->policy & engine_policy_hydro);
const int is_self_gravity = (e->policy & engine_policy_self_gravity);
const int is_with_cooling = (e->policy & engine_policy_cooling);
const int is_with_sourceterms = (e->policy & engine_policy_sourceterms);
......@@ -162,15 +163,19 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
if (c->super == c) {
/* Add the sort task. */
c->sorts =
scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0, c, NULL);
if (is_with_hydro) {
c->sorts = scheduler_addtask(s, task_type_sort, task_subtype_none, 0, 0,
c, NULL);
}
/* Local tasks only... */
if (c->nodeID == e->nodeID) {
/* Add the drift task. */
c->drift_part = scheduler_addtask(s, task_type_drift_part,
task_subtype_none, 0, 0, c, NULL);
if (is_with_hydro) {
c->drift_part = scheduler_addtask(s, task_type_drift_part,
task_subtype_none, 0, 0, c, NULL);
}
/* Add the two half kicks */
c->kick1 = scheduler_addtask(s, task_type_kick1, task_subtype_none, 0, 0,
......@@ -207,7 +212,7 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
}
/* Generate the ghost tasks. */
if (is_hydro) {
if (is_with_hydro) {
c->ghost_in =
scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
/* implicit = */ 1, c, NULL);
......@@ -215,14 +220,13 @@ void engine_make_hierarchical_tasks(struct engine *e, struct cell *c) {
scheduler_addtask(s, task_type_ghost, task_subtype_none, 0,
/* implicit = */ 1, c, NULL);
engine_add_ghosts(e, c, c->ghost_in, c->ghost_out);
}
#ifdef EXTRA_HYDRO_LOOP
/* Generate the extra ghost task. */
if (is_hydro)
/* Generate the extra ghost task. */
c->extra_ghost = scheduler_addtask(s, task_type_extra_ghost,
task_subtype_none, 0, 0, c, NULL);
#endif
}
/* Cooling task */
if (is_with_cooling) {
......@@ -898,6 +902,16 @@ void engine_repartition(struct engine *e) {
partition_repartition(e->reparttype, e->nodeID, e->nr_nodes, e->s,
e->sched.tasks, e->sched.nr_tasks);
/* Partitioning requires copies of the particles, so we need to reduce the
* memory in use to the minimum, we can free the sorting indices and the
* tasks as these will be regenerated at the next rebuild. */
/* Sorting indices. */
if (e->s->cells_top != NULL) space_free_cells(e->s);
/* Task arrays. */
scheduler_free_tasks(&e->sched);
/* Now comes the tricky part: Exchange particles between all nodes.
This is done in two steps, first allreducing a matrix of
how many particles go from where to where, then re-allocating
......@@ -1731,7 +1745,7 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
/* If the cells is local build a self-interaction */
scheduler_addtask(sched, task_type_self, task_subtype_grav, 0, 0, ci, NULL);
/* Deal with periodicity dependencies */
/* Deal with periodicity FFT task dependencies */
const int ghost_id = cell_getid(cdim_ghost, i / 4, j / 4, k / 4);
if (ghost_id > n_ghosts) error("Invalid ghost_id");
if (periodic) {
......@@ -1739,6 +1753,10 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
ci->grav_ghost[1] = ghosts[2 * ghost_id + 1];
}
/* Recover the multipole information */
struct gravity_tensors *const multi_i = ci->multipole;
const double CoM_i[3] = {multi_i->CoM[0], multi_i->CoM[1], multi_i->CoM[2]};
/* Loop over every other cell */
for (int ii = 0; ii < cdim[0]; ii++) {
for (int jj = 0; jj < cdim[1]; jj++) {
......@@ -1757,11 +1775,27 @@ void engine_make_self_gravity_tasks_mapper(void *map_data, int num_elements,
/* Is that neighbour local ? */
if (cj->nodeID != nodeID) continue; // MATTHIEU
/* Are the cells to close for a MM interaction ? */
if (!gravity_multipole_accept_rebuild(ci->multipole, cj->multipole,
theta_crit_inv, periodic,
dim)) {
/* Recover the multipole information */
struct gravity_tensors *const multi_j = cj->multipole;
/* Get the distance between the CoMs */
double dx = CoM_i[0] - multi_j->CoM[0];
double dy = CoM_i[1] - multi_j->CoM[1];
double dz = CoM_i[2] - multi_j->CoM[2];
/* Apply BC */
if (periodic) {
dx = nearest(dx, dim[0]);
dy = nearest(dy, dim[1]);
dz = nearest(dz, dim[2]);
}
const double r2 = dx * dx + dy * dy + dz * dz;
/* Are the cells too close for a MM interaction ? */
if (!gravity_multipole_accept_rebuild(multi_i, multi_j,
theta_crit_inv, r2)) {
/* Ok, we need to add a direct pair calculation */
scheduler_addtask(sched, task_type_pair, task_subtype_grav, 0, 0,
ci, cj);
}
......@@ -2796,8 +2830,7 @@ void engine_marktasks_mapper(void *map_data, int num_elements,
/* Kick/Drift/init ? */
if (t->type == task_type_kick1 || t->type == task_type_kick2 ||
t->type == task_type_drift_part || t->type == task_type_drift_gpart ||
t->type == task_type_init_grav) {
t->type == task_type_drift_gpart || t->type == task_type_init_grav) {
if (cell_is_active(t->ci, e)) scheduler_activate(s, t);
}
......@@ -4517,8 +4550,12 @@ void engine_init(struct engine *e, struct space *s,
e->runners[k].qid = k * nr_queues / e->nr_threads;
}
#ifdef WITH_VECTORIZATION
/* Allocate particle caches. */
e->runners[k].ci_gravity_cache.count = 0;
e->runners[k].cj_gravity_cache.count = 0;
gravity_cache_init(&e->runners[k].ci_gravity_cache, space_splitsize);
gravity_cache_init(&e->runners[k].cj_gravity_cache, space_splitsize);
#ifdef WITH_VECTORIZATION
e->runners[k].ci_cache.count = 0;
e->runners[k].cj_cache.count = 0;
cache_init(&e->runners[k].ci_cache, CACHE_SIZE);
......@@ -4609,8 +4646,12 @@ void engine_compute_next_snapshot_time(struct engine *e) {
void engine_clean(struct engine *e) {
#ifdef WITH_VECTORIZATION
for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].ci_cache);
for (int i = 0; i < e->nr_threads; ++i) cache_clean(&e->runners[i].cj_cache);
for (int i = 0; i < e->nr_threads; ++i) {
cache_clean(&e->runners[i].ci_cache);
cache_clean(&e->runners[i].cj_cache);
gravity_cache_clean(&e->runners[i].ci_gravity_cache);
gravity_cache_clean(&e->runners[i].cj_gravity_cache);
}
#endif
free(e->runners);
free(e->snapshotUnits);
......
......@@ -21,9 +21,15 @@
#include "../config.h"
/* Some standard headers. */
#include <float.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#ifdef HAVE_HDF5
#include <hdf5.h>
#endif
/* This object's header. */
#include "gravity.h"
......@@ -39,6 +45,256 @@ struct exact_force_data {
double const_G;
};
#ifdef SWIFT_GRAVITY_FORCE_CHECKS
/* Size of the Ewald table */
#define Newald 64
/* Components of the Ewald correction */
static float fewald_x[Newald + 1][Newald + 1][Newald + 1];
static float fewald_y[Newald + 1][Newald + 1][Newald + 1];
static float fewald_z[Newald + 1][Newald + 1][Newald + 1];
/* Factor used to normalize the access to the Ewald table */
float ewald_fac;
#endif
/**
* @brief Allocates the memory and computes one octant of the
* Ewald correction table.
*
* We follow Hernquist, Bouchet & Suto, 1991, ApJS, Volume 75, p.231-240,
* equations (2.14a) and (2.14b) with alpha = 2. We consider all terms with
* |x - nL| < 4L and |h|^2 < 16.
*
* @param boxSize The side-length (L) of the volume.
*/
void gravity_exact_force_ewald_init(double boxSize) {
#ifdef SWIFT_GRAVITY_FORCE_CHECKS
const ticks tic = getticks();
message("Computing Ewald correction table...");
/* Level of correction (Hernquist et al. 1991)*/
const float alpha = 2.f;
/* some useful constants */
const float alpha2 = alpha * alpha;
const float factor_exp1 = 2.f * alpha / sqrt(M_PI);
const float factor_exp2 = -M_PI * M_PI / alpha2;
const float factor_sin = 2.f * M_PI;
const float boxSize_inv2 = 1.f / (boxSize * boxSize);
/* Ewald factor to access the table */
ewald_fac = (double)(2 * Newald) / boxSize;
/* Zero everything */
bzero(fewald_x, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
bzero(fewald_y, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
bzero(fewald_z, (Newald + 1) * (Newald + 1) * (Newald + 1) * sizeof(float));
/* Compute the values in one of the octants */
for (int i = 0; i <= Newald; ++i) {
for (int j = 0; j <= Newald; ++j) {
for (int k = 0; k <= Newald; ++k) {
if (i == 0 && j == 0 && k == 0) continue;
/* Distance vector */
const float r_x = 0.5f * ((float)i) / Newald;
const float r_y = 0.5f * ((float)j) / Newald;
const float r_z = 0.5f * ((float)k) / Newald;
/* Norm of distance vector */
const float r2 = r_x * r_x + r_y * r_y + r_z * r_z;
const float r_inv = 1.f / sqrtf(r2);
const float r_inv3 = r_inv * r_inv * r_inv;
/* Normal gravity potential term */
float f_x = r_x * r_inv3;
float f_y = r_y * r_inv3;
float f_z = r_z * r_inv3;
for (int n_i = -4; n_i <= 4; ++n_i) {
for (int n_j = -4; n_j <= 4; ++n_j) {
for (int n_k = -4; n_k <= 4; ++n_k) {
const float d_x = r_x - n_i;
const float d_y = r_y - n_j;
const float d_z = r_z - n_k;
/* Discretised distance */
const float r_tilde2 = d_x * d_x + d_y * d_y + d_z * d_z;
const float r_tilde_inv = 1.f / sqrtf(r_tilde2);
const float r_tilde = r_tilde_inv * r_tilde2;
const float r_tilde_inv3 =
r_tilde_inv * r_tilde_inv * r_tilde_inv;
const float val =
erfcf(alpha * r_tilde) +
factor_exp1 * r_tilde * expf(-alpha2 * r_tilde2);
/* First correction term */
const float f = val * r_tilde_inv3;
f_x -= f * d_x;
f_y -= f * d_y;
f_z -= f * d_z;
}
}
}
for (int h_i = -4; h_i <= 4; ++h_i) {
for (int h_j = -4; h_j <= 4; ++h_j) {
for (int h_k = -4; h_k <= 4; ++h_k) {
const float h2 = h_i * h_i + h_j * h_j + h_k * h_k;
const float h2_inv = 1.f / (h2 + FLT_MIN);
const float h_dot_x = h_i * r_x + h_j * r_y + h_k * r_z;
const float val = 2.f * h2_inv * expf(h2 * factor_exp2) *
sinf(factor_sin * h_dot_x);
/* Second correction term */
f_x -= val * h_i;
f_y -= val * h_j;
f_z -= val * h_k;
}
}
}
/* Save back to memory */
fewald_x[i][j][k] = f_x;
fewald_y[i][j][k] = f_y;
fewald_z[i][j][k] = f_z;
}
}
}
/* Dump the Ewald table to a file */
#ifdef HAVE_HDF5
hid_t h_file =
H5Fcreate("Ewald.hdf5", H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
if (h_file < 0) error("Error while opening file for Ewald dump.");
/* Create dataspace */
hsize_t dim[3] = {Newald + 1, Newald + 1, Newald + 1};
hid_t h_space = H5Screate_simple(3, dim, NULL);
hid_t h_data;
h_data = H5Dcreate(h_file, "Ewald_x", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
&(fewald_x[0][0][0]));
H5Dclose(h_data);
h_data = H5Dcreate(h_file, "Ewald_y", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
&(fewald_y[0][0][0]));
H5Dclose(h_data);
h_data = H5Dcreate(h_file, "Ewald_z", H5T_NATIVE_FLOAT, h_space, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
H5Dwrite(h_data, H5T_NATIVE_FLOAT, h_space, H5S_ALL, H5P_DEFAULT,
&(fewald_z[0][0][0]));
H5Dclose(h_data);
H5Sclose(h_space);
H5Fclose(h_file);
#endif
/* Apply the box-size correction */
for (int i = 0; i <= Newald; ++i) {
for (int j = 0; j <= Newald; ++j) {
for (int k = 0; k <= Newald; ++k) {
fewald_x[i][j][k] *= boxSize_inv2;
fewald_y[i][j][k] *= boxSize_inv2;
fewald_z[i][j][k] *= boxSize_inv2;
}
}
}
/* Say goodbye */
message("Ewald correction table computed (took %.3f %s). ",
clocks_from_ticks(getticks() - tic), clocks_getunit());
#else
error("Gravity checking function called without the corresponding flag.");