Commit cac5c3f5 authored by James Willis's avatar James Willis
Browse files

Merge branch 'master' into doself2-vectorisation

Conflicts:
	src/runner_doiact_vec.c
parents 67df6740 c9d46f8e
......@@ -28,7 +28,7 @@ SPH:
resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours.
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions
InitialConditions:
file_name: ./multiTypes.hdf5 # The file to read
......
......@@ -8,12 +8,13 @@ InternalUnitSystem:
# Parameters for the task scheduling
Scheduler:
nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide.
cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value).
cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value).
cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value).
max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide.
cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value).
cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value).
cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value).
max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
# Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
TimeIntegration:
......
......@@ -108,6 +108,7 @@ case $host_cpu in
*3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;;
*3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;;
*000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;;
......
......@@ -32,6 +32,7 @@
#include "debug.h"
/* Local includes. */
#include "active.h"
#include "cell.h"
#include "engine.h"
#include "hydro.h"
......@@ -269,6 +270,79 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
return result;
}
/**
* @brief map function for dumping cells. In MPI mode locally active cells
* only.
*/
static void dumpCells_map(struct cell *c, void *data) {
uintptr_t *ldata = (uintptr_t *)data;
FILE *file = (FILE *)ldata[0];
struct engine *e = (struct engine *)ldata[1];
float ntasks = c->nr_tasks;
#if SWIFT_DEBUG_CHECKS
/* The c->nr_tasks field does not include all the tasks. So let's check this
* the hard way. Note pairs share the task 50/50 with the other cell. */
ntasks = 0.0f;
struct task *tasks = e->sched.tasks;
int nr_tasks = e->sched.nr_tasks;
for (int k = 0; k < nr_tasks; k++) {
if (tasks[k].cj == NULL) {
if (c == tasks[k].ci) {
ntasks = ntasks + 1.0f;
}
} else {
if (c == tasks[k].ci || c == tasks[k].cj) {
ntasks = ntasks + 0.5f;
}
}
}
#endif
/* Only locally active cells are dumped. */
if (c->count > 0 || c->gcount > 0 || c->scount > 0)
fprintf(file,
" %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d "
"%6.1f %20lld %6d %6d %6d %6d\n",
c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks,
c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c),
cell_is_active(c, e), c->nodeID);
}
/**
* @brief Dump the location, depth, task counts and timebins and active state,
* for all cells to a simple text file.
*
* @param prefix base output filename
* @param s the space holding the cells to dump.
*/
void dumpCells(const char *prefix, struct space *s) {
FILE *file = NULL;
/* Name of output file. */
static int nseq = 0;
char fname[200];
int uniq = atomic_inc(&nseq);
sprintf(fname, "%s_%03d.dat", prefix, uniq);
file = fopen(fname, "w");
/* Header. */
fprintf(file,
"# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
"%20s %6s %6s %6s\n",
"x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth",
"tasks", "ti_end_min", "timebin", "issuper", "active", "rank");
uintptr_t data[2];
data[0] = (size_t)file;
data[1] = (size_t)s->e;
space_map_cells_pre(s, 1, dumpCells_map, &data);
fclose(file);
}
#ifdef HAVE_METIS
/**
......
......@@ -33,6 +33,7 @@ void printgParticle_single(struct gpart *gp);
int checkSpacehmax(struct space *s);
int checkCellhdxmax(const struct cell *c, int *depth);
void dumpCells(const char *prefix, struct space *s);
#ifdef HAVE_METIS
#include "metis.h"
......
......@@ -2507,7 +2507,7 @@ void engine_maketasks(struct engine *e) {
const ticks tic = getticks();
/* Re-set the scheduler. */
scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell);
scheduler_reset(sched, engine_estimate_nr_tasks(e));
/* Construct the firt hydro loop over neighbours */
if (e->policy & engine_policy_hydro) {
......@@ -2902,7 +2902,8 @@ void engine_print_task_counts(struct engine *e) {
else
counts[(int)tasks[k].type] += 1;
}
message("Total = %d", nr_tasks);
message("Total = %d (per cell = %d)", nr_tasks,
(int)ceil((double)nr_tasks / e->s->tot_cells));
#ifdef WITH_MPI
printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
......@@ -2923,6 +2924,117 @@ void engine_print_task_counts(struct engine *e) {
clocks_getunit());
}
/**
* @brief if necessary, estimate the number of tasks required given
* the current tasks in use and the numbers of cells.
*
* If e->tasks_per_cell is set greater than 0 then that value is used
* as the estimate of the average number of tasks per cell,
* otherwise we attempt an estimate.
*
* @param e the #engine
*
* @return the estimated total number of tasks
*/
int engine_estimate_nr_tasks(struct engine *e) {
int tasks_per_cell = e->tasks_per_cell;
if (tasks_per_cell > 0) return e->s->tot_cells * tasks_per_cell;
/* Our guess differs depending on the types of tasks we are using, but we
* basically use a formula <n1>*ntopcells + <n2>*(totcells - ntopcells).
* Where <n1> is the expected maximum tasks per top-level/super cell, and
* <n2> the expected maximum tasks for all other cells. These should give
* a safe upper limit.
*/
int n1 = 0;
int n2 = 0;
if (e->policy & engine_policy_hydro) {
n1 += 36;
n2 += 2;
#ifdef WITH_MPI
n1 += 6;
#endif
#ifdef EXTRA_HYDRO_LOOP
n1 += 15;
#ifdef WITH_MPI
n1 += 2;
#endif
#endif
}
if (e->policy & engine_policy_self_gravity) {
n1 += 24;
n2 += 1;
#ifdef WITH_MPI
n2 += 2;
#endif
}
if (e->policy & engine_policy_external_gravity) {
n1 += 2;
}
if (e->policy & engine_policy_cosmology) {
n1 += 2;
}
if (e->policy & engine_policy_cooling) {
n1 += 2;
}
if (e->policy & engine_policy_sourceterms) {
n1 += 2;
}
if (e->policy & engine_policy_stars) {
n1 += 2;
}
#ifdef WITH_MPI
/* We need fewer tasks per rank when using MPI, but we could have
* imbalances, so we need to work using the locally active cells, not just
* some equipartition amongst the nodes. Don't want to recurse the whole
* cell tree, so just make a guess of the maximum possible total cells. */
int ntop = 0;
int ncells = 0;
for (int k = 0; k < e->s->nr_cells; k++) {
struct cell *c = &e->s->cells_top[k];
/* Any cells with particles will have tasks (local & foreign). */
int nparts = c->count + c->gcount + c->scount;
if (nparts > 0) {
ntop++;
ncells++;
/* Count cell depth until we get below the parts per cell threshold. */
int depth = 0;
while (nparts > space_splitsize) {
depth++;
nparts /= 8;
ncells += (1 << (depth * 3));
}
}
}
/* If no local cells, we are probably still initialising, so just keep
* room for the top-level. */
if (ncells == 0) {
ntop = e->s->nr_cells;
ncells = ntop;
}
#else
int ntop = e->s->nr_cells;
int ncells = e->s->tot_cells;
#endif
double ntasks = n1 * ntop + n2 * (ncells - ntop);
tasks_per_cell = ceil(ntasks / ncells);
if (tasks_per_cell < 1.0) tasks_per_cell = 1.0;
if (e->verbose)
message("tasks per cell estimated as: %d, maximum tasks: %d",
tasks_per_cell, ncells * tasks_per_cell);
return ncells * tasks_per_cell;
}
/**
* @brief Rebuild the space and tasks.
*
......@@ -4503,9 +4615,14 @@ void engine_init(struct engine *e, struct space *s,
pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
error("Failed to initialize barrier.");
/* Init the scheduler with enough tasks for the initial sorting tasks. */
const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads;
scheduler_init(&e->sched, e->s, nr_tasks, nr_queues,
/* Expected average for tasks per cell. If set to zero we use a heuristic
* guess based on the numbers of cells and how many tasks per cell we expect.
*/
e->tasks_per_cell =
parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0);
/* Init the scheduler. */
scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
(policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
/* Allocate and init the threads. */
......
......@@ -75,7 +75,6 @@ enum engine_policy {
extern const char *engine_policy_names[];
#define engine_queue_scale 1.2
#define engine_maxtaskspercell 96
#define engine_maxproxies 64
#define engine_tasksreweight 1
#define engine_parts_size_grow 1.05
......@@ -222,6 +221,10 @@ struct engine {
struct link *links;
int nr_links, size_links;
/* Average number of tasks per cell. Used to estimate the sizes
* of the various task arrays. */
int tasks_per_cell;
/* Are we talkative ? */
int verbose;
......@@ -292,5 +295,6 @@ int engine_is_done(struct engine *e);
void engine_pin();
void engine_unpin();
void engine_clean(struct engine *e);
int engine_estimate_nr_tasks(struct engine *e);
#endif /* SWIFT_ENGINE_H */
......@@ -131,7 +131,7 @@ static INLINE void gravity_cache_init(struct gravity_cache *c, int count) {
*/
__attribute__((always_inline)) INLINE void gravity_cache_populate(
struct gravity_cache *c, const struct gpart *restrict gparts, int gcount,
int gcount_padded, const double shift[3]) {
int gcount_padded, const double shift[3], const struct cell *cell) {
/* Make the compiler understand we are in happy vectorization land */
float *restrict x = c->x;
......@@ -161,9 +161,9 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate(
/* Pad the caches */
for (int i = gcount; i < gcount_padded; ++i) {
x[i] = 0.f;
y[i] = 0.f;
z[i] = 0.f;
x[i] = -3.f * cell->width[0];
y[i] = -3.f * cell->width[0];
z[i] = -3.f * cell->width[0];
epsilon[i] = 0.f;
m[i] = 0.f;
}
......
......@@ -1749,7 +1749,8 @@ void *runner_main(void *data) {
struct runner *r = (struct runner *)data;
struct engine *e = r->e;
struct scheduler *sched = &e->sched;
unsigned int seed = r->id;
pthread_setspecific(sched->local_seed_pointer, &seed);
/* Main loop. */
while (1) {
......
......@@ -1039,7 +1039,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid,
const float hj = pj->h;
const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
if (dj - rshift > di_max) continue;
double pjx[3];
for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
const float hjg2 = hj * hj * kernel_gamma2;
......@@ -1451,7 +1451,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) {
const float hj = pj->h;
const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift;
if (dj - rshift > di_max) continue;
double pjx[3];
for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k];
const float hjg2 = hj * hj * kernel_gamma2;
......
......@@ -194,9 +194,9 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci,
/* Fill the caches */
gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
loc_mean);
loc_mean, ci);
gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
loc_mean);
loc_mean, cj);
/* Ok... Here we go ! */
......@@ -542,9 +542,9 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci,
/* Fill the caches */
gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i,
loc_mean);
loc_mean, ci);
gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j,
loc_mean);
loc_mean, cj);
/* Ok... Here we go ! */
......@@ -941,7 +941,7 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) {
/* Computed the padded counts */
const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc, c);
/* Ok... Here we go ! */
......@@ -1155,7 +1155,7 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) {
/* Computed the padded counts */
const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE;
gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc);
gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc, c);
/* Ok... Here we go ! */
......
......@@ -772,7 +772,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
const int ind = atomic_inc(&s->tasks_next);
/* Overflow? */
if (ind >= s->size) error("Task list overflow.");
if (ind >= s->size)
error(
"Task list overflow (%d). Need to increase "
"Scheduler:tasks_per_cell.",
ind);
/* Get a pointer to the new task. */
struct task *t = &s->tasks[ind];
......@@ -1377,7 +1381,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
if (qid >= s->nr_queues) error("Bad computed qid.");
/* If no previous owner, pick a random queue. */
if (qid < 0) qid = rand() % s->nr_queues;
/* Note that getticks() is random enough */
if (qid < 0) qid = getticks() % s->nr_queues;
/* Increase the waiting counter. */
atomic_inc(&s->waiting);
......@@ -1609,6 +1614,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
s->size = 0;
s->tasks = NULL;
s->tasks_ind = NULL;
pthread_key_create(&s->local_seed_pointer, NULL);
scheduler_reset(s, nr_tasks);
}
......
......@@ -102,6 +102,9 @@ struct scheduler {
/* The node we are working on. */
int nodeID;
/* 'Pointer' to the seed for the random number generator */
pthread_key_t local_seed_pointer;
};
/* Inlined functions (for speed). */
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment