Commit 5451b571 authored by Matthieu Schaller's avatar Matthieu Schaller
Browse files

Merge branch 'master' into gravity_speedup

parents 9dad548c be949fab
...@@ -28,7 +28,7 @@ SPH: ...@@ -28,7 +28,7 @@ SPH:
resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel).
delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours.
CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration.
# Parameters related to the initial conditions # Parameters related to the initial conditions
InitialConditions: InitialConditions:
file_name: ./multiTypes.hdf5 # The file to read file_name: ./multiTypes.hdf5 # The file to read
......
...@@ -8,12 +8,13 @@ InternalUnitSystem: ...@@ -8,12 +8,13 @@ InternalUnitSystem:
# Parameters for the task scheduling # Parameters for the task scheduling
Scheduler: Scheduler:
nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide.
cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value).
cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value). cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value).
cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value). cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value).
cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value).
max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value).
tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...).
# Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.) # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.)
TimeIntegration: TimeIntegration:
......
...@@ -108,6 +108,7 @@ case $host_cpu in ...@@ -108,6 +108,7 @@ case $host_cpu in
*3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;;
*1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;; *1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;;
*3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;; *3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;;
*000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;; *000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;;
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "debug.h" #include "debug.h"
/* Local includes. */ /* Local includes. */
#include "active.h"
#include "cell.h" #include "cell.h"
#include "engine.h" #include "engine.h"
#include "hydro.h" #include "hydro.h"
...@@ -269,6 +270,79 @@ int checkCellhdxmax(const struct cell *c, int *depth) { ...@@ -269,6 +270,79 @@ int checkCellhdxmax(const struct cell *c, int *depth) {
return result; return result;
} }
/**
* @brief map function for dumping cells. In MPI mode locally active cells
* only.
*/
static void dumpCells_map(struct cell *c, void *data) {
uintptr_t *ldata = (uintptr_t *)data;
FILE *file = (FILE *)ldata[0];
struct engine *e = (struct engine *)ldata[1];
float ntasks = c->nr_tasks;
#if SWIFT_DEBUG_CHECKS
/* The c->nr_tasks field does not include all the tasks. So let's check this
* the hard way. Note pairs share the task 50/50 with the other cell. */
ntasks = 0.0f;
struct task *tasks = e->sched.tasks;
int nr_tasks = e->sched.nr_tasks;
for (int k = 0; k < nr_tasks; k++) {
if (tasks[k].cj == NULL) {
if (c == tasks[k].ci) {
ntasks = ntasks + 1.0f;
}
} else {
if (c == tasks[k].ci || c == tasks[k].cj) {
ntasks = ntasks + 0.5f;
}
}
}
#endif
/* Only locally active cells are dumped. */
if (c->count > 0 || c->gcount > 0 || c->scount > 0)
fprintf(file,
" %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d "
"%6.1f %20lld %6d %6d %6d %6d\n",
c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1],
c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks,
c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c),
cell_is_active(c, e), c->nodeID);
}
/**
* @brief Dump the location, depth, task counts and timebins and active state,
* for all cells to a simple text file.
*
* @param prefix base output filename
* @param s the space holding the cells to dump.
*/
void dumpCells(const char *prefix, struct space *s) {
FILE *file = NULL;
/* Name of output file. */
static int nseq = 0;
char fname[200];
int uniq = atomic_inc(&nseq);
sprintf(fname, "%s_%03d.dat", prefix, uniq);
file = fopen(fname, "w");
/* Header. */
fprintf(file,
"# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s "
"%20s %6s %6s %6s\n",
"x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth",
"tasks", "ti_end_min", "timebin", "issuper", "active", "rank");
uintptr_t data[2];
data[0] = (size_t)file;
data[1] = (size_t)s->e;
space_map_cells_pre(s, 1, dumpCells_map, &data);
fclose(file);
}
#ifdef HAVE_METIS #ifdef HAVE_METIS
/** /**
......
...@@ -33,6 +33,7 @@ void printgParticle_single(struct gpart *gp); ...@@ -33,6 +33,7 @@ void printgParticle_single(struct gpart *gp);
int checkSpacehmax(struct space *s); int checkSpacehmax(struct space *s);
int checkCellhdxmax(const struct cell *c, int *depth); int checkCellhdxmax(const struct cell *c, int *depth);
void dumpCells(const char *prefix, struct space *s);
#ifdef HAVE_METIS #ifdef HAVE_METIS
#include "metis.h" #include "metis.h"
......
...@@ -2514,7 +2514,7 @@ void engine_maketasks(struct engine *e) { ...@@ -2514,7 +2514,7 @@ void engine_maketasks(struct engine *e) {
const ticks tic = getticks(); const ticks tic = getticks();
/* Re-set the scheduler. */ /* Re-set the scheduler. */
scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell); scheduler_reset(sched, engine_estimate_nr_tasks(e));
/* Construct the firt hydro loop over neighbours */ /* Construct the firt hydro loop over neighbours */
if (e->policy & engine_policy_hydro) { if (e->policy & engine_policy_hydro) {
...@@ -2943,7 +2943,8 @@ void engine_print_task_counts(struct engine *e) { ...@@ -2943,7 +2943,8 @@ void engine_print_task_counts(struct engine *e) {
else else
counts[(int)tasks[k].type] += 1; counts[(int)tasks[k].type] += 1;
} }
message("Total = %d", nr_tasks); message("Total = %d (per cell = %d)", nr_tasks,
(int)ceil((double)nr_tasks / e->s->tot_cells));
#ifdef WITH_MPI #ifdef WITH_MPI
printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i", printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i",
e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]); e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]);
...@@ -2964,6 +2965,117 @@ void engine_print_task_counts(struct engine *e) { ...@@ -2964,6 +2965,117 @@ void engine_print_task_counts(struct engine *e) {
clocks_getunit()); clocks_getunit());
} }
/**
* @brief if necessary, estimate the number of tasks required given
* the current tasks in use and the numbers of cells.
*
* If e->tasks_per_cell is set greater than 0 then that value is used
* as the estimate of the average number of tasks per cell,
* otherwise we attempt an estimate.
*
* @param e the #engine
*
* @return the estimated total number of tasks
*/
int engine_estimate_nr_tasks(struct engine *e) {
int tasks_per_cell = e->tasks_per_cell;
if (tasks_per_cell > 0) return e->s->tot_cells * tasks_per_cell;
/* Our guess differs depending on the types of tasks we are using, but we
* basically use a formula <n1>*ntopcells + <n2>*(totcells - ntopcells).
* Where <n1> is the expected maximum tasks per top-level/super cell, and
* <n2> the expected maximum tasks for all other cells. These should give
* a safe upper limit.
*/
int n1 = 0;
int n2 = 0;
if (e->policy & engine_policy_hydro) {
n1 += 36;
n2 += 2;
#ifdef WITH_MPI
n1 += 6;
#endif
#ifdef EXTRA_HYDRO_LOOP
n1 += 15;
#ifdef WITH_MPI
n1 += 2;
#endif
#endif
}
if (e->policy & engine_policy_self_gravity) {
n1 += 24;
n2 += 1;
#ifdef WITH_MPI
n2 += 2;
#endif
}
if (e->policy & engine_policy_external_gravity) {
n1 += 2;
}
if (e->policy & engine_policy_cosmology) {
n1 += 2;
}
if (e->policy & engine_policy_cooling) {
n1 += 2;
}
if (e->policy & engine_policy_sourceterms) {
n1 += 2;
}
if (e->policy & engine_policy_stars) {
n1 += 2;
}
#ifdef WITH_MPI
/* We need fewer tasks per rank when using MPI, but we could have
* imbalances, so we need to work using the locally active cells, not just
* some equipartition amongst the nodes. Don't want to recurse the whole
* cell tree, so just make a guess of the maximum possible total cells. */
int ntop = 0;
int ncells = 0;
for (int k = 0; k < e->s->nr_cells; k++) {
struct cell *c = &e->s->cells_top[k];
/* Any cells with particles will have tasks (local & foreign). */
int nparts = c->count + c->gcount + c->scount;
if (nparts > 0) {
ntop++;
ncells++;
/* Count cell depth until we get below the parts per cell threshold. */
int depth = 0;
while (nparts > space_splitsize) {
depth++;
nparts /= 8;
ncells += (1 << (depth * 3));
}
}
}
/* If no local cells, we are probably still initialising, so just keep
* room for the top-level. */
if (ncells == 0) {
ntop = e->s->nr_cells;
ncells = ntop;
}
#else
int ntop = e->s->nr_cells;
int ncells = e->s->tot_cells;
#endif
double ntasks = n1 * ntop + n2 * (ncells - ntop);
tasks_per_cell = ceil(ntasks / ncells);
if (tasks_per_cell < 1.0) tasks_per_cell = 1.0;
if (e->verbose)
message("tasks per cell estimated as: %d, maximum tasks: %d",
tasks_per_cell, ncells * tasks_per_cell);
return ncells * tasks_per_cell;
}
/** /**
* @brief Rebuild the space and tasks. * @brief Rebuild the space and tasks.
* *
...@@ -4545,9 +4657,14 @@ void engine_init(struct engine *e, struct space *s, ...@@ -4545,9 +4657,14 @@ void engine_init(struct engine *e, struct space *s,
pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0) pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0)
error("Failed to initialize barrier."); error("Failed to initialize barrier.");
/* Init the scheduler with enough tasks for the initial sorting tasks. */ /* Expected average for tasks per cell. If set to zero we use a heuristic
const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads; * guess based on the numbers of cells and how many tasks per cell we expect.
scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, */
e->tasks_per_cell =
parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0);
/* Init the scheduler. */
scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues,
(policy & scheduler_flag_steal), e->nodeID, &e->threadpool); (policy & scheduler_flag_steal), e->nodeID, &e->threadpool);
/* Allocate and init the threads. */ /* Allocate and init the threads. */
......
...@@ -75,7 +75,6 @@ enum engine_policy { ...@@ -75,7 +75,6 @@ enum engine_policy {
extern const char *engine_policy_names[]; extern const char *engine_policy_names[];
#define engine_queue_scale 1.2 #define engine_queue_scale 1.2
#define engine_maxtaskspercell 96
#define engine_maxproxies 64 #define engine_maxproxies 64
#define engine_tasksreweight 1 #define engine_tasksreweight 1
#define engine_parts_size_grow 1.05 #define engine_parts_size_grow 1.05
...@@ -222,6 +221,10 @@ struct engine { ...@@ -222,6 +221,10 @@ struct engine {
struct link *links; struct link *links;
int nr_links, size_links; int nr_links, size_links;
/* Average number of tasks per cell. Used to estimate the sizes
* of the various task arrays. */
int tasks_per_cell;
/* Are we talkative ? */ /* Are we talkative ? */
int verbose; int verbose;
...@@ -292,5 +295,6 @@ int engine_is_done(struct engine *e); ...@@ -292,5 +295,6 @@ int engine_is_done(struct engine *e);
void engine_pin(); void engine_pin();
void engine_unpin(); void engine_unpin();
void engine_clean(struct engine *e); void engine_clean(struct engine *e);
int engine_estimate_nr_tasks(struct engine *e);
#endif /* SWIFT_ENGINE_H */ #endif /* SWIFT_ENGINE_H */
...@@ -1751,7 +1751,8 @@ void *runner_main(void *data) { ...@@ -1751,7 +1751,8 @@ void *runner_main(void *data) {
struct runner *r = (struct runner *)data; struct runner *r = (struct runner *)data;
struct engine *e = r->e; struct engine *e = r->e;
struct scheduler *sched = &e->sched; struct scheduler *sched = &e->sched;
unsigned int seed = r->id;
pthread_setspecific(sched->local_seed_pointer, &seed);
/* Main loop. */ /* Main loop. */
while (1) { while (1) {
......
...@@ -740,7 +740,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, ...@@ -740,7 +740,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type,
const int ind = atomic_inc(&s->tasks_next); const int ind = atomic_inc(&s->tasks_next);
/* Overflow? */ /* Overflow? */
if (ind >= s->size) error("Task list overflow."); if (ind >= s->size)
error(
"Task list overflow (%d). Need to increase "
"Scheduler:tasks_per_cell.",
ind);
/* Get a pointer to the new task. */ /* Get a pointer to the new task. */
struct task *t = &s->tasks[ind]; struct task *t = &s->tasks[ind];
...@@ -1345,7 +1349,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { ...@@ -1345,7 +1349,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) {
if (qid >= s->nr_queues) error("Bad computed qid."); if (qid >= s->nr_queues) error("Bad computed qid.");
/* If no previous owner, pick a random queue. */ /* If no previous owner, pick a random queue. */
if (qid < 0) qid = rand() % s->nr_queues; /* Note that getticks() is random enough */
if (qid < 0) qid = getticks() % s->nr_queues;
/* Increase the waiting counter. */ /* Increase the waiting counter. */
atomic_inc(&s->waiting); atomic_inc(&s->waiting);
...@@ -1577,6 +1582,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, ...@@ -1577,6 +1582,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks,
s->size = 0; s->size = 0;
s->tasks = NULL; s->tasks = NULL;
s->tasks_ind = NULL; s->tasks_ind = NULL;
pthread_key_create(&s->local_seed_pointer, NULL);
scheduler_reset(s, nr_tasks); scheduler_reset(s, nr_tasks);
} }
......
...@@ -102,6 +102,9 @@ struct scheduler { ...@@ -102,6 +102,9 @@ struct scheduler {
/* The node we are working on. */ /* The node we are working on. */
int nodeID; int nodeID;
/* 'Pointer' to the seed for the random number generator */
pthread_key_t local_seed_pointer;
}; };
/* Inlined functions (for speed). */ /* Inlined functions (for speed). */
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment