diff --git a/examples/MultiTypes/multiTypes.yml b/examples/MultiTypes/multiTypes.yml index 05d45595e9ec43b0849ed574f6b9922c19021a33..c1b37ab02698c71f75e548a6801e059c3e1741a9 100644 --- a/examples/MultiTypes/multiTypes.yml +++ b/examples/MultiTypes/multiTypes.yml @@ -28,7 +28,7 @@ SPH: resolution_eta: 1.2348 # Target smoothing length in units of the mean inter-particle separation (1.2348 == 48Ngbs with the cubic spline kernel). delta_neighbours: 0.1 # The tolerance for the targetted number of neighbours. CFL_condition: 0.1 # Courant-Friedrich-Levy condition for time integration. - + # Parameters related to the initial conditions InitialConditions: file_name: ./multiTypes.hdf5 # The file to read diff --git a/examples/parameter_example.yml b/examples/parameter_example.yml index 9c3cee7630edf1be1e161a3e70547f06e6108ebd..6327540c27753ea61a79d7fb4c16d60c5f00635d 100644 --- a/examples/parameter_example.yml +++ b/examples/parameter_example.yml @@ -8,12 +8,13 @@ InternalUnitSystem: # Parameters for the task scheduling Scheduler: - nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. - cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). - cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value). - cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value). - cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). - max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). + nr_queues: 0 # (Optional) The number of task queues to use. Use 0 to let the system decide. + cell_max_size: 8000000 # (Optional) Maximal number of interactions per task if we force the split (this is the default value). + cell_sub_size_pair: 256000000 # (Optional) Maximal number of interactions per sub-pair task (this is the default value). + cell_sub_size_self: 32000 # (Optional) Maximal number of interactions per sub-self task (this is the default value). + cell_split_size: 400 # (Optional) Maximal number of particles per cell (this is the default value). + max_top_level_cells: 12 # (Optional) Maximal number of top-level cells in any dimension. The number of top-level cells will be the cube of this (this is the default value). + tasks_per_cell: 0 # (Optional) The average number of tasks per cell. If not large enough the simulation will fail (means guess...). # Parameters governing the time integration (Set dt_min and dt_max to the same value for a fixed time-step run.) TimeIntegration: diff --git a/m4/ax_gcc_archflag.m4 b/m4/ax_gcc_archflag.m4 index bba53a4c8a8cb363a017c55c4e4ebbb4c6528dae..298db809e82130b77758f5e66c29c276ceb8266a 100644 --- a/m4/ax_gcc_archflag.m4 +++ b/m4/ax_gcc_archflag.m4 @@ -108,6 +108,7 @@ case $host_cpu in *3?6[[ae]]?:*:*:*) ax_gcc_arch="ivybridge core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6[[cf]]?:*:*:*|*4?6[[56]]?:*:*:*) ax_gcc_arch="haswell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *3?6d?:*:*:*|*4?6f?:*:*:*) ax_gcc_arch="broadwell core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; + *9?6[[de]]?:*:*:*) ax_gcc_arch="kabylake core-avx2 core-avx-i corei7-avx corei7 core2 pentium-m pentium3 pentiumpro" ;; *1?6c?:*:*:*|*2?6[[67]]?:*:*:*|*3?6[[56]]?:*:*:*) ax_gcc_arch="bonnell atom core2 pentium-m pentium3 pentiumpro" ;; *3?67?:*:*:*|*[[45]]?6[[ad]]?:*:*:*) ax_gcc_arch="silvermont atom core2 pentium-m pentium3 pentiumpro" ;; *000?f[[012]]?:*:*:*|?f[[012]]?:*:*:*|f[[012]]?:*:*:*) ax_gcc_arch="pentium4 pentiumpro" ;; diff --git a/src/debug.c b/src/debug.c index 903d7e5a2e30bca8980078991c5155830f5e4c43..c9f12370f0e59f812afc72eb55b0bdaa032edfe1 100644 --- a/src/debug.c +++ b/src/debug.c @@ -32,6 +32,7 @@ #include "debug.h" /* Local includes. */ +#include "active.h" #include "cell.h" #include "engine.h" #include "hydro.h" @@ -269,6 +270,79 @@ int checkCellhdxmax(const struct cell *c, int *depth) { return result; } +/** + * @brief map function for dumping cells. In MPI mode locally active cells + * only. + */ +static void dumpCells_map(struct cell *c, void *data) { + uintptr_t *ldata = (uintptr_t *)data; + FILE *file = (FILE *)ldata[0]; + struct engine *e = (struct engine *)ldata[1]; + float ntasks = c->nr_tasks; + +#if SWIFT_DEBUG_CHECKS + /* The c->nr_tasks field does not include all the tasks. So let's check this + * the hard way. Note pairs share the task 50/50 with the other cell. */ + ntasks = 0.0f; + struct task *tasks = e->sched.tasks; + int nr_tasks = e->sched.nr_tasks; + for (int k = 0; k < nr_tasks; k++) { + if (tasks[k].cj == NULL) { + if (c == tasks[k].ci) { + ntasks = ntasks + 1.0f; + } + } else { + if (c == tasks[k].ci || c == tasks[k].cj) { + ntasks = ntasks + 0.5f; + } + } + } +#endif + + /* Only locally active cells are dumped. */ + if (c->count > 0 || c->gcount > 0 || c->scount > 0) + fprintf(file, + " %6.3f %6.3f %6.3f %6.3f %6.3f %6.3f %6d %6d %6d %6d " + "%6.1f %20lld %6d %6d %6d %6d\n", + c->loc[0], c->loc[1], c->loc[2], c->width[0], c->width[1], + c->width[2], c->count, c->gcount, c->scount, c->depth, ntasks, + c->ti_end_min, get_time_bin(c->ti_end_min), (c->super == c), + cell_is_active(c, e), c->nodeID); +} + +/** + * @brief Dump the location, depth, task counts and timebins and active state, + * for all cells to a simple text file. + * + * @param prefix base output filename + * @param s the space holding the cells to dump. + */ +void dumpCells(const char *prefix, struct space *s) { + + FILE *file = NULL; + + /* Name of output file. */ + static int nseq = 0; + char fname[200]; + int uniq = atomic_inc(&nseq); + sprintf(fname, "%s_%03d.dat", prefix, uniq); + + file = fopen(fname, "w"); + + /* Header. */ + fprintf(file, + "# %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s " + "%20s %6s %6s %6s\n", + "x", "y", "z", "xw", "yw", "zw", "count", "gcount", "scount", "depth", + "tasks", "ti_end_min", "timebin", "issuper", "active", "rank"); + + uintptr_t data[2]; + data[0] = (size_t)file; + data[1] = (size_t)s->e; + space_map_cells_pre(s, 1, dumpCells_map, &data); + fclose(file); +} + #ifdef HAVE_METIS /** diff --git a/src/debug.h b/src/debug.h index 7dca848b6bf4e44de5f40fa8e1c0849e8ee3d0b4..7bea11de6ac3fe82ae9b3f1be84249f75742ecaf 100644 --- a/src/debug.h +++ b/src/debug.h @@ -33,6 +33,7 @@ void printgParticle_single(struct gpart *gp); int checkSpacehmax(struct space *s); int checkCellhdxmax(const struct cell *c, int *depth); +void dumpCells(const char *prefix, struct space *s); #ifdef HAVE_METIS #include "metis.h" diff --git a/src/engine.c b/src/engine.c index 93481aca3d25fd9755b7c7f69ef25ddb4d9d9d06..8f2ce85a970233178eebaa9ffd0f6598f7c2b467 100644 --- a/src/engine.c +++ b/src/engine.c @@ -2507,7 +2507,7 @@ void engine_maketasks(struct engine *e) { const ticks tic = getticks(); /* Re-set the scheduler. */ - scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell); + scheduler_reset(sched, engine_estimate_nr_tasks(e)); /* Construct the firt hydro loop over neighbours */ if (e->policy & engine_policy_hydro) { @@ -2902,7 +2902,8 @@ void engine_print_task_counts(struct engine *e) { else counts[(int)tasks[k].type] += 1; } - message("Total = %d", nr_tasks); + message("Total = %d (per cell = %d)", nr_tasks, + (int)ceil((double)nr_tasks / e->s->tot_cells)); #ifdef WITH_MPI printf("[%04i] %s engine_print_task_counts: task counts are [ %s=%i", e->nodeID, clocks_get_timesincestart(), taskID_names[0], counts[0]); @@ -2923,6 +2924,117 @@ void engine_print_task_counts(struct engine *e) { clocks_getunit()); } +/** + * @brief if necessary, estimate the number of tasks required given + * the current tasks in use and the numbers of cells. + * + * If e->tasks_per_cell is set greater than 0 then that value is used + * as the estimate of the average number of tasks per cell, + * otherwise we attempt an estimate. + * + * @param e the #engine + * + * @return the estimated total number of tasks + */ +int engine_estimate_nr_tasks(struct engine *e) { + + int tasks_per_cell = e->tasks_per_cell; + if (tasks_per_cell > 0) return e->s->tot_cells * tasks_per_cell; + + /* Our guess differs depending on the types of tasks we are using, but we + * basically use a formula <n1>*ntopcells + <n2>*(totcells - ntopcells). + * Where <n1> is the expected maximum tasks per top-level/super cell, and + * <n2> the expected maximum tasks for all other cells. These should give + * a safe upper limit. + */ + int n1 = 0; + int n2 = 0; + if (e->policy & engine_policy_hydro) { + n1 += 36; + n2 += 2; +#ifdef WITH_MPI + n1 += 6; +#endif + +#ifdef EXTRA_HYDRO_LOOP + n1 += 15; +#ifdef WITH_MPI + n1 += 2; +#endif +#endif + } + if (e->policy & engine_policy_self_gravity) { + n1 += 24; + n2 += 1; +#ifdef WITH_MPI + n2 += 2; +#endif + } + if (e->policy & engine_policy_external_gravity) { + n1 += 2; + } + if (e->policy & engine_policy_cosmology) { + n1 += 2; + } + if (e->policy & engine_policy_cooling) { + n1 += 2; + } + if (e->policy & engine_policy_sourceterms) { + n1 += 2; + } + if (e->policy & engine_policy_stars) { + n1 += 2; + } + +#ifdef WITH_MPI + + /* We need fewer tasks per rank when using MPI, but we could have + * imbalances, so we need to work using the locally active cells, not just + * some equipartition amongst the nodes. Don't want to recurse the whole + * cell tree, so just make a guess of the maximum possible total cells. */ + int ntop = 0; + int ncells = 0; + for (int k = 0; k < e->s->nr_cells; k++) { + struct cell *c = &e->s->cells_top[k]; + + /* Any cells with particles will have tasks (local & foreign). */ + int nparts = c->count + c->gcount + c->scount; + if (nparts > 0) { + ntop++; + ncells++; + + /* Count cell depth until we get below the parts per cell threshold. */ + int depth = 0; + while (nparts > space_splitsize) { + depth++; + nparts /= 8; + ncells += (1 << (depth * 3)); + } + } + } + + /* If no local cells, we are probably still initialising, so just keep + * room for the top-level. */ + if (ncells == 0) { + ntop = e->s->nr_cells; + ncells = ntop; + } +#else + int ntop = e->s->nr_cells; + int ncells = e->s->tot_cells; +#endif + + double ntasks = n1 * ntop + n2 * (ncells - ntop); + tasks_per_cell = ceil(ntasks / ncells); + + if (tasks_per_cell < 1.0) tasks_per_cell = 1.0; + if (e->verbose) + message("tasks per cell estimated as: %d, maximum tasks: %d", + tasks_per_cell, ncells * tasks_per_cell); + + return ncells * tasks_per_cell; +} + /** * @brief Rebuild the space and tasks. * @@ -4503,9 +4615,14 @@ void engine_init(struct engine *e, struct space *s, pthread_barrier_init(&e->run_barrier, NULL, e->nr_threads + 1) != 0) error("Failed to initialize barrier."); - /* Init the scheduler with enough tasks for the initial sorting tasks. */ - const int nr_tasks = 2 * s->tot_cells + 2 * e->nr_threads; - scheduler_init(&e->sched, e->s, nr_tasks, nr_queues, + /* Expected average for tasks per cell. If set to zero we use a heuristic + * guess based on the numbers of cells and how many tasks per cell we expect. + */ + e->tasks_per_cell = + parser_get_opt_param_int(params, "Scheduler:tasks_per_cell", 0); + + /* Init the scheduler. */ + scheduler_init(&e->sched, e->s, engine_estimate_nr_tasks(e), nr_queues, (policy & scheduler_flag_steal), e->nodeID, &e->threadpool); /* Allocate and init the threads. */ diff --git a/src/engine.h b/src/engine.h index 47a30a99b696304365a0ddf31d4499628a649a37..90645b308529e49a370bf91f2b3d05195b2b08d8 100644 --- a/src/engine.h +++ b/src/engine.h @@ -75,7 +75,6 @@ enum engine_policy { extern const char *engine_policy_names[]; #define engine_queue_scale 1.2 -#define engine_maxtaskspercell 96 #define engine_maxproxies 64 #define engine_tasksreweight 1 #define engine_parts_size_grow 1.05 @@ -222,6 +221,10 @@ struct engine { struct link *links; int nr_links, size_links; + /* Average number of tasks per cell. Used to estimate the sizes + * of the various task arrays. */ + int tasks_per_cell; + /* Are we talkative ? */ int verbose; @@ -292,5 +295,6 @@ int engine_is_done(struct engine *e); void engine_pin(); void engine_unpin(); void engine_clean(struct engine *e); +int engine_estimate_nr_tasks(struct engine *e); #endif /* SWIFT_ENGINE_H */ diff --git a/src/gravity_cache.h b/src/gravity_cache.h index 14b672233aa9958ec39af32a87baead98c0bae04..b9fde71dd887d704fac5c730aee33767de8a28f7 100644 --- a/src/gravity_cache.h +++ b/src/gravity_cache.h @@ -131,7 +131,7 @@ static INLINE void gravity_cache_init(struct gravity_cache *c, int count) { */ __attribute__((always_inline)) INLINE void gravity_cache_populate( struct gravity_cache *c, const struct gpart *restrict gparts, int gcount, - int gcount_padded, const double shift[3]) { + int gcount_padded, const double shift[3], const struct cell *cell) { /* Make the compiler understand we are in happy vectorization land */ float *restrict x = c->x; @@ -161,9 +161,9 @@ __attribute__((always_inline)) INLINE void gravity_cache_populate( /* Pad the caches */ for (int i = gcount; i < gcount_padded; ++i) { - x[i] = 0.f; - y[i] = 0.f; - z[i] = 0.f; + x[i] = -3.f * cell->width[0]; + y[i] = -3.f * cell->width[0]; + z[i] = -3.f * cell->width[0]; epsilon[i] = 0.f; m[i] = 0.f; } diff --git a/src/runner.c b/src/runner.c index ec08b743452508364a7f1900963aae73061a944d..dd3f3c8e4e59af15485ece16665ffdae85703117 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1749,7 +1749,8 @@ void *runner_main(void *data) { struct runner *r = (struct runner *)data; struct engine *e = r->e; struct scheduler *sched = &e->sched; - + unsigned int seed = r->id; + pthread_setspecific(sched->local_seed_pointer, &seed); /* Main loop. */ while (1) { diff --git a/src/runner_doiact.h b/src/runner_doiact.h index c07d70f3e48bb6f1c9e7e343a50cdbba71da0785..14eecaa2c0c126b721a400e63298e1d55a9c98da 100644 --- a/src/runner_doiact.h +++ b/src/runner_doiact.h @@ -1039,7 +1039,7 @@ void DOPAIR1(struct runner *r, struct cell *ci, struct cell *cj, const int sid, const float hj = pj->h; const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; if (dj - rshift > di_max) continue; - + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; @@ -1451,7 +1451,7 @@ void DOPAIR2(struct runner *r, struct cell *ci, struct cell *cj) { const float hj = pj->h; const double dj = sort_j[pjd].d - hj * kernel_gamma - dx_max + rshift; if (dj - rshift > di_max) continue; - + double pjx[3]; for (int k = 0; k < 3; k++) pjx[k] = pj->x[k] + shift[k]; const float hjg2 = hj * hj * kernel_gamma2; diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index 01ea6a073211a08430e77721f4c2e60ef7adfd04..69f821f0a991cb797a6ae6b2002ed83986759d86 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -194,9 +194,9 @@ void runner_dopair_grav_pp_full(struct runner *r, struct cell *ci, /* Fill the caches */ gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, - loc_mean); + loc_mean, ci); gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j, - loc_mean); + loc_mean, cj); /* Ok... Here we go ! */ @@ -542,9 +542,9 @@ void runner_dopair_grav_pp_truncated(struct runner *r, struct cell *ci, /* Fill the caches */ gravity_cache_populate(ci_cache, gparts_i, gcount_i, gcount_padded_i, - loc_mean); + loc_mean, ci); gravity_cache_populate(cj_cache, gparts_j, gcount_j, gcount_padded_j, - loc_mean); + loc_mean, cj); /* Ok... Here we go ! */ @@ -941,7 +941,7 @@ void runner_doself_grav_pp_full(struct runner *r, struct cell *c) { /* Computed the padded counts */ const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE; - gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc); + gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc, c); /* Ok... Here we go ! */ @@ -1155,7 +1155,7 @@ void runner_doself_grav_pp_truncated(struct runner *r, struct cell *c) { /* Computed the padded counts */ const int gcount_padded = gcount - (gcount % VEC_SIZE) + VEC_SIZE; - gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc); + gravity_cache_populate(ci_cache, gparts, gcount, gcount_padded, loc, c); /* Ok... Here we go ! */ diff --git a/src/scheduler.c b/src/scheduler.c index 4081cde0489b1b439ceb46fc9b4e191541f15bef..d0eeb8cb726cf53321d1b4e6a028f2914246cbf2 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -772,7 +772,11 @@ struct task *scheduler_addtask(struct scheduler *s, enum task_types type, const int ind = atomic_inc(&s->tasks_next); /* Overflow? */ - if (ind >= s->size) error("Task list overflow."); + if (ind >= s->size) + error( + "Task list overflow (%d). Need to increase " + "Scheduler:tasks_per_cell.", + ind); /* Get a pointer to the new task. */ struct task *t = &s->tasks[ind]; @@ -1377,7 +1381,8 @@ void scheduler_enqueue(struct scheduler *s, struct task *t) { if (qid >= s->nr_queues) error("Bad computed qid."); /* If no previous owner, pick a random queue. */ - if (qid < 0) qid = rand() % s->nr_queues; + /* Note that getticks() is random enough */ + if (qid < 0) qid = getticks() % s->nr_queues; /* Increase the waiting counter. */ atomic_inc(&s->waiting); @@ -1609,6 +1614,7 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, s->size = 0; s->tasks = NULL; s->tasks_ind = NULL; + pthread_key_create(&s->local_seed_pointer, NULL); scheduler_reset(s, nr_tasks); } diff --git a/src/scheduler.h b/src/scheduler.h index ac654580b2af2ffb506dc3fd9f0b988b89effbd0..0ef9426ed3248742de619d64f53bdd6b9954ed2c 100644 --- a/src/scheduler.h +++ b/src/scheduler.h @@ -102,6 +102,9 @@ struct scheduler { /* The node we are working on. */ int nodeID; + + /* 'Pointer' to the seed for the random number generator */ + pthread_key_t local_seed_pointer; }; /* Inlined functions (for speed). */