diff --git a/examples/main.c b/examples/main.c index fa0a7764a3af91581c367f8618d3932bdeb3b8f5..69b0fcd55c4508af1825954b7a3088c258f9353a 100644 --- a/examples/main.c +++ b/examples/main.c @@ -1089,6 +1089,9 @@ int main(int argc, char *argv[]) { /* Dump the task data using the given frequency. */ if (dump_tasks && (dump_tasks == 1 || j % dump_tasks == 1)) { task_dump_all(&e, j + 1); + + /* Generate the task statistics. */ + task_dump_stats(&e, j + 1); } #endif diff --git a/src/partition.c b/src/partition.c index bbd7454dd63be6ab5192558fb4a2e3399ea03cfc..80855be2dbf72b5fde426e8861a7a19a6b5235e4 100644 --- a/src/partition.c +++ b/src/partition.c @@ -77,6 +77,15 @@ const char *repartition_name[] = { /* Local functions, if needed. */ static int check_complete(struct space *s, int verbose, int nregions); +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +/* + * Repartition fixed costs per type/subtype. These are determined from the + * statistics output produced when running with task debugging enabled. + */ +static double repartition_costs[task_type_count][task_subtype_count]; +static void repart_init_fixed_costs(int policy); +#endif + /* Vectorisation support */ /* ===================== */ @@ -1195,10 +1204,11 @@ void partition_gather_weights(void *map_data, int num_elements, struct task *t = &tasks[i]; /* Skip un-interesting tasks. */ - if (t->cost == 0.f) continue; + if (t->type == task_type_send || t->type == task_type_recv || + t->type == task_type_logger || t->implicit) continue; - /* Get the task weight based on costs. */ - double w = (double)t->cost; + /* Get the task weight based on fixed cost for this task type. */ + double w = repartition_costs[t->type][t->subtype]; /* Get the top-level cells involved. */ struct cell *ci, *cj; @@ -1527,6 +1537,7 @@ void partition_repartition(struct repartition *reparttype, int nodeID, #if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) ticks tic = getticks(); + repart_init_fixed_costs(s->e->policy); if (reparttype->type == REPART_METIS_VERTEX_EDGE_COSTS) { repart_edge_metis(1, 1, 0, reparttype, nodeID, nr_nodes, s, tasks, @@ -1700,7 +1711,7 @@ void partition_initial_partition(struct partition *initial_partition, /** * @brief Initialises the partition and re-partition scheme from the parameter - * file + * file. * * @param partition The #partition scheme to initialise. * @param repartition The #repartition scheme to initialise. @@ -1843,6 +1854,95 @@ void partition_init(struct partition *partition, #endif } +#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) +/** + * @brief Set the fixed costs for repartition using METIS. + * + * These are determined using a run with task debugging enabled which gives a + * statistical analysis condensed into a .h file. Note that some tasks have + * different costs depending on the engine policies, for instance the kicks + * do work with self gravity and hydro, we attempt to allow for that. Finally + * note this is a statistical solution, so requires that there are sufficient + * tasks on each rank so that the fixed costs do the right thing on average, + * you may like to used task ticks as weights if this isn't working. + * + * @param policy the #engine policy. + */ +static void repart_init_fixed_costs(int policy) { + + /* Set the default fixed cost. */ + for (int j = 0; j < task_type_count; j++) { + for (int k = 0; k < task_subtype_count; k++) { + repartition_costs[j][k] = 1.0; + } + } + + /* TODO: these may be separable, so we could have costs for each policy + * addition, if separated need to take care with the relative scaling. */ + if (policy & (engine_policy_hydro & engine_policy_self_gravity)) { + + /* EAGLE_50 -s -G -S 8 nodes 16 cores */ + repartition_costs[1][0] = 45842; /* sort/none */ + repartition_costs[2][1] = 15609; /* self/density */ + repartition_costs[2][3] = 18830; /* self/force */ + repartition_costs[2][4] = 202588; /* self/grav */ + repartition_costs[3][1] = 67; /* pair/density */ + repartition_costs[3][3] = 207; /* pair/force */ + repartition_costs[3][4] = 1507; /* pair/grav */ + repartition_costs[4][1] = 36268; /* sub_self/density */ + repartition_costs[4][3] = 52252; /* sub_self/force */ + repartition_costs[5][1] = 709; /* sub_pair/density */ + repartition_costs[5][3] = 1258; /* sub_pair/force */ + repartition_costs[6][0] = 1135; /* init_grav/none */ + repartition_costs[9][0] = 160; /* ghost/none */ + repartition_costs[12][0] = 7176; /* drift_part/none */ + repartition_costs[13][0] = 4502; /* drift_gpart/none */ + repartition_costs[15][0] = 8234; /* end_force/none */ + repartition_costs[16][0] = 15508; /* kick1/none */ + repartition_costs[17][0] = 15780; /* kick2/none */ + repartition_costs[18][0] = 19848; /* timestep/none */ + repartition_costs[21][0] = 4105; /* grav_long_range/none */ + repartition_costs[22][0] = 68; /* grav_mm/none */ + repartition_costs[24][0] = 16785; /* grav_down/none */ + repartition_costs[25][0] = 70632; /* grav_mesh/none */ + + } else if (policy & engine_policy_self_gravity) { + + /* EAGLE_50 -G 8 nodes 16 cores, scaled to match self/grav. */ + repartition_costs[2][4] = 202588; /* self/grav */ + repartition_costs[3][4] = 1760; /* pair/grav */ + repartition_costs[6][0] = 1610; /* init_grav/none */ + repartition_costs[13][0] = 999; /* drift_gpart/none */ + repartition_costs[15][0] = 3481; /* end_force/none */ + repartition_costs[16][0] = 6336; /* kick1/none */ + repartition_costs[17][0] = 6343; /* kick2/none */ + repartition_costs[18][0] = 13864; /* timestep/none */ + repartition_costs[21][0] = 1422; /* grav_long_range/none */ + repartition_costs[22][0] = 71; /* grav_mm/none */ + repartition_costs[24][0] = 16011; /* grav_down/none */ + repartition_costs[25][0] = 60414; /* grav_mesh/none */ + } else if (policy & engine_policy_hydro) { + + /* EAGLE_50 -s 8 nodes 16 cores, not scaled, but similar. */ + repartition_costs[1][0] = 52733; /* sort/none */ + repartition_costs[2][1] = 15458; /* self/density */ + repartition_costs[2][3] = 19212; /* self/force */ + repartition_costs[3][1] = 74; /* pair/density */ + repartition_costs[3][3] = 242; /* pair/force */ + repartition_costs[4][1] = 42895; /* sub_self/density */ + repartition_costs[4][3] = 64254; /* sub_self/force */ + repartition_costs[5][1] = 818; /* sub_pair/density */ + repartition_costs[5][3] = 1443; /* sub_pair/force */ + repartition_costs[9][0] = 159; /* ghost/none */ + repartition_costs[12][0] = 6708; /* drift_part/none */ + repartition_costs[15][0] = 6479; /* end_force/none */ + repartition_costs[16][0] = 6609; /* kick1/none */ + repartition_costs[17][0] = 6975; /* kick2/none */ + repartition_costs[18][0] = 5229; /* timestep/none */ + } +} +#endif + /* General support */ /* =============== */ diff --git a/src/scheduler.c b/src/scheduler.c index 2ae8f6785434af021b52dd2d6586b4e2dc5d68bb..d40971482a273fd3b3d2d65a166d1340e7cd4c9d 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -1817,16 +1817,11 @@ void scheduler_reweight(struct scheduler *s, int verbose) { for (int k = nr_tasks - 1; k >= 0; k--) { struct task *t = &tasks[tid[k]]; t->weight = 0.f; -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - t->cost = 0.f; -#endif + for (int j = 0; j < t->nr_unlock_tasks; j++) if (t->unlock_tasks[j]->weight > t->weight) t->weight = t->unlock_tasks[j]->weight; float cost = 0.f; -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - int partcost = 1; -#endif const float count_i = (t->ci != NULL) ? t->ci->hydro.count : 0.f; const float count_j = (t->cj != NULL) ? t->cj->hydro.count : 0.f; @@ -1847,9 +1842,9 @@ void scheduler_reweight(struct scheduler *s, int verbose) { break; case task_type_self: - if (t->subtype == task_subtype_grav) + if (t->subtype == task_subtype_grav) { cost = 1.f * (wscale * gcount_i) * gcount_i; - else if (t->subtype == task_subtype_external_grav) + } else if (t->subtype == task_subtype_external_grav) cost = 1.f * wscale * gcount_i; else if (t->subtype == task_subtype_stars_density) cost = 1.f * wscale * scount_i * count_i; @@ -1949,18 +1944,12 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = wscale * count_i + wscale * gcount_i; break; case task_type_send: -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - partcost = 0; -#endif if (count_i < 1e5) cost = 10.f * (wscale * count_i) * count_i; else cost = 2e9; break; case task_type_recv: -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - partcost = 0; -#endif if (count_i < 1e5) cost = 5.f * (wscale * count_i) * count_i; else @@ -1970,10 +1959,6 @@ void scheduler_reweight(struct scheduler *s, int verbose) { cost = 0; break; } - -#if defined(WITH_MPI) && (defined(HAVE_PARMETIS) || defined(HAVE_METIS)) - if (partcost) t->cost = cost; -#endif t->weight += cost; } diff --git a/src/task.c b/src/task.c index b78ae5c465416c68021c2e49e8b25454615d9050..62e00d976582766e8ff9b1b7ebb408ca11c373a4 100644 --- a/src/task.c +++ b/src/task.c @@ -579,7 +579,8 @@ void task_create_mpi_comms(void) { #endif /** - * @brief dump all the tasks of all the known engines into a file for postprocessing. + * @brief dump all the tasks of all the known engines into a file for + * postprocessing. * * Dumps the information to a file "thread_info-stepn.dat" where n is the * given step value, or "thread_info_MPI-stepn.dat", if we are running @@ -681,5 +682,129 @@ void task_dump_all(struct engine *e, int step) { } fclose(file_thread); #endif // WITH_MPI -#endif // SWIFT_DEBUG_TASKS +#endif // SWIFT_DEBUG_TASKS +} + +/** + * @brief Generate simple statistics about the times used by the tasks of + * all the engines and write these into two files, a human readable + * version and one intented for inclusion as the fixed costs for + * repartitioning. + * + * Dumps the human readable information to a file "thread_stats-stepn.dat" + * where n is the given step value. When running under MPI all the tasks are + * summed into this single file. + * + * The fixed costs file will be called "thread_stats-stepn.h". + * + * @param e the #engine + * @param step the current step. + */ +void task_dump_stats(struct engine *e, int step) { + +#ifdef SWIFT_DEBUG_TASKS + char dumpfile[40]; + snprintf(dumpfile, 40, "thread_stats-step%d.dat", step); + + char costsfile[40]; + snprintf(costsfile, 40, "thread_stats-step%d.h", step); + + /* Need arrays for sum, min and max across all types and subtypes. */ + double sum[task_type_count][task_subtype_count]; + double min[task_type_count][task_subtype_count]; + double max[task_type_count][task_subtype_count]; + int count[task_type_count][task_subtype_count]; + + for (int j = 0; j < task_type_count; j++) { + for (int k = 0; k < task_subtype_count; k++) { + sum[j][k] = 0.0; + count[j][k] = 0; + min[j][k] = DBL_MAX; + max[j][k] = 0.0; + } + } + + double total[1] = {0.0}; + double minmin[1] = {DBL_MAX}; + for (int l = 0; l < e->sched.nr_tasks; l++) { + int type = e->sched.tasks[l].type; + + /* Skip implicit tasks, tasks that didn't run and MPI send/recv as these + * are not interesting (or meaningfully measured). */ + if (!e->sched.tasks[l].implicit && e->sched.tasks[l].toc != 0 && + type != task_type_send && type != task_type_recv) { + int subtype = e->sched.tasks[l].subtype; + + double dt = e->sched.tasks[l].toc - e->sched.tasks[l].tic; + sum[type][subtype] += dt; + count[type][subtype] += 1; + if (dt < min[type][subtype]) { + min[type][subtype] = dt; + } + if (dt > max[type][subtype]) { + max[type][subtype] = dt; + } + total[0] += dt; + if (dt < minmin[0]) minmin[0] = dt; + } + } + +#ifdef WITH_MPI + /* Get these from all ranks for output from rank 0. Could wrap these into a + * single operation. */ + size_t size = task_type_count * task_subtype_count; + int res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : sum), sum, size, + MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task sums"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : count), count, size, + MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task counts"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : min), min, size, + MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task minima"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : max), max, size, + MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task maxima"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : total), total, 1, + MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task total time"); + + res = MPI_Reduce((engine_rank == 0 ? MPI_IN_PLACE : minmin), minmin, 1, + MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + if (res != MPI_SUCCESS) mpi_error(res, "Failed to reduce task global min"); + + if (engine_rank == 0) { +#endif + + FILE *dfile = fopen(dumpfile, "w"); + FILE *cfile = fopen(costsfile, "w"); + + fprintf(dfile, "# task ntasks min max sum mean percent fixed_cost\n"); + for (int j = 0; j < task_type_count; j++) { + const char *taskID = taskID_names[j]; + for (int k = 0; k < task_subtype_count; k++) { + if (sum[j][k] > 0.0) { + double mean = sum[j][k] / (double)count[j][k]; + double perc = 100.0 * sum[j][k] / total[0]; + int fixed_cost = (int)mean / minmin[0]; + fprintf(dfile, + "%15s/%-10s %10d %14.2f %14.2f %14.2f %14.2f %14.2f %10d\n", + taskID, subtaskID_names[k], count[j][k], min[j][k], max[j][k], + sum[j][k], mean, perc, fixed_cost); + fprintf(cfile, "repartition_costs[%d][%d] = %10d; /* %s/%s */\n", j, + k, fixed_cost, taskID, subtaskID_names[k]); + } + } + } + fclose(dfile); + fclose(cfile); +#ifdef WITH_MPI + } +#endif + +#endif // SWIFT_DEBUG_TASKS } diff --git a/src/task.h b/src/task.h index eda3c82fd316223459f696a066d6b37603202e93..8203c2484345017fc34dd58a78882fc069fb8e4a 100644 --- a/src/task.h +++ b/src/task.h @@ -156,11 +156,6 @@ struct task { /*! Weight of the task */ float weight; -#if defined(WITH_MPI) && (defined(HAVE_METIS) || defined(HAVE_PARMETIS)) - /*! Individual cost estimate for this task. */ - float cost; -#endif - /*! Number of tasks unlocked by this one */ short int nr_unlock_tasks; @@ -204,6 +199,7 @@ int task_lock(struct task *t); void task_do_rewait(struct task *t); void task_print(const struct task *t); void task_dump_all(struct engine *e, int step); +void task_dump_stats(struct engine *e, int step); #ifdef WITH_MPI void task_create_mpi_comms(void); #endif