diff --git a/doc/RTD/source/ParameterFiles/parameter_description.rst b/doc/RTD/source/ParameterFiles/parameter_description.rst index ebfd3f3faab167006410ed2fe18ef43aa4f7f8c6..ab939cfcb297c439784eff1b0cb05813b1167358 100644 --- a/doc/RTD/source/ParameterFiles/parameter_description.rst +++ b/doc/RTD/source/ParameterFiles/parameter_description.rst @@ -862,9 +862,9 @@ On Lustre filesystems [#f4]_ it is important to properly stripe files to achieve a good writing speed. If the parameter ``lustre_OST_count`` is set to the number of OSTs present on the system, then SWIFT will set the `stripe count` of each distributed file to `1` and set each file's `stripe index` to the MPI rank -generating it modulo the OST count. If the parameter is not set then the files -will be created with the default system policy (or whatever was set for the -directory where the files are written). This parameter has no effect on +generating it modulo the OST count [#f5]_. If the parameter is not set then the +files will be created with the default system policy (or whatever was set for +the directory where the files are written). This parameter has no effect on non-Lustre file systems and no effect if distributed snapshots are not used. * The number of Lustre OSTs to distribute the single-striped distributed @@ -1394,9 +1394,9 @@ On Lustre filesystems [#f4]_ it is important to properly stripe files to achieve a good writing and reading speed. If the parameter ``lustre_OST_count`` is set to the number of OSTs present on the system, then SWIFT will set the `stripe count` of each restart file to `1` and set each file's `stripe index` to the MPI -rank generating it modulo the OST count. If the parameter is not set then the -files will be created with the default system policy (or whatever was set for -the directory where the files are written). This parameter has no effect on +rank generating it modulo the OST count [#f5]_. If the parameter is not set then +the files will be created with the default system policy (or whatever was set +for the directory where the files are written). This parameter has no effect on non-Lustre file systems. * The number of Lustre OSTs to distribute the single-striped restart files over: @@ -1914,3 +1914,8 @@ A complete specification of the model looks like matter, 3 --> sinks, 4 --> stars, 5 --> black holes, 6 --> neutrinos. .. [#f4] https://wiki.lustre.org/Main_Page + +.. [#f5] We add a per-output random integer to the OST value such that we don't + generate a bias towards low OSTs. This averages the load over all OSTs + over the course of a run even if the number of OSTs does not divide the + number of files and vice-versa. diff --git a/src/restart.c b/src/restart.c index 9b166a5670f0048761226eace6d947dda5a495de..ed9d346457c60bbec81e85d66e800018d16b0e7f 100644 --- a/src/restart.c +++ b/src/restart.c @@ -135,9 +135,17 @@ void restart_write(struct engine *e, const char *filename) { /* Use a single Lustre stripe with a rank-based OST offset? */ if (e->restart_lustre_OST_count != 0) { + + /* Use a random offset to avoid placing things in the same OSTs. We do + * this to keep the use of OSTs balanced, much like using -1 for the + * stripe. */ + int offset = rand() % e->restart_lustre_OST_count; +#ifdef WITH_MPI + MPI_Bcast(&offset, 1, MPI_INT, 0, MPI_COMM_WORLD); +#endif char string[1200]; sprintf(string, "lfs setstripe -c 1 -i %d %s", - (e->nodeID % e->restart_lustre_OST_count), filename); + ((e->nodeID + offset) % e->restart_lustre_OST_count), filename); const int result = system(string); if (result != 0) { message("lfs setstripe command returned error code %d", result);