/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2018 Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ /** * @file restart.c * @brief support for SWIFT restarts */ /* Config parameters. */ #include /* Standard headers. */ #include "engine.h" #include "error.h" #include "restart.h" #include "version.h" #include #include #include #include #include #include #include /* The signature for restart files. */ #define SWIFT_RESTART_SIGNATURE "SWIFT-restart-file" #define SWIFT_RESTART_END_SIGNATURE "SWIFT-restart-file:end" #define FNAMELEN 200 #define LABLEN 20 /* Structure for a dumped header. */ struct header { size_t len; /* Total length of data in bytes. */ char label[LABLEN + 1]; /* A label for data */ }; /** * @brief generate a name for a restart file. * * @param dir the directory of restart files. * @param basename the basename of the restart files. * @param nodeID a unique integer, usually the nodeID of the engine. * @param name pointer to a string to hold the result. * @param size length of name. * * @result 0 if the string was large enough. */ int restart_genname(const char *dir, const char *basename, int nodeID, char *name, int size) { int n = snprintf(name, size, "%s/%s_%06d.rst", dir, basename, nodeID); return (n >= size); } /** * @brief locate all the restart files in the given directory with the given * basename. * @param dir the directory of restart files. * @param basename the basename of the restart files. * @param nfiles the number of restart files located. * * @result pointer to an array of strings with all the filenames found, * these should be collated using the current locale, i.e. sorted * alphabetically (so make sure the filenames are zero padded to get * numeric ordering). Release by calling restart_locate_free(). */ char **restart_locate(const char *dir, const char *basename, int *nfiles) { *nfiles = 0; /* Construct the glob pattern for locating files. */ char pattern[FNAMELEN]; if (snprintf(pattern, FNAMELEN, "%s/%s_[0-9]*.rst", dir, basename) < FNAMELEN) { glob_t globbuf; char **files = NULL; if (glob(pattern, 0, NULL, &globbuf) == 0) { *nfiles = globbuf.gl_pathc; files = (char **)malloc(sizeof(char *) * *nfiles); for (int i = 0; i < *nfiles; i++) { files[i] = strdup(globbuf.gl_pathv[i]); } } globfree(&globbuf); return files; } error("Failed to construct pattern to locate restart files"); return NULL; } /** * @brief Release the memory allocated to hold the restart file names. * * @param nfiles the number of restart files located. * @param files the list of filenames found in call to restart_locate(). */ void restart_locate_free(int nfiles, char **files) { for (int i = 0; i < nfiles; i++) { free(files[i]); } free(files); } /** * @brief Write a restart file for the state of the given engine struct. * * @param e the engine with our state information. * @param filename name of the file to write the restart data to. */ void restart_write(struct engine *e, const char *filename) { ticks tic = getticks(); /* Save a backup the existing restart file, if requested. */ if (e->restart_save) restart_save_previous(filename); /* Use a single Lustre stripe with a rank-based OST offset? */ if (e->restart_lustre_OST_count != 0) { /* Use a random offset to avoid placing things in the same OSTs. We do * this to keep the use of OSTs balanced, much like using -1 for the * stripe. */ int offset = rand() % e->restart_lustre_OST_count; #ifdef WITH_MPI MPI_Bcast(&offset, 1, MPI_INT, 0, MPI_COMM_WORLD); #endif char string[1200]; sprintf(string, "lfs setstripe -c 1 -i %d %s", ((e->nodeID + offset) % e->restart_lustre_OST_count), filename); const int result = system(string); if (result != 0) { message("lfs setstripe command returned error code %d", result); } } FILE *stream = fopen(filename, "w"); if (stream == NULL) error("Failed to open restart file: %s (%s)", filename, strerror(errno)); /* Dump our signature and version. */ restart_write_blocks((void *)SWIFT_RESTART_SIGNATURE, strlen(SWIFT_RESTART_SIGNATURE), 1, stream, "signature", "SWIFT signature"); restart_write_blocks((void *)package_version(), strlen(package_version()), 1, stream, "version", "SWIFT version"); engine_struct_dump(e, stream); /* Just an END statement to spot truncated files. */ restart_write_blocks((void *)SWIFT_RESTART_END_SIGNATURE, strlen(SWIFT_RESTART_END_SIGNATURE), 1, stream, "endsignature", "SWIFT end signature"); fclose(stream); if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); } /** * @brief Read a restart file to construct a saved engine struct state. * * @param e the engine to recover from the saved state. * @param filename name of the file containing the staved state. */ void restart_read(struct engine *e, const char *filename) { const ticks tic = getticks(); FILE *stream = fopen(filename, "r"); if (stream == NULL) error("Failed to open restart file: %s (%s)", filename, strerror(errno)); /* Get our version and signature back. These should match. * Use static int here to avoid compiler warnings about gnu-extensions * of folding a variable length array to constant array. */ const int sig_len = strlen(SWIFT_RESTART_SIGNATURE); char signature[sig_len + 1]; restart_read_blocks(signature, sig_len, 1, stream, NULL, "SWIFT signature"); signature[sig_len] = '\0'; if (strncmp(signature, SWIFT_RESTART_SIGNATURE, sig_len) != 0) error( "Do not recognise this as a SWIFT restart file, found '%s' " "expected '%s'", signature, SWIFT_RESTART_SIGNATURE); char version[FNAMELEN]; int len = strlen(package_version()); restart_read_blocks(version, len, 1, stream, NULL, "SWIFT version"); version[len] = '\0'; /* It might work! */ if (strncmp(version, package_version(), len) != 0) message( "WARNING: restoring from a different version of SWIFT.\n You have:" " '%s' and the restarts files are from: '%s'. This may fail" " badly.", package_version(), version); engine_struct_restore(e, stream); fclose(stream); if (e->verbose) message("took %.3f %s.", clocks_from_ticks(getticks() - tic), clocks_getunit()); } /** * @brief Read blocks of memory from a file stream into a memory location. * Exits the application if the read fails and does nothing if the * size is zero. * * @param ptr pointer to the memory * @param size size of a block * @param nblocks number of blocks to read * @param stream the file stream * @param label the label recovered for the block, needs to be at least 20 * characters, set to NULL if not required * @param errstr a context string to qualify any errors. */ void restart_read_blocks(void *ptr, size_t size, size_t nblocks, FILE *stream, char *label, const char *errstr) { if (size > 0) { struct header head; size_t nread = fread(&head, sizeof(struct header), 1, stream); if (nread != 1) error("Failed to read the %s header from restart file (%s)", errstr, strerror(errno)); /* Check that the stored length is the same as the expected one. */ if (head.len != nblocks * size) error("Mismatched data length in restart file for %s (%zu != %zu)", errstr, head.len, nblocks * size); /* Return label, if required. */ if (label != NULL) { head.label[LABLEN] = '\0'; strncpy(label, head.label, LABLEN + 1); } nread = fread(ptr, size, nblocks, stream); if (nread != nblocks) error("Failed to restore %s from restart file (%s)", errstr, ferror(stream) ? strerror(errno) : "unexpected end of file"); } } /** * @brief Write blocks of memory to a file stream from a memory location. * Exits the application if the write fails and does nothing * if the size is zero. * * @param ptr pointer to the memory * @param size the blocks * @param nblocks number of blocks to write * @param stream the file stream * @param label a label for the content, can only be 20 characters. * @param errstr a context string to qualify any errors. */ void restart_write_blocks(void *ptr, size_t size, size_t nblocks, FILE *stream, const char *label, const char *errstr) { if (size > 0) { /* Add a preamble header. */ struct header head; head.len = nblocks * size; strncpy(head.label, label, LABLEN); head.label[LABLEN] = '\0'; /* Now dump it and the data. */ size_t nwrite = fwrite(&head, sizeof(struct header), 1, stream); if (nwrite != 1) error("Failed to save %s header to restart file (%s)", errstr, strerror(errno)); nwrite = fwrite(ptr, size, nblocks, stream); if (nwrite != nblocks) error("Failed to save %s to restart file (%s)", errstr, strerror(errno)); } } /** * @brief check if the stop file exists in the given directory and optionally * remove it if found. * * @param dir the directory of restart files. * @param cleanup remove the file if found. Should only do this from one rank * once all ranks have tested this file. * * @result 1 if the file was found. */ int restart_stop_now(const char *dir, int cleanup) { struct stat buf; char filename[FNAMELEN]; strcpy(filename, dir); strcat(filename, "/stop"); if (stat(filename, &buf) == 0) { if (cleanup && unlink(filename) != 0) { /* May not be fatal, so press on. */ message("Failed to delete restart stop file (%s)", strerror(errno)); } return 1; } return 0; } /** * @brief check if a file with the given name exists and rename to * {filename}.prev. Used to move old restart files before overwriting. * * Does nothing if the file does not exist. * * @param filename the name of the file to check. */ void restart_save_previous(const char *filename) { struct stat buf; if (stat(filename, &buf) == 0) { char newname[FNAMELEN]; strcpy(newname, filename); strcat(newname, ".prev"); if (rename(filename, newname) != 0) { /* Worth a complaint, this should not happen. */ message("Failed to rename file '%s' to '%s' (%s)", filename, newname, strerror(errno)); } } } /** * @brief check if a saved file with the given prefix name exists and remove * it. Used to remove old restart files before a save sequence * so that old saved files are not mixed up with new ones. * * Does nothing if a saved file does not exist. * * @param filename the prefix used when the saved file was created. */ void restart_remove_previous(const char *filename) { struct stat buf; char newname[FNAMELEN]; strcpy(newname, filename); strcat(newname, ".prev"); if (stat(newname, &buf) == 0) { if (unlink(newname) != 0) { /* Worth a complaint, this should not happen. */ message("Failed to unlink file '%s' (%s)", newname, strerror(errno)); } } } /** * @brief Run a given command, usually to resubmit a job. * * No check is done on the command being run. * * @param command The command to run in the system's shell. */ void restart_resubmit(const char *command) { /* Let's trust the user's command... */ const int result = system(command); if (result != 0) { message("Restart resubmit command returned error code %d", result); } }