restart.c 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/*******************************************************************************
 * This file is part of SWIFT.
 * Copyright (c) 2018 Peter W. Draper (p.w.draper@durham.ac.uk)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 ******************************************************************************/

/**
 *  @file restart.c
 *  @brief support for SWIFT restarts
 */

/* Config parameters. */
#include "../config.h"

/* Standard headers. */
29 30 31 32 33
#include "engine.h"
#include "error.h"
#include "restart.h"
#include "version.h"

34
#include <errno.h>
Peter W. Draper's avatar
Peter W. Draper committed
35
#include <glob.h>
36 37 38
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
39 40
#include <sys/stat.h>
#include <unistd.h>
41

42 43
/* The signature for restart files. */
#define SWIFT_RESTART_SIGNATURE "SWIFT-restart-file"
44
#define SWIFT_RESTART_END_SIGNATURE "SWIFT-restart-file:end"
45

46
#define FNAMELEN 200
47 48 49 50
#define LABLEN 20

/* Structure for a dumped header. */
struct header {
Peter W. Draper's avatar
Peter W. Draper committed
51 52
  size_t len;             /* Total length of data in bytes. */
  char label[LABLEN + 1]; /* A label for data */
53
};
54

55 56 57 58 59 60 61 62 63 64 65
/**
 * @brief generate a name for a restart file.
 *
 * @param dir the directory of restart files.
 * @param basename the basename of the restart files.
 * @param nodeID a unique integer, usually the nodeID of the engine.
 * @param name pointer to a string to hold the result.
 * @param size length of name.
 *
 * @result 0 if the string was large enough.
 */
Peter W. Draper's avatar
Peter W. Draper committed
66 67
int restart_genname(const char *dir, const char *basename, int nodeID,
                    char *name, int size) {
68
  int n = snprintf(name, size, "%s/%s_%06d.rst", dir, basename, nodeID);
69 70 71 72 73 74 75 76 77 78 79
  return (n >= size);
}

/**
 * @brief locate all the restart files in the given directory with the given
 *        basename.
 * @param dir the directory of restart files.
 * @param basename the basename of the restart files.
 * @param nfiles the number of restart files located.
 *
 * @result pointer to an array of strings with all the filenames found,
80 81 82
 *         these should be collated using the current locale, i.e. sorted
 *         alphabetically (so make sure the filenames are zero padded to get
 *         numeric ordering). Release by calling restart_locate_free().
83
 */
Peter W. Draper's avatar
Peter W. Draper committed
84
char **restart_locate(const char *dir, const char *basename, int *nfiles) {
85 86 87
  *nfiles = 0;

  /* Construct the glob pattern for locating files. */
88
  char pattern[FNAMELEN];
Peter W. Draper's avatar
Peter W. Draper committed
89 90
  if (snprintf(pattern, FNAMELEN, "%s/%s_[0-9]*.rst", dir, basename) <
      FNAMELEN) {
91 92 93 94 95

    glob_t globbuf;
    char **files = NULL;
    if (glob(pattern, 0, NULL, &globbuf) == 0) {
      *nfiles = globbuf.gl_pathc;
96
      files = (char **)malloc(sizeof(char *) * *nfiles);
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
      for (int i = 0; i < *nfiles; i++) {
        files[i] = strdup(globbuf.gl_pathv[i]);
      }
    }

    globfree(&globbuf);
    return files;
  }
  error("Failed to construct pattern to locate restart files");

  return NULL;
}

/**
 * @brief Release the memory allocated to hold the restart file names.
 *
 * @param nfiles the number of restart files located.
 * @param files the list of filenames found in call to restart_locate().
 */
void restart_locate_free(int nfiles, char **files) {
  for (int i = 0; i < nfiles; i++) {
Peter W. Draper's avatar
Peter W. Draper committed
118
    free(files[i]);
119 120 121 122 123
  }
  free(files);
}

/**
124 125 126 127
 * @brief Write a restart file for the state of the given engine struct.
 *
 * @param e the engine with our state information.
 * @param filename name of the file to write the restart data to.
128 129 130
 */
void restart_write(struct engine *e, const char *filename) {

131 132
  ticks tic = getticks();

133 134
  /* Save a backup the existing restart file, if requested. */
  if (e->restart_save) restart_save_previous(filename);
135

136 137 138 139
  FILE *stream = fopen(filename, "w");
  if (stream == NULL)
    error("Failed to open restart file: %s (%s)", filename, strerror(errno));

140
  /* Dump our signature and version. */
Matthieu Schaller's avatar
Matthieu Schaller committed
141 142 143
  restart_write_blocks((void *)SWIFT_RESTART_SIGNATURE,
                       strlen(SWIFT_RESTART_SIGNATURE), 1, stream, "signature",
                       "SWIFT signature");
144
  restart_write_blocks((void *)package_version(), strlen(package_version()), 1,
145
                       stream, "version", "SWIFT version");
146

147
  engine_struct_dump(e, stream);
148 149

  /* Just an END statement to spot truncated files. */
150
  restart_write_blocks((void *)SWIFT_RESTART_END_SIGNATURE,
Matthieu Schaller's avatar
Matthieu Schaller committed
151 152
                       strlen(SWIFT_RESTART_END_SIGNATURE), 1, stream,
                       "endsignature", "SWIFT end signature");
153

154
  fclose(stream);
155 156 157 158

  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
159 160 161
}

/**
162 163 164 165
 * @brief Read a restart file to construct a saved engine struct state.
 *
 * @param e the engine to recover from the saved state.
 * @param filename name of the file containing the staved state.
166 167 168
 */
void restart_read(struct engine *e, const char *filename) {

169 170
  const ticks tic = getticks();

171 172 173 174
  FILE *stream = fopen(filename, "r");
  if (stream == NULL)
    error("Failed to open restart file: %s (%s)", filename, strerror(errno));

175 176 177
  /* Get our version and signature back. These should match. */
  char signature[strlen(SWIFT_RESTART_SIGNATURE) + 1];
  int len = strlen(SWIFT_RESTART_SIGNATURE);
178
  restart_read_blocks(signature, len, 1, stream, NULL, "SWIFT signature");
179 180
  signature[len] = '\0';
  if (strncmp(signature, SWIFT_RESTART_SIGNATURE, len) != 0)
181
    error(
182 183
        "Do not recognise this as a SWIFT restart file, found '%s' "
        "expected '%s'",
184
        signature, SWIFT_RESTART_SIGNATURE);
185

186
  char version[FNAMELEN];
187
  len = strlen(package_version());
188
  restart_read_blocks(version, len, 1, stream, NULL, "SWIFT version");
189 190
  version[len] = '\0';

191
  /* It might work! */
192 193 194
  if (strncmp(version, package_version(), len) != 0)
    message(
        "WARNING: restoring from a different version of SWIFT.\n You have:"
195
        " '%s' and the restarts files are from: '%s'. This may fail"
196 197
        " badly.",
        package_version(), version);
198

199 200
  engine_struct_restore(e, stream);
  fclose(stream);
201 202 203 204

  if (e->verbose)
    message("took %.3f %s.", clocks_from_ticks(getticks() - tic),
            clocks_getunit());
205 206
}

207 208
/**
 * @brief Read blocks of memory from a file stream into a memory location.
209 210
 *        Exits the application if the read fails and does nothing if the
 *        size is zero.
211 212
 *
 * @param ptr pointer to the memory
213 214
 * @param size size of a block
 * @param nblocks number of blocks to read
215
 * @param stream the file stream
216 217
 * @param label the label recovered for the block, needs to be at least 20
 *              characters, set to NULL if not required
218 219
 * @param errstr a context string to qualify any errors.
 */
220
void restart_read_blocks(void *ptr, size_t size, size_t nblocks, FILE *stream,
221
                         char *label, const char *errstr) {
222
  if (size > 0) {
Peter W. Draper's avatar
Peter W. Draper committed
223 224 225 226 227 228 229 230 231 232 233 234
    struct header head;
    size_t nread = fread(&head, sizeof(struct header), 1, stream);
    if (nread != 1)
      error("Failed to read the %s header from restart file (%s)", errstr,
            strerror(errno));

    /* Check that the stored length is the same as the expected one. */
    if (head.len != nblocks * size)
      error("Mismatched data length in restart file for %s (%zu != %zu)",
            errstr, head.len, nblocks * size);

    /* Return label, if required. */
Loic Hausammann's avatar
Loic Hausammann committed
235 236
    if (label != NULL) {
      head.label[LABLEN] = '\0';
237
      strncpy(label, head.label, LABLEN + 1);
Loic Hausammann's avatar
Loic Hausammann committed
238
    }
Peter W. Draper's avatar
Peter W. Draper committed
239 240 241 242 243

    nread = fread(ptr, size, nblocks, stream);
    if (nread != nblocks)
      error("Failed to restore %s from restart file (%s)", errstr,
            ferror(stream) ? strerror(errno) : "unexpected end of file");
244
  }
245 246
}

247 248
/**
 * @brief Write blocks of memory to a file stream from a memory location.
249 250
 *        Exits the application if the write fails and does nothing
 *        if the size is zero.
251 252
 *
 * @param ptr pointer to the memory
253 254
 * @param size the blocks
 * @param nblocks number of blocks to write
255
 * @param stream the file stream
256
 * @param label a label for the content, can only be 20 characters.
257 258
 * @param errstr a context string to qualify any errors.
 */
259
void restart_write_blocks(void *ptr, size_t size, size_t nblocks, FILE *stream,
260
                          const char *label, const char *errstr) {
Peter W. Draper's avatar
Peter W. Draper committed
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
  if (size > 0) {

    /* Add a preamble header. */
    struct header head;
    head.len = nblocks * size;
    strncpy(head.label, label, LABLEN);
    head.label[LABLEN] = '\0';

    /* Now dump it and the data. */
    size_t nwrite = fwrite(&head, sizeof(struct header), 1, stream);
    if (nwrite != 1)
      error("Failed to save %s header to restart file (%s)", errstr,
            strerror(errno));

    nwrite = fwrite(ptr, size, nblocks, stream);
    if (nwrite != nblocks)
      error("Failed to save %s to restart file (%s)", errstr, strerror(errno));
  }
279
}
280 281 282 283 284 285 286 287 288 289 290 291

/**
 * @brief check if the stop file exists in the given directory and optionally
 *        remove it if found.
 *
 * @param dir the directory of restart files.
 * @param cleanup remove the file if found. Should only do this from one rank
 *                once all ranks have tested this file.
 *
 * @result 1 if the file was found.
 */
int restart_stop_now(const char *dir, int cleanup) {
292
  struct stat buf;
293 294 295 296
  char filename[FNAMELEN];
  strcpy(filename, dir);
  strcat(filename, "/stop");
  if (stat(filename, &buf) == 0) {
Peter W. Draper's avatar
Peter W. Draper committed
297 298 299 300 301
    if (cleanup && unlink(filename) != 0) {
      /* May not be fatal, so press on. */
      message("Failed to delete restart stop file (%s)", strerror(errno));
    }
    return 1;
302 303 304
  }
  return 0;
}
305 306 307

/**
 * @brief check if a file with the given name exists and rename to
308
 *        {filename}.prev. Used to move old restart files before overwriting.
309 310 311 312 313 314
 *
 *        Does nothing if the file does not exist.
 *
 * @param filename the name of the file to check.
 */
void restart_save_previous(const char *filename) {
315
  struct stat buf;
316 317 318 319 320 321 322 323 324 325 326
  if (stat(filename, &buf) == 0) {
    char newname[FNAMELEN];
    strcpy(newname, filename);
    strcat(newname, ".prev");
    if (rename(filename, newname) != 0) {
      /* Worth a complaint, this should not happen. */
      message("Failed to rename file '%s' to '%s' (%s)", filename, newname,
              strerror(errno));
    }
  }
}
327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348

/**
 * @brief check if a saved file with the given prefix name exists and remove
 *        it. Used to remove old restart files before a save sequence
 *        so that old saved files are not mixed up with new ones.
 *
 *        Does nothing if a saved file does not exist.
 *
 * @param filename the prefix used when the saved file was created.
 */
void restart_remove_previous(const char *filename) {
  struct stat buf;
  char newname[FNAMELEN];
  strcpy(newname, filename);
  strcat(newname, ".prev");
  if (stat(newname, &buf) == 0) {
    if (unlink(newname) != 0) {
      /* Worth a complaint, this should not happen. */
      message("Failed to unlink file '%s' (%s)", newname, strerror(errno));
    }
  }
}
349 350 351 352 353 354 355 356 357 358 359 360 361 362

/**
 * @brief Run a given command, usually to resubmit a job.
 *
 * No check is done on the command being run.
 *
 * @param command The command to run in the system's shell.
 */
void restart_resubmit(const char *command) {

  /* Let's trust the user's command... */
  const int result = system(command);
  if (result != 0) message("Command returned error code %d", result);
}