/******************************************************************************* * This file is part of SWIFT. * Copyright (c) 2025 Peter W. Draper (p.w.draper@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program. If not, see . * ******************************************************************************/ /* Config parameters. */ #include /* Standard includes. */ #include #include #include #include #include #include #include /* Local includes. */ #include "error.h" #include "swift_lustre_api.h" /* Lustre API */ #ifdef HAVE_LUSTREAPI #include #include #endif /* Number of OSTs to pre-allocate space for. */ #define PREALLOC (100) /* Bytes in a TiB */ #define TiB (1024.0 * 1024.0 * 1024.0) /* Bytes in a MiB */ #define MiB (1024.0 * 1024.0) /** * @brief Allocate storage for a number of OSTs in a OST scan storage struct. * * Note does not reset count or fullcount. Zero these if you want an * empty struct. * * @param ost_infos pointer to the storage structure. * @param size number of OSTs to make space for. */ void swift_ost_store_alloc(struct swift_ost_store *ost_infos, int size) { #ifdef HAVE_LUSTREAPI ost_infos->size = size; ost_infos->infos = (struct swift_ost_info *)malloc(sizeof(struct swift_ost_info) * size); if (ost_infos->infos == NULL) error("Failed to allocate space for an OST scan"); memset(ost_infos->infos, 0, sizeof(struct swift_ost_info) * size); #endif } /** * @brief Create a copy of an OST scan storage struct. * * @param ost_infos_src pointer to the storage structure to copy * @param ost_infos_dst pointer to a storage structure to populate with * the copy. Assumed to have no OST space so not * used or initialized. */ void swift_ost_store_copy(struct swift_ost_store *ost_infos_src, struct swift_ost_store *ost_infos_dst) { #ifdef HAVE_LUSTREAPI ost_infos_dst->size = ost_infos_src->fullcount; /* Used size. */ ost_infos_dst->count = ost_infos_src->count; ost_infos_dst->fullcount = ost_infos_src->fullcount; ost_infos_dst->infos = (struct swift_ost_info *)malloc( sizeof(struct swift_ost_info) * ost_infos_dst->size); if (ost_infos_dst->infos == NULL) error("Failed to allocate space for an OST scan copy"); memcpy(ost_infos_dst->infos, ost_infos_src->infos, sizeof(struct swift_ost_info) * ost_infos_dst->size); #endif } /** * @brief Initialize an OST scan storage structure. * * @param ost_infos pointer to the storage structure. */ void swift_ost_store_init(struct swift_ost_store *ost_infos) { #ifdef HAVE_LUSTREAPI swift_ost_store_alloc(ost_infos, PREALLOC); ost_infos->count = 0; ost_infos->fullcount = 0; #endif } /** * @brief Release any storage associated with an OST scan storage structure. * * @param ost_infos pointer to the storage structure. */ void swift_ost_store_free(struct swift_ost_store *ost_infos) { #ifdef HAVE_LUSTREAPI free(ost_infos->infos); ost_infos->infos = NULL; ost_infos->count = 0; ost_infos->fullcount = 0; ost_infos->size = 0; #endif } /** * @brief Write about an OST storage structure to a given FILE. * * @param file FILE stream to write output to. * @param ost_infos pointer to the storage structure. */ void swift_ost_store_write(FILE *file, struct swift_ost_store *ost_infos) { #ifdef HAVE_LUSTREAPI fprintf(file, "# %5s %21s %21s %21s\n", "Index", "Size (MiB)", "Used (MiB)", "Free (MiB)"); size_t ssum = 0; size_t usum = 0; size_t smin = ost_infos->infos[0].size; size_t smax = 0; size_t umin = ost_infos->infos[0].used; size_t umax = 0; for (int i = 0; i < ost_infos->count; i++) { int msize = (int)(ost_infos->infos[i].size / MiB); int mused = (int)(ost_infos->infos[i].used / MiB); fprintf(file, "# %5d %21d %21d %21d\n", ost_infos->infos[i].index, msize, mused, msize - mused); ssum += ost_infos->infos[i].size; usum += ost_infos->infos[i].used; if (ost_infos->infos[i].size > smax) smax = ost_infos->infos[i].size; if (ost_infos->infos[i].size < smin) smin = ost_infos->infos[i].size; if (ost_infos->infos[i].used > umax) umax = ost_infos->infos[i].used; if (ost_infos->infos[i].used < umin) umin = ost_infos->infos[i].used; } if (ost_infos->count == ost_infos->fullcount) { /* Size is for used OSTs not all, so don't report as misleading. */ fprintf(file, "# Filesystem size:%.2f TiB used:%.2f TiB free:%.2f TiB %.2f%%\n", ssum / TiB, usum / TiB, (ssum - usum) / TiB, 100.0 * (double)(ssum - usum) / (double)ssum); fprintf(file, "# Min/max size: %.2f/%.2f TiB Min/max used: %.2f/%.2f TiB\n", smin / TiB, smax / TiB, umin / TiB, umax / TiB); } else { fprintf(file, "#\n"); } #endif } /** * @brief Print information about OST storage structure * * @param ost_infos pointer to the storage structure. * @param verbose if non zero additional information will be written * to stdout. */ void swift_ost_store_print(struct swift_ost_store *ost_infos, int verbose) { #ifdef HAVE_LUSTREAPI message("# OSTs, using %d of %d", ost_infos->count, ost_infos->fullcount); if (verbose) swift_ost_store_write(stdout, ost_infos); #endif } #ifdef HAVE_LUSTREAPI /** * @brief Store information about an OST. * * @param ost_infos pointer to the storage structure. * @param index the index, zero based. * @param size the total size in bytes. * @param used the number of bytes used. */ static void swift_ost_store(struct swift_ost_store *ost_infos, int index, size_t size, size_t used) { /* Add extra space if needed. Note not thread safe. */ if (ost_infos->fullcount == ost_infos->size - 1) { size_t newsize = ost_infos->size + PREALLOC; struct swift_ost_info *newinfos = (struct swift_ost_info *)malloc( sizeof(struct swift_ost_info) * newsize); if (newinfos == NULL) error("Failed to allocate space for OST information"); memset(newinfos, 0, sizeof(struct swift_ost_info) * newsize); memcpy(newinfos, ost_infos->infos, sizeof(struct swift_ost_info) * ost_infos->size); free(ost_infos->infos); ost_infos->infos = newinfos; ost_infos->size = newsize; } int count = ost_infos->count++; ost_infos->infos[count].index = index; ost_infos->infos[count].size = size; ost_infos->infos[count].used = used; ost_infos->fullcount = ost_infos->count; } #endif /** * @brief Scan the OSTs associated with a lustre file system given a path. * * On exit the ost_infos struct will be populated with the * the number of OSTs found and details of the size and used bytes in each * OST. * * @param path a directory on the lustre file system, ideally the mount point. * @param ost_infos pointer to the storage structure. * * @return 0 on success, otherwise an error will have been reported to stdout. * If an error occurs the store will never be changed. */ int swift_ost_scan(const char *path, struct swift_ost_store *ost_infos) { int rc = 0; #ifdef HAVE_LUSTREAPI char mntdir[PATH_MAX] = {0}; char fsname[PATH_MAX] = {0}; char cpath[PATH_MAX] = {0}; /* Check this path exists. */ if (!realpath(path, cpath)) { rc = errno; message("Not a filesystem path '%s': %s", path, strerror(rc)); } else { /* Parse the path into the mount point and file system name. */ if (llapi_search_mounts(cpath, 0, mntdir, fsname) == 0) { if (mntdir[0] != '\0') { struct obd_statfs stat_buf; struct obd_uuid uuid_buf; /* Loop while OSTs are located. */ for (int index = 0;; index++) { memset(&stat_buf, 0, sizeof(struct obd_statfs)); memset(&uuid_buf, 0, sizeof(struct obd_uuid)); rc = llapi_obd_statfs(mntdir, LL_STATFS_LOV, index, &stat_buf, &uuid_buf); rc = -rc; if (rc == ENODEV || rc == EAGAIN || rc == EINVAL || rc == EFAULT) { /* Nothing we can query here, so time to stop search. */ break; } /* Inactive devices are empty. */ if (rc == ENODATA) { swift_ost_store(ost_infos, index, 0, 0); } else { size_t used = (stat_buf.os_blocks - stat_buf.os_bfree) * stat_buf.os_bsize; size_t total = stat_buf.os_blocks * stat_buf.os_bsize; swift_ost_store(ost_infos, index, total, used); } } rc = 0; } else { message("No lustre mount point found for path: %s", path); rc = 1; } } else { message("Failed to locate a lustre mount point using path: %s", path); rc = 1; } } #endif return rc; } #ifdef HAVE_LUSTREAPI /** Comparison function for OST free space. */ static int ostcmp(const void *p1, const void *p2) { const struct swift_ost_info *i1 = (const struct swift_ost_info *)p1; const struct swift_ost_info *i2 = (const struct swift_ost_info *)p2; /* size_t ints so some care is needed to return an int. */ size_t f1 = i1->size - i1->used; size_t f2 = i2->size - i2->used; if (f1 < f2) return 1; if (f1 > f2) return -1; return 0; } #endif /** * @brief Sort the OSTs into decreasing free space culling those that do not * meet a free space threshold. * * @param ost_infos pointer to populated storage structure. * @param minfree the number of MiB that the OST should be capable of * storing. Zero for no effect. */ void swift_ost_cull(struct swift_ost_store *ost_infos, int minfree) { #ifdef HAVE_LUSTREAPI /* Sort by free space. */ qsort(ost_infos->infos, ost_infos->count, sizeof(struct swift_ost_info), ostcmp); /* And cull if needed. */ if (minfree > 0) { size_t bytesfree = minfree * (size_t)MiB; /* Always keep at least one! */ for (int i = 1; i < ost_infos->count; i++) { struct swift_ost_info *curr = &ost_infos->infos[i]; if ((curr->size - curr->used) < bytesfree) { /* Throw the rest away. Note fullcount now decoupled. */ ost_infos->count = i; } } } #endif } /** * @brief Get the next OST in an incrementing sequence. * * @param ost_infos pointer to populated storage structure. * @param arrayindex the last used array index, start with 0. * This will be wrapped as needed use as input for next * call. * @param count number of OSTs that will be used to stripe, that is the * increment, usually 1. Only makes sense if the OST list is not * culled as this implicitly assumes OSTs are in index order. * @return the selected OST index. */ int swift_ost_next(struct swift_ost_store *ost_infos, int *arrayindex, int count) { #ifdef HAVE_LUSTREAPI int index = (*arrayindex % ost_infos->count); *arrayindex = index + count; return ost_infos->infos[index].index; #else return 0; #endif } /** * @brief Remove an OST by index from the store. * * @param ost_infos pointer to populated storage structure. * @param index index of the OST to remove. */ void swift_ost_remove(struct swift_ost_store *ost_infos, int index) { #ifdef HAVE_LUSTREAPI /* Find the array index. */ int arrayindex = -1; for (int i = 0; i < ost_infos->fullcount; i++) { if (ost_infos->infos[i].index == index) { arrayindex = i; break; } } /* Do nothing if not found or we have the end array index. */ if ((arrayindex != -1) && arrayindex != (ost_infos->fullcount - 1)) { /* Copy remaining infos down one place. Overlapping.. */ memmove(&ost_infos->infos[arrayindex], &ost_infos->infos[arrayindex + 1], (ost_infos->fullcount - arrayindex - 1) * sizeof(struct swift_ost_info)); if (arrayindex < ost_infos->count) ost_infos->count = ost_infos->count - 1; ost_infos->fullcount = ost_infos->fullcount - 1; } else if (arrayindex == ost_infos->fullcount - 1) { /* End array index, just adjust counts. */ if (arrayindex < ost_infos->count) ost_infos->count = ost_infos->count - 1; ost_infos->fullcount = ost_infos->fullcount - 1; } #endif } /** * @brief Create a file with a given OST index and number of OSTs to stripe. * * @param filename name of the file to create. * @param offset index of the first OST used with this file. * @param count number of OSTs to stripe this file over. * @param usedoffset the offset actually used by file. * * @return non-zero if there are problems creating the file. */ int swift_create_striped_file(const char *filename, int offset, int count, int *usedoffset) { int rc = 0; #ifdef HAVE_LUSTREAPI *usedoffset = offset; rc = llapi_file_create(filename, 0 /* Default block size */, offset, count, LLAPI_LAYOUT_RAID0 /* Pattern default */); if (rc != 0) { rc = -rc; message("Cannot create file %s : %s", filename, strerror(rc)); } else { /* Recover the file offset of first OST in case it is changed from * operational reasons. */ /* Yuk, needs extra space for array os lov_user_ost_data. */ size_t sizelum = sizeof(struct lov_user_md) + LOV_MAX_STRIPE_COUNT * sizeof(struct lov_user_ost_data); struct lov_user_md *lum = (struct lov_user_md *)malloc(sizelum); rc = llapi_file_get_stripe(filename, lum); rc = -rc; if (rc == 0) { *usedoffset = lum->lmm_objects[0].l_ost_idx; } else { /* Shouldn't be fatal. */ *usedoffset = offset; } free(lum); } #endif return rc; } /** * @brief Scan for the available OSTs for a given file path. * * The OSTs will be sorted by free space on exit and may be further selected * to remove OSTs that are too full for use or cannot be written to. If too * many OSTs are rejected it will be considered to be a parameter error and * all OSTs, sorted by free space, will be returned. * We don't want to flood the OSTs with RPC calls so only one MPI rank * should make this call. * * @param ost_infos pointer to empty OST storage struct. Will contain the * selected free-space ordered OSTs found on exit. The OST * count will remain at zero if anything fails. Note this * will need to be freed as usual regardless. * @param filepath path to a file on the lustre file system. Must not exist. * The containing directory must exist and be part of the * lustre file system. * @param minfree minimum free space to allow in MiB. -1 for use a guess based * on size of the current process, 0 to disable selection. * @param writetest whether to check if the OSTs are writable. If used * the path must be that of a non-existent file on the * file system that is writable by the process. * @param verbose if true information about the OSTs and the selections made * will be output. * */ void swift_ost_select(struct swift_ost_store *ost_infos, const char *filepath, int minfree, int writetest, int verbose) { /* Initialise the struct. */ swift_ost_store_init(ost_infos); /* Get directory of filepath. */ char *filepathc = strdup(filepath); char *dirp = dirname(filepathc); /* Scan for all OSTs. */ int rc = swift_ost_scan(dirp, ost_infos); free(dirp); /* If does not succeed we do nothing, probably not a lustre mount. */ if (rc == 0) { if (verbose) swift_ost_store_print(ost_infos, 1); /* Make a copy so we can undo any changes. */ struct swift_ost_store ost_infos_full; swift_ost_store_copy(ost_infos, &ost_infos_full); /* Cull these so we do not use OSTs with too little free space. Also sorts * into most free space order. If given a value use that, otherwise we use * the resident set size of the process, dumps and restarts are always * smaller than that. */ if (minfree != 0) { if (minfree < 0) { /* No guarantee this will work, hopefully will return 0 in those cases * and we do nothing. */ long size, resident, shared, text, library, data, dirty; memuse_use(&size, &resident, &shared, &text, &data, &library, &dirty); /* KiB into MiB. */ minfree = (int)(resident / 1024.0); } /* And cull and sort. */ swift_ost_cull(ost_infos, minfree); if (verbose) message("Rejected %d OSTs using free space threshold %d (MiB)", ost_infos->fullcount - ost_infos->count, minfree); } if (writetest != 0) { /* Test writing to all OSTs and remove any that are not writable. We do * this by creating our file on every OST and checking it was created on * it. */ int usedindex = 0; int removed = 0; for (int i = ost_infos->count - 1; i >= 0; i--) { usedindex = ost_infos->infos[i].index; rc = swift_create_striped_file(filepath, ost_infos->infos[i].index, 1, &usedindex); if (rc != 0) { /* Failed so not likely to succeed next time. Probably file * exists, there is nothing we should do about that, the existing * stripe will be reused, along with the space of the existing file. */ message("Failed testing file creation on OSTs, aborting test"); break; } if (usedindex != ost_infos->infos[i].index) { /* Differing OST indices, so not what we asked for, bye. */ swift_ost_remove(ost_infos, ost_infos->infos[i].index); removed++; } unlink(filepath); } if (verbose) message("Rejected %d OSTs as readonly", removed); } /* Safety first. If we have too few OSTs left after the above we will * make the choice to do nothing. */ if ((ost_infos->fullcount * 0.25 > ost_infos->count) || ost_infos->count < 2) { message("Too many OSTs have been rejected (%d of %d).", ost_infos->fullcount - ost_infos->count, ost_infos->fullcount); message("Assuming OST rejection is flawed and skipping."); swift_ost_store_copy(&ost_infos_full, ost_infos); /* Still good to use a sorted list. */ swift_ost_cull(ost_infos, 0); } swift_ost_store_free(&ost_infos_full); if (verbose) swift_ost_store_print(ost_infos, 1); } else { /* If the scan failed we do nothing, this is probably not a lustre mount. */ message("Lustre OST scan failed, is this a lustre mount?"); } }