diff --git a/configure.ac b/configure.ac index 9fa9a1de591d63794dde5db6a8dd733cfcaada09..a02dcc57c720f1a9a792b160485caee728a91b98 100644 --- a/configure.ac +++ b/configure.ac @@ -351,7 +351,7 @@ AC_ARG_WITH([tcmalloc], [with_tcmalloc="no"] ) if test "x$with_tcmalloc" != "xno"; then - if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then + if test "x$with_tcmalloc" != "xyes" -a "x$with_tcmalloc" != "x"; then tclibs="-L$with_tcmalloc -ltcmalloc" else tclibs="-ltcmalloc" @@ -361,7 +361,7 @@ if test "x$with_tcmalloc" != "xno"; then # Could just have the minimal version. if test "$have_tcmalloc" = "no"; then - if test "x$with_tcmalloc" != "xyes" && test "x$with_tcmalloc" != "x"; then + if test "x$with_tcmalloc" != "xyes" -a "x$with_tcmalloc" != "x"; then tclibs="-L$with_tcmalloc -ltcmalloc_minimal" else tclibs="-ltcmalloc_minimal" @@ -394,7 +394,7 @@ AC_ARG_WITH([profiler], [with_profiler="yes"] ) if test "x$with_profiler" != "xno"; then - if test "x$with_profiler" != "xyes" && test "x$with_profiler" != "x"; then + if test "x$with_profiler" != "xyes" -a "x$with_profiler" != "x"; then proflibs="-L$with_profiler -lprofiler" else proflibs="-lprofiler" @@ -411,6 +411,38 @@ fi AC_SUBST([PROFILER_LIBS]) AM_CONDITIONAL([HAVEPROFILER],[test -n "$PROFILER_LIBS"]) +# Check for jemalloc another fast malloc that is good with contention. +have_jemalloc="no" +AC_ARG_WITH([jemalloc], + [AS_HELP_STRING([--with-jemalloc], + [use jemalloc library or specify the directory with lib @<:@yes/no@:>@] + )], + [with_jemalloc="$withval"], + [with_jemalloc="no"] +) +if test "x$with_jemalloc" != "xno"; then + if test "x$with_jemalloc" != "xyes" -a "x$with_jemalloc" != "x"; then + jelibs="-L$with_jemalloc -ljemalloc" + else + jelibs="-ljemalloc" + fi + AC_CHECK_LIB([jemalloc],[malloc_usable_size],[have_jemalloc="yes"],[have_jemalloc="no"], + $jelibs) + + if test "$have_jemalloc" = "yes"; then + JEMALLOC_LIBS="$jelibs" + else + JEMALLOC_LIBS="" + fi +fi +AC_SUBST([JEMALLOC_LIBS]) +AM_CONDITIONAL([HAVEJEMALLOC],[test -n "$JEMALLOC_LIBS"]) + +# Don't allow both tcmalloc and jemalloc. +if test "x$have_tcmalloc" != "xno" -a "x$have_jemalloc" != "xno"; then + AC_MSG_ERROR([Cannot use tcmalloc at same time as jemalloc]) +fi + # Check for HDF5. This is required. AX_LIB_HDF5 @@ -781,6 +813,7 @@ AC_MSG_RESULT([ FFTW3 enabled : $have_fftw3 libNUMA enabled : $have_numa Using tcmalloc : $have_tcmalloc + Using jemalloc : $have_jemalloc CPU profiler : $have_profiler Hydro scheme : $with_hydro diff --git a/examples/Makefile.am b/examples/Makefile.am index 4da84788a485dacd2103fe85ad3e729ade6b582a..28a4629bdb401c0736379a2fe14a3a5f19caf650 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -24,7 +24,7 @@ AM_CFLAGS = -I$(top_srcdir)/src $(HDF5_CPPFLAGS) AM_LDFLAGS = $(HDF5_LDFLAGS) # Extra libraries. -EXTRA_LIBS = $(HDF5_LIBS) $(FFTW_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) +EXTRA_LIBS = $(HDF5_LIBS) $(FFTW_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) $(JEMALLOC_LIBS) # MPI libraries. MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS) diff --git a/src/Makefile.am b/src/Makefile.am index a336c635091694916955d89d9327d5c5c672ec63..1b9d3b4ae9770b9081ba384851fe6cbf1f2ad688 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -25,7 +25,7 @@ AM_LDFLAGS = $(HDF5_LDFLAGS) $(FFTW_LIBS) -version-info 0:0:0 GIT_CMD = @GIT_CMD@ # Additional dependencies for shared libraries. -EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) +EXTRA_LIBS = $(HDF5_LIBS) $(PROFILER_LIBS) $(TCMALLOC_LIBS) $(JEMALLOC_LIBS) # MPI libraries. MPI_LIBS = $(METIS_LIBS) $(MPI_THREAD_LIBS) diff --git a/src/align.h b/src/align.h index 84e2909c0866c18f0f8378df9d0efc8d0f6545b5..915af33e6e2ba59be1a0849c4de0e2f1bd5b0d96 100644 --- a/src/align.h +++ b/src/align.h @@ -19,9 +19,13 @@ #ifndef SWIFT_ALIGN_H #define SWIFT_ALIGN_H +/** + * @brief The default struct alignment in SWIFT. + */ +#define SWIFT_STRUCT_ALIGNMENT 32 /** * @brief Defines alignment of structures */ -#define SWIFT_STRUCT_ALIGN __attribute__((aligned(32))) +#define SWIFT_STRUCT_ALIGN __attribute__((aligned(SWIFT_STRUCT_ALIGNMENT))) #endif /* SWIFT_ALIGN_H */ diff --git a/src/cell.c b/src/cell.c index e2767cdaa9e1189ec87b5ef51cc578c91f8cfe4c..b08d9ce8b10b01e2f107ecaae5fb56936a4688e6 100644 --- a/src/cell.c +++ b/src/cell.c @@ -467,8 +467,11 @@ void cell_gunlocktree(struct cell *c) { * space's parts array, i.e. c->parts - s->parts. * @param buff A buffer with at least max(c->count, c->gcount) entries, * used for sorting indices. + * @param gbuff A buffer with at least max(c->count, c->gcount) entries, + * used for sorting indices for the gparts. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { +void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff, + struct cell_buff *gbuff) { const int count = c->count, gcount = c->gcount; struct part *parts = c->parts; @@ -480,18 +483,21 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0}; int bucket_offset[9]; - /* If the buff is NULL, allocate it, and remember to free it. */ - const int allocate_buffer = (buff == NULL); - if (allocate_buffer && - (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) - error("Failed to allocate temporary indices."); +#ifdef SWIFT_DEBUG_CHECKS + /* Check that the buffs are OK. */ + for (int k = 0; k < count; k++) { + if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] || + buff[k].x[2] != parts[k].x[2]) + error("Inconsistent buff contents."); + } +#endif /* SWIFT_DEBUG_CHECKS */ /* Fill the buffer with the indices. */ for (int k = 0; k < count; k++) { - const int bid = (parts[k].x[0] > pivot[0]) * 4 + - (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]); + const int bid = (buff[k].x[0] > pivot[0]) * 4 + + (buff[k].x[1] > pivot[1]) * 2 + (buff[k].x[2] > pivot[2]); bucket_count[bid]++; - buff[k] = bid; + buff[k].ind = bid; } /* Set the buffer offsets. */ @@ -505,23 +511,25 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { for (int bucket = 0; bucket < 8; bucket++) { for (int k = bucket_offset[bucket] + bucket_count[bucket]; k < bucket_offset[bucket + 1]; k++) { - int bid = buff[k]; + int bid = buff[k].ind; if (bid != bucket) { struct part part = parts[k]; struct xpart xpart = xparts[k]; + struct cell_buff temp_buff = buff[k]; while (bid != bucket) { int j = bucket_offset[bid] + bucket_count[bid]++; - while (buff[j] == bid) { + while (buff[j].ind == bid) { j++; bucket_count[bid]++; } memswap(&parts[j], &part, sizeof(struct part)); memswap(&xparts[j], &xpart, sizeof(struct xpart)); - memswap(&buff[j], &bid, sizeof(int)); + memswap(&buff[j], &temp_buff, sizeof(struct cell_buff)); + bid = temp_buff.ind; } parts[k] = part; xparts[k] = xpart; - buff[k] = bid; + buff[k] = temp_buff; } bucket_count[bid]++; } @@ -538,6 +546,14 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { if (count > 0 && gcount > 0) part_relink_gparts(parts, count, parts_offset); #ifdef SWIFT_DEBUG_CHECKS + /* Check that the buffs are OK. */ + for (int k = 1; k < count; k++) { + if (buff[k].ind < buff[k - 1].ind) error("Buff not sorted."); + if (buff[k].x[0] != parts[k].x[0] || buff[k].x[1] != parts[k].x[1] || + buff[k].x[2] != parts[k].x[2]) + error("Inconsistent buff contents (k=%i).", k); + } + /* Verify that _all_ the parts have been assigned to a cell. */ for (int k = 1; k < 8; k++) if (&c->progeny[k - 1]->parts[c->progeny[k - 1]->count] != @@ -564,6 +580,31 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { c->progeny[2]->parts[k].x[1] <= pivot[1] || c->progeny[2]->parts[k].x[2] > pivot[2]) error("Sorting failed (progeny=2)."); + for (int k = 0; k < c->progeny[3]->count; k++) + if (c->progeny[3]->parts[k].x[0] > pivot[0] || + c->progeny[3]->parts[k].x[1] <= pivot[1] || + c->progeny[3]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=3)."); + for (int k = 0; k < c->progeny[4]->count; k++) + if (c->progeny[4]->parts[k].x[0] <= pivot[0] || + c->progeny[4]->parts[k].x[1] > pivot[1] || + c->progeny[4]->parts[k].x[2] > pivot[2]) + error("Sorting failed (progeny=4)."); + for (int k = 0; k < c->progeny[5]->count; k++) + if (c->progeny[5]->parts[k].x[0] <= pivot[0] || + c->progeny[5]->parts[k].x[1] > pivot[1] || + c->progeny[5]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=5)."); + for (int k = 0; k < c->progeny[6]->count; k++) + if (c->progeny[6]->parts[k].x[0] <= pivot[0] || + c->progeny[6]->parts[k].x[1] <= pivot[1] || + c->progeny[6]->parts[k].x[2] > pivot[2]) + error("Sorting failed (progeny=6)."); + for (int k = 0; k < c->progeny[7]->count; k++) + if (c->progeny[7]->parts[k].x[0] <= pivot[0] || + c->progeny[7]->parts[k].x[1] <= pivot[1] || + c->progeny[7]->parts[k].x[2] <= pivot[2]) + error("Sorting failed (progeny=7)."); #endif /* Now do the same song and dance for the gparts. */ @@ -571,11 +612,10 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { /* Fill the buffer with the indices. */ for (int k = 0; k < gcount; k++) { - const int bid = (gparts[k].x[0] > pivot[0]) * 4 + - (gparts[k].x[1] > pivot[1]) * 2 + - (gparts[k].x[2] > pivot[2]); + const int bid = (gbuff[k].x[0] > pivot[0]) * 4 + + (gbuff[k].x[1] > pivot[1]) * 2 + (gbuff[k].x[2] > pivot[2]); bucket_count[bid]++; - buff[k] = bid; + gbuff[k].ind = bid; } /* Set the buffer offsets. */ @@ -589,20 +629,22 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { for (int bucket = 0; bucket < 8; bucket++) { for (int k = bucket_offset[bucket] + bucket_count[bucket]; k < bucket_offset[bucket + 1]; k++) { - int bid = buff[k]; + int bid = gbuff[k].ind; if (bid != bucket) { struct gpart gpart = gparts[k]; + struct cell_buff temp_buff = gbuff[k]; while (bid != bucket) { int j = bucket_offset[bid] + bucket_count[bid]++; - while (buff[j] == bid) { + while (gbuff[j].ind == bid) { j++; bucket_count[bid]++; } memswap(&gparts[j], &gpart, sizeof(struct gpart)); - memswap(&buff[j], &bid, sizeof(int)); + memswap(&gbuff[j], &temp_buff, sizeof(struct cell_buff)); + bid = temp_buff.ind; } gparts[k] = gpart; - buff[k] = bid; + gbuff[k] = temp_buff; } bucket_count[bid]++; } diff --git a/src/cell.h b/src/cell.h index 2cd13cf2ab6b934f6aab84bcbacf510270892866..984ef4bc20cf07285b136461815cd0851c12291a 100644 --- a/src/cell.h +++ b/src/cell.h @@ -52,6 +52,12 @@ struct scheduler; /* Global variables. */ extern int cell_next_tag; +/* Struct to temporarily buffer the particle locations and bin id. */ +struct cell_buff { + double x[3]; + int ind; +} SWIFT_STRUCT_ALIGN; + /* Mini struct to link cells to tasks. Used as a linked list. */ struct link { @@ -272,7 +278,8 @@ struct cell { ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* Function prototypes. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff); +void cell_split(struct cell *c, ptrdiff_t parts_offset, struct cell_buff *buff, + struct cell_buff *gbuff); void cell_sanitize(struct cell *c); int cell_locktree(struct cell *c); void cell_unlocktree(struct cell *c); diff --git a/src/debug.c b/src/debug.c index 48572df7f046944613d2598b0d340e949ad3ab7e..21f539b62eef3b0b36f52520486f1e725abc0cda 100644 --- a/src/debug.c +++ b/src/debug.c @@ -227,21 +227,21 @@ int checkCellhdxmax(const struct cell *c, int *depth) { if (c->h_max != h_max) { message("%d Inconsistent h_max: cell %f != parts %f", *depth, c->h_max, h_max); - message("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]); + error("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]); result = 0; } if (c->dx_max != dx_max) { message("%d Inconsistent dx_max: %f != %f", *depth, c->dx_max, dx_max); - message("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]); + error("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]); result = 0; } /* Check rebuild criterion. */ - if (h_max > c->dmin) { + /* if (h_max > c->dmin) { message("%d Inconsistent c->dmin: %f > %f", *depth, h_max, c->dmin); - message("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]); + error("location: %f %f %f", c->loc[0], c->loc[1], c->loc[2]); result = 0; - } + } */ return result; } diff --git a/src/engine.c b/src/engine.c index d604042fffd2f9b8f5677cba72d61e9d75b6a311..37b6c45598a0f56ccf4ec10fe7717ea357e90cfd 100644 --- a/src/engine.c +++ b/src/engine.c @@ -318,6 +318,23 @@ void engine_redistribute(struct engine *e) { MPI_COMM_WORLD) != MPI_SUCCESS) error("Failed to allreduce particle transfer counts."); + /* Report how many particles will be moved. */ + if (e->verbose) { + if (e->nodeID == 0) { + size_t total = 0; + size_t unmoved = 0; + for (int p = 0, r = 0; p < nr_nodes; p++) { + for (int s = 0; s < nr_nodes; s++) { + total += counts[r]; + if (p == s) unmoved += counts[r]; + r++; + } + } + message("%ld of %ld (%.2f%%) of particles moved", total - unmoved, total, + 100.0 * (double)(total - unmoved) / (double)total); + } + } + /* Get all the g_counts from all the nodes. */ if (MPI_Allreduce(MPI_IN_PLACE, g_counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM, MPI_COMM_WORLD) != MPI_SUCCESS) diff --git a/src/hydro/Minimal/hydro.h b/src/hydro/Minimal/hydro.h index 3b3454f1bb348b178ac57899da4f7611802a69cd..beb6f98b8c0d781aa709fb6ee3ca564a52704db2 100644 --- a/src/hydro/Minimal/hydro.h +++ b/src/hydro/Minimal/hydro.h @@ -66,7 +66,9 @@ __attribute__((always_inline)) INLINE static float hydro_get_internal_energy( __attribute__((always_inline)) INLINE static float hydro_get_pressure( const struct part *restrict p, float dt) { - return p->force.pressure; + const float u = p->u + p->u_dt * dt; + + return gas_pressure_from_internal_energy(p->rho, u); } /** diff --git a/src/memswap.h b/src/memswap.h index 4643725535917952d12927d52187bc7306ced5ef..92c902eeb158978d4a606f5f2a9416d4113fae0b 100644 --- a/src/memswap.h +++ b/src/memswap.h @@ -32,24 +32,27 @@ #include <altivec.h> #endif -/* Macro for in-place swap of two values a and b of type t. */ -#define swap_loop(t, a, b, c) \ - while (c >= sizeof(t)) { \ - register t temp = *(t *)a; \ - *(t *)a = *(t *)b; \ - *(t *)b = temp; \ - a += sizeof(t); \ - b += sizeof(t); \ - bytes -= sizeof(t); \ +/* Macro for in-place swap of two values a and b of type t. a and b are + assumed to be of type char* so that the pointer arithmetic works. */ +#define swap_loop(type, a, b, count) \ + while (count >= sizeof(type)) { \ + register type temp = *(type *)a; \ + *(type *)a = *(type *)b; \ + *(type *)b = temp; \ + a += sizeof(type); \ + b += sizeof(type); \ + count -= sizeof(type); \ } /** * @brief Swap the contents of two elements in-place. * - * Keep in mind that this function works best when the underlying data + * Keep in mind that this function only works when the underlying data * is aligned to the vector length, e.g. with the @c * __attribute__((aligned(32))) - * syntax, and the code is compiled with @c -funroll-loops. + * syntax! + * Furthermore, register re-labeling only seems to work when the code is + * compiled with @c -funroll-loops. * * @param void_a Pointer to the first element. * @param void_b Pointer to the second element. @@ -76,4 +79,63 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, swap_loop(char, a, b, bytes); } +/** + * @brief Swap the contents of two elements in-place. + * + * As opposed to #memswap, this function does not require the parameters + * to be aligned in any specific way. + * Furthermore, register re-labeling only seems to work when the code is + * compiled with @c -funroll-loops. + * + * @param void_a Pointer to the first element. + * @param void_b Pointer to the second element. + * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. + */ +__attribute__((always_inline)) inline void memswap_unaligned(void *void_a, + void *void_b, + size_t bytes) { + char *a = (char *)void_a, *b = (char *)void_b; +#ifdef __AVX512F__ + while (bytes >= sizeof(__m512i)) { + register __m512i temp; + temp = _mm512_loadu_si512((__m512i *)a); + _mm512_storeu_si512((__m512i *)a, _mm512_loadu_si512((__m512i *)b)); + _mm512_storeu_si512((__m512i *)b, temp); + a += sizeof(__m512i); + b += sizeof(__m512i); + bytes -= sizeof(__m512i); + } +#endif +#ifdef __AVX__ + while (bytes >= sizeof(__m256i)) { + register __m256i temp; + temp = _mm256_loadu_si256((__m256i *)a); + _mm256_storeu_si256((__m256i *)a, _mm256_loadu_si256((__m256i *)b)); + _mm256_storeu_si256((__m256i *)b, temp); + a += sizeof(__m256i); + b += sizeof(__m256i); + bytes -= sizeof(__m256i); + } +#endif +#ifdef __SSE2__ + while (bytes >= sizeof(__m128i)) { + register __m128i temp; + temp = _mm_loadu_si128((__m128i *)a); + _mm_storeu_si128((__m128i *)a, _mm_loadu_si128((__m128i *)b)); + _mm_storeu_si128((__m128i *)b, temp); + a += sizeof(__m128i); + b += sizeof(__m128i); + bytes -= sizeof(__m128i); + } +#endif +#ifdef __ALTIVEC__ + // Power8 supports unaligned load/stores, but not sure what it will do here. + swap_loop(vector int, a, b, bytes); +#endif + swap_loop(size_t, a, b, bytes); + swap_loop(int, a, b, bytes); + swap_loop(short, a, b, bytes); + swap_loop(char, a, b, bytes); +} + #endif /* SWIFT_MEMSWAP_H */ diff --git a/src/partition.c b/src/partition.c index 89ba3f2835cb78e07ec2bc5cb04c3f8f71751563..85745d880e3ab0f6beaf918a5c226730b6b82a7c 100644 --- a/src/partition.c +++ b/src/partition.c @@ -278,6 +278,18 @@ static void split_metis(struct space *s, int nregions, int *celllist) { #endif #if defined(WITH_MPI) && defined(HAVE_METIS) + +/* qsort support. */ +struct indexval { + int index; + int count; +}; +static int indexvalcmp(const void *p1, const void *p2) { + const struct indexval *iv1 = (const struct indexval *)p1; + const struct indexval *iv2 = (const struct indexval *)p2; + return iv2->count - iv1->count; +} + /** * @brief Partition the given space into a number of connected regions. * @@ -382,14 +394,70 @@ static void pick_metis(struct space *s, int nregions, int *vertexw, int *edgew, if (regionid[k] < 0 || regionid[k] >= nregions) error("Got bad nodeID %" PRIDX " for cell %i.", regionid[k], k); + /* We want a solution in which the current regions of the space are + * preserved when possible, to avoid unneccesary particle movement. + * So create a 2d-array of cells counts that are common to all pairs + * of old and new ranks. Each element of the array has a cell count and + * an unique index so we can sort into decreasing counts. */ + int indmax = nregions * nregions; + struct indexval *ivs = malloc(sizeof(struct indexval) * indmax); + bzero(ivs, sizeof(struct indexval) * indmax); + for (int k = 0; k < ncells; k++) { + int index = regionid[k] + nregions * s->cells_top[k].nodeID; + ivs[index].count++; + ivs[index].index = index; + } + qsort(ivs, indmax, sizeof(struct indexval), indexvalcmp); + + /* Go through the ivs using the largest counts first, these are the + * regions with the most cells in common, old partition to new. */ + int *oldmap = malloc(sizeof(int) * nregions); + int *newmap = malloc(sizeof(int) * nregions); + for (int k = 0; k < nregions; k++) { + oldmap[k] = -1; + newmap[k] = -1; + } + for (int k = 0; k < indmax; k++) { + + /* Stop when all regions with common cells have been considered. */ + if (ivs[k].count == 0) break; + + /* Store old and new IDs, if not already used. */ + int oldregion = ivs[k].index / nregions; + int newregion = ivs[k].index - oldregion * nregions; + if (newmap[newregion] == -1 && oldmap[oldregion] == -1) { + newmap[newregion] = oldregion; + oldmap[oldregion] = newregion; + } + } + + /* Handle any regions that did not get selected by picking an unused rank + * from oldmap and assigning to newmap. */ + int spare = 0; + for (int k = 0; k < nregions; k++) { + if (newmap[k] == -1) { + for (int j = spare; j < nregions; j++) { + if (oldmap[j] == -1) { + newmap[k] = j; + oldmap[j] = j; + spare = j; + break; + } + } + } + } + /* Set the cell list to the region index. */ for (int k = 0; k < ncells; k++) { - celllist[k] = regionid[k]; + celllist[k] = newmap[regionid[k]]; } /* Clean up. */ if (weights_v != NULL) free(weights_v); if (weights_e != NULL) free(weights_e); + free(ivs); + free(oldmap); + free(newmap); free(xadj); free(adjncy); free(regionid); diff --git a/src/proxy.c b/src/proxy.c index efe3a3eec108d44d5b9bf8b4718dc025464f8762..e68fe9cb3ad23956d3d3c012573b0af6047c1904 100644 --- a/src/proxy.c +++ b/src/proxy.c @@ -65,8 +65,8 @@ void proxy_cells_exch1(struct proxy *p) { /* Allocate and fill the pcell buffer. */ if (p->pcells_out != NULL) free(p->pcells_out); - if ((p->pcells_out = malloc(sizeof(struct pcell) * p->size_pcells_out)) == - NULL) + if (posix_memalign((void **)&p->pcells_out, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct pcell) * p->size_pcells_out) != 0) error("Failed to allocate pcell_out buffer."); for (int ind = 0, k = 0; k < p->nr_cells_out; k++) { memcpy(&p->pcells_out[ind], p->cells_out[k]->pcell, @@ -102,8 +102,8 @@ void proxy_cells_exch2(struct proxy *p) { /* Re-allocate the pcell_in buffer. */ if (p->pcells_in != NULL) free(p->pcells_in); - if ((p->pcells_in = (struct pcell *)malloc(sizeof(struct pcell) * - p->size_pcells_in)) == NULL) + if (posix_memalign((void **)&p->pcells_in, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct pcell) * p->size_pcells_in) != 0) error("Failed to allocate pcell_in buffer."); /* Receive the particle buffers. */ diff --git a/src/queue.h b/src/queue.h index c0a2fb1da6e6e3cbea813a0ef53841084ab0f933..951a3e5a056d7ad0c3935f98341a0d93c805e3ad 100644 --- a/src/queue.h +++ b/src/queue.h @@ -30,6 +30,7 @@ #define queue_sizegrow 2 #define queue_search_window 8 #define queue_incoming_size 1024 +#define queue_struct_align 64 /* Counters. */ enum { @@ -57,7 +58,7 @@ struct queue { int *tid_incoming; volatile unsigned int first_incoming, last_incoming, count_incoming; -} __attribute__((aligned(64))); +} __attribute__((aligned(queue_struct_align))); /* Function prototypes. */ struct task *queue_gettask(struct queue *q, const struct task *prev, diff --git a/src/scheduler.c b/src/scheduler.c index 0d7c8c4754bac931c7886200176e3e9441c63c53..6dc7d67a1467333ea08ed2c72795c172d9a942e1 100644 --- a/src/scheduler.c +++ b/src/scheduler.c @@ -1408,8 +1408,8 @@ void scheduler_init(struct scheduler *s, struct space *space, int nr_tasks, lock_init(&s->lock); /* Allocate the queues. */ - if ((s->queues = (struct queue *)malloc(sizeof(struct queue) * nr_queues)) == - NULL) + if (posix_memalign((void **)&s->queues, queue_struct_align, + sizeof(struct queue) * nr_queues) != 0) error("Failed to allocate queues."); /* Initialize each queue. */ diff --git a/src/space.c b/src/space.c index a9d50dc3e94b5b320817f8ada5f81410fefdb93e..84af59d30fdebe839212ea9119aec92d2384802f 100644 --- a/src/space.c +++ b/src/space.c @@ -174,18 +174,68 @@ int space_getsid(struct space *s, struct cell **ci, struct cell **cj, * * @param s The #space. * @param c The #cell to recycle. + * @param rec_begin Pointer to the start of the list of cells to recycle. + * @param rec_end Pointer to the end of the list of cells to recycle. */ -void space_rebuild_recycle(struct space *s, struct cell *c) { - +void space_rebuild_recycle_rec(struct space *s, struct cell *c, + struct cell **rec_begin, struct cell **rec_end) { if (c->split) for (int k = 0; k < 8; k++) if (c->progeny[k] != NULL) { - space_rebuild_recycle(s, c->progeny[k]); - space_recycle(s, c->progeny[k]); + space_rebuild_recycle_rec(s, c->progeny[k], rec_begin, rec_end); + c->progeny[k]->next = *rec_begin; + *rec_begin = c->progeny[k]; + if (*rec_end == NULL) *rec_end = *rec_begin; c->progeny[k] = NULL; } } +void space_rebuild_recycle_mapper(void *map_data, int num_elements, + void *extra_data) { + + struct space *s = (struct space *)extra_data; + struct cell *cells = (struct cell *)map_data; + + for (int k = 0; k < num_elements; k++) { + struct cell *c = &cells[k]; + struct cell *rec_begin = NULL, *rec_end = NULL; + space_rebuild_recycle_rec(s, c, &rec_begin, &rec_end); + if (rec_begin != NULL) space_recycle_list(s, rec_begin, rec_end); + c->sorts = NULL; + c->nr_tasks = 0; + c->density = NULL; + c->gradient = NULL; + c->force = NULL; + c->grav = NULL; + c->dx_max = 0.0f; + c->sorted = 0; + c->count = 0; + c->gcount = 0; + c->init = NULL; + c->extra_ghost = NULL; + c->ghost = NULL; + c->kick = NULL; + c->cooling = NULL; + c->sourceterms = NULL; + c->super = c; + if (c->sort != NULL) { + free(c->sort); + c->sort = NULL; + } +#if WITH_MPI + c->recv_xv = NULL; + c->recv_rho = NULL; + c->recv_gradient = NULL; + c->recv_ti = NULL; + + c->send_xv = NULL; + c->send_rho = NULL; + c->send_gradient = NULL; + c->send_ti = NULL; +#endif + } +} + /** * @brief Re-build the top-level cell grid. * @@ -298,10 +348,8 @@ void space_regrid(struct space *s, int verbose) { /* Free the old cells, if they were allocated. */ if (s->cells_top != NULL) { - for (int k = 0; k < s->nr_cells; k++) { - space_rebuild_recycle(s, &s->cells_top[k]); - if (s->cells_top[k].sort != NULL) free(s->cells_top[k].sort); - } + threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, + s->cells_top, s->nr_cells, sizeof(struct cell), 100, s); free(s->cells_top); s->maxdepth = 0; } @@ -389,42 +437,12 @@ void space_regrid(struct space *s, int verbose) { // message( "rebuilding upper-level cells took %.3f %s." , // clocks_from_ticks(double)(getticks() - tic), clocks_getunit()); - } /* re-build upper-level cells? */ - + } /* re-build upper-level cells? */ else { /* Otherwise, just clean up the cells. */ /* Free the old cells, if they were allocated. */ - for (int k = 0; k < s->nr_cells; k++) { - space_rebuild_recycle(s, &s->cells_top[k]); - s->cells_top[k].sorts = NULL; - s->cells_top[k].nr_tasks = 0; - s->cells_top[k].density = NULL; - s->cells_top[k].gradient = NULL; - s->cells_top[k].force = NULL; - s->cells_top[k].grav = NULL; - s->cells_top[k].dx_max = 0.0f; - s->cells_top[k].sorted = 0; - s->cells_top[k].count = 0; - s->cells_top[k].gcount = 0; - s->cells_top[k].init = NULL; - s->cells_top[k].extra_ghost = NULL; - s->cells_top[k].ghost = NULL; - s->cells_top[k].kick = NULL; - s->cells_top[k].cooling = NULL; - s->cells_top[k].sourceterms = NULL; - s->cells_top[k].super = &s->cells_top[k]; -#if WITH_MPI - s->cells_top[k].recv_xv = NULL; - s->cells_top[k].recv_rho = NULL; - s->cells_top[k].recv_gradient = NULL; - s->cells_top[k].recv_ti = NULL; - - s->cells_top[k].send_xv = NULL; - s->cells_top[k].send_rho = NULL; - s->cells_top[k].send_gradient = NULL; - s->cells_top[k].send_ti = NULL; -#endif - } + threadpool_map(&s->e->threadpool, space_rebuild_recycle_mapper, + s->cells_top, s->nr_cells, sizeof(struct cell), 100, s); s->maxdepth = 0; } @@ -1441,9 +1459,12 @@ void space_map_cells_pre(struct space *s, int full, * @param s The #space in which the cell lives. * @param c The #cell to split recursively. * @param buff A buffer for particle sorting, should be of size at least - * max(c->count, c->gount) or @c NULL. + * c->count or @c NULL. + * @param gbuff A buffer for particle sorting, should be of size at least + * c->gcount or @c NULL. */ -void space_split_recursive(struct space *s, struct cell *c, int *buff) { +void space_split_recursive(struct space *s, struct cell *c, + struct cell_buff *buff, struct cell_buff *gbuff) { const int count = c->count; const int gcount = c->gcount; @@ -1458,10 +1479,29 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) { struct engine *e = s->e; /* If the buff is NULL, allocate it, and remember to free it. */ - const int allocate_buffer = (buff == NULL); - if (allocate_buffer && - (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) - error("Failed to allocate temporary indices."); + const int allocate_buffer = (buff == NULL && gbuff == NULL); + if (allocate_buffer) { + if (count > 0) { + if (posix_memalign((void *)&buff, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct cell_buff) * count) != 0) + error("Failed to allocate temporary indices."); + for (int k = 0; k < count; k++) { + buff[k].x[0] = parts[k].x[0]; + buff[k].x[1] = parts[k].x[1]; + buff[k].x[2] = parts[k].x[2]; + } + } + if (gcount > 0) { + if (posix_memalign((void *)&gbuff, SWIFT_STRUCT_ALIGNMENT, + sizeof(struct cell_buff) * gcount) != 0) + error("Failed to allocate temporary indices."); + for (int k = 0; k < gcount; k++) { + gbuff[k].x[0] = gparts[k].x[0]; + gbuff[k].x[1] = gparts[k].x[1]; + gbuff[k].x[2] = gparts[k].x[2]; + } + } + } /* Check the depth. */ while (depth > (maxdepth = s->maxdepth)) { @@ -1507,15 +1547,18 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) { } /* Split the cell data. */ - cell_split(c, c->parts - s->parts, buff); + cell_split(c, c->parts - s->parts, buff, gbuff); /* Remove any progeny with zero parts. */ + struct cell_buff *progeny_buff = buff, *progeny_gbuff = gbuff; for (int k = 0; k < 8; k++) if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { space_recycle(s, c->progeny[k]); c->progeny[k] = NULL; } else { - space_split_recursive(s, c->progeny[k], buff); + space_split_recursive(s, c->progeny[k], progeny_buff, progeny_gbuff); + progeny_buff += c->progeny[k]->count; + progeny_gbuff += c->progeny[k]->gcount; h_max = max(h_max, c->progeny[k]->h_max); ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); @@ -1574,7 +1617,10 @@ void space_split_recursive(struct space *s, struct cell *c, int *buff) { c->owner = 0; /* Ok, there is really nothing on this rank... */ /* Clean up. */ - if (allocate_buffer) free(buff); + if (allocate_buffer) { + if (buff != NULL) free(buff); + if (gbuff != NULL) free(gbuff); + } } /** @@ -1593,7 +1639,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { for (int ind = 0; ind < num_cells; ind++) { struct cell *c = &cells_top[ind]; - space_split_recursive(s, c, NULL); + space_split_recursive(s, c, NULL, NULL); } #ifdef SWIFT_DEBUG_CHECKS @@ -1601,30 +1647,28 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { for (int ind = 0; ind < num_cells; ind++) { int depth = 0; if (!checkCellhdxmax(&cells_top[ind], &depth)) - message(" at cell depth %d", depth); + error(" at cell depth %d", depth); } #endif } /** - * @brief Return a used cell to the buffer od unused sub-cells. + * @brief Return a used cell to the buffer of unused sub-cells. * * @param s The #space. * @param c The #cell. */ void space_recycle(struct space *s, struct cell *c) { - /* Lock the space. */ - lock_lock(&s->lock); - /* Clear the cell. */ - if (lock_destroy(&c->lock) != 0) error("Failed to destroy spinlock."); + if (lock_destroy(&c->lock) != 0 || lock_destroy(&c->glock) != 0) + error("Failed to destroy spinlock."); /* Clear this cell's sort arrays. */ if (c->sort != NULL) free(c->sort); - /* Clear the cell data. */ - bzero(c, sizeof(struct cell)); + /* Lock the space. */ + lock_lock(&s->lock); /* Hook this cell into the buffer. */ c->next = s->cells_sub; @@ -1634,6 +1678,47 @@ void space_recycle(struct space *s, struct cell *c) { /* Unlock the space. */ lock_unlock_blind(&s->lock); } + +/** + * @brief Return a list of used cells to the buffer of unused sub-cells. + * + * @param s The #space. + * @param list_begin Pointer to the first #cell in the linked list of + * cells joined by their @c next pointers. + * @param list_end Pointer to the last #cell in the linked list of + * cells joined by their @c next pointers. It is assumed that this + * cell's @c next pointer is @c NULL. + */ +void space_recycle_list(struct space *s, struct cell *list_begin, + struct cell *list_end) { + + int count = 0; + + /* Clean up the list of cells. */ + for (struct cell *c = list_begin; c != NULL; c = c->next) { + /* Clear the cell. */ + if (lock_destroy(&c->lock) != 0 || lock_destroy(&c->glock) != 0) + error("Failed to destroy spinlock."); + + /* Clear this cell's sort arrays. */ + if (c->sort != NULL) free(c->sort); + + /* Count this cell. */ + count += 1; + } + + /* Lock the space. */ + lock_lock(&s->lock); + + /* Hook this cell into the buffer. */ + list_end->next = s->cells_sub; + s->cells_sub = list_begin; + s->tot_cells -= count; + + /* Unlock the space. */ + lock_unlock_blind(&s->lock); +} + /** * @brief Get a new empty (sub-)#cell. * @@ -1653,9 +1738,6 @@ struct cell *space_getcell(struct space *s) { space_cellallocchunk * sizeof(struct cell)) != 0) error("Failed to allocate more cells."); - /* Zero everything for good measure */ - bzero(s->cells_sub, space_cellallocchunk * sizeof(struct cell)); - /* Constructed a linked list */ for (int k = 0; k < space_cellallocchunk - 1; k++) s->cells_sub[k].next = &s->cells_sub[k + 1]; diff --git a/src/space.h b/src/space.h index 36699d1d5091cac4dbd98e3ceeae7547c1012aed..1103cc4867c0ff7ded700331a1a4ff540233e8a7 100644 --- a/src/space.h +++ b/src/space.h @@ -178,6 +178,8 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, void *extra_data); void space_rebuild(struct space *s, int verbose); void space_recycle(struct space *s, struct cell *c); +void space_recycle_list(struct space *s, struct cell *list_begin, + struct cell *list_end); void space_split(struct space *s, struct cell *cells, int nr_cells, int verbose); void space_split_mapper(void *map_data, int num_elements, void *extra_data); diff --git a/src/threadpool.c b/src/threadpool.c index 6bc887d96cb72804f0fbc8e2801a6522bf27f947..c11fd8121bb02f36fce1796d79a7eb55a38102c4 100644 --- a/src/threadpool.c +++ b/src/threadpool.c @@ -91,6 +91,10 @@ void threadpool_init(struct threadpool *tp, int num_threads) { tp->num_threads = num_threads; tp->num_threads_waiting = 0; + /* If there is only a single thread, do nothing more as of here as + we will just do work in the (blocked) calling thread. */ + if (num_threads == 1) return; + /* Init the threadpool mutexes. */ if (pthread_mutex_init(&tp->thread_mutex, NULL) != 0) error("Failed to initialize mutexex."); @@ -144,6 +148,12 @@ void threadpool_map(struct threadpool *tp, threadpool_map_function map_function, void *map_data, size_t N, int stride, int chunk, void *extra_data) { + /* If we just have a single thread, call the map function directly. */ + if (tp->num_threads == 1) { + map_function(map_data, N, extra_data); + return; + } + /* Set the map data and signal the threads. */ pthread_mutex_lock(&tp->thread_mutex); tp->map_data_stride = stride; diff --git a/tests/testInteractions.c b/tests/testInteractions.c index d14c840ec77819bbef5750b897c72139f4d7b2b4..4faebbbdb35c4bc36cca16cb207517f05629f8c1 100644 --- a/tests/testInteractions.c +++ b/tests/testInteractions.c @@ -150,6 +150,9 @@ void write_header(char *fileName) { * * @param parts Particle array to be interacted * @param count No. of particles to be interacted + * @param serial_inter_func Function pointer to serial interaction function + * @param vec_inter_func Function pointer to vectorised interaction function + * @param filePrefix Prefix of output files * */ void test_interactions(struct part *parts, int count,