From 6d749362921bec449d891a8e00456521d1f21279 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Sun, 6 Nov 2016 22:03:56 +0100 Subject: [PATCH 01/23] only count the parts per cell after the particles have been sorted. --- src/space.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/space.c b/src/space.c index 18f0e21e8..4578006ae 100644 --- a/src/space.c +++ b/src/space.c @@ -453,20 +453,18 @@ void space_rebuild(struct space *s, int verbose) { const int ti_current = (s->e != NULL) ? s->e->ti_current : 0; /* Run through the particles and get their cell index. */ - const size_t ind_size = s->size_parts; + const size_t ind_size = s->size_parts + 1; int *ind; if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL) error("Failed to allocate temporary particle indices."); if (ind_size > 0) space_parts_get_cell_index(s, ind, cells_top, verbose); - for (size_t i = 0; i < s->nr_parts; ++i) cells_top[ind[i]].count++; /* Run through the gravity particles and get their cell index. */ - const size_t gind_size = s->size_gparts; + const size_t gind_size = s->size_gparts + 1; int *gind; if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL) error("Failed to allocate temporary g-particle indices."); if (gind_size > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose); - for (size_t i = 0; i < s->nr_gparts; ++i) cells_top[gind[i]].gcount++; #ifdef WITH_MPI @@ -474,7 +472,6 @@ void space_rebuild(struct space *s, int verbose) { const int local_nodeID = s->e->nodeID; for (size_t k = 0; k < nr_parts;) { if (cells_top[ind[k]].nodeID != local_nodeID) { - cells_top[ind[k]].count -= 1; nr_parts -= 1; const struct part tp = s->parts[k]; s->parts[k] = s->parts[nr_parts]; @@ -514,7 +511,6 @@ void space_rebuild(struct space *s, int verbose) { /* Move non-local gparts to the end of the list. */ for (size_t k = 0; k < nr_gparts;) { if (cells_top[gind[k]].nodeID != local_nodeID) { - cells_top[gind[k]].gcount -= 1; nr_gparts -= 1; const struct gpart tp = s->gparts[k]; s->gparts[k] = s->gparts[nr_gparts]; @@ -561,9 +557,9 @@ void space_rebuild(struct space *s, int verbose) { s->nr_gparts = nr_gparts + nr_gparts_exchanged; /* Re-allocate the index array if needed.. */ - if (s->nr_parts > ind_size) { + if (s->nr_parts + 1 > ind_size) { int *ind_new; - if ((ind_new = (int *)malloc(sizeof(int) * s->nr_parts)) == NULL) + if ((ind_new = (int *)malloc(sizeof(int) * (s->nr_parts + 1))) == NULL) error("Failed to allocate temporary particle indices."); memcpy(ind_new, ind, sizeof(int) * nr_parts); free(ind); @@ -578,7 +574,6 @@ void space_rebuild(struct space *s, int verbose) { const struct part *const p = &s->parts[k]; ind[k] = cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); - cells_top[ind[k]].count += 1; #ifdef SWIFT_DEBUG_CHECKS if (cells_top[ind[k]].nodeID != local_nodeID) error("Received part that does not belong to me (nodeID=%i).", @@ -608,15 +603,25 @@ void space_rebuild(struct space *s, int verbose) { } #endif + /* Extract the cell counts from the sorted indices. */ + size_t last_index = 0; + ind[nr_parts] = s->nr_cells; // sentinel. + for (size_t k = 0; k < nr_parts; k++) { + if (ind[k] < ind[k + 1]) { + cells_top[ind[k]].count = k - last_index + 1; + last_index = k + 1; + } + } + /* We no longer need the indices as of here. */ free(ind); #ifdef WITH_MPI /* Re-allocate the index array if needed.. */ - if (s->nr_gparts > gind_size) { + if (s->nr_gparts + 1 > gind_size) { int *gind_new; - if ((gind_new = (int *)malloc(sizeof(int) * s->nr_gparts)) == NULL) + if ((gind_new = (int *)malloc(sizeof(int) * (s->nr_gparts + 1))) == NULL) error("Failed to allocate temporary g-particle indices."); memcpy(gind_new, gind, sizeof(int) * nr_gparts); free(gind); @@ -628,7 +633,6 @@ void space_rebuild(struct space *s, int verbose) { const struct gpart *const p = &s->gparts[k]; gind[k] = cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); - cells_top[gind[k]].gcount += 1; #ifdef SWIFT_DEBUG_CHECKS if (cells_top[ind[k]].nodeID != s->e->nodeID) @@ -647,6 +651,16 @@ void space_rebuild(struct space *s, int verbose) { if (nr_parts > 0 && nr_gparts > 0) part_relink_parts(s->gparts, nr_gparts, s->parts); + /* Extract the cell counts from the sorted indices. */ + size_t last_gindex = 0; + gind[nr_parts] = s->nr_cells; + for (size_t k = 0; k < nr_gparts; k++) { + if (gind[k] < gind[k + 1]) { + cells_top[ind[k]].gcount = k - last_gindex + 1; + last_gindex = k + 1; + } + } + /* We no longer need the indices as of here. */ free(gind); -- GitLab From c5918242bc47add9c9a484cb20c2e45a08390d00 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Sun, 6 Nov 2016 22:09:52 +0100 Subject: [PATCH 02/23] leave some buffer in the ind and gind arrays in case we get more strays than we lose. --- src/space.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/space.c b/src/space.c index 4578006ae..e9611548b 100644 --- a/src/space.c +++ b/src/space.c @@ -453,14 +453,14 @@ void space_rebuild(struct space *s, int verbose) { const int ti_current = (s->e != NULL) ? s->e->ti_current : 0; /* Run through the particles and get their cell index. */ - const size_t ind_size = s->size_parts + 1; + const size_t ind_size = s->size_parts + 100; int *ind; if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL) error("Failed to allocate temporary particle indices."); if (ind_size > 0) space_parts_get_cell_index(s, ind, cells_top, verbose); /* Run through the gravity particles and get their cell index. */ - const size_t gind_size = s->size_gparts + 1; + const size_t gind_size = s->size_gparts + 100; int *gind; if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL) error("Failed to allocate temporary g-particle indices."); -- GitLab From 3bdd372aa0a340b614d85064251dea59269227e1 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Mon, 7 Nov 2016 22:35:15 +0100 Subject: [PATCH 03/23] put clearing the cells and sub-cells into a mapper. --- src/space.c | 83 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 34 deletions(-) diff --git a/src/space.c b/src/space.c index e9611548b..630c576ce 100644 --- a/src/space.c +++ b/src/space.c @@ -184,6 +184,49 @@ void space_rebuild_recycle(struct space *s, struct cell *c) { } } +/** + * @brief Mapper function to clean out the cell hierarchies for a regrid. + */ + +void space_clear_cells_mapper(void *map_data, int num_elements, void *extra_data) { + + struct cell *cells = (struct cell *)map_data; + struct space *s = (struct space *)extra_data; + + for (int k = 0; k < num_elements; k++) { + struct cell *c = &cells[k]; + space_rebuild_recycle(s, c); + c->sorts = NULL; + c->nr_tasks = 0; + c->density = NULL; + c->gradient = NULL; + c->force = NULL; + c->grav = NULL; + c->dx_max = 0.0f; + c->sorted = 0; + c->count = 0; + c->gcount = 0; + c->init = NULL; + c->extra_ghost = NULL; + c->ghost = NULL; + c->kick = NULL; + c->cooling = NULL; + c->sourceterms = NULL; + c->super = c; +#if WITH_MPI + c->recv_xv = NULL; + c->recv_rho = NULL; + c->recv_gradient = NULL; + c->recv_ti = NULL; + + c->send_xv = NULL; + c->send_rho = NULL; + c->send_gradient = NULL; + c->send_ti = NULL; +#endif + } +} + /** * @brief Re-build the top-level cell grid. * @@ -381,7 +424,7 @@ void space_regrid(struct space *s, int verbose) { /* Finished with these. */ free(oldnodeIDs); } -#endif +#endif /* WITH_MPI */ // message( "rebuilding upper-level cells took %.3f %s." , // clocks_from_ticks(double)(getticks() - tic), clocks_getunit()); @@ -391,37 +434,9 @@ void space_regrid(struct space *s, int verbose) { else { /* Otherwise, just clean up the cells. */ /* Free the old cells, if they were allocated. */ - for (int k = 0; k < s->nr_cells; k++) { - space_rebuild_recycle(s, &s->cells_top[k]); - s->cells_top[k].sorts = NULL; - s->cells_top[k].nr_tasks = 0; - s->cells_top[k].density = NULL; - s->cells_top[k].gradient = NULL; - s->cells_top[k].force = NULL; - s->cells_top[k].grav = NULL; - s->cells_top[k].dx_max = 0.0f; - s->cells_top[k].sorted = 0; - s->cells_top[k].count = 0; - s->cells_top[k].gcount = 0; - s->cells_top[k].init = NULL; - s->cells_top[k].extra_ghost = NULL; - s->cells_top[k].ghost = NULL; - s->cells_top[k].kick = NULL; - s->cells_top[k].cooling = NULL; - s->cells_top[k].sourceterms = NULL; - s->cells_top[k].super = &s->cells_top[k]; -#if WITH_MPI - s->cells_top[k].recv_xv = NULL; - s->cells_top[k].recv_rho = NULL; - s->cells_top[k].recv_gradient = NULL; - s->cells_top[k].recv_ti = NULL; - - s->cells_top[k].send_xv = NULL; - s->cells_top[k].send_rho = NULL; - s->cells_top[k].send_gradient = NULL; - s->cells_top[k].send_ti = NULL; -#endif - } + threadpool_map(&s->e->threadpool, space_clear_cells_mapper, + s->cells_top, s->nr_cells, sizeof(struct cell), 100, s); + // space_clear_cells_mapper(s->cells_top, s->nr_cells, s); s->maxdepth = 0; } @@ -457,14 +472,14 @@ void space_rebuild(struct space *s, int verbose) { int *ind; if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL) error("Failed to allocate temporary particle indices."); - if (ind_size > 0) space_parts_get_cell_index(s, ind, cells_top, verbose); + if (s->size_parts > 0) space_parts_get_cell_index(s, ind, cells_top, verbose); /* Run through the gravity particles and get their cell index. */ const size_t gind_size = s->size_gparts + 100; int *gind; if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL) error("Failed to allocate temporary g-particle indices."); - if (gind_size > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose); + if (s->size_gparts > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose); #ifdef WITH_MPI -- GitLab From fcbab9ea15f3da4a9786a0e737e21697222d9d2e Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Mon, 7 Nov 2016 23:06:37 +0100 Subject: [PATCH 04/23] make space_recycle non-blocking and don't clear the cells before returning them. --- src/space.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/space.c b/src/space.c index 630c576ce..645dda184 100644 --- a/src/space.c +++ b/src/space.c @@ -1602,25 +1602,15 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { */ void space_recycle(struct space *s, struct cell *c) { - /* Lock the space. */ - lock_lock(&s->lock); - /* Clear the cell. */ if (lock_destroy(&c->lock) != 0) error("Failed to destroy spinlock."); /* Clear this cell's sort arrays. */ if (c->sort != NULL) free(c->sort); - /* Clear the cell data. */ - bzero(c, sizeof(struct cell)); - /* Hook this cell into the buffer. */ - c->next = s->cells_sub; - s->cells_sub = c; - s->tot_cells -= 1; - - /* Unlock the space. */ - lock_unlock_blind(&s->lock); + c->next = atomic_swap(&s->cells_sub, c); + atomic_dec(&s->tot_cells); } /** -- GitLab From 4debd3c46939741cec6e66410b504d8c975fa31d Mon Sep 17 00:00:00 2001 From: Matthieu Schaller Date: Tue, 8 Nov 2016 12:09:39 +0000 Subject: [PATCH 05/23] Only sort particles in top-level cells if you have particles --- src/space.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/space.c b/src/space.c index 645dda184..f88e2450d 100644 --- a/src/space.c +++ b/src/space.c @@ -600,7 +600,7 @@ void space_rebuild(struct space *s, int verbose) { #endif /* WITH_MPI */ /* Sort the parts according to their cells. */ - space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); + if(nr_parts > 0) space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); /* Re-link the gparts. */ if (nr_parts > 0 && nr_gparts > 0) part_relink_gparts(s->parts, nr_parts, 0); @@ -660,7 +660,7 @@ void space_rebuild(struct space *s, int verbose) { #endif /* Sort the gparts according to their cells. */ - space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); + if(nr_gparts > 0) space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); /* Re-link the parts. */ if (nr_parts > 0 && nr_gparts > 0) -- GitLab From 61295a905c16902f9e06f24762312a8691aadc21 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Tue, 8 Nov 2016 22:25:55 +0100 Subject: [PATCH 06/23] formatting. --- src/space.c | 74 +++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 36 deletions(-) diff --git a/src/space.c b/src/space.c index 645dda184..598e8b11b 100644 --- a/src/space.c +++ b/src/space.c @@ -187,42 +187,43 @@ void space_rebuild_recycle(struct space *s, struct cell *c) { /** * @brief Mapper function to clean out the cell hierarchies for a regrid. */ - -void space_clear_cells_mapper(void *map_data, int num_elements, void *extra_data) { - + +void space_clear_cells_mapper(void *map_data, int num_elements, + void *extra_data) { + struct cell *cells = (struct cell *)map_data; struct space *s = (struct space *)extra_data; - + for (int k = 0; k < num_elements; k++) { struct cell *c = &cells[k]; - space_rebuild_recycle(s, c); - c->sorts = NULL; - c->nr_tasks = 0; - c->density = NULL; - c->gradient = NULL; - c->force = NULL; - c->grav = NULL; - c->dx_max = 0.0f; - c->sorted = 0; - c->count = 0; - c->gcount = 0; - c->init = NULL; - c->extra_ghost = NULL; - c->ghost = NULL; - c->kick = NULL; - c->cooling = NULL; - c->sourceterms = NULL; - c->super = c; + space_rebuild_recycle(s, c); + c->sorts = NULL; + c->nr_tasks = 0; + c->density = NULL; + c->gradient = NULL; + c->force = NULL; + c->grav = NULL; + c->dx_max = 0.0f; + c->sorted = 0; + c->count = 0; + c->gcount = 0; + c->init = NULL; + c->extra_ghost = NULL; + c->ghost = NULL; + c->kick = NULL; + c->cooling = NULL; + c->sourceterms = NULL; + c->super = c; #if WITH_MPI - c->recv_xv = NULL; - c->recv_rho = NULL; - c->recv_gradient = NULL; - c->recv_ti = NULL; - - c->send_xv = NULL; - c->send_rho = NULL; - c->send_gradient = NULL; - c->send_ti = NULL; + c->recv_xv = NULL; + c->recv_rho = NULL; + c->recv_gradient = NULL; + c->recv_ti = NULL; + + c->send_xv = NULL; + c->send_rho = NULL; + c->send_gradient = NULL; + c->send_ti = NULL; #endif } } @@ -424,7 +425,7 @@ void space_regrid(struct space *s, int verbose) { /* Finished with these. */ free(oldnodeIDs); } -#endif /* WITH_MPI */ +#endif /* WITH_MPI */ // message( "rebuilding upper-level cells took %.3f %s." , // clocks_from_ticks(double)(getticks() - tic), clocks_getunit()); @@ -434,8 +435,8 @@ void space_regrid(struct space *s, int verbose) { else { /* Otherwise, just clean up the cells. */ /* Free the old cells, if they were allocated. */ - threadpool_map(&s->e->threadpool, space_clear_cells_mapper, - s->cells_top, s->nr_cells, sizeof(struct cell), 100, s); + threadpool_map(&s->e->threadpool, space_clear_cells_mapper, s->cells_top, + s->nr_cells, sizeof(struct cell), 100, s); // space_clear_cells_mapper(s->cells_top, s->nr_cells, s); s->maxdepth = 0; } @@ -479,7 +480,8 @@ void space_rebuild(struct space *s, int verbose) { int *gind; if ((gind = (int *)malloc(sizeof(int) * gind_size)) == NULL) error("Failed to allocate temporary g-particle indices."); - if (s->size_gparts > 0) space_gparts_get_cell_index(s, gind, cells_top, verbose); + if (s->size_gparts > 0) + space_gparts_get_cell_index(s, gind, cells_top, verbose); #ifdef WITH_MPI @@ -1526,7 +1528,7 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { } /* Split the cell data. */ - cell_split(c, c->parts - s->parts); + cell_split(c, c->parts - s->parts, NULL); /* Remove any progeny with zero parts. */ for (int k = 0; k < 8; k++) -- GitLab From 4f696e9bc2412d4a07c39527121f62481a640245 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Tue, 8 Nov 2016 22:46:12 +0100 Subject: [PATCH 07/23] use a one-pass bucket sort to split cells. --- src/cell.c | 249 +++++++++++++++++++---------------------------- src/cell.h | 2 +- src/space.c | 272 ++++++++++++++++++++++++++++------------------------ 3 files changed, 246 insertions(+), 277 deletions(-) diff --git a/src/cell.c b/src/cell.c index 573272d05..60bd3e799 100644 --- a/src/cell.c +++ b/src/cell.c @@ -52,6 +52,7 @@ #include "gravity.h" #include "hydro.h" #include "hydro_properties.h" +#include "minmax.h" #include "scheduler.h" #include "space.h" #include "timers.h" @@ -462,122 +463,79 @@ void cell_gunlocktree(struct cell *c) { * @param c The #cell array to be sorted. * @param parts_offset Offset of the cell parts array relative to the * space's parts array, i.e. c->parts - s->parts. + * @param buff A buffer with at least max(c->count, c->gcount) entries, + * used for sorting indices. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset) { +void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { - int i, j; const int count = c->count, gcount = c->gcount; struct part *parts = c->parts; struct xpart *xparts = c->xparts; struct gpart *gparts = c->gparts; - int left[8], right[8]; - double pivot[3]; - - /* Init the pivots. */ - for (int k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->width[k] / 2; - - /* Split along the x-axis. */ - i = 0; - j = count - 1; - while (i <= j) { - while (i <= count - 1 && parts[i].x[0] <= pivot[0]) i += 1; - while (j >= 0 && parts[j].x[0] > pivot[0]) j -= 1; - if (i < j) { - struct part temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - struct xpart xtemp = xparts[i]; - xparts[i] = xparts[j]; - xparts[j] = xtemp; - } + const double pivot[3] = {c->loc[0] + c->width[0] / 2, + c->loc[1] + c->width[1] / 2, + c->loc[2] + c->width[2] / 2}; + int bucket_count[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + int bucket_offset[9]; + + /* If the buff is NULL, allocate it, and remember to free it. */ + const int allocate_buffer = (buff == NULL); + if (allocate_buffer && + (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) + error("Failed to allocate temporary indices."); + + /* Fill the buffer with the indices. */ + for (int k = 0; k < count; k++) { + const int bid = (parts[k].x[0] > pivot[0]) * 4 + + (parts[k].x[1] > pivot[1]) * 2 + (parts[k].x[2] > pivot[2]); + bucket_count[bid]++; + buff[k] = bid; } -#ifdef SWIFT_DEBUG_CHECKS - for (int k = 0; k <= j; k++) - if (parts[k].x[0] > pivot[0]) error("cell_split: sorting failed."); - for (int k = i; k < count; k++) - if (parts[k].x[0] < pivot[0]) error("cell_split: sorting failed."); -#endif - - left[1] = i; - right[1] = count - 1; - left[0] = 0; - right[0] = j; - - /* Split along the y axis, twice. */ - for (int k = 1; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && parts[i].x[1] <= pivot[1]) i += 1; - while (j >= left[k] && parts[j].x[1] > pivot[1]) j -= 1; - if (i < j) { - struct part temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - struct xpart xtemp = xparts[i]; - xparts[i] = xparts[j]; - xparts[j] = xtemp; - } - } - -#ifdef SWIFT_DEBUG_CHECKS - for (int kk = left[k]; kk <= j; kk++) - if (parts[kk].x[1] > pivot[1]) { - message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); - error("sorting failed (left)."); - } - for (int kk = i; kk <= right[k]; kk++) - if (parts[kk].x[1] < pivot[1]) error("sorting failed (right)."); -#endif - - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; + /* Set the buffer offsets. */ + bucket_offset[0] = 0; + for (int k = 1; k <= 8; k++) { + bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1]; + bucket_count[k - 1] = 0; } - /* Split along the z axis, four times. */ - for (int k = 3; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && parts[i].x[2] <= pivot[2]) i += 1; - while (j >= left[k] && parts[j].x[2] > pivot[2]) j -= 1; - if (i < j) { - struct part temp = parts[i]; - parts[i] = parts[j]; - parts[j] = temp; - struct xpart xtemp = xparts[i]; - xparts[i] = xparts[j]; - xparts[j] = xtemp; + /* Run through the buckets, and swap particles to their correct spot. */ + for (int bucket = 0; bucket < 8; bucket++) { + for (int k = bucket_offset[bucket] + bucket_count[bucket]; + k < bucket_offset[bucket + 1]; k++) { + int bid = buff[k]; + if (bid != bucket) { + struct part part = parts[k]; + struct xpart xpart = xparts[k]; + while (bid != bucket) { + int j = bucket_offset[bid] + bucket_count[bid]++; + while (buff[j] == bid) { + j++; + bucket_count[bid]++; + } + struct part temp_part = parts[j]; + struct xpart temp_xpart = xparts[j]; + int temp_buff = buff[j]; + parts[j] = part; + xparts[j] = xpart; + buff[j] = bid; + part = temp_part; + xpart = temp_xpart; + bid = temp_buff; + } + parts[k] = part; + xparts[k] = xpart; + buff[k] = bid; } + bucket_count[bid]++; } - -#ifdef SWIFT_DEBUG_CHECKS - for (int kk = left[k]; kk <= j; kk++) - if (parts[kk].x[2] > pivot[2]) { - message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); - error("sorting failed (left)."); - } - for (int kk = i; kk <= right[k]; kk++) - if (parts[kk].x[2] < pivot[2]) { - message("ival=[%i,%i], i=%i, j=%i.", left[k], right[k], i, j); - error("sorting failed (right)."); - } -#endif - - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; } /* Store the counts and offsets. */ for (int k = 0; k < 8; k++) { - c->progeny[k]->count = right[k] - left[k] + 1; - c->progeny[k]->parts = &c->parts[left[k]]; - c->progeny[k]->xparts = &c->xparts[left[k]]; + c->progeny[k]->count = bucket_count[k]; + c->progeny[k]->parts = &c->parts[bucket_offset[k]]; + c->progeny[k]->xparts = &c->xparts[bucket_offset[k]]; } /* Re-link the gparts. */ @@ -613,66 +571,55 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset) { #endif /* Now do the same song and dance for the gparts. */ - - /* Split along the x-axis. */ - i = 0; - j = gcount - 1; - while (i <= j) { - while (i <= gcount - 1 && gparts[i].x[0] <= pivot[0]) i += 1; - while (j >= 0 && gparts[j].x[0] > pivot[0]) j -= 1; - if (i < j) { - struct gpart gtemp = gparts[i]; - gparts[i] = gparts[j]; - gparts[j] = gtemp; - } + for (int k = 0; k < 8; k++) bucket_count[k] = 0; + + /* Fill the buffer with the indices. */ + for (int k = 0; k < gcount; k++) { + const int bid = (gparts[k].x[0] > pivot[0]) * 4 + + (gparts[k].x[1] > pivot[1]) * 2 + + (gparts[k].x[2] > pivot[2]); + bucket_count[bid]++; + buff[k] = bid; } - left[1] = i; - right[1] = gcount - 1; - left[0] = 0; - right[0] = j; - - /* Split along the y axis, twice. */ - for (int k = 1; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && gparts[i].x[1] <= pivot[1]) i += 1; - while (j >= left[k] && gparts[j].x[1] > pivot[1]) j -= 1; - if (i < j) { - struct gpart gtemp = gparts[i]; - gparts[i] = gparts[j]; - gparts[j] = gtemp; - } - } - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; + + /* Set the buffer offsets. */ + bucket_offset[0] = 0; + for (int k = 1; k <= 8; k++) { + bucket_offset[k] = bucket_offset[k - 1] + bucket_count[k - 1]; + bucket_count[k - 1] = 0; } - /* Split along the z axis, four times. */ - for (int k = 3; k >= 0; k--) { - i = left[k]; - j = right[k]; - while (i <= j) { - while (i <= right[k] && gparts[i].x[2] <= pivot[2]) i += 1; - while (j >= left[k] && gparts[j].x[2] > pivot[2]) j -= 1; - if (i < j) { - struct gpart gtemp = gparts[i]; - gparts[i] = gparts[j]; - gparts[j] = gtemp; + /* Run through the buckets, and swap particles to their correct spot. */ + for (int bucket = 0; bucket < 8; bucket++) { + for (int k = bucket_offset[bucket] + bucket_count[bucket]; + k < bucket_offset[bucket + 1]; k++) { + int bid = buff[k]; + if (bid != bucket) { + struct gpart gpart = gparts[k]; + while (bid != bucket) { + int j = bucket_offset[bid] + bucket_count[bid]++; + while (buff[j] == bid) { + j++; + bucket_count[bid]++; + } + struct gpart temp_gpart = gparts[j]; + int temp_buff = buff[j]; + gparts[j] = gpart; + buff[j] = bid; + gpart = temp_gpart; + bid = temp_buff; + } + gparts[k] = gpart; + buff[k] = bid; } + bucket_count[bid]++; } - left[2 * k + 1] = i; - right[2 * k + 1] = right[k]; - left[2 * k] = left[k]; - right[2 * k] = j; } /* Store the counts and offsets. */ for (int k = 0; k < 8; k++) { - c->progeny[k]->gcount = right[k] - left[k] + 1; - c->progeny[k]->gparts = &c->gparts[left[k]]; + c->progeny[k]->gcount = bucket_count[k]; + c->progeny[k]->gparts = &c->gparts[bucket_offset[k]]; } /* Re-link the parts. */ diff --git a/src/cell.h b/src/cell.h index 9e5bed091..d3df097f2 100644 --- a/src/cell.h +++ b/src/cell.h @@ -271,7 +271,7 @@ struct cell { ((int)(k) + (cdim)[2] * ((int)(j) + (cdim)[1] * (int)(i))) /* Function prototypes. */ -void cell_split(struct cell *c, ptrdiff_t parts_offset); +void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff); void cell_sanitize(struct cell *c); int cell_locktree(struct cell *c); void cell_unlocktree(struct cell *c); diff --git a/src/space.c b/src/space.c index 8fc2e3004..2f53223a6 100644 --- a/src/space.c +++ b/src/space.c @@ -602,7 +602,8 @@ void space_rebuild(struct space *s, int verbose) { #endif /* WITH_MPI */ /* Sort the parts according to their cells. */ - if(nr_parts > 0) space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); + if (nr_parts > 0) + space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1, verbose); /* Re-link the gparts. */ if (nr_parts > 0 && nr_gparts > 0) part_relink_gparts(s->parts, nr_parts, 0); @@ -662,7 +663,8 @@ void space_rebuild(struct space *s, int verbose) { #endif /* Sort the gparts according to their cells. */ - if(nr_gparts > 0) space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); + if (nr_gparts > 0) + space_gparts_sort(s, gind, nr_gparts, 0, s->nr_cells - 1, verbose); /* Re-link the parts. */ if (nr_parts > 0 && nr_gparts > 0) @@ -1455,144 +1457,164 @@ void space_map_cells_pre(struct space *s, int full, } /** - * @brief #threadpool mapper function to split cells if they contain - * too many particles. + * @brief Recursively split a cell. * - * @param map_data Pointer towards the top-cells. - * @param num_cells The number of cells to treat. - * @param extra_data Pointers to the #space. + * @param s The #space in which the cell lives. + * @param c The #cell to split recursively. + * @param buff A buffer for particle sorting, should be of size at least + * max(c->count, c->gount) or @c NULL. */ -void space_split_mapper(void *map_data, int num_cells, void *extra_data) { - - /* Unpack the inputs. */ - struct space *s = (struct space *)extra_data; - struct cell *restrict cells_top = (struct cell *)map_data; +void space_split_recursive(struct space *s, struct cell *c, int *buff) { + + const int count = c->count; + const int gcount = c->gcount; + const int depth = c->depth; + int maxdepth = 0; + float h_max = 0.0f; + int ti_end_min = max_nr_timesteps, ti_end_max = 0; + struct cell *temp; + struct part *parts = c->parts; + struct gpart *gparts = c->gparts; + struct xpart *xparts = c->xparts; struct engine *e = s->e; - for (int ind = 0; ind < num_cells; ind++) { + /* If the buff is NULL, allocate it, and remember to free it. */ + const int allocate_buffer = (buff == NULL); + if (allocate_buffer && + (buff = (int *)malloc(sizeof(int) * max(count, gcount))) == NULL) + error("Failed to allocate temporary indices."); - struct cell *c = &cells_top[ind]; + /* Check the depth. */ + while (depth > (maxdepth = s->maxdepth)) { + atomic_cas(&s->maxdepth, maxdepth, depth); + } - const int count = c->count; - const int gcount = c->gcount; - const int depth = c->depth; - int maxdepth = 0; - float h_max = 0.0f; - int ti_end_min = max_nr_timesteps, ti_end_max = 0; - struct cell *temp; - struct part *parts = c->parts; - struct gpart *gparts = c->gparts; - struct xpart *xparts = c->xparts; - - /* Check the depth. */ - while (depth > (maxdepth = s->maxdepth)) { - atomic_cas(&s->maxdepth, maxdepth, depth); - } + /* If the depth is too large, we have a problem and should stop. */ + if (maxdepth > space_cell_maxdepth) { + error("Exceeded maximum depth (%d) when splitting cells, aborting", + space_cell_maxdepth); + } - /* If the depth is too large, we have a problem and should stop. */ - if (maxdepth > space_cell_maxdepth) { - error("Exceeded maximum depth (%d) when splitting cells, aborting", - space_cell_maxdepth); + /* Split or let it be? */ + if (count > space_splitsize || gcount > space_splitsize) { + + /* No longer just a leaf. */ + c->split = 1; + + /* Create the cell's progeny. */ + for (int k = 0; k < 8; k++) { + temp = space_getcell(s); + temp->count = 0; + temp->gcount = 0; + temp->ti_old = e->ti_current; + temp->loc[0] = c->loc[0]; + temp->loc[1] = c->loc[1]; + temp->loc[2] = c->loc[2]; + temp->width[0] = c->width[0] / 2; + temp->width[1] = c->width[1] / 2; + temp->width[2] = c->width[2] / 2; + temp->dmin = c->dmin / 2; + if (k & 4) temp->loc[0] += temp->width[0]; + if (k & 2) temp->loc[1] += temp->width[1]; + if (k & 1) temp->loc[2] += temp->width[2]; + temp->depth = c->depth + 1; + temp->split = 0; + temp->h_max = 0.0; + temp->dx_max = 0.f; + temp->nodeID = c->nodeID; + temp->parent = c; + temp->super = NULL; + c->progeny[k] = temp; } - /* Split or let it be? */ - if (count > space_splitsize || gcount > space_splitsize) { - - /* No longer just a leaf. */ - c->split = 1; - - /* Create the cell's progeny. */ - for (int k = 0; k < 8; k++) { - temp = space_getcell(s); - temp->count = 0; - temp->gcount = 0; - temp->ti_old = e->ti_current; - temp->loc[0] = c->loc[0]; - temp->loc[1] = c->loc[1]; - temp->loc[2] = c->loc[2]; - temp->width[0] = c->width[0] / 2; - temp->width[1] = c->width[1] / 2; - temp->width[2] = c->width[2] / 2; - temp->dmin = c->dmin / 2; - if (k & 4) temp->loc[0] += temp->width[0]; - if (k & 2) temp->loc[1] += temp->width[1]; - if (k & 1) temp->loc[2] += temp->width[2]; - temp->depth = c->depth + 1; - temp->split = 0; - temp->h_max = 0.0; - temp->dx_max = 0.f; - temp->nodeID = c->nodeID; - temp->parent = c; - temp->super = NULL; - c->progeny[k] = temp; + /* Split the cell data. */ + cell_split(c, c->parts - s->parts, buff); + + /* Remove any progeny with zero parts. */ + for (int k = 0; k < 8; k++) + if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { + space_recycle(s, c->progeny[k]); + c->progeny[k] = NULL; + } else { + space_split_recursive(s, c->progeny[k], buff); + h_max = max(h_max, c->progeny[k]->h_max); + ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); + ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); + if (c->progeny[k]->maxdepth > maxdepth) + maxdepth = c->progeny[k]->maxdepth; } - /* Split the cell data. */ - cell_split(c, c->parts - s->parts, NULL); - - /* Remove any progeny with zero parts. */ - for (int k = 0; k < 8; k++) - if (c->progeny[k]->count == 0 && c->progeny[k]->gcount == 0) { - space_recycle(s, c->progeny[k]); - c->progeny[k] = NULL; - } else { - space_split_mapper(c->progeny[k], 1, s); - h_max = max(h_max, c->progeny[k]->h_max); - ti_end_min = min(ti_end_min, c->progeny[k]->ti_end_min); - ti_end_max = max(ti_end_max, c->progeny[k]->ti_end_max); - if (c->progeny[k]->maxdepth > maxdepth) - maxdepth = c->progeny[k]->maxdepth; - } + } + /* Otherwise, collect the data for this cell. */ + else { + + /* Clear the progeny. */ + bzero(c->progeny, sizeof(struct cell *) * 8); + c->split = 0; + maxdepth = c->depth; + + /* Get dt_min/dt_max. */ + for (int k = 0; k < count; k++) { + struct part *p = &parts[k]; + struct xpart *xp = &xparts[k]; + const float h = p->h; + const int ti_end = p->ti_end; + xp->x_diff[0] = 0.f; + xp->x_diff[1] = 0.f; + xp->x_diff[2] = 0.f; + if (h > h_max) h_max = h; + if (ti_end < ti_end_min) ti_end_min = ti_end; + if (ti_end > ti_end_max) ti_end_max = ti_end; } - - /* Otherwise, collect the data for this cell. */ - else { - - /* Clear the progeny. */ - bzero(c->progeny, sizeof(struct cell *) * 8); - c->split = 0; - maxdepth = c->depth; - - /* Get dt_min/dt_max. */ - for (int k = 0; k < count; k++) { - struct part *p = &parts[k]; - struct xpart *xp = &xparts[k]; - const float h = p->h; - const int ti_end = p->ti_end; - xp->x_diff[0] = 0.f; - xp->x_diff[1] = 0.f; - xp->x_diff[2] = 0.f; - if (h > h_max) h_max = h; - if (ti_end < ti_end_min) ti_end_min = ti_end; - if (ti_end > ti_end_max) ti_end_max = ti_end; - } - for (int k = 0; k < gcount; k++) { - struct gpart *gp = &gparts[k]; - const int ti_end = gp->ti_end; - gp->x_diff[0] = 0.f; - gp->x_diff[1] = 0.f; - gp->x_diff[2] = 0.f; - if (ti_end < ti_end_min) ti_end_min = ti_end; - if (ti_end > ti_end_max) ti_end_max = ti_end; - } + for (int k = 0; k < gcount; k++) { + struct gpart *gp = &gparts[k]; + const int ti_end = gp->ti_end; + gp->x_diff[0] = 0.f; + gp->x_diff[1] = 0.f; + gp->x_diff[2] = 0.f; + if (ti_end < ti_end_min) ti_end_min = ti_end; + if (ti_end > ti_end_max) ti_end_max = ti_end; } + } - /* Set the values for this cell. */ - c->h_max = h_max; - c->ti_end_min = ti_end_min; - c->ti_end_max = ti_end_max; - c->maxdepth = maxdepth; - - /* Set ownership according to the start of the parts array. */ - if (s->nr_parts > 0) - c->owner = - ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts; - else if (s->nr_gparts > 0) - c->owner = ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / - s->nr_gparts; - else - c->owner = 0; /* Ok, there is really nothing on this rank... */ + /* Set the values for this cell. */ + c->h_max = h_max; + c->ti_end_min = ti_end_min; + c->ti_end_max = ti_end_max; + c->maxdepth = maxdepth; + + /* Set ownership according to the start of the parts array. */ + if (s->nr_parts > 0) + c->owner = + ((c->parts - s->parts) % s->nr_parts) * s->nr_queues / s->nr_parts; + else if (s->nr_gparts > 0) + c->owner = + ((c->gparts - s->gparts) % s->nr_gparts) * s->nr_queues / s->nr_gparts; + else + c->owner = 0; /* Ok, there is really nothing on this rank... */ + + /* Clean up. */ + if (allocate_buffer) free(buff); +} + +/** + * @brief #threadpool mapper function to split cells if they contain + * too many particles. + * + * @param map_data Pointer towards the top-cells. + * @param num_cells The number of cells to treat. + * @param extra_data Pointers to the #space. + */ +void space_split_mapper(void *map_data, int num_cells, void *extra_data) { + + /* Unpack the inputs. */ + struct space *s = (struct space *)extra_data; + struct cell *restrict cells_top = (struct cell *)map_data; + + for (int ind = 0; ind < num_cells; ind++) { + struct cell *c = &cells_top[ind]; + space_split_recursive(s, c, NULL); } } -- GitLab From c103a79f90fb8b722244b0de50adb5b6dc61e774 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 21:27:12 +0100 Subject: [PATCH 08/23] add memswap.h, which provides in-place swapping. --- src/Makefile.am | 5 +-- src/memswap.h | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 src/memswap.h diff --git a/src/Makefile.am b/src/Makefile.am index 30afe6cf1..bf296a066 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -44,7 +44,7 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \ common_io.h single_io.h multipole.h map.h tools.h partition.h clocks.h parser.h \ physical_constants.h physical_constants_cgs.h potential.h version.h \ hydro_properties.h riemann.h threadpool.h cooling.h cooling_struct.h sourceterms.h \ - sourceterms_struct.h statistics.h + sourceterms_struct.h statistics.h memswap.h # Common source files @@ -83,7 +83,8 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h potential/softened_isothermal/potential.h \ cooling/none/cooling.h cooling/none/cooling_struct.h \ cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \ - cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h + cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \ + memswap.h # Sources and flags for regular library diff --git a/src/memswap.h b/src/memswap.h new file mode 100644 index 000000000..2ac189925 --- /dev/null +++ b/src/memswap.h @@ -0,0 +1,82 @@ +/******************************************************************************* + * This file is part of SWIFT. + * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) + * Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * 2015 Peter W. Draper (p.w.draper@durham.ac.uk) + * 2016 John A. Regan (john.a.regan@durham.ac.uk) + * Tom Theuns (tom.theuns@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + * + ******************************************************************************/ +#ifndef SWIFT_MEMSWAP_H +#define SWIFT_MEMSWAP_H + +/* Config parameters. */ +#include "../config.h" + +#ifdef HAVE_IMMINTRIN_H +/* Include the header file with the intrinsics for Intel architecture. */ +#include +#endif + +#ifdef HAVE_ALTIVEC_H +/* Include the header file with the intrinsics for Intel architecture. */ +#include +#endif + +/* Macro for in-place swap of two values a and b of type t. */ +#define swap_loop(t, a, b, c) \ + while (c >= sizeof(t)) { \ + register t temp = *(t *)a; \ + *(t *)a = *(t *)b; \ + *(t *)b = temp; \ + a += sizeof(t); \ + b += sizeof(t); \ + bytes -= sizeof(t); \ + } + +/** + * @brief Swap the contents of two elements in-place. + * + * Keep in mind that this function works best when the underlying data + * is aligned to the vector length, e.g. with the @c __attribute__((aligned(32))) + * syntax, and the code is compiled with @c -funroll-loops. + * + * @param a Pointer to the first element. + * @param b Pointer to the second element. + * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. + */ +__attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, + size_t bytes) { + char *a = (char *)void_a, *b = (char *)void_b; +#ifdef __AVX512F__ + swap_loop(__m512i, a, b, bytes) +#endif +#ifdef __AVX__ + swap_loop(__m256i, a, b, bytes) +#endif +#ifdef __SSE2__ + swap_loop(__m128i, a, b, bytes) +#endif +#ifdef __ALTIVEC__ + swap_loop(vector int, a, b, bytes) +#endif + swap_loop(size_t, a, b, bytes) + swap_loop(int, a, b, bytes) + swap_loop(short, a, b, bytes) + swap_loop(char, a, b, bytes) +} + +#endif /* SWIFT_MEMSWAP_H */ -- GitLab From db88daf8818facabe7b13cf426cc3b209e3a2bc4 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 21:28:55 +0100 Subject: [PATCH 09/23] add a test for altivec.h. --- configure.ac | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 8cb7fd1b3..c7cc7e359 100644 --- a/configure.ac +++ b/configure.ac @@ -455,8 +455,9 @@ if test "$ac_cv_header_fftw3_h" = "yes"; then fi AC_SUBST([FFTW_LIBS]) -# Check for Intel intrinsics header optionally used by vector.h. +# Check for Intel and PowerPC intrinsics header optionally used by vector.h. AC_CHECK_HEADERS([immintrin.h]) +AC_CHECK_HEADERS([altivec.h]) # Check for timing functions needed by cycle.h. AC_HEADER_TIME -- GitLab From 95ef2ec0a41bb32ae1d48a951265a99714096a45 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 21:34:53 +0100 Subject: [PATCH 10/23] add a flag for loop unrolling, defaults to whatever has been set for --enble-optimization. --- configure.ac | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/configure.ac b/configure.ac index c7cc7e359..f413a99e7 100644 --- a/configure.ac +++ b/configure.ac @@ -278,6 +278,18 @@ if test "$enable_san" = "yes"; then fi fi +# Add loop unrolling to the compiler flags, if requested. +AC_ARG_ENABLE([loop-unrolling], + [AS_HELP_STRING([--enable-loop-unrolling], + [Tell the compiler explicitly to unroll loops @<:@no/yes@:>@] + )], + [enable_loop_unrolling="$enableval"], + [enable_loop_unrolling="yes"] +) +if test "$enable_loop_unrolling" = "yes"; then + CFLAGS="$CFLAGS -funroll-loops" +fi + # Autoconf stuff. AC_PROG_INSTALL -- GitLab From fd51df93d87c59446b97a1d7c11e5885760f40d2 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 21:52:30 +0100 Subject: [PATCH 11/23] use memswap in cell_split. --- src/cell.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/cell.c b/src/cell.c index 60bd3e799..fed6cc22b 100644 --- a/src/cell.c +++ b/src/cell.c @@ -52,6 +52,7 @@ #include "gravity.h" #include "hydro.h" #include "hydro_properties.h" +#include "memswap.h" #include "minmax.h" #include "scheduler.h" #include "space.h" @@ -513,15 +514,9 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { j++; bucket_count[bid]++; } - struct part temp_part = parts[j]; - struct xpart temp_xpart = xparts[j]; - int temp_buff = buff[j]; - parts[j] = part; - xparts[j] = xpart; - buff[j] = bid; - part = temp_part; - xpart = temp_xpart; - bid = temp_buff; + memswap(&parts[j], &part, sizeof(struct part)); + memswap(&xparts[j], &xpart, sizeof(struct xpart)); + memswap(&buff[j], &bid, sizeof(int)); } parts[k] = part; xparts[k] = xpart; @@ -602,12 +597,8 @@ void cell_split(struct cell *c, ptrdiff_t parts_offset, int *buff) { j++; bucket_count[bid]++; } - struct gpart temp_gpart = gparts[j]; - int temp_buff = buff[j]; - gparts[j] = gpart; - buff[j] = bid; - gpart = temp_gpart; - bid = temp_buff; + memswap(&gparts[j], &gpart, sizeof(struct gpart)); + memswap(&buff[j], &bid, sizeof(int)); } gparts[k] = gpart; buff[k] = bid; -- GitLab From 6f60bbf9789d98d66b60e368b5d1556cbb9ade02 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 21:55:44 +0100 Subject: [PATCH 12/23] use memswap in the sorting routines. --- src/space.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/src/space.c b/src/space.c index 2f53223a6..a95548992 100644 --- a/src/space.c +++ b/src/space.c @@ -49,6 +49,7 @@ #include "hydro.h" #include "kernel_hydro.h" #include "lock.h" +#include "memswap.h" #include "minmax.h" #include "runner.h" #include "threadpool.h" @@ -1043,15 +1044,9 @@ void space_parts_sort_mapper(void *map_data, int num_elements, while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - size_t temp_i = ind[ii]; - ind[ii] = ind[jj]; - ind[jj] = temp_i; - struct part temp_p = parts[ii]; - parts[ii] = parts[jj]; - parts[jj] = temp_p; - struct xpart temp_xp = xparts[ii]; - xparts[ii] = xparts[jj]; - xparts[jj] = temp_xp; + memswap(&ind[ii], &ind[jj], sizeof(size_t)); + memswap(&parts[ii], &parts[jj], sizeof(struct part)); + memswap(&xparts[ii], &xparts[jj], sizeof(struct xpart)); } } @@ -1226,12 +1221,8 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - size_t temp_i = ind[ii]; - ind[ii] = ind[jj]; - ind[jj] = temp_i; - struct gpart temp_p = gparts[ii]; - gparts[ii] = gparts[jj]; - gparts[jj] = temp_p; + memswap(&ind[ii], &ind[jj], sizeof(size_t)); + memswap(&gparts[ii], &gparts[jj], sizeof(size_t)); } } -- GitLab From 93bc328ab6d78bd1ff9f38ad1ed4a0e5143f7e48 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 21:58:15 +0100 Subject: [PATCH 13/23] silly typo. --- src/space.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/space.c b/src/space.c index a95548992..07440a0fe 100644 --- a/src/space.c +++ b/src/space.c @@ -1222,7 +1222,7 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { memswap(&ind[ii], &ind[jj], sizeof(size_t)); - memswap(&gparts[ii], &gparts[jj], sizeof(size_t)); + memswap(&gparts[ii], &gparts[jj], sizeof(struct gpart)); } } -- GitLab From 8850cba7fc1ab63b431d4841eb645e7e7519429b Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 22:04:21 +0100 Subject: [PATCH 14/23] another stupid mistake, inds are int, not size_t. --- src/space.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/space.c b/src/space.c index 07440a0fe..6e3d09f74 100644 --- a/src/space.c +++ b/src/space.c @@ -1044,7 +1044,7 @@ void space_parts_sort_mapper(void *map_data, int num_elements, while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - memswap(&ind[ii], &ind[jj], sizeof(size_t)); + memswap(&ind[ii], &ind[jj], sizeof(int)); memswap(&parts[ii], &parts[jj], sizeof(struct part)); memswap(&xparts[ii], &xparts[jj], sizeof(struct xpart)); } @@ -1221,7 +1221,7 @@ void space_gparts_sort_mapper(void *map_data, int num_elements, while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - memswap(&ind[ii], &ind[jj], sizeof(size_t)); + memswap(&ind[ii], &ind[jj], sizeof(int)); memswap(&gparts[ii], &gparts[jj], sizeof(struct gpart)); } } -- GitLab From 0f1d06bb591bde85c8ed23716be226abd4cbb855 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 22:24:34 +0100 Subject: [PATCH 15/23] formatting. --- src/memswap.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/memswap.h b/src/memswap.h index 2ac189925..f3e402d13 100644 --- a/src/memswap.h +++ b/src/memswap.h @@ -51,7 +51,8 @@ * @brief Swap the contents of two elements in-place. * * Keep in mind that this function works best when the underlying data - * is aligned to the vector length, e.g. with the @c __attribute__((aligned(32))) + * is aligned to the vector length, e.g. with the @c + * __attribute__((aligned(32))) * syntax, and the code is compiled with @c -funroll-loops. * * @param a Pointer to the first element. @@ -62,21 +63,21 @@ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, size_t bytes) { char *a = (char *)void_a, *b = (char *)void_b; #ifdef __AVX512F__ - swap_loop(__m512i, a, b, bytes) + swap_loop(__m512i, a, b, bytes); #endif #ifdef __AVX__ - swap_loop(__m256i, a, b, bytes) + swap_loop(__m256i, a, b, bytes); #endif #ifdef __SSE2__ - swap_loop(__m128i, a, b, bytes) + swap_loop(__m128i, a, b, bytes); #endif #ifdef __ALTIVEC__ - swap_loop(vector int, a, b, bytes) + swap_loop(vector int, a, b, bytes); #endif - swap_loop(size_t, a, b, bytes) - swap_loop(int, a, b, bytes) - swap_loop(short, a, b, bytes) - swap_loop(char, a, b, bytes) + swap_loop(size_t, a, b, bytes); + swap_loop(int, a, b, bytes); + swap_loop(short, a, b, bytes); + swap_loop(char, a, b, bytes); } #endif /* SWIFT_MEMSWAP_H */ -- GitLab From 8371776af5ab9e6ce27a9b791de672a7fc31a5dc Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Wed, 9 Nov 2016 22:24:56 +0100 Subject: [PATCH 16/23] forgot to commit this. --- configure.ac | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index f413a99e7..6113d080f 100644 --- a/configure.ac +++ b/configure.ac @@ -284,7 +284,7 @@ AC_ARG_ENABLE([loop-unrolling], [Tell the compiler explicitly to unroll loops @<:@no/yes@:>@] )], [enable_loop_unrolling="$enableval"], - [enable_loop_unrolling="yes"] + [enable_loop_unrolling="$enable_opt"] ) if test "$enable_loop_unrolling" = "yes"; then CFLAGS="$CFLAGS -funroll-loops" -- GitLab From 1fbd0435036dd2b71af00f7a025b9f1ca9939d0a Mon Sep 17 00:00:00 2001 From: James Willis Date: Fri, 25 Nov 2016 11:57:52 +0000 Subject: [PATCH 17/23] Reverted to old space_recycle and space_regrid versions and bug fix in space_rebuild to use nr_gparts instead of nr_parts when extracting cell counts from sorted indices --- src/space.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/src/space.c b/src/space.c index 6e3d09f74..a4b834b7e 100644 --- a/src/space.c +++ b/src/space.c @@ -436,9 +436,37 @@ void space_regrid(struct space *s, int verbose) { else { /* Otherwise, just clean up the cells. */ /* Free the old cells, if they were allocated. */ - threadpool_map(&s->e->threadpool, space_clear_cells_mapper, s->cells_top, - s->nr_cells, sizeof(struct cell), 100, s); - // space_clear_cells_mapper(s->cells_top, s->nr_cells, s); + for (int k = 0; k < s->nr_cells; k++) { + space_rebuild_recycle(s, &s->cells_top[k]); + s->cells_top[k].sorts = NULL; + s->cells_top[k].nr_tasks = 0; + s->cells_top[k].density = NULL; + s->cells_top[k].gradient = NULL; + s->cells_top[k].force = NULL; + s->cells_top[k].grav = NULL; + s->cells_top[k].dx_max = 0.0f; + s->cells_top[k].sorted = 0; + s->cells_top[k].count = 0; + s->cells_top[k].gcount = 0; + s->cells_top[k].init = NULL; + s->cells_top[k].extra_ghost = NULL; + s->cells_top[k].ghost = NULL; + s->cells_top[k].kick = NULL; + s->cells_top[k].cooling = NULL; + s->cells_top[k].sourceterms = NULL; + s->cells_top[k].super = &s->cells_top[k]; +#if WITH_MPI + s->cells_top[k].recv_xv = NULL; + s->cells_top[k].recv_rho = NULL; + s->cells_top[k].recv_gradient = NULL; + s->cells_top[k].recv_ti = NULL; + + s->cells_top[k].send_xv = NULL; + s->cells_top[k].send_rho = NULL; + s->cells_top[k].send_gradient = NULL; + s->cells_top[k].send_ti = NULL; +#endif + } s->maxdepth = 0; } @@ -673,7 +701,7 @@ void space_rebuild(struct space *s, int verbose) { /* Extract the cell counts from the sorted indices. */ size_t last_gindex = 0; - gind[nr_parts] = s->nr_cells; + gind[nr_gparts] = s->nr_cells; for (size_t k = 0; k < nr_gparts; k++) { if (gind[k] < gind[k + 1]) { cells_top[ind[k]].gcount = k - last_gindex + 1; @@ -1617,17 +1645,26 @@ void space_split_mapper(void *map_data, int num_cells, void *extra_data) { */ void space_recycle(struct space *s, struct cell *c) { + /* Lock the space. */ + lock_lock(&s->lock); + /* Clear the cell. */ if (lock_destroy(&c->lock) != 0) error("Failed to destroy spinlock."); /* Clear this cell's sort arrays. */ if (c->sort != NULL) free(c->sort); + /* Clear the cell data. */ + bzero(c, sizeof(struct cell)); + /* Hook this cell into the buffer. */ - c->next = atomic_swap(&s->cells_sub, c); - atomic_dec(&s->tot_cells); -} + c->next = s->cells_sub; + s->cells_sub = c; + s->tot_cells -= 1; + /* Unlock the space. */ + lock_unlock_blind(&s->lock); +} /** * @brief Get a new empty (sub-)#cell. * -- GitLab From 38b8edd8284b45dc71c70129e6b3f28e3777d0c4 Mon Sep 17 00:00:00 2001 From: "Peter W. Draper" Date: Fri, 25 Nov 2016 18:11:32 +0000 Subject: [PATCH 18/23] Fix up documentation issues --- src/memswap.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/memswap.h b/src/memswap.h index f3e402d13..464372553 100644 --- a/src/memswap.h +++ b/src/memswap.h @@ -1,10 +1,6 @@ /******************************************************************************* * This file is part of SWIFT. - * Copyright (c) 2012 Pedro Gonnet (pedro.gonnet@durham.ac.uk) - * Matthieu Schaller (matthieu.schaller@durham.ac.uk) - * 2015 Peter W. Draper (p.w.draper@durham.ac.uk) - * 2016 John A. Regan (john.a.regan@durham.ac.uk) - * Tom Theuns (tom.theuns@durham.ac.uk) + * Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as published @@ -55,8 +51,8 @@ * __attribute__((aligned(32))) * syntax, and the code is compiled with @c -funroll-loops. * - * @param a Pointer to the first element. - * @param b Pointer to the second element. + * @param void_a Pointer to the first element. + * @param void_b Pointer to the second element. * @param bytes Size, in bytes, of the data pointed to by @c a and @c b. */ __attribute__((always_inline)) inline void memswap(void *void_a, void *void_b, -- GitLab From 0cd36f25eae2ca750c6e453765129d866dc77ba6 Mon Sep 17 00:00:00 2001 From: "Peter W. Draper" Date: Fri, 25 Nov 2016 18:12:03 +0000 Subject: [PATCH 19/23] Loop unrolling is now a standard optimization --- configure.ac | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/configure.ac b/configure.ac index 6113d080f..19daf84f0 100644 --- a/configure.ac +++ b/configure.ac @@ -278,19 +278,6 @@ if test "$enable_san" = "yes"; then fi fi -# Add loop unrolling to the compiler flags, if requested. -AC_ARG_ENABLE([loop-unrolling], - [AS_HELP_STRING([--enable-loop-unrolling], - [Tell the compiler explicitly to unroll loops @<:@no/yes@:>@] - )], - [enable_loop_unrolling="$enableval"], - [enable_loop_unrolling="$enable_opt"] -) -if test "$enable_loop_unrolling" = "yes"; then - CFLAGS="$CFLAGS -funroll-loops" -fi - - # Autoconf stuff. AC_PROG_INSTALL AC_PROG_MAKE_SET -- GitLab From 78a746bc7421c833d7eedf6884c777900aa086d0 Mon Sep 17 00:00:00 2001 From: "Peter W. Draper" Date: Fri, 25 Nov 2016 18:14:23 +0000 Subject: [PATCH 20/23] Formatting --- src/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Makefile.am b/src/Makefile.am index f7fd2bc39..4080b37ab 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -84,7 +84,7 @@ nobase_noinst_HEADERS = align.h approx_math.h atomic.h cycle.h error.h inline.h cooling/none/cooling.h cooling/none/cooling_struct.h \ cooling/const_du/cooling.h cooling/const_du/cooling_struct.h \ cooling/const_lambda/cooling.h cooling/const_lambda/cooling_struct.h \ - memswap.h + memswap.h # Sources and flags for regular library -- GitLab From 2d959882ec4d569d2a7eb610f0a3af74a5167871 Mon Sep 17 00:00:00 2001 From: "Peter W. Draper" Date: Fri, 2 Dec 2016 16:05:01 +0000 Subject: [PATCH 21/23] Remove unused function space_clear_cells_mapper() --- src/space.c | 44 -------------------------------------------- 1 file changed, 44 deletions(-) diff --git a/src/space.c b/src/space.c index 5626a5213..4110446e4 100644 --- a/src/space.c +++ b/src/space.c @@ -185,50 +185,6 @@ void space_rebuild_recycle(struct space *s, struct cell *c) { } } -/** - * @brief Mapper function to clean out the cell hierarchies for a regrid. - */ - -void space_clear_cells_mapper(void *map_data, int num_elements, - void *extra_data) { - - struct cell *cells = (struct cell *)map_data; - struct space *s = (struct space *)extra_data; - - for (int k = 0; k < num_elements; k++) { - struct cell *c = &cells[k]; - space_rebuild_recycle(s, c); - c->sorts = NULL; - c->nr_tasks = 0; - c->density = NULL; - c->gradient = NULL; - c->force = NULL; - c->grav = NULL; - c->dx_max = 0.0f; - c->sorted = 0; - c->count = 0; - c->gcount = 0; - c->init = NULL; - c->extra_ghost = NULL; - c->ghost = NULL; - c->kick = NULL; - c->cooling = NULL; - c->sourceterms = NULL; - c->super = c; -#if WITH_MPI - c->recv_xv = NULL; - c->recv_rho = NULL; - c->recv_gradient = NULL; - c->recv_ti = NULL; - - c->send_xv = NULL; - c->send_rho = NULL; - c->send_gradient = NULL; - c->send_ti = NULL; -#endif - } -} - /** * @brief Re-build the top-level cell grid. * -- GitLab From d806d1193ff28e9be5807321dcd99e92bdf3f646 Mon Sep 17 00:00:00 2001 From: "Peter W. Draper" Date: Fri, 2 Dec 2016 18:14:33 +0000 Subject: [PATCH 22/23] Fixed incorrect uses of ind instead of gind Gravity only tests --- src/space.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/space.c b/src/space.c index 4110446e4..e9cfe1cf4 100644 --- a/src/space.c +++ b/src/space.c @@ -639,9 +639,9 @@ void space_rebuild(struct space *s, int verbose) { cell_getid(cdim, p->x[0] * ih[0], p->x[1] * ih[1], p->x[2] * ih[2]); #ifdef SWIFT_DEBUG_CHECKS - if (cells_top[ind[k]].nodeID != s->e->nodeID) + if (cells_top[gind[k]].nodeID != s->e->nodeID) error("Received part that does not belong to me (nodeID=%i).", - cells_top[ind[k]].nodeID); + cells_top[gind[k]].nodeID); #endif } nr_gparts = s->nr_gparts; @@ -661,7 +661,7 @@ void space_rebuild(struct space *s, int verbose) { gind[nr_gparts] = s->nr_cells; for (size_t k = 0; k < nr_gparts; k++) { if (gind[k] < gind[k + 1]) { - cells_top[ind[k]].gcount = k - last_gindex + 1; + cells_top[gind[k]].gcount = k - last_gindex + 1; last_gindex = k + 1; } } -- GitLab From dec3b22e8b977f8dcd39793bbac40c9c1a0d1e91 Mon Sep 17 00:00:00 2001 From: Pedro Gonnet Date: Sun, 4 Dec 2016 22:51:36 +0100 Subject: [PATCH 23/23] add comment on the +100. --- src/space.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/space.c b/src/space.c index e9cfe1cf4..935677a9e 100644 --- a/src/space.c +++ b/src/space.c @@ -454,7 +454,9 @@ void space_rebuild(struct space *s, int verbose) { struct cell *restrict cells_top = s->cells_top; const int ti_current = (s->e != NULL) ? s->e->ti_current : 0; - /* Run through the particles and get their cell index. */ + /* Run through the particles and get their cell index. Allocates + an index that is larger than the number of particles to avoid + re-allocating after shuffling. */ const size_t ind_size = s->size_parts + 100; int *ind; if ((ind = (int *)malloc(sizeof(int) * ind_size)) == NULL) -- GitLab