diff --git a/src/engine.c b/src/engine.c index c4f5dccad5c829efd3b1b50211c4f6a06a28e198..80c122b0a911582ee4fef393c549519d8e048066 100644 --- a/src/engine.c +++ b/src/engine.c @@ -172,7 +172,7 @@ void engine_redistribute(struct engine *e) { dest[k] = cells[cid].nodeID; counts[nodeID * nr_nodes + dest[k]] += 1; } - parts_sort(s->parts, s->xparts, dest, s->nr_parts, 0, nr_nodes - 1); + space_parts_sort(s, dest, s->nr_parts, 0, nr_nodes - 1); /* Get all the counts from all the nodes. */ if (MPI_Allreduce(MPI_IN_PLACE, counts, nr_nodes * nr_nodes, MPI_INT, MPI_SUM, @@ -990,6 +990,11 @@ void engine_maketasks(struct engine *e) { /* Re-set the scheduler. */ scheduler_reset(sched, s->tot_cells * engine_maxtaskspercell); + /* Add the space sorting tasks. */ + for (i = 0; i < e->nr_threads; i++) + scheduler_addtask(sched, task_type_psort, task_subtype_none, i, 0, NULL, + NULL, 0); + /* Run through the highest level of cells and add pairs. */ for (i = 0; i < cdim[0]; i++) for (j = 0; j < cdim[1]; j++) @@ -1706,15 +1711,6 @@ void engine_launch(struct engine *e, int nr_runners, unsigned int mask) { error("Error while waiting for barrier."); } -void hassorted(struct cell *c) { - - if (c->sorted) error("Suprious sorted flags."); - - if (c->split) - for (int k = 0; k < 8; k++) - if (c->progeny[k] != NULL) hassorted(c->progeny[k]); -} - /** * @brief Let the #engine loose to compute the forces. * @@ -2168,13 +2164,18 @@ void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, s->nr_queues = nr_queues; /* Append a kick1 task to each cell. */ - scheduler_reset(&e->sched, s->tot_cells); + scheduler_reset(&e->sched, s->tot_cells + e->nr_threads); for (k = 0; k < s->nr_cells; k++) s->cells[k].kick1 = scheduler_addtask(&e->sched, task_type_kick1, task_subtype_none, 0, 0, &s->cells[k], NULL, 0); scheduler_ranktasks(&e->sched); + /* Create the sorting tasks. */ + for (i = 0; i < e->nr_threads; i++) + scheduler_addtask(&e->sched, task_type_psort, task_subtype_none, i, 0, NULL, + NULL, 0); + /* Allocate and init the threads. */ if ((e->runners = (struct runner *)malloc(sizeof(struct runner) * nr_threads)) == NULL) diff --git a/src/engine.h b/src/engine.h index 65aa028401344f953d7a23a5c83c030278bbdcb7..dcf0d5cc0912aca42782d09df01fae6edfa45d81 100644 --- a/src/engine.h +++ b/src/engine.h @@ -129,6 +129,7 @@ struct engine { void engine_barrier(struct engine *e, int tid); void engine_init(struct engine *e, struct space *s, float dt, int nr_threads, int nr_queues, int nr_nodes, int nodeID, int policy); +void engine_launch(struct engine *e, int nr_runners, unsigned int mask); void engine_prepare(struct engine *e); void engine_step(struct engine *e); void engine_maketasks(struct engine *e); diff --git a/src/runner.c b/src/runner.c index e1956ebe9a8125c95e1109dfa9ee0211fbecdea4..b8bef2100b540e1b81370fdc73d44441777fece8 100644 --- a/src/runner.c +++ b/src/runner.c @@ -1214,12 +1214,12 @@ void *runner_main(void *data) { t->rid = r->cpuid; /* Set super to the first cell that I own. */ - if (ci->super != NULL && ci->super->owner == r->qid) + if (ci != NULL && ci->super != NULL && ci->super->owner == r->qid) super = ci->super; else if (cj != NULL && cj->super != NULL && cj->super->owner == r->qid) super = cj->super; - /* else - super = NULL; */ + else + super = NULL; /* Different types of tasks... */ switch (t->type) { @@ -1287,6 +1287,9 @@ void *runner_main(void *data) { case task_type_grav_down: runner_dograv_down(r, t->ci); break; + case task_type_psort: + space_do_parts_sort(); + break; default: error("Unknown task type."); } diff --git a/src/space.c b/src/space.c index f4d9cda9d367332ea179a5b144fbc9d169c0dd99..f53a94bdeb44c3d08fcaf4efb2cdbcbada47ca54 100644 --- a/src/space.c +++ b/src/space.c @@ -43,6 +43,9 @@ #include "lock.h" #include "runner.h" +/* Shared sort structure. */ +struct parallel_sort space_sort_struct; + /* Split size. */ int space_splitsize = space_splitsize_default; int space_subsize = space_subsize_default; @@ -389,7 +392,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { /* Sort the parts according to their cells. */ // tic = getticks(); - parts_sort(s->parts, s->xparts, ind, nr_parts, 0, s->nr_cells - 1); + space_parts_sort(s, ind, nr_parts, 0, s->nr_cells - 1); // message( "parts_sort took %.3f ms." , (double)(getticks() - tic) / CPU_TPS // * 1000 ); @@ -397,7 +400,7 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { for (k = 0; k < nr_parts; k++) if (s->parts[k].gpart != NULL) s->parts[k].gpart->part = &s->parts[k]; - /* Verify sort. */ + /* Verify space_sort_struct. */ /* for ( k = 1 ; k < nr_parts ; k++ ) { if ( ind[k-1] > ind[k] ) { error( "Sort failed!" ); @@ -473,88 +476,97 @@ void space_rebuild(struct space *s, double cell_max, int verbose) { * @brief Sort the particles and condensed particles according to the given *indices. * - * @param parts The list of #part - * @param xparts The list of reduced particles * @param ind The indices with respect to which the parts are sorted. * @param N The number of parts * @param min Lowest index. * @param max highest index. */ -void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, - int min, int max) { - - struct qstack { - volatile int i, j, min, max; - volatile int ready; - }; - struct qstack *qstack; - unsigned int qstack_size = 2 * (max - min) + 10; - volatile unsigned int first, last, waiting; +void space_parts_sort(struct space *s, int *ind, int N, int min, int max) { + // Populate a parallel_sort structure with the input data. + struct parallel_sort sort; + space_sort_struct.parts = s->parts; + space_sort_struct.xparts = s->xparts; + space_sort_struct.ind = ind; + space_sort_struct.stack_size = 2 * (max - min) + 10; + if ((space_sort_struct.stack = malloc(sizeof(struct qstack) * + space_sort_struct.stack_size)) == NULL) + error("Failed to allocate sorting stack."); + for (int i = 0; i < space_sort_struct.stack_size; i++) + space_sort_struct.stack[i].ready = 0; + + // Add the first interval. + space_sort_struct.stack[0].i = 0; + space_sort_struct.stack[0].j = N - 1; + space_sort_struct.stack[0].min = min; + space_sort_struct.stack[0].max = max; + space_sort_struct.stack[0].ready = 1; + space_sort_struct.first = 0; + space_sort_struct.last = 1; + space_sort_struct.waiting = 1; + + // Launch the sorting tasks. + engine_launch(s->e, s->e->nr_threads, (1 << task_type_psort)); + + /* Verify space_sort_struct. */ + /* for (int i = 1; i < N; i++) + if (ind[i - 1] > ind[i]) + error("Sorting failed (ind[%i]=%i,ind[%i]=%i).", i - 1, ind[i - 1], i, + ind[i]); */ + + // Clean up. + free(space_sort_struct.stack); +} - int pivot; - int i, ii, j, jj, temp_i, qid; - struct part temp_p; - struct xpart temp_xp; +void space_do_parts_sort() { - /* for ( int k = 0 ; k < N ; k++ ) - if ( ind[k] > max || ind[k] < min ) - error( "ind[%i]=%i is not in [%i,%i]." , k , ind[k] , min , max ); */ - - /* Allocate the stack. */ - if ((qstack = malloc(sizeof(struct qstack) * qstack_size)) == NULL) - error("Failed to allocate qstack."); - - /* Init the interval stack. */ - qstack[0].i = 0; - qstack[0].j = N - 1; - qstack[0].min = min; - qstack[0].max = max; - qstack[0].ready = 1; - for (i = 1; i < qstack_size; i++) qstack[i].ready = 0; - first = 0; - last = 1; - waiting = 1; + /* Pointers to the sorting data. */ + int *ind = space_sort_struct.ind; + struct part *parts = space_sort_struct.parts; + struct xpart *xparts = space_sort_struct.xparts; /* Main loop. */ - while (waiting > 0) { + while (space_sort_struct.waiting > 0) { /* Grab an interval off the queue. */ - qid = (first++) % qstack_size; + int qid = + atomic_inc(&space_sort_struct.first) % space_sort_struct.stack_size; /* Get the stack entry. */ - i = qstack[qid].i; - j = qstack[qid].j; - min = qstack[qid].min; - max = qstack[qid].max; - qstack[qid].ready = 0; + while (!space_sort_struct.stack[qid].ready) + if (!space_sort_struct.waiting) return; + int i = space_sort_struct.stack[qid].i; + int j = space_sort_struct.stack[qid].j; + int min = space_sort_struct.stack[qid].min; + int max = space_sort_struct.stack[qid].max; + space_sort_struct.stack[qid].ready = 0; /* Loop over sub-intervals. */ while (1) { /* Bring beer. */ - pivot = (min + max) / 2; + const int pivot = (min + max) / 2; /* One pass of QuickSort's partitioning. */ - ii = i; - jj = j; + int ii = i; + int jj = j; while (ii < jj) { while (ii <= j && ind[ii] <= pivot) ii++; while (jj >= i && ind[jj] > pivot) jj--; if (ii < jj) { - temp_i = ind[ii]; + int temp_i = ind[ii]; ind[ii] = ind[jj]; ind[jj] = temp_i; - temp_p = parts[ii]; + struct part temp_p = parts[ii]; parts[ii] = parts[jj]; parts[jj] = temp_p; - temp_xp = xparts[ii]; + struct xpart temp_xp = xparts[ii]; xparts[ii] = xparts[jj]; xparts[jj] = temp_xp; } } - /* Verify sort. */ + /* Verify space_sort_struct. */ /* for ( int k = i ; k <= jj ; k++ ) if ( ind[k] > pivot ) { message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, @@ -573,13 +585,16 @@ void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, /* Recurse on the left? */ if (jj > i && pivot > min) { - qid = (last++) % qstack_size; - qstack[qid].i = i; - qstack[qid].j = jj; - qstack[qid].min = min; - qstack[qid].max = pivot; - qstack[qid].ready = 1; - if (waiting++ >= qstack_size) error("Qstack overflow."); + qid = atomic_inc(&space_sort_struct.last) % + space_sort_struct.stack_size; + space_sort_struct.stack[qid].i = i; + space_sort_struct.stack[qid].j = jj; + space_sort_struct.stack[qid].min = min; + space_sort_struct.stack[qid].max = pivot; + space_sort_struct.stack[qid].ready = 1; + if (atomic_inc(&space_sort_struct.waiting) >= + space_sort_struct.stack_size) + error("Qstack overflow."); } /* Recurse on the right? */ @@ -593,13 +608,16 @@ void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, /* Recurse on the right? */ if (jj + 1 < j && pivot + 1 < max) { - qid = (last++) % qstack_size; - qstack[qid].i = jj + 1; - qstack[qid].j = j; - qstack[qid].min = pivot + 1; - qstack[qid].max = max; - qstack[qid].ready = 1; - if ((waiting++) >= qstack_size) error("Qstack overflow."); + qid = atomic_inc(&space_sort_struct.last) % + space_sort_struct.stack_size; + space_sort_struct.stack[qid].i = jj + 1; + space_sort_struct.stack[qid].j = j; + space_sort_struct.stack[qid].min = pivot + 1; + space_sort_struct.stack[qid].max = max; + space_sort_struct.stack[qid].ready = 1; + if (atomic_inc(&space_sort_struct.waiting) >= + space_sort_struct.stack_size) + error("Qstack overflow."); } /* Recurse on the left? */ @@ -612,18 +630,9 @@ void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, } /* loop over sub-intervals. */ - waiting--; + atomic_dec(&space_sort_struct.waiting); } /* main loop. */ - - /* Verify sort. */ - /* for ( i = 1 ; i < N ; i++ ) - if ( ind[i-1] > ind[i] ) - error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i - , ind[i] ); */ - - /* Clean up. */ - free(qstack); } void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) { @@ -694,7 +703,7 @@ void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) { } } - /* Verify sort. */ + /* Verify space_sort_struct. */ /* for ( int k = i ; k <= jj ; k++ ) if ( ind[k] > pivot ) { message( "sorting failed at k=%i, ind[k]=%i, pivot=%i, i=%i, j=%i, @@ -756,7 +765,7 @@ void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max) { } /* main loop. */ - /* Verify sort. */ + /* Verify space_sort_struct. */ /* for ( i = 1 ; i < N ; i++ ) if ( ind[i-1] > ind[i] ) error( "Sorting failed (ind[%i]=%i,ind[%i]=%i)." , i-1 , ind[i-1] , i diff --git a/src/space.h b/src/space.h index c12ec46be968d713618c41db5ab1385ed147d33e..4f37275b716d37644d7c6bd648a58cc70a3d76e2 100644 --- a/src/space.h +++ b/src/space.h @@ -111,9 +111,23 @@ struct space { int nr_parts_foreign, size_parts_foreign; }; +/* Interval stack necessary for parallel particle sorting. */ +struct qstack { + volatile int i, j, min, max; + volatile int ready; +}; +struct parallel_sort { + struct part *parts; + struct xpart *xparts; + int *ind; + struct qstack *stack; + unsigned int stack_size; + volatile unsigned int first, last, waiting; +}; +extern struct parallel_sort space_sort_struct; + /* function prototypes. */ -void parts_sort(struct part *parts, struct xpart *xparts, int *ind, int N, - int min, int max); +void space_parts_sort(struct space *s, int *ind, int N, int min, int max); void gparts_sort(struct gpart *gparts, int *ind, int N, int min, int max); struct cell *space_getcell(struct space *s); int space_getsid(struct space *s, struct cell **ci, struct cell **cj, @@ -130,5 +144,6 @@ void space_map_cells_post(struct space *s, int full, void space_rebuild(struct space *s, double h_max, int verbose); void space_recycle(struct space *s, struct cell *c); void space_split(struct space *s, struct cell *c); +void space_do_parts_sort(); #endif /* SWIFT_SPACE_H */ diff --git a/src/task.h b/src/task.h index 0d3a68e1e8a892d554f8fb83f8f16d7030d5a54c..474fcc3592c7e4e85ff0f6fab71f99041c07bb65 100644 --- a/src/task.h +++ b/src/task.h @@ -44,6 +44,7 @@ enum task_types { task_type_grav_mm, task_type_grav_up, task_type_grav_down, + task_type_psort, task_type_count };