diff --git a/examples/Makefile.am b/examples/Makefile.am index e9a7a59112fa3cea392ef5c7b035454b00aa4884..179db883582780dad2158d39d52f87b354f06724 100644 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -31,7 +31,7 @@ AM_CFLAGS = -g -O3 -Wall -Werror -I../src -ffast-math -fstrict-aliasing \ AM_LDFLAGS = -lm # -fsanitize=address # Set-up the library -bin_PROGRAMS = test test_qr test_bh test_bh_sorted +bin_PROGRAMS = test test_qr test_bh test_bh_sorted test_fmm_sorted # Sources for test test_SOURCES = test.c @@ -53,3 +53,8 @@ test_bh_sorted_SOURCES = test_bh_sorted.c test_bh_sorted_CFLAGS = $(AM_CFLAGS) test_bh_sorted_LDADD = ../src/.libs/libquicksched.a +# Sources for test_fmm_sorted +test_fmm_sorted_SOURCES = test_fmm_sorted.c +test_fmm_sorted_CFLAGS = $(AM_CFLAGS) +test_fmm_sorted_LDADD = ../src/.libs/libquicksched.a + diff --git a/examples/test_fmm_sorted.c b/examples/test_fmm_sorted.c new file mode 100644 index 0000000000000000000000000000000000000000..53ce537d9d672b0fba11fd4924e3a26921e71edc --- /dev/null +++ b/examples/test_fmm_sorted.c @@ -0,0 +1,2534 @@ +/******************************************************************************* + * This file is part of QuickSched. + * Coypright (c) 2014 Pedro Gonnet (pedro.gonnet@durham.ac.uk), + * Matthieu Schaller (matthieu.schaller@durham.ac.uk) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * +* *****************************************************************************/ + +/* Config parameters. */ +#include "../config.h" + +/* Standard includes. */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <math.h> +#include <float.h> +#include <limits.h> +#include <omp.h> +#include <fenv.h> + +/* Local includes. */ +#include "quicksched.h" + +/* Some local constants. */ +#define cell_pool_grow 1000 +#define cell_maxparts 1 +#define task_limit 1e8 +#define const_G 1 // 6.6738e-8 +#define dist_min 0.5 /* Used for legacy walk only */ +#define dist_cutoff_ratio 1.5 +#define iact_pair_direct iact_pair_direct_sorted_multipole +#define ICHECK -1 +#define NO_SANITY_CHECKS +#define NO_COM_AS_TASK +//#define COUNTERS + + +/** Data structure for the particles. */ +struct part { + double x[3]; + // union { + float a[3]; + float a_legacy[3]; + float a_exact[3]; + // }; + float mass; + int id; +}; // __attribute__((aligned(64))); + +/** Data structure for the sorted particle positions. */ +struct index { + int ind; + float d; +}; + +struct monopole { + double com[3]; + float mass; +}; + +/** Data structure for the BH tree cell. */ +struct cell { + double loc[3]; + double h; + + int count; + unsigned short int split, sorted; + struct part *parts; + + struct cell *firstchild; /* Next node if opening */ + struct cell *sibling; /* Next node */ + + /* We keep both CoMs and masses to make sure the comp_com calculation is + * correct (use an anonymous union to keep variable names compact). */ + union { + + /* Information for the legacy walk */ + struct monopole legacy; + + /* Information for the QuickShed walk */ + struct monopole new; + }; + + int res, com_tid; + struct index *indices; + +} __attribute__((aligned(128))); + +/** Task types. */ +enum task_type { + task_type_self = 0, + task_type_pair, + task_type_self_pc, + task_type_pair_direct, + task_type_com, + task_type_count +}; + + + + +/** Multipole stuff. */ +struct multipole { + float mass; + float mu[3]; /* Centre of Mass */ + float sigma_00, sigma_11, sigma_22, sigma_01, sigma_02, sigma_12; +}; + +static inline void multipole_reset(struct multipole *d) { + d->mu[0] = 0.0; + d->mu[1] = 0.0; + d->mu[2] = 0.0; + + d->sigma_00 = 0.0; + d->sigma_11 = 0.0; + d->sigma_22 = 0.0; + d->sigma_01 = 0.0; + d->sigma_02 = 0.0; + d->sigma_12 = 0.0; + + d->mass = 0.0; +} + +static inline void multipole_add(struct multipole *d, const float *x, float mass) { + d->mu[0] += x[0] * mass; + d->mu[1] += x[1] * mass; + d->mu[2] += x[2] * mass; + + d->sigma_00 += mass * x[0] * x[0]; + d->sigma_11 += mass * x[1] * x[1]; + d->sigma_22 += mass * x[2] * x[2]; + d->sigma_01 += mass * x[0] * x[1]; + d->sigma_02 += mass * x[0] * x[2]; + d->sigma_12 += mass * x[1] * x[2]; + + d->mass += mass; +} + + +static inline void multipole_iact(struct multipole *d, const float *x, float* a) { + + int k; + float r2,ir,w, dx[3]; + + /* early abort */ + if (d->mass == 0.) return; + + float inv_mass = 1. / d->mass; + + /* Temporary quantity */ + dx[0] = d->sigma_00 - inv_mass*d->mu[0]*d->mu[0]; /* Abusing a so far unused */ + dx[1] = d->sigma_11 - inv_mass*d->mu[1]*d->mu[1]; /* variable temporarily... */ + dx[2] = d->sigma_22 - inv_mass*d->mu[2]*d->mu[2]; + + /* Construct quadrupole matrix */ + float Q_00 = 2.0f*dx[0] - dx[1] - dx[2]; + float Q_11 = 2.0f*dx[1] - dx[0] - dx[2]; + float Q_22 = Q_00 + Q_11; /* Q_ij is traceless */ + Q_22 *= -1.0f; + + float Q_01 = d->sigma_01 - inv_mass*d->mu[0]*d->mu[1]; + float Q_02 = d->sigma_02 - inv_mass*d->mu[0]*d->mu[2]; + float Q_12 = d->sigma_12 - inv_mass*d->mu[1]*d->mu[2]; + Q_01 *= 3.0f; + Q_02 *= 3.0f; + Q_12 *= 3.0f; + + /* Compute the distance to the CoM. */ + for (r2 = 0.0f, k = 0; k < 3; k++) { + dx[k] = x[k] - d->mu[k] * inv_mass; + r2 += dx[k] * dx[k]; + } + ir = 1.0f / sqrtf(r2); + + /* Compute the acceleration from the monopole */ + w = const_G * ir * ir * ir * d->mass; + for (k = 0; k < 3; k++) a[k] -= w * dx[k]; + + /* Compute some helpful terms */ + float w1 = const_G * ir * ir * ir * ir * ir; + float w2 = -2.5f * const_G * ir * ir * ir * ir * ir * ir * ir; + float xi = 0.; + + xi += dx[0] * dx[0] * Q_00; + xi += dx[1] * dx[1] * Q_11; + xi += dx[2] * dx[2] * Q_22; + xi += 2.f * dx[0] * dx[1] * Q_01; + xi += 2.f * dx[0] * dx[2] * Q_02; + xi += 2.f * dx[1] * dx[2] * Q_12; + + /* Compute the acceleration from the quadrupole */ + a[0] += w2 * xi * dx[0] + w1 * (dx[0]*Q_00 + dx[1]*Q_01 + dx[2]*Q_02); + a[1] += w2 * xi * dx[1] + w1 * (dx[0]*Q_01 + dx[1]*Q_11 + dx[2]*Q_12); + a[2] += w2 * xi * dx[2] + w1 * (dx[0]*Q_02 + dx[1]*Q_12 + dx[2]*Q_22); + +} + + + + + + +#ifdef COUNTERS +int count_direct_unsorted; +int count_direct_sorted_pp; +int count_direct_sorted_pm_i; +int count_direct_sorted_pm_j; +#endif + +/** Per-type timers. */ +ticks task_timers[task_type_count]; + +/** Global variable for the pool of allocated cells. */ +struct cell *cell_pool = NULL; + +/** Constants for the sorting axes. */ +const float axis_shift[13 * 3] = { + 5.773502691896258e-01, 5.773502691896258e-01, 5.773502691896258e-01, + 7.071067811865475e-01, 7.071067811865475e-01, 0.0, + 5.773502691896258e-01, 5.773502691896258e-01, -5.773502691896258e-01, + 7.071067811865475e-01, 0.0, 7.071067811865475e-01, + 1.0, 0.0, 0.0, + 7.071067811865475e-01, 0.0, -7.071067811865475e-01, + 5.773502691896258e-01, -5.773502691896258e-01, 5.773502691896258e-01, + 7.071067811865475e-01, -7.071067811865475e-01, 0.0, + 5.773502691896258e-01, -5.773502691896258e-01, -5.773502691896258e-01, + 0.0, 7.071067811865475e-01, 7.071067811865475e-01, + 0.0, 1.0, 0.0, + 0.0, 7.071067811865475e-01, -7.071067811865475e-01, + 0.0, 0.0, 1.0, +}; + +const char axis_flip[27] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +/* Map shift vector to sortlist. */ +const int axis_sid[27] = { + /* ( -1 , -1 , -1 ) */ 0, + /* ( -1 , -1 , 0 ) */ 1, + /* ( -1 , -1 , 1 ) */ 2, + /* ( -1 , 0 , -1 ) */ 3, + /* ( -1 , 0 , 0 ) */ 4, + /* ( -1 , 0 , 1 ) */ 5, + /* ( -1 , 1 , -1 ) */ 6, + /* ( -1 , 1 , 0 ) */ 7, + /* ( -1 , 1 , 1 ) */ 8, + /* ( 0 , -1 , -1 ) */ 9, + /* ( 0 , -1 , 0 ) */ 10, + /* ( 0 , -1 , 1 ) */ 11, + /* ( 0 , 0 , -1 ) */ 12, + /* ( 0 , 0 , 0 ) */ 0, + /* ( 0 , 0 , 1 ) */ 12, + /* ( 0 , 1 , -1 ) */ 11, + /* ( 0 , 1 , 0 ) */ 10, + /* ( 0 , 1 , 1 ) */ 9, + /* ( 1 , -1 , -1 ) */ 8, + /* ( 1 , -1 , 0 ) */ 7, + /* ( 1 , -1 , 1 ) */ 6, + /* ( 1 , 0 , -1 ) */ 5, + /* ( 1 , 0 , 0 ) */ 4, + /* ( 1 , 0 , 1 ) */ 3, + /* ( 1 , 1 , -1 ) */ 2, + /* ( 1 , 1 , 0 ) */ 1, + /* ( 1 , 1 , 1 ) */ 0 +}; + +/* Some forward declarations. */ +void comp_com(struct cell *c); + +/** + * @brief Sort the entries in ascending order using QuickSort. + * + * @param sort The indices + * @param N The number of entries. + */ +void indices_sort(struct index *sort, int N) { + + struct { + short int lo, hi; + } qstack[10]; + int qpos, i, j, lo, hi, imin; + struct index temp; + float pivot; + + /* Sort parts in cell_i in decreasing order with quicksort */ + qstack[0].lo = 0; + qstack[0].hi = N - 1; + qpos = 0; + while (qpos >= 0) { + lo = qstack[qpos].lo; + hi = qstack[qpos].hi; + qpos -= 1; + if (hi - lo < 15) { + for (i = lo; i < hi; i++) { + imin = i; + for (j = i + 1; j <= hi; j++) + if (sort[j].d < sort[imin].d) imin = j; + if (imin != i) { + temp = sort[imin]; + sort[imin] = sort[i]; + sort[i] = temp; + } + } + } else { + pivot = sort[(lo + hi) / 2].d; + i = lo; + j = hi; + while (i <= j) { + while (sort[i].d < pivot) i++; + while (sort[j].d > pivot) j--; + if (i <= j) { + if (i < j) { + temp = sort[i]; + sort[i] = sort[j]; + sort[j] = temp; + } + i += 1; + j -= 1; + } + } + if (j > (lo + hi) / 2) { + if (lo < j) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + if (i < hi) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + } else { + if (i < hi) { + qpos += 1; + qstack[qpos].lo = i; + qstack[qpos].hi = hi; + } + if (lo < j) { + qpos += 1; + qstack[qpos].lo = lo; + qstack[qpos].hi = j; + } + } + } + } +} + +/** + * @brief Sort the particles in the given cell along the given axis. + * + * @param c The #cell. + * @param axis The normalized axis along which to sort. + * @param aid The axis ID at which to store the indices. + */ +void cell_sort(struct cell *c, const float *axis, int aid) { + + /* Has the indices array even been allocated? */ + if (c->indices == NULL) { + if ((c->indices = (struct index *)malloc((c->count + 1) * 13 * + sizeof(struct index))) == + NULL) + error("Failed to allocate cell sorting indices."); + } else if (c->sorted & (1 << aid)) { + return; + } + + /* If this cell has been split, merge from the progeny. */ + if (c->split) { + + /* Heap of pointers to the progeny's indices. */ + struct { + struct index *index; + int offset; + } temp, pindex[8]; + int k = 0; + + /* First, make sure all the progeny have been sorted, and get the pointers + to their first entries. */ + for (struct cell *cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + if (!(cp->sorted & (1 << aid))) cell_sort(cp, axis, aid); + pindex[k].index = &cp->indices[(cp->count + 1) * aid]; + pindex[k].offset = cp->parts - c->parts; + k++; + } + + /* Fill any remaining gaps with the sentinel. */ + c->indices[(c->count + 1) * aid + c->count].d = FLT_MAX; + for (; k < 8; k++) + pindex[k].index = &c->indices[(c->count + 1) * aid + c->count]; + + /* Heapify the pindices. */ + for (k = 3; k >= 0; k--) { + int j = k; + while (j < 4) { + int jj = 2 * j + 1; + if (jj + 1 < 8 && pindex[jj + 1].index->d < pindex[jj].index->d) + jj = jj + 1; + if (pindex[jj].index->d < pindex[j].index->d) { + temp = pindex[jj]; + pindex[jj] = pindex[j]; + pindex[j] = temp; + j = jj; + } else + break; + } + } + + /* Copy the indices into the local index in increasing order. */ + struct index *dest = &c->indices[(c->count + 1) * aid]; + for (int k = 0; k < c->count; k++) { + + /* Copy the top of the heap to the destination. */ + *dest = *pindex[0].index; + dest->ind += pindex[0].offset; + + /* Increase the pointers. */ + dest++; + pindex[0].index++; + + /* Fix the heap. */ + int j = 0; + while (j < 4) { + int jj = 2 * j + 1; + if (jj + 1 < 8 && pindex[jj + 1].index->d < pindex[jj].index->d) + jj = jj + 1; + if (pindex[jj].index->d < pindex[j].index->d) { + temp = pindex[jj]; + pindex[jj] = pindex[j]; + pindex[j] = temp; + j = jj; + } else + break; + } + } + + /* Add a sentinel at the end. */ + dest->ind = -1; + dest->d = FLT_MAX; + + /* Otherwise, just sort the entries. */ + } else { + + /* Fill the indices. */ + struct index *dest = &c->indices[(c->count + 1) * aid]; + for (int k = 0; k < c->count; k++) { + dest[k].ind = k; + dest[k].d = c->parts[k].x[0] * axis[0] + c->parts[k].x[1] * axis[1] + + c->parts[k].x[2] * axis[2]; + } + + /* Sort the indices. */ + indices_sort(&c->indices[(c->count + 1) * aid], c->count); + + /* Set the sentinel on the last entry. */ + dest[c->count].ind = -1; + dest[c->count].d = FLT_MAX; + } + + /* Mark this cell as sorted in the given direction. */ + c->sorted |= (1 << aid); + + /* Verify the sort. */ + /* int offset = (c->count + 1) * aid; + for (int k = 0; k < c->count; k++) + if (c->indices[offset + k].d > c->indices[offset + k + 1].d) + error( "Sorting failed." ); */ +} /* */ + +/** + * @brief Get all the data needed for computing a sorted pair. + * + * @param ci Pointer to a pointer to the first cell. + * @param cj Pointer to a pointer to the second cell. + * @param ind_i Sorted indices of the cell @c ci. + * @param ind_j Sorted indices of the cell @c cj. + */ +void get_axis(struct cell **ci, struct cell **cj, struct index **ind_i, + struct index **ind_j) { + + float axis[3]; + float dx[3]; + int aid = 0; + + /* Get the cell pair separation and the axis index. */ + for (int k = 0; k < 3; k++) { + dx[k] = (*cj)->loc[k] - (*ci)->loc[k]; + aid = 3 * aid + ((dx[k] < 0) ? 0 : (dx[k] > 0) ? 2 : 1); + } + + /* Flip the cells? */ + if (axis_flip[aid]) { + struct cell *temp = *ci; + *ci = *cj; + *cj = temp; + } + aid = axis_sid[aid]; + + /* Copy the shift vector to the output parameter. */ + axis[0] = axis_shift[aid * 3 + 0]; + axis[1] = axis_shift[aid * 3 + 1]; + axis[2] = axis_shift[aid * 3 + 2]; + + /* Make sure the cells are sorted. */ + if (!((*ci)->sorted & (1 << aid))) cell_sort(*ci, axis, aid); + if (!((*cj)->sorted & (1 << aid))) cell_sort(*cj, axis, aid); + + /* Set the indices. */ + *ind_i = &(*ci)->indices[aid * ((*ci)->count + 1)]; + *ind_j = &(*cj)->indices[aid * ((*cj)->count + 1)]; + + + /* Make sure the sorts are ok. */ + /* for (int k = 1; k < (*ci)->count; k++) + if ((*ind_i)[k].d < (*ind_i)[k-1].d) + error("Sorting failed."); + for (int k = 1; k < (*cj)->count; k++) + if ((*ind_j)[k].d < (*ind_j)[k-1].d) + error("Sorting failed."); */ +} + +/** + * @brief Get a #cell from the pool. + */ +struct cell *cell_get() { + + struct cell *res; + int k; + + /* Allocate a new batch? */ + if (cell_pool == NULL) { + + /* Allocate the cell array. */ + if ((cell_pool = + (struct cell *)calloc(cell_pool_grow, sizeof(struct cell))) == + NULL) + error("Failed to allocate fresh batch of cells."); + + /* Link them up via their progeny pointers. */ + for (k = 1; k < cell_pool_grow; k++) + cell_pool[k - 1].firstchild = &cell_pool[k]; + } + + /* Pick a cell off the pool. */ + res = cell_pool; + cell_pool = cell_pool->firstchild; + + /* Clean up a few things. */ + res->res = qsched_res_none; + res->firstchild = 0; + + /* Return the cell. */ + return res; +} + +/** + * @brief Sort the parts into eight bins along the given pivots and + * fill the multipoles. Also adds the hierarchical resources + * to the sched. + * + * @param c The #cell to be split. + * @param N The total number of parts. + * @param s The #sched to store the resources. + */ +void cell_split(struct cell *c, struct qsched *s) { + + int i, j, k, kk, count = c->count; + struct part temp, *parts = c->parts; + struct cell *cp; + int left[8], right[8]; + double pivot[3]; + static struct cell *root = NULL; + struct cell *progenitors[8]; + + /* Set the root cell. */ + if (root == NULL) { + root = c; + c->sibling = 0; + } + + /* Add a resource for this cell if it doesn't have one yet. */ + if (c->res == qsched_res_none) + c->res = qsched_addres(s, qsched_owner_none, qsched_res_none); + +/* Attach a center-of-mass task to the cell. */ +#ifdef COM_AS_TASK + if (count > 0) + c->com_tid = qsched_addtask(s, task_type_com, task_flag_none, &c, + sizeof(struct cell *), 1); +#endif + + /* Does this cell need to be split? */ + if (count > cell_maxparts) { + + /* Mark this cell as split. */ + c->split = 1; + + /* Create the progeny. */ + for (k = 0; k < 8; k++) { + progenitors[k] = cp = cell_get(); + cp->loc[0] = c->loc[0]; + cp->loc[1] = c->loc[1]; + cp->loc[2] = c->loc[2]; + cp->h = c->h * 0.5; + cp->res = qsched_addres(s, qsched_owner_none, c->res); + if (k & 4) cp->loc[0] += cp->h; + if (k & 2) cp->loc[1] += cp->h; + if (k & 1) cp->loc[2] += cp->h; + } + + /* Init the pivots. */ + for (k = 0; k < 3; k++) pivot[k] = c->loc[k] + c->h * 0.5; + + /* Split along the x-axis. */ + i = 0; + j = count - 1; + while (i <= j) { + while (i <= count - 1 && parts[i].x[0] < pivot[0]) i += 1; + while (j >= 0 && parts[j].x[0] >= pivot[0]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + } + left[1] = i; + right[1] = count - 1; + left[0] = 0; + right[0] = j; + + /* Split along the y axis, twice. */ + for (k = 1; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && parts[i].x[1] < pivot[1]) i += 1; + while (j >= left[k] && parts[j].x[1] >= pivot[1]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Split along the z axis, four times. */ + for (k = 3; k >= 0; k--) { + i = left[k]; + j = right[k]; + while (i <= j) { + while (i <= right[k] && parts[i].x[2] < pivot[2]) i += 1; + while (j >= left[k] && parts[j].x[2] >= pivot[2]) j -= 1; + if (i < j) { + temp = parts[i]; + parts[i] = parts[j]; + parts[j] = temp; + } + } + left[2 * k + 1] = i; + right[2 * k + 1] = right[k]; + left[2 * k] = left[k]; + right[2 * k] = j; + } + + /* Store the counts and offsets. */ + for (k = 0; k < 8; k++) { + progenitors[k]->count = right[k] - left[k] + 1; + progenitors[k]->parts = &c->parts[left[k]]; + } + + /* Find the first non-empty progenitor */ + for (k = 0; k < 8; k++) + if (progenitors[k]->count > 0) { + c->firstchild = progenitors[k]; + break; + } + +#ifdef SANITY_CHECKS + if (c->firstchild == NULL) + error("Cell has been split but all progenitors have 0 particles"); +#endif + + /* Prepare the pointers. */ + for (k = 0; k < 8; k++) { + + /* Find the next non-empty sibling */ + for (kk = k + 1; kk < 8; ++kk) { + if (progenitors[kk]->count > 0) { + progenitors[k]->sibling = progenitors[kk]; + break; + } + } + + /* No non-empty sibling ? Go back a level */ + if (kk == 8) progenitors[k]->sibling = c->sibling; + } + + /* Recurse. */ + for (k = 0; k < 8; k++) + if (progenitors[k]->count > 0) cell_split(progenitors[k], s); + +/* Link the COM tasks. */ +#ifdef COM_AS_TASK + for (k = 0; k < 8; k++) + if (progenitors[k]->count > 0) + qsched_addunlock(s, progenitors[k]->com_tid, c->com_tid); +#endif + + /* Otherwise, we're at a leaf, so create the cell's particle-cell task. */ + } else { + struct cell *data[2] = {root, c}; + int tid = qsched_addtask(s, task_type_self_pc, task_flag_none, data, + 2 * sizeof(struct cell *), 1); + qsched_addlock(s, tid, c->res); +#ifdef COM_AS_TASK + qsched_addunlock(s, root->com_tid, tid); +#endif + } /* does the cell need to be split? */ + +/* Compute the cell's center of mass. */ +#ifndef COM_AS_TASK + comp_com(c); +#endif + + /* Set this cell's resources ownership. */ + qsched_res_own(s, c->res, + s->nr_queues * (c->parts - root->parts) / root->count); +} + +/* -------------------------------------------------------------------------- */ +/* New tree walk */ +/* -------------------------------------------------------------------------- */ + +/** + * @brief Compute the center of mass of a given cell. + * + * @param c The #cell. + */ +void comp_com(struct cell *c) { + + int k, count = c->count; + struct cell *cp; + struct part *parts = c->parts; + double com[3] = {0.0, 0.0, 0.0}, mass = 0.0; + + if (c->split) { + + /* Loop over the projenitors and collect the multipole information. */ + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + float cp_mass = cp->new.mass; + com[0] += cp->new.com[0] * cp_mass; + com[1] += cp->new.com[1] * cp_mass; + com[2] += cp->new.com[2] * cp_mass; + mass += cp_mass; + } + + /* Otherwise, collect the multipole from the particles. */ + } else { + + for (k = 0; k < count; k++) { + float p_mass = parts[k].mass; + com[0] += parts[k].x[0] * p_mass; + com[1] += parts[k].x[1] * p_mass; + com[2] += parts[k].x[2] * p_mass; + mass += p_mass; + } + } + + /* Store the COM data, if any was collected. */ + if (mass > 0.0) { + float imass = 1.0f / mass; + c->new.com[0] = com[0] * imass; + c->new.com[1] = com[1] * imass; + c->new.com[2] = com[2] * imass; + c->new.mass = mass; + } else { + c->new.com[0] = 0.0; + c->new.com[1] = 0.0; + c->new.com[2] = 0.0; + c->new.mass = 0.0; + } +} + +/** + * @brief Interacts all particles in ci with the monopole in cj + */ +static inline void make_interact_pc(struct cell *leaf, struct cell *cj) { + + int j, k; + double com[3] = {0.0, 0.0, 0.0}; + float mcom, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; + +#ifdef SANITY_CHECKS + + /* Sanity checks */ + if (leaf->count == 0) error("Empty cell!"); + + /* Sanity check. */ + if (cj->new.mass == 0.0) { + message("%e %e %e %d %p %d %p", cj->new.com[0], cj->new.com[1], + cj->new.com[2], cj->count, cj, cj->split, cj->sibling); + + for (j = 0; j < cj->count; ++j) + message("part %d mass=%e id=%d", j, cj->parts[j].mass, cj->parts[j].id); + + error("com does not seem to have been set."); + } + +#endif + + /* Init the com's data. */ + for (k = 0; k < 3; k++) com[k] = cj->new.com[k]; + mcom = cj->new.mass; + + /* Loop over every particle in leaf. */ + for (j = 0; j < leaf->count; j++) { + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = com[k] - leaf->parts[j].x[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = mcom * const_G * ir * ir * ir; + for (k = 0; k < 3; k++) leaf->parts[j].a[k] += w * dx[k]; + +#if ICHECK >= 0 + if (leaf->parts[j].id == ICHECK) + printf("[DEBUG] cell [%f,%f,%f] interacting with cj->loc=[%f,%f,%f] " + "m=%f h=%f\n", + leaf->loc[0], leaf->loc[1], leaf->loc[2], cj->loc[0], cj->loc[1], + cj->loc[2], mcom, cj->h); + + if (leaf->parts[j].id == ICHECK) + printf("[NEW] Interaction with monopole a=( %e %e %e ) h= %f Nj= %d m_j= " + "%f\n", + w * dx[0], w * dx[1], w * dx[2], cj->h, cj->count, mcom); +#endif + + } /* loop over every particle. */ +} + +/** + * @brief Checks whether the cell leaf is a subvolume of the cell c + */ +static inline int is_inside(struct cell *leaf, struct cell *c) { + return (leaf->parts >= c->parts) && (leaf->parts < c->parts + c->count); +} + +/** + * @brief Checks whether the cells are direct neighbours ot not + */ +static inline int are_neighbours_different_size(struct cell *ci, + struct cell *cj) { + + int k; + float dx[3]; + double cih = ci->h, cjh = cj->h; + + /* Maximum allowed distance */ + float min_dist = 0.5 * (cih + cjh); + + /* (Manhattan) Distance between the cells */ + for (k = 0; k < 3; k++) { + float center_i = ci->loc[k] + 0.5 * cih; + float center_j = cj->loc[k] + 0.5 * cjh; + dx[k] = fabs(center_i - center_j); + } + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +/** + * @brief Checks whether the cells are direct neighbours ot not. Both cells have + * to be of the same size + */ +static inline int are_neighbours(struct cell *ci, struct cell *cj) { + + int k; + float dx[3]; + +#ifdef SANITY_CHECKS + if (ci->h != cj->h) + error(" Cells of different size in distance calculation."); +#endif + + /* Maximum allowed distance */ + float min_dist = ci->h; + + /* (Manhattan) Distance between the cells */ + for (k = 0; k < 3; k++) { + float center_i = ci->loc[k]; + float center_j = cj->loc[k]; + dx[k] = fabs(center_i - center_j); + } + + return (dx[0] <= min_dist) && (dx[1] <= min_dist) && (dx[2] <= min_dist); +} + +/** + * @brief Compute the interactions between all particles in a cell leaf + * and the center of mass of all the cells in a part of the tree +* described by ci and cj + * + * @param ci The #cell containing the particle + * @param cj The #cell containing the center of mass. + */ +static inline void iact_pair_pc(struct cell *ci, struct cell *cj, + struct cell *leaf) { + + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (ci->count == 0 || cj->count == 0) error("Empty cell !"); + + /* Sanity check */ + if (ci == cj) + error("The impossible has happened: pair interaction between a cell and " + "itself."); + + /* Sanity check */ + if (!is_inside(leaf, ci)) + error("The impossible has happened: The leaf is not within ci"); + + /* Are the cells direct neighbours? */ + if (!are_neighbours(ci, cj)) error("Cells are not neighours"); + + /* Are both cells split ? */ + if (!ci->split || !cj->split) error("One of the cells is not split !"); +#endif + + /* Let's find in which subcell of ci the leaf is */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + + if (is_inside(leaf, cp)) break; + } + + if (are_neighbours_different_size(cp, cj)) { + + /* Now interact this subcell with all subcells of cj */ + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + + /* Check whether we have to recurse or can directly jump to the multipole + * calculation */ + if (are_neighbours(cp, cps)) { + + /* We only recurse if the children are split */ + if (cp->split && cps->split) { + iact_pair_pc(cp, cps, leaf); + } + + } else { + make_interact_pc(leaf, cps); + } + } + } else { + + /* If cp is not a neoghbour of cj, we can directly interact with the + * multipoles */ + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + + make_interact_pc(leaf, cps); + } + } +} + +/** + * @brief Compute the interactions between all particles in a leaf and + * and all the monopoles in the cell c + * + * @param c The #cell containing the monopoles + * @param leaf The #cell containing the particles + */ +static inline void iact_self_pc(struct cell *c, struct cell *leaf) { + + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (c->count == 0) error("Empty cell !"); + + if (!c->split) error("Cell is not split !"); + +#endif + + /* Find in which subcell of c the leaf is */ + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + + /* Only recurse if the leaf is in this part of the tree */ + if (is_inside(leaf, cp)) break; + } + + if (cp->split) { + + /* Recurse if the cell can be split */ + iact_self_pc(cp, leaf); + + /* Now, interact with every other subcell */ + for (cps = c->firstchild; cps != c->sibling; cps = cps->sibling) { + + /* Since cp and cps will be direct neighbours it is only worth recursing + */ + /* if the cells can both be split */ + if (cp != cps && cps->split) iact_pair_pc(cp, cps, leaf); + } + } +} + +/** + * @brief Starts the recursive tree walk of a given leaf + */ +void init_multipole_walk(struct cell *root, struct cell *leaf) { + + /* Pre-fetch the leaf's particles */ + __builtin_prefetch(leaf->parts, 1, 3); + + /* Start the recursion */ + iact_self_pc(root, leaf); +} + +/** + * @brief Compute the interactions between all particles in a cell + * and all particles in the other cell (N^2 algorithm) + * + * @param ci The #cell containing the particles. + * @param cj The #cell containing the other particles + */ +static inline void iact_pair_direct_unsorted(struct cell *ci, struct cell *cj) { + + int i, j, k; + int count_i = ci->count, count_j = cj->count; + struct part *parts_i = ci->parts, *parts_j = cj->parts; + double xi[3]; + float dx[3], ai[3], mi, mj, r2, w, ir; + +#ifdef SANITY_CHECKS + + /* Bad stuff will happen if cell sizes are different */ + if (ci->h != cj->h) + error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); + +#endif + + /* Loop over all particles in ci... */ + for (i = 0; i < count_i; i++) { + + /* Init the ith particle's data. */ + for (k = 0; k < 3; k++) { + xi[k] = parts_i[i].x[k]; + ai[k] = 0.0f; + } + mi = parts_i[i].mass; + + /* Loop over every particle in the other cell. */ + for (j = 0; j < count_j; j++) { + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = xi[k] - parts_j[j].x[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_j[j].mass; + for (k = 0; k < 3; k++) { + float wdx = w * dx[k]; + parts_j[j].a[k] += wdx * mi; + ai[k] -= wdx * mj; + } + +#ifdef COUNTERS + ++count_direct_unsorted; +#endif + +#if ICHECK >= 0 && 0 + if (parts_i[i].id == ICHECK) + printf( + "[NEW] Interaction with particle id= %d (pair i) a=( %e %e %e )\n", + parts_j[j].id, -mi * w * dx[0], -mi * w * dx[1], -mi * w * dx[2]); + + if (parts_j[j].id == ICHECK) + printf( + "[NEW] Interaction with particle id= %d (pair j) a=( %e %e %e )\n", + parts_i[i].id, mj * w * dx[0], mj * w * dx[1], mj * w * dx[2]); +#endif + + } /* loop over every other particle. */ + + /* Store the accumulated acceleration on the ith part. */ + for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k]; + + } /* loop over all particles. */ +} + + + + + + +static inline void iact_pair_direct_sorted_multipole(struct cell *ci, + struct cell *cj) { + + struct part_local { + float x[3]; + float a[3]; + float mass, d; + }; + + int i, j, k; + int count_i, count_j; + struct part_local *parts_i, *parts_j; + float cjh = cj->h; + float xi[3]; + float dx[3], ai[3], mi, mj, r2, w, ir; + +#ifdef SANITY_CHECKS + + /* Bad stuff will happen if cell sizes are different */ + if (ci->h != cj->h) + error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); + +#endif + + /* Get the sorted indices and stuff. */ + struct index *ind_i, *ind_j; + struct multipole multi; + get_axis(&ci, &cj, &ind_i, &ind_j); + multipole_reset(&multi); + cjh = cj->h; + + /* Allocate and fill-in the local parts. */ + count_i = ci->count; + count_j = cj->count; + if ((parts_i = + (struct part_local *)alloca(sizeof(struct part_local) *count_i)) == + NULL || + (parts_j = + (struct part_local *)alloca(sizeof(struct part_local) * count_j)) == + NULL) + error("Failed to allocate local part arrays."); + for (i = 0; i < count_i; i++) { + int pid = ind_i[i].ind; + parts_i[i].d = ind_i[i].d; + for (k = 0; k < 3; k++) { + parts_i[i].x[k] = ci->parts[pid].x[k] - cj->loc[k]; + parts_i[i].a[k] = 0.0f; + } + parts_i[i].mass = ci->parts[pid].mass; + } + for (j = 0; j < count_j; j++) { + int pjd = ind_j[j].ind; + parts_j[j].d = ind_j[j].d; + for (k = 0; k < 3; k++) { + parts_j[j].x[k] = cj->parts[pjd].x[k] - cj->loc[k]; + parts_j[j].a[k] = 0.0f; + } + parts_j[j].mass = cj->parts[pjd].mass; + } + +#if ICHECK >= 0 + for (k = 0; k < count_i; k++) + if (parts_i[k].id == ICHECK) + message("[DEBUG] interacting cells loc=[%f,%f,%f], h=%f and " + "loc=[%f,%f,%f], h=%f.", + ci->loc[0], ci->loc[1], ci->loc[2], ci->h, cj->loc[0], cj->loc[1], + cj->loc[2], cj->h); + for (k = 0; k < count_j; k++) + if (parts_j[k].id == ICHECK) + message("[DEBUG] interacting cells loc=[%f,%f,%f], h=%f and " + "loc=[%f,%f,%f], h=%f.", + cj->loc[0], cj->loc[1], cj->loc[2], cj->h, ci->loc[0], ci->loc[1], + ci->loc[2], ci->h); +#endif + + /* Distance along the axis as of which we will use a multipole. */ + float d_max = dist_cutoff_ratio * cjh; + + /* Loop over all particles in ci... */ + for (i = count_i - 1; i >= 0; i--) { + + /* Get a local copy of the distance along the axis. */ + float di = parts_i[i].d; + + /* Init the ith particle's data. */ + for (k = 0; k < 3; k++) { + xi[k] = parts_i[i].x[k]; + ai[k] = 0.0; + } + mi = parts_i[i].mass; + + /* Loop over every following particle within d_max along the axis. */ + for (j = 0; j < count_j && (parts_j[j].d - di) < d_max; j++) { + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = xi[k] - parts_j[j].x[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts_j[j].mass; + for (k = 0; k < 3; k++) { + float wdx = w * dx[k]; + parts_j[j].a[k] += wdx * mi; + ai[k] -= wdx * mj; + } + +#if ICHECK >= 0 + if (parts_i[i].id == ICHECK) + message("Interaction with part %d - d= %f", parts_j[j].id, sqrt(r2)); +#endif + +#ifdef COUNTERS + ++count_direct_sorted_pp; +#endif + +#if ICHECK >= 0 && 0 + if (parts_i[i].id == ICHECK) + printf("[NEW] Interaction with particle id= %d (pair i)\n", + parts_j[pjd].id); + + if (parts_j[j].id == ICHECK) + printf("[NEW] Interaction with particle id= %d (pair j) h_i= %f h_j= " + "%f ci= %p cj= %p count_i= %d count_j= %d d_i= %d d_j= %d\n", + parts_i[pid].id, ci->h, cj->h, ci, cj, count_i, count_j, ci->res, + cj->res); +#endif + + } /* loop over every other particle. */ + + /* Add any remaining particles to the COM. */ + for (int jj = j; jj < count_j; jj++) { + multipole_add(&multi, parts_j[jj].x, parts_j[jj].mass); + } + + /* Shrink count_j to the latest valid particle. */ + count_j = j; + + /* Interact part_i with the center of mass. */ + multipole_iact(&multi, xi, ai); + +#if ICHECK >= 0 + if (parts_i[i].id == ICHECK) + message("Interaction with multipole"); //, parts_j[j].id ); +#endif + +#ifdef COUNTERS + ++count_direct_sorted_pm_i; +#endif + + + /* Store the accumulated acceleration on the ith part. */ + for (k = 0; k < 3; k++) parts_i[i].a[k] += ai[k]; + + } /* loop over all particles in ci. */ + + + + /* Re-init some values. */ + count_j = cj->count; + int last_i = 0; + multipole_reset(&multi); + + /* Loop over the particles in cj, catch the COM interactions. */ + for (j = 0; j < count_j; j++) { + + /* Get the sorted index. */ + float dj = parts_j[j].d; + + /* Fill the COM with any new particles. */ + for (i = last_i; i < count_i && (dj - parts_i[i].d) > d_max; i++) { + multipole_add(&multi, parts_i[i].x, parts_i[i].mass); + } + + /* Set the new last_i to the last particle checked. */ + last_i = i; + + /* Interact part_j with the COM. */ + multipole_iact(&multi, parts_j[j].x, parts_j[j].a); + +#ifdef COUNTERS + ++count_direct_sorted_pm_j; +#endif + + } + + /* Copy the accelerations back to the original particles. */ + for (i = 0; i < count_i; i++) { + int pid = ind_i[i].ind; + for (k = 0; k < 3; k++) ci->parts[pid].a[k] += parts_i[i].a[k]; + } + for (j = 0; j < count_j; j++) { + int pjd = ind_j[j].ind; + for (k = 0; k < 3; k++) cj->parts[pjd].a[k] += parts_j[j].a[k]; + } +} + + + + + +/** + * @brief Decides whether two cells use the direct summation interaction or the +* multipole interactions + * + * @param ci The #cell. + * @param cj The other #cell. + */ +void iact_pair(struct cell *ci, struct cell *cj) { + + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (ci->count == 0 || cj->count == 0) error("Empty cell !"); + + /* Bad stuff will happen if cell sizes are different */ + if (ci->h != cj->h) + error("Non matching cell sizes !! h_i=%f h_j=%f\n", ci->h, cj->h); + + /* Sanity check */ + if (ci == cj) + error("The impossible has happened: pair interaction between a cell and " + "itself."); + +#endif + + /* Are the cells direct neighbours? */ + if (are_neighbours(ci, cj)) { + + /* Are both cells split ? */ + if (ci->split && cj->split) { + + /* Let's split both cells and build all possible pairs */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + + /* If the cells are neighbours, recurse. */ + if (are_neighbours(cp, cps)) { + iact_pair(cp, cps); + } + } + } + } else {/* Otherwise, compute the interactions at this level directly. */ + iact_pair_direct(ci, cj); + } + } +} + +/** + * @brief Compute the interactions between all particles in a cell. + * + * @param c The #cell. + */ +void iact_self_direct(struct cell *c) { + int i, j, k, count = c->count; + double xi[3] = {0.0, 0.0, 0.0}; + float ai[3] = {0.0, 0.0, 0.0}, mi, mj, dx[3] = {0.0, 0.0, 0.0}, r2, ir, w; + struct part *parts = c->parts; + struct cell *cp, *cps; + +#ifdef SANITY_CHECKS + + /* Early abort? */ + if (count == 0) error("Empty cell !"); + +#endif + + /* If the cell is split, interact each progeny with itself, and with + each of its siblings. */ + if (c->split) { + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + iact_self_direct(cp); + for (cps = cp->sibling; cps != c->sibling; cps = cps->sibling) + iact_pair(cp, cps); + } + + /* Otherwise, compute the interactions directly. */ + } else { + + /* Loop over all particles... */ + for (i = 0; i < count; i++) { + + /* Init the ith particle's data. */ + for (k = 0; k < 3; k++) { + xi[k] = parts[i].x[k]; + ai[k] = 0.0; + } + mi = parts[i].mass; + + /* Loop over every following particle. */ + for (j = i + 1; j < count; j++) { + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = xi[k] - parts[j].x[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = const_G * ir * ir * ir; + mj = parts[j].mass; + for (k = 0; k < 3; k++) { + float wdx = w * dx[k]; + parts[j].a[k] += wdx * mi; + ai[k] -= wdx * mj; + } + +#if ICHECK >= 0 + if (parts[i].id == ICHECK) + message("[NEW] Interaction with particle id= %d (self i)", + parts[j].id); + + if (parts[j].id == ICHECK) + message("[NEW] Interaction with particle id= %d (self j)", + parts[i].id); +#endif + + } /* loop over every other particle. */ + + /* Store the accumulated acceleration on the ith part. */ + for (k = 0; k < 3; k++) parts[i].a[k] += ai[k]; + + } /* loop over all particles. */ + + } /* otherwise, compute interactions directly. */ +} + +/** + * @brief Create the tasks for the cell pair/self. + * + * @param s The #sched in which to create the tasks. + * @param ci The first #cell. + * @param cj The second #cell. + */ +void create_tasks(struct qsched *s, struct cell *ci, struct cell *cj) { + + qsched_task_t tid; + struct cell *data[2], *cp, *cps; + +#ifdef SANITY_CHECKS + + /* If either cell is empty, stop. */ + if (ci->count == 0 || (cj != NULL && cj->count == 0)) error("Empty cell !"); + +#endif + + /* Single cell? */ + if (cj == NULL) { + + /* Is this cell split and above the task limit ? */ + if (ci->split && ci->count > task_limit / ci->count) { + + /* Loop over each of this cell's progeny. */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + + /* Make self-interaction task. */ + create_tasks(s, cp, NULL); + + /* Make all pair-interaction tasks. */ + for (cps = cp->sibling; cps != ci->sibling; cps = cps->sibling) + create_tasks(s, cp, cps); + } + + /* Otherwise, add a self-interaction task. */ + } else { + + /* Set the data. */ + data[0] = ci; + data[1] = NULL; + + /* Create the task. */ + tid = + qsched_addtask(s, task_type_self, task_flag_none, data, + sizeof(struct cell *) * 2, ci->count * ci->count / 2); + + /* Add the resource (i.e. the cell) to the new task. */ + qsched_addlock(s, tid, ci->res); + +/* If this call might recurse, add a dependency on the cell's COM + task. */ +#ifdef COM_AS_TASK + if (ci->split) qsched_addunlock(s, ci->com_tid, tid); +#endif + } + + /* Otherwise, it's a pair. */ + } else { + + /* Are the cells NOT neighbours ? */ + if (!are_neighbours(ci, cj)) { + + } else {/* Cells are direct neighbours */ + + /* Are both cells split ? */ + if (ci->split && cj->split && ci->count > task_limit / cj->count) { + + /* Let's split both cells and build all possible pairs */ + for (cp = ci->firstchild; cp != ci->sibling; cp = cp->sibling) { + for (cps = cj->firstchild; cps != cj->sibling; cps = cps->sibling) { + /* Recurse */ + create_tasks(s, cp, cps); + } + } + /* Otherwise, at least one of the cells is not split, build a direct + * interaction. */ + } else { + + /* Set the data. */ + data[0] = ci; + data[1] = cj; + + /* Create the task. */ + tid = qsched_addtask(s, task_type_pair, task_flag_none, data, + sizeof(struct cell *) * 2, ci->count * cj->count); + + /* Add the resources. */ + qsched_addlock(s, tid, ci->res); + qsched_addlock(s, tid, cj->res); + } + } /* Cells are direct neighbours */ + } /* Otherwise it's a pair */ +} + +/* -------------------------------------------------------------------------- */ +/* Legacy tree walk */ +/* -------------------------------------------------------------------------- */ + +/** + * @brief Compute the center of mass of a given cell recursively. + * + * @param c The #cell. + */ +void legacy_comp_com(struct cell *c, int *countCoMs) { + + int k, count = c->count; + struct cell *cp; + struct part *p, *parts = c->parts; + double com[3] = {0.0, 0.0, 0.0}, mass = 0.0; + + ++(*countCoMs); + + /* Is the cell split? */ + if (c->split) { + + /* Loop over the progeny. */ + for (cp = c->firstchild; cp != c->sibling; cp = cp->sibling) { + /* Recurse */ + legacy_comp_com(cp, countCoMs); + + /* Collect multipole information */ + float cp_mass = cp->legacy.mass; + com[0] += cp->legacy.com[0] * cp_mass; + com[1] += cp->legacy.com[1] * cp_mass; + com[2] += cp->legacy.com[2] * cp_mass; + mass += cp_mass; + } + + /* Otherwise, collect the multipole from local data. */ + } else { + + for (k = 0; k < count; k++) { + p = &parts[k]; + float p_mass = p->mass; + com[0] += p->x[0] * p_mass; + com[1] += p->x[1] * p_mass; + com[2] += p->x[2] * p_mass; + mass += p_mass; + } + } + + /* Finish multipole calculation */ + if (mass > 0.0) { + float imass = 1.0f / mass; + c->legacy.com[0] = com[0] * imass; + c->legacy.com[1] = com[1] * imass; + c->legacy.com[2] = com[2] * imass; + c->legacy.mass = mass; + } else { + c->legacy.com[0] = 0.0; + c->legacy.com[1] = 0.0; + c->legacy.com[2] = 0.0; + c->legacy.mass = 0.0; + } +} + +/** + * @brief Interacts a particle with a cell recursively using the original B-H + * tree walk procedure + * + * @param parts The array of particles + * @param i The particle of interest + * @param root The root of the tree under which we will search. + * @param monitor If set to @c parts[i].id, will produce debug output when + * ICHECK is set. + * @param cell The cell the particle interacts with + */ +void legacy_interact(struct part *parts, int i, struct cell *root, int monitor, + int *countMultipoles, int *countPairs) { + + int j, k; + float r2, dx[3], ir, w; + float a[3] = {0.0, 0.0, 0.0}; + double pix[3] = {parts[i].x[0], parts[i].x[1], parts[i].x[2]}; + int pid = parts[i].id; + struct cell *cell = root; + + /* Traverse the cells of the tree. */ + while (cell != NULL) { + + /* Are we in a leaf ? */ + if (!cell->split) { + + /* Interact the particle with the particles in the leaf */ + for (j = 0; j < cell->count; ++j) { + if (cell->parts[j].id == pid) continue; + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = cell->parts[j].x[k] - pix[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = cell->parts[j].mass * const_G * ir * ir * ir; + for (k = 0; k < 3; k++) a[k] += w * dx[k]; + + (*countPairs)++; + +#if ICHECK >= 0 + if (pid == monitor) + message("[BH_] Interaction with particle id= %d a=( %e %e %e ) h= %f " + "loc=( %e %e %e )\n", + cell->parts[j].id, w * dx[0], w * dx[1], w * dx[2], cell->h, + cell->loc[0], cell->loc[1], cell->loc[2]); +#endif + } + + cell = cell->sibling; + } else { + + /* We are in a node */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = cell->legacy.com[k] - pix[k]; + r2 += dx[k] * dx[k]; + } + +#if ICHECK >= 0 + if (pid == monitor) + message("This is a node with %d particles h= %f. r= %f theta= %f", + cell->count, cell->h, sqrt(r2), dist_min); +#endif + + /* Is the cell far enough ? */ + if (dist_min * dist_min * r2 < cell->h * cell->h) { + +#if ICHECK >= 0 + if (pid == monitor) printf("Recursing...\n"); +#endif + cell = cell->firstchild; + continue; + } + +#if ICHECK >= 0 + if (pid == monitor) + message("[BH_] Can interact with the monopole. x= %f %f %f m= %f h= %f", + cell->legacy.com[0], cell->legacy.com[1], cell->legacy.com[2], + cell->legacy.mass, cell->h); +#endif + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = cell->legacy.mass * const_G * ir * ir * ir; + for (k = 0; k < 3; k++) a[k] += w * dx[k]; + + (*countMultipoles)++; + + /* Move to the next node */ + cell = cell->sibling; + } + } + + /* Store the locally computed acceleration back to the particle. */ + for (k = 0; k < 3; k++) parts[i].a_legacy[k] += a[k]; +} + +/** + * @brief Does a tree walk as in the B-H original work for all particles + * + * @param N The number of particles + * @param parts The array of particles + * @param root The root cell of the tree + * @param monitor ID of the particle to monitor and output interactions to + * stdout + */ +void legacy_tree_walk(int N, struct part *parts, struct cell *root, int monitor, + int *countMultipoles, int *countPairs, int *countCoMs) { + + int i; + + /* Compute multipoles (recursively) */ + legacy_comp_com(root, countCoMs); + + //#pragma omp parallel for + for (i = 0; i < N; ++i) { + if (parts[i].id == monitor) + message("tree walk for particle %d x= %f %f %f", parts[i].id, + parts[i].x[0], parts[i].x[1], parts[i].x[2]); + + legacy_interact(parts, i, root, monitor, countMultipoles, countPairs); + + if (parts[i].id == monitor) + message("[check] legacy acceleration for particle %d a= %.3e %.3e %.3e", + parts[i].id, parts[i].a_legacy[0], parts[i].a_legacy[1], + parts[i].a_legacy[2]); + } +} + +/* -------------------------------------------------------------------------- */ +/* Exact interaction */ +/* -------------------------------------------------------------------------- */ + +/** + * @brief Solve the particle interactions using the stupid N^2 algorithm + * + * @param N The number of particles + * @param parts The array of particles + */ +void interact_exact(int N, struct part *parts, int monitor) { + + int i, j, k; + float ir, w, r2, dx[3]; + + /* Loop over all particles. */ + for (i = 0; i < N; ++i) { + + /* Some things to store locally. */ + double pix[3] = {parts[i].x[0], parts[i].x[1], parts[i].x[2]}; + float mi = parts[i].mass; + + /* Loop over every other particle. */ + for (j = i + 1; j < N; ++j) { + + /* Compute the pairwise distance. */ + for (r2 = 0.0, k = 0; k < 3; k++) { + dx[k] = parts[j].x[k] - pix[k]; + r2 += dx[k] * dx[k]; + } + + /* Apply the gravitational acceleration. */ + ir = 1.0f / sqrtf(r2); + w = const_G * ir * ir * ir; + + for (k = 0; k < 3; k++) { + float wdx = w * dx[k]; + parts[j].a_exact[k] -= wdx * mi; + parts[i].a_exact[k] += wdx * parts[j].mass; + } + } + } + +#if ICHECK >= 0 + for (i = 0; i < N; ++i) + if (parts[i].id == monitor) + message("[check] exact acceleration for particle %d a= %.3e %.3e %.3e\n", + parts[i].id, parts[i].a_exact[0], parts[i].a_exact[1], + parts[i].a_exact[2]); +#endif +} + +/** + * @brief Set up and run a task-based Barnes-Hutt N-body solver. + * + * @param N The number of random particles to use. + * @param nr_threads Number of threads to use. + * @param runs Number of force evaluations to use as a benchmark. + * @param fileName Input file name. If @c NULL or an empty string, random + * particle positions will be used. + */ +void test_bh(int N, int nr_threads, int runs, char *fileName) { + + int i, k; + struct cell *root; + struct part *parts; + FILE *file; + struct qsched s; + ticks tic, toc_run, tot_setup = 0, tot_run = 0; + int countMultipoles = 0, countPairs = 0, countCoMs = 0; + + /* Runner function. */ + void runner(int type, void * data) { + + ticks tic = getticks(); + + /* Decode the data. */ + struct cell **d = (struct cell **)data; + + /* Decode and execute the task. */ + switch (type) { + case task_type_self: + iact_self_direct(d[0]); + break; + case task_type_pair: + iact_pair(d[0], d[1]); + break; + case task_type_pair_direct: + iact_pair_direct(d[0], d[1]); + break; + case task_type_self_pc: + init_multipole_walk(d[0], d[1]); + break; + case task_type_com: + comp_com(d[0]); + break; + default: + error("Unknown task type."); + } + + atomic_add(&task_timers[type], getticks() - tic); + } + + /* Initialize the per-task type timers. */ + for (k = 0; k < task_type_count; k++) task_timers[k] = 0; + + /* Initialize the scheduler. */ + qsched_init(&s, nr_threads, qsched_flag_noreown); + + /* Init and fill the particle array. */ + if ((parts = (struct part *)malloc(sizeof(struct part) * N)) == NULL) + error("Failed to allocate particle buffer."); + + /* If no input file was specified, generate random particle positions. */ + if (fileName == NULL || fileName[0] == 0) { + for (k = 0; k < N; k++) { + parts[k].id = k; + parts[k].x[0] = ((double)rand()) / RAND_MAX; + parts[k].x[1] = ((double)rand()) / RAND_MAX; + parts[k].x[2] = ((double)rand()) / RAND_MAX; + parts[k].mass = ((double)rand()) / RAND_MAX; + parts[k].a_legacy[0] = 0.0; + parts[k].a_legacy[1] = 0.0; + parts[k].a_legacy[2] = 0.0; + } + + /* Otherwise, read them from a file. */ + } else { + file = fopen(fileName, "r"); + if (file) { + for (k = 0; k < N; k++) { + if (fscanf(file, "%d", &parts[k].id) != 1) + error("Failed to read ID of part %i.", k); + if (fscanf(file, "%lf%lf%lf", &parts[k].x[0], &parts[k].x[1], + &parts[k].x[2]) != + 3) + error("Failed to read position of part %i.", k); + if (fscanf(file, "%f", &parts[k].mass) != 1) + error("Failed to read mass of part %i.", k); + } + fclose(file); + } + } + + /* Init the cells. */ + root = cell_get(); + root->loc[0] = 0.0; + root->loc[1] = 0.0; + root->loc[2] = 0.0; + root->h = 1.0; + root->count = N; + root->parts = parts; + cell_split(root, &s); + + /* Iterate over the cells and get the average number of particles + per leaf. */ + struct cell *c = root; + int nr_leaves = 0; + while (c != NULL) { + if (!c->split) { + nr_leaves++; + c = c->sibling; + } else { + c = c->firstchild; + } + } + message("Average number of parts per leaf is %f.", ((double)N) / nr_leaves); + +#if ICHECK > 0 + printf("----------------------------------------------------------\n"); + + /* Do a N^2 interactions calculation */ + + ticks tic_exact = getticks(); + interact_exact(N, parts, ICHECK); + ticks toc_exact = getticks(); + + printf("Exact calculation (1 thread) took %lli (= %e) ticks\n", + toc_exact - tic_exact, (float)(toc_exact - tic_exact)); + + printf("----------------------------------------------------------\n"); +#endif + + /* Create the tasks. */ + tic = getticks(); + create_tasks(&s, root, NULL); + tot_setup += getticks() - tic; + + /* Dump the number of tasks. */ + message("total nr of tasks: %i.", s.count); + message("total nr of deps: %i.", s.count_deps); + message("total nr of res: %i.", s.count_res); + message("total nr of locks: %i.", s.count_locks); + message("total nr of uses: %i.", s.count_uses); + int counts[task_type_count]; + for (k = 0; k < task_type_count; k++) counts[k] = 0; + for (k = 0; k < s.count; k++) counts[s.tasks[k].type] += 1; + + // char buffer[200]; + // sprintf(buffer, "timings_legacy_%d_%d.dat", cell_maxparts, nr_threads); + // FILE *fileTime = fopen(buffer, "w"); + + /* Loop over the number of runs. */ + for (k = 0; k < 0 /* runs */; k++) { + + countMultipoles = 0; + countPairs = 0; + countCoMs = 0; + + /* Execute the legacy walk. */ + tic = getticks(); + legacy_tree_walk(N, parts, root, ICHECK, &countMultipoles, &countPairs, + &countCoMs); + toc_run = getticks(); + + /* Dump some timings. */ + message("%ith legacy run took %lli (= %e) ticks...", k, toc_run - tic, + (float)(toc_run - tic)); + tot_run += toc_run - tic; + // fprintf(fileTime, "%lli %e\n", toc_run - tic, (float)(toc_run - tic)); + } + +// fclose(fileTime); + +#if ICHECK >= 0 + message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK, + root->parts[ICHECK].a[0], root->parts[ICHECK].a[1], + root->parts[ICHECK].a[2]); +#endif + printf("task counts: [ %8s %8s %8s %8s %8s ]\n", "self", "pair", "m-poles", + "direct", "CoMs"); + printf("task counts: [ %8i %8i %8i %8i %8i ] (legacy).\n", 0, 0, + countMultipoles, countPairs, countCoMs); + printf("task counts: [ "); + for (k = 0; k < task_type_count; k++) printf("%8i ", counts[k]); + printf("] (new).\n"); + + /* Loop over the number of runs. */ + for (k = 0; k < runs; k++) { + + for (i = 0; i < N; ++i) { + parts[i].a[0] = 0.0; + parts[i].a[1] = 0.0; + parts[i].a[2] = 0.0; + } + + struct cell *finger = root; + while (finger != NULL) { + finger->sorted = 0; + if (finger->split) + finger = finger->firstchild; + else + finger = finger->sibling; + } + + /* Execute the tasks. */ + tic = getticks(); + qsched_run(&s, nr_threads, runner); + toc_run = getticks(); + message("%ith run took %lli (= %e) ticks...", k, toc_run - tic, + (float)(toc_run - tic)); + tot_run += toc_run - tic; + } + + message("[check] root mass= %f %f", root->legacy.mass, root->new.mass); + message("[check] root CoMx= %f %f", root->legacy.com[0], root->new.com[0]); + message("[check] root CoMy= %f %f", root->legacy.com[1], root->new.com[1]); + message("[check] root CoMz= %f %f", root->legacy.com[2], root->new.com[2]); +#if ICHECK >= 0 + for (i = 0; i < N; ++i) + if (root->parts[i].id == ICHECK) + message("[check] accel of part %i is [%.3e,%.3e,%.3e]", ICHECK, + root->parts[i].a[0], root->parts[i].a[1], root->parts[i].a[2]); +#endif + + /* Dump the tasks. */ + /* for ( k = 0 ; k < s.count ; k++ ) + printf( " %i %i %lli %lli\n" , s.tasks[k].type , s.tasks[k].qid , + s.tasks[k].tic , s.tasks[k].toc ); */ + + /* Dump the costs. */ + message("costs: setup=%lli ticks, run=%lli ticks.", tot_setup, + tot_run / runs); + + /* Dump the timers. */ + for (k = 0; k < qsched_timer_count; k++) + message("timer %s is %lli ticks.", qsched_timer_names[k], + s.timers[k] / runs); + + /* Dump the per-task type timers. */ + printf("task timers: [ "); + for (k = 0; k < task_type_count; k++) printf("%lli ", task_timers[k] / runs); + printf("] ticks.\n"); + + /* Dump the particles to a file */ + file = fopen("particle_dump.dat", "w"); + fprintf(file, + "# ID m x y z a_exact.x a_exact.y a_exact.z a_legacy.x " + "a_legacy.y a_legacy.z a_new.x a_new.y a_new.z\n"); + for (k = 0; k < N; ++k) + fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id, + parts[k].mass, parts[k].x[0], parts[k].x[1], parts[k].x[2], + parts[k].a_exact[0], parts[k].a_exact[1], parts[k].a_exact[2], + parts[k].a_legacy[0], parts[k].a_legacy[1], parts[k].a_legacy[2], + parts[k].a[0], parts[k].a[1], parts[k].a[2]); + fclose(file); + + /* Clean up. */ + qsched_free(&s); + free(parts); +} + +/* All 26 configurations for the cell tests */ +const float cell_shift[27 * 3] = {1.0, 1.0, 1.0, /* 0 */ + 1.0, 1.0, 0.0, /* 1 */ + 1.0, 1.0, -1.0, /* 2 */ + 1.0, 0.0, 1.0, /* 3 */ + 1.0, 0.0, 0.0, /* 4 */ + 1.0, 0.0, -1.0, /* 5 */ + 1.0, -1.0, 1.0, /* 6 */ + 1.0, -1.0, 0.0, /* 7 */ + 1.0, -1.0, -1.0, /* 8 */ + 0.0, 1.0, 1.0, /* 9 */ + 0.0, 1.0, 0.0, /* 10 */ + 0.0, 1.0, -1.0, /* 11 */ + 0.0, 0.0, 1.0, /* 12 */ + -1.0, -1.0, -1.0, /* 13 */ + -1.0, -1.0, 0.0, /* 14 */ + -1.0, -1.0, 1.0, /* 15 */ + -1.0, 0.0, -1.0, /* 16 */ + -1.0, 0.0, 0.0, /* 17 */ + -1.0, 0.0, 1.0, /* 18 */ + -1.0, 1.0, -1.0, /* 19 */ + -1.0, 1.0, 0.0, /* 20 */ + -1.0, 1.0, 1.0, /* 21 */ + 0.0, -1.0, -1.0, /* 22 */ + 0.0, -1.0, 0.0, /* 23 */ + 0.0, -1.0, 1.0, /* 24 */ + 0.0, 0.0, -1.0, /* 25 */ + 0.0, 0.0, + 0.0 /* 26 */ /* <-- The cell itself */ +}; + +/** + * @brief Creates two neighbouring cells wiht N_parts per celland makes them + * interact using both the + * sorted and unsortde interactions. Outputs then the tow sets of accelerations + * for accuracy tests. + * @param N_parts Number of particles in each cell + * @param orientation Orientation of the cells ( 0 <= orientation < 26 ) + */ +void test_direct_neighbour(int N_parts, int orientation) { + + int k; + struct part *parts; + struct cell left, right; + + if (orientation >= 26) error("Wrong orientation !"); + + /* Select configuration */ + float shift[3]; + for (k = 0; k < 3; ++k) shift[k] = cell_shift[3 * orientation + k]; + + /* Init and fill the particle array. */ + if ((parts = (struct part *)malloc(sizeof(struct part) * N_parts * 2)) == + NULL) + error("Failed to allocate particle buffer."); + + /* Create random set of particles in both cells */ + for (k = 0; k < 2 * N_parts; k++) { + parts[k].id = k; + + parts[k].x[0] = ((double)rand()) / RAND_MAX; + parts[k].x[1] = ((double)rand()) / RAND_MAX; + parts[k].x[2] = ((double)rand()) / RAND_MAX; + + /* Shift the second cell */ + if (k >= N_parts) { + parts[k].x[0] += shift[0]; + parts[k].x[1] += shift[1]; + parts[k].x[2] += shift[2]; + } + + parts[k].mass = ((double)rand()) / RAND_MAX; + parts[k].a[0] = 0.0; + parts[k].a[1] = 0.0; + parts[k].a[2] = 0.0; + } + + /* Get the cell geometry right */ + left.loc[0] = 0.; + left.loc[1] = 0.; + left.loc[2] = 0.; + left.h = 1.; + + right.loc[0] = shift[0]; + right.loc[1] = shift[1]; + right.loc[2] = shift[2]; + right.h = 1.; + + /* Put the particles in the cell */ + left.parts = parts; + left.count = N_parts; + right.parts = parts + N_parts; + right.count = N_parts; + + /* Get the linked list right (functions should not recurse but hey...) */ + left.firstchild = NULL; + left.sibling = NULL; + left.split = 0; + right.firstchild = NULL; + right.sibling = NULL; + right.split = 0; + + /* Get the multipoles right (should also be useless) */ + comp_com(&left); + comp_com(&right); + + /* Nothing has been sorted yet */ + left.indices = NULL; + left.sorted = 0; + right.indices = NULL; + right.sorted = 0; + +#ifdef COUNTERS + count_direct_unsorted = 0; + count_direct_sorted_pp = 0; + count_direct_sorted_pm_i = 0; + count_direct_sorted_pm_j = 0; +#endif + + /* Do the interactions without sorting */ + iact_pair_direct_unsorted(&left, &right); + + message("Unsorted interactions done "); + + /* Store accelerations */ + for (k = 0; k < 2 * N_parts; k++) { + parts[k].a_exact[0] = parts[k].a[0]; + parts[k].a_exact[1] = parts[k].a[1]; + parts[k].a_exact[2] = parts[k].a[2]; + parts[k].a[0] = 0.0; + parts[k].a[1] = 0.0; + parts[k].a[2] = 0.0; + } + + /* Do the interactions with sorting */ + iact_pair_direct(&left, &right); + + message("Sorted interactions done "); + + for (k = 0; k < 2 * N_parts; ++k) { + if (parts[k].id == ICHECK) { + + message("Accel exact : [ %e %e %e ]", parts[k].a_exact[0], + parts[k].a_exact[1], parts[k].a_exact[2]); + message("Accel sorted: [ %e %e %e ]", parts[k].a[0], parts[k].a[1], + parts[k].a[2]); + } + } + + /* Position along the axis */ + double *position, *ortho_dist; + if ((position = (double *)malloc(sizeof(double) * N_parts * 2)) == NULL) + error("Failed to allocate position buffer."); + if ((ortho_dist = (double *)malloc(sizeof(double) * N_parts * 2)) == NULL) + error("Failed to allocate orthogonal distance buffer."); + + float w = 1.0f / sqrtf(shift[0]*shift[0] + shift[1]*shift[1] + shift[2]*shift[2]); + shift[0] *= w; + shift[1] *= w; + shift[2] *= w; + + for (k = 0; k < 2 * N_parts; ++k) { + float dx = parts[k].x[0] * shift[0]; + float dy = parts[k].x[1] * shift[1]; + float dz = parts[k].x[2] * shift[2]; + + position[k] = dx + dy + dz; + + dx = (parts[k].x[1] - 0.5f)*shift[2] - (parts[k].x[2] - 0.5f)*shift[1]; + dy = (parts[k].x[2] - 0.5f)*shift[0] - (parts[k].x[0] - 0.5f)*shift[2]; + dz = (parts[k].x[0] - 0.5f)*shift[1] - (parts[k].x[1] - 0.5f)*shift[0]; + + ortho_dist[k] = sqrtf(dx*dx + dy*dy + dz*dz); + } + + /* Now, output everything */ + char fileName[100]; + sprintf(fileName, "interaction_dump_%d.dat", orientation); + message("Writing file '%s'", fileName); + FILE *file = fopen(fileName, "w"); + fprintf(file, + "# ID m r x y z a_u.x a_u.y a_u.z a_s.x a_s.y a_s.z ortho\n"); + for (k = 0; k < 2 * N_parts; k++) { + fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e\n", parts[k].id, + parts[k].mass, position[k], parts[k].x[0], parts[k].x[1], + parts[k].x[2], parts[k].a_exact[0], parts[k].a_exact[1], + parts[k].a_exact[2], parts[k].a[0], parts[k].a[1], parts[k].a[2], + ortho_dist[k]); + } + fclose(file); + +#ifdef COUNTERS + message("Unsorted intereactions: %d", count_direct_unsorted); + message("Sorted intereactions PP: %d", count_direct_sorted_pp); + message("Sorted intereactions PM (part i): %d", count_direct_sorted_pm_i); + message("Sorted intereactions PM (part j): %d", count_direct_sorted_pm_j); + message("Sorted intereactions total: %d", + count_direct_sorted_pm_j + count_direct_sorted_pm_i + + count_direct_sorted_pp); +// message( "%f %d %d %d %d\n", dist_cutoff_ratio, count_direct_unsorted, +// count_direct_sorted_pp, count_direct_sorted_pm_i, count_direct_sorted_pm_j ); +#endif + + /* Clean up */ + free(parts); +} + +/** + * @brief Creates 27 cells and makes the particles in the central one + * interact with the other cells + * using both the sorted and unsorted interactions + * Outputs then the two sets of accelerations for accuracy tests. + * @param N_parts Number of particles in each cell + * @param N_test Number of times the test has to be run + */ +void test_all_direct_neighbours(int N_parts, int N_test) { + + int i, k, j; + struct part *parts; + struct cell cells[27]; + + /* Init and fill the particle array. */ + if ((parts = (struct part *)malloc(sizeof(struct part) * N_parts * 27)) == + NULL) + error("Failed to allocate particle buffer."); + +#ifdef COUNTERS + count_direct_unsorted = 0; + count_direct_sorted_pp = 0; + count_direct_sorted_pm_i = 0; + count_direct_sorted_pm_j = 0; +#endif + int countMultipoles = 0; + int countPairs = 0; + int countCoMs = 0; + + /* Loop over the number of tests */ + for (i = 0; i < N_test; ++i) { + + /* Loop over the 27 cells */ + for (j = 0; j < 27; ++j) { + + float shift[3]; + for (k = 0; k < 3; ++k) shift[k] = cell_shift[3 * j + k]; + + /* Create random set of particles in all cells */ + for (k = 0; k < N_parts; k++) { + parts[j * N_parts + k].id = j * N_parts + k; + + parts[j * N_parts + k].x[0] = ((double)rand()) / RAND_MAX + shift[0]; + parts[j * N_parts + k].x[1] = ((double)rand()) / RAND_MAX + shift[1]; + parts[j * N_parts + k].x[2] = ((double)rand()) / RAND_MAX + shift[2]; + + parts[j * N_parts + k].mass = ((double)rand()) / RAND_MAX; + parts[j * N_parts + k].a[0] = 0.0; + parts[j * N_parts + k].a[1] = 0.0; + parts[j * N_parts + k].a[2] = 0.0; + parts[j * N_parts + k].a_exact[0] = 0.0; + parts[j * N_parts + k].a_exact[1] = 0.0; + parts[j * N_parts + k].a_exact[2] = 0.0; + parts[j * N_parts + k].a_legacy[0] = 0.0; + parts[j * N_parts + k].a_legacy[1] = 0.0; + parts[j * N_parts + k].a_legacy[2] = 0.0; + } + + /* Get the cell geometry right */ + cells[j].loc[0] = shift[0]; + cells[j].loc[1] = shift[1]; + cells[j].loc[2] = shift[2]; + cells[j].h = 1.; + + /* Put the particles in the cell */ + cells[j].parts = parts + N_parts * j; + cells[j].count = N_parts; + + /* Get the linked list right (functions should not recurse but hey...) */ + cells[j].firstchild = NULL; + cells[j].sibling = NULL; + cells[j].split = 0; + + /* Get the multipoles right (should also be useless) */ + comp_com(&cells[j]); + + /* Nothing has been sorted yet */ + cells[j].indices = NULL; + cells[j].sorted = 0; + } + + /* Do the interactions without sorting */ + for (j = 0; j < 26; ++j) iact_pair_direct_unsorted(&cells[26], &cells[j]); + iact_self_direct(&cells[26]); + + // message("Unsorted interactions done "); + + /* Store accelerations */ + for (k = 0; k < 27 * N_parts; k++) { + parts[k].a_exact[0] = parts[k].a[0]; + parts[k].a_exact[1] = parts[k].a[1]; + parts[k].a_exact[2] = parts[k].a[2]; + parts[k].a[0] = 0.0; + parts[k].a[1] = 0.0; + parts[k].a[2] = 0.0; + } + + /* Do the interactions with sorting */ + for (j = 0; j < 26; ++j) iact_pair_direct(&cells[26], &cells[j]); + iact_self_direct(&cells[26]); + + // message("Sorted interactions done "); + + /* Now, do a B-H calculation on the same set of particles */ + struct cell *root = cell_get(); + root->loc[0] = -1.0; + root->loc[1] = -1.0; + root->loc[2] = -1.0; + root->h = 3.0; + root->count = 27 * N_parts; + root->parts = parts; + struct qsched s; + qsched_init(&s, 1, qsched_flag_noreown); + cell_split(root, &s); + legacy_tree_walk(27 * N_parts, parts, root, ICHECK, &countMultipoles, + &countPairs, &countCoMs); + + // free(root); + //++countCoMs; + + /* Now, output everything */ + char fileName[100]; + sprintf(fileName, "interaction_dump_all.dat"); + // message("Writing file '%s'", fileName); + FILE *file = fopen(fileName, (i == 0 ? "w" : "a")); + if (i == 0) + fprintf(file, + "# ID m x y z a_u.x a_u.y a_u.z a_s.x a_s.y a_s.z" + " a_bh.x a_bh.y a_bh.z\n"); + for (k = 0; k < 27 * N_parts; k++) { + if (parts[k].id >= 26 * N_parts) + fprintf(file, "%d %e %e %e %e %e %e %e %e %e %e %e %e %e\n", + parts[k].id, parts[k].mass, parts[k].x[0], parts[k].x[1], + parts[k].x[2], parts[k].a_exact[0], parts[k].a_exact[1], + parts[k].a_exact[2], parts[k].a[0], parts[k].a[1], + parts[k].a[2], parts[k].a_legacy[0], parts[k].a_legacy[1], + parts[k].a_legacy[2]); + } + fclose(file); + } + +#ifdef COUNTERS + message("Unsorted intereactions: %d", count_direct_unsorted); + message("B-H intereactions: PP %d", countPairs); + message("B-H intereactions: PM %d", countMultipoles); + message("B-H intereactions: total %d", countPairs + countMultipoles); + message("Sorted intereactions PP: %d", count_direct_sorted_pp); + message("Sorted intereactions PM (part i): %d", count_direct_sorted_pm_i); + message("Sorted intereactions PM (part j): %d", count_direct_sorted_pm_j); + message("Sorted intereactions total: %d", + count_direct_sorted_pm_j + count_direct_sorted_pm_i + + count_direct_sorted_pp); +// message( "%f %d %d %d %d\n", dist_cutoff_ratio, count_direct_unsorted, +// count_direct_sorted_pp, count_direct_sorted_pm_i, count_direct_sorted_pm_j ); +#endif + + /* Clean up */ + free(parts); +} + +/** + * @brief Main function. + */ + +int main(int argc, char *argv[]) { + + int c, k, nr_threads; + int N = 1000, runs = 1; + char fileName[100] = {0}; + int N_parts = 0; + int N_iterations = 0; + + /* Die on FP-exceptions. */ + feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + + srand(0); + +/* Get the number of threads. */ +#pragma omp parallel shared(nr_threads) + { + if (omp_get_thread_num() == 0) nr_threads = omp_get_num_threads(); + } + + /* Parse the options */ + while ((c = getopt(argc, argv, "n:r:t:f:c:i:")) != -1) switch (c) { + case 'n': + if (sscanf(optarg, "%d", &N) != 1) + error("Error parsing number of particles."); + break; + case 'r': + if (sscanf(optarg, "%d", &runs) != 1) + error("Error parsing number of runs."); + break; + case 't': + if (sscanf(optarg, "%d", &nr_threads) != 1) + error("Error parsing number of threads."); + omp_set_num_threads(nr_threads); + break; + case 'f': + if (sscanf(optarg, "%s", &fileName[0]) != 1) + error("Error parsing file name."); + break; + case 'c': + if (sscanf(optarg, "%d", &N_parts) != 1) + error( + "Error parsing number of particles in neighbouring cells test."); + break; + case 'i': + if (sscanf(optarg, "%d", &N_iterations) != 1) + error( + "Error parsing number of particles in neighbouring cells test."); + break; + case '?': + fprintf(stderr, "Usage: %s [-t nr_threads] [-n N] [-r runs] [-f file] " + "[-c Nparts] [-i Niterations] \n", + argv[0]); + fprintf(stderr, "Solves the N-body problem using a Barnes-Hut\n" + "tree code with N random particles read from a file in " + "[0,1]^3 using" + "nr_threads threads.\n" + "A test of the neighbouring cells interaction with " + "Nparts per cell is also run Niterations times.\n"); + exit(EXIT_FAILURE); + } + + /* Tree node information */ + printf("Size of cell: %zu bytes.\n", sizeof(struct cell)); + + /* Part information */ + printf("Size of part: %zu bytes.\n", sizeof(struct part)); + + /* Run the neighbour direct integration test */ + if (N_parts > 0) { + + /* Dump arguments */ + message("Interacting 2 neighbouring cells with %d particles per cell", + N_parts); + + /* Run the test */ + for (k = 0; k < 26; ++k) test_direct_neighbour(N_parts, k); + + /* Dump arguments */ + message("Interacting one cell with %d particles with its 27 neighbours" + " %d times to get the statistics.", + N_parts, N_iterations); + + test_all_direct_neighbours(N_parts, N_iterations); + + } else { + + /* Dump arguments. */ + if (fileName[0] == 0) { + message("Computing the N-body problem over %i random particles using %i " + "threads (%i runs).", + N, nr_threads, runs); + } else { + message("Computing the N-body problem over %i particles read from '%s' " + "using %i threads (%i runs).", + N, fileName, nr_threads, runs); + } + + /* Run the BH test. */ + test_bh(N, nr_threads, runs, fileName); + } + + return 0; +} +