diff --git a/examples/PMillennium-384/p-mill-384.yml b/examples/PMillennium-384/p-mill-384.yml new file mode 100644 index 0000000000000000000000000000000000000000..b7fc6f762205a0a493b13392ff3001c653445d8d --- /dev/null +++ b/examples/PMillennium-384/p-mill-384.yml @@ -0,0 +1,50 @@ +# Define the system of units to use internally. +InternalUnitSystem: + UnitMass_in_cgs: 1.98848e43 # 10^10 M_sun + UnitLength_in_cgs: 3.08567758e24 # 1 Mpc + UnitVelocity_in_cgs: 1e5 # 1 km/s + UnitCurrent_in_cgs: 1 # Amperes + UnitTemp_in_cgs: 1 # Kelvin + +# Planck-13 cosmology +Cosmology: + h: 0.6777 + a_begin: 0.02 # z_ini = 49 + a_end: 1.0 # z_end = 0 + Omega_m: 0.307 + Omega_lambda: 0.693 + Omega_b: 0.0455 + +# Parameters governing the time integration +TimeIntegration: + dt_min: 1e-6 + dt_max: 1e-2 + +Scheduler: + max_top_level_cells: 16 + cell_split_size: 100 + +# Parameters governing the snapshots +Snapshots: + basename: PMill + delta_time: 1.02 + scale_factor_first: 0.02 + +# Parameters governing the conserved quantities statistics +Statistics: + delta_time: 1.02 + scale_factor_first: 0.02 + +# Parameters for the self-gravity scheme +Gravity: + eta: 0.025 + theta: 0.5 + comoving_softening: 0.08333 # 83.333 kpc = 1/25 mean inter-particle separation + max_physical_softening: 0.08333 # 83.333 kpc = 1/25 mean inter-particle separation + mesh_side_length: 128 + +# Parameters related to the initial conditions +InitialConditions: + file_name: ics.hdf5 + cleanup_h_factors: 1 + cleanup_velocity_factors: 1 diff --git a/examples/PMillennium-768/p-mill-768.yml b/examples/PMillennium-768/p-mill-768.yml new file mode 100644 index 0000000000000000000000000000000000000000..8b93a7752d09545c08b98af87db1b1d46eb15189 --- /dev/null +++ b/examples/PMillennium-768/p-mill-768.yml @@ -0,0 +1,50 @@ +# Define the system of units to use internally. +InternalUnitSystem: + UnitMass_in_cgs: 1.98848e43 # 10^10 M_sun + UnitLength_in_cgs: 3.08567758e24 # 1 Mpc + UnitVelocity_in_cgs: 1e5 # 1 km/s + UnitCurrent_in_cgs: 1 # Amperes + UnitTemp_in_cgs: 1 # Kelvin + +# Planck-13 cosmology +Cosmology: + h: 0.6777 + a_begin: 0.02 # z_ini = 49 + a_end: 1.0 # z_end = 0 + Omega_m: 0.307 + Omega_lambda: 0.693 + Omega_b: 0.0455 + +# Parameters governing the time integration +TimeIntegration: + dt_min: 1e-6 + dt_max: 1e-2 + +Scheduler: + max_top_level_cells: 16 + cell_split_size: 100 + +# Parameters governing the snapshots +Snapshots: + basename: PMill + delta_time: 1.02 + scale_factor_first: 0.02 + +# Parameters governing the conserved quantities statistics +Statistics: + delta_time: 1.02 + scale_factor_first: 0.02 + +# Parameters for the self-gravity scheme +Gravity: + eta: 0.025 + theta: 0.5 + comoving_softening: 0.041666 # 41.6666 kpc = 1/25 mean inter-particle separation + max_physical_softening: 0.041666 # 41.6666 kpc = 1/25 mean inter-particle separation + mesh_side_length: 256 + +# Parameters related to the initial conditions +InitialConditions: + file_name: ics.hdf5 + cleanup_h_factors: 1 + cleanup_velocity_factors: 1 diff --git a/src/approx_math.h b/src/approx_math.h index 90ea4eb997c71311e0c1ce854bbdd0a0ba7396ce..f347bab44790d1e3120675bcbd6e7a457ca09821 100644 --- a/src/approx_math.h +++ b/src/approx_math.h @@ -21,6 +21,31 @@ #include "inline.h" +/** + * @brief Approximate version of the complementay error function erfcf(x). + * + * This is based on eq. 7.1.27 of Abramowitz & Stegun, 1972. + * The absolute error is < 4.7*10^-4 over the range 0 < x < infinity. + * + * Returns garbage for x < 0. + * @param x The number to compute erfc for. + */ +__attribute__((always_inline, const)) INLINE static float approx_erfcf( + float x) { + + /* 1 + 0.278393*x + 0.230389*x^2 + 0.000972*x^3 + 0.078108*x^4 */ + float arg = 0.078108f; + arg = x * arg + 0.000972f; + arg = x * arg + 0.230389f; + arg = x * arg + 0.278393f; + arg = x * arg + 1.f; + + /* 1 / arg^4 */ + const float arg2 = arg * arg; + const float arg4 = arg2 * arg2; + return 1.f / arg4; +} + /** * @brief Approximate version of expf(x) using a 4th order Taylor expansion * diff --git a/src/atomic.h b/src/atomic.h index 69df59e9fba965422eaf9a3b3de9d28ab9f09dad..10548c6a20249b4b0c362c5e6ab78ea5d85b2091 100644 --- a/src/atomic.h +++ b/src/atomic.h @@ -127,4 +127,36 @@ __attribute__((always_inline)) INLINE static void atomic_add_f( } while (test_val.as_int != old_val.as_int); } +/** + * @brief Atomic add operation on doubles. + * + * This is a text-book implementation based on an atomic CAS. + * + * We create a temporary union to cope with the int-only atomic CAS + * and the floating-point add that we want. + * + * @param address The address to update. + * @param y The value to update the address with. + */ +__attribute__((always_inline)) INLINE static void atomic_add_d( + volatile double *const address, const double y) { + + long long *const long_long_ptr = (long long *)address; + + typedef union { + double as_double; + long long as_long_long; + } cast_type; + + cast_type test_val, old_val, new_val; + old_val.as_double = *address; + + do { + test_val.as_long_long = old_val.as_long_long; + new_val.as_double = old_val.as_double + y; + old_val.as_long_long = + atomic_cas(long_long_ptr, test_val.as_long_long, new_val.as_long_long); + } while (test_val.as_long_long != old_val.as_long_long); +} + #endif /* SWIFT_ATOMIC_H */ diff --git a/src/engine.c b/src/engine.c index 8b8a107df60938bc598f13b24a53338e105dc77b..ba0192b40f20bc6e05a70266d7f750b05b2911b0 100644 --- a/src/engine.c +++ b/src/engine.c @@ -3943,9 +3943,12 @@ void engine_rebuild(struct engine *e, int clean_smoothing_length_values) { /* Re-build the space. */ space_rebuild(e->s, e->verbose); + /* Construct the list of purely local cells */ + space_list_local_cells(e->s); + /* Re-compute the mesh forces */ if ((e->policy & engine_policy_self_gravity) && e->s->periodic) - pm_mesh_compute_potential(e->mesh, e->s, e->verbose); + pm_mesh_compute_potential(e->mesh, e->s, &e->threadpool, e->verbose); /* Re-compute the maximal RMS displacement constraint */ if (e->policy & engine_policy_cosmology) @@ -4262,7 +4265,8 @@ void engine_collect_end_of_step(struct engine *e, int apply) { /* Collect information from the local top-level cells */ threadpool_map(&e->threadpool, engine_collect_end_of_step_mapper, - s->local_cells_top, s->nr_local_cells, sizeof(int), 0, &data); + s->local_cells_with_tasks_top, s->nr_local_cells_with_tasks, + sizeof(int), 0, &data); /* Store these in the temporary collection group. */ collectgroup1_init(&e->collect_group1, data.updates, data.g_updates, @@ -5132,14 +5136,17 @@ void engine_unskip(struct engine *e) { #endif // WITH_PROFILER /* Move the active local cells to the top of the list. */ - int *local_cells = e->s->local_cells_top; + int *local_cells = e->s->local_cells_with_tasks_top; int num_active_cells = 0; - for (int k = 0; k < s->nr_local_cells; k++) { + for (int k = 0; k < s->nr_local_cells_with_tasks; k++) { struct cell *c = &s->cells_top[local_cells[k]]; + if ((e->policy & engine_policy_hydro && cell_is_active_hydro(c, e)) || - (e->policy & - (engine_policy_self_gravity | engine_policy_external_gravity) && + (e->policy & engine_policy_self_gravity && + cell_is_active_gravity(c, e)) || + (e->policy & engine_policy_external_gravity && cell_is_active_gravity(c, e))) { + if (num_active_cells != k) memswap(&local_cells[k], &local_cells[num_active_cells], sizeof(int)); num_active_cells += 1; diff --git a/src/gravity/Default/gravity_iact.h b/src/gravity/Default/gravity_iact.h index 71e5007a49bda25a8b65d4a5d3733d0027aa2682..6fce3ddd512018e9ea4be21111c75904c77cb925 100644 --- a/src/gravity/Default/gravity_iact.h +++ b/src/gravity/Default/gravity_iact.h @@ -166,7 +166,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pm_full( /* Compute the derivatives of the potential */ struct potential_derivatives_M2P d; - compute_potential_derivatives_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 0, 0.f, + potential_derivatives_compute_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 0, 0.f, &d); /* 0th order contributions */ @@ -271,7 +271,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pm_truncated( /* Compute the derivatives of the potential */ struct potential_derivatives_M2P d; - compute_potential_derivatives_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 1, + potential_derivatives_compute_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 1, r_s_inv, &d); /* 0th order contributions */ diff --git a/src/gravity/Potential/gravity_iact.h b/src/gravity/Potential/gravity_iact.h index fdc8c17da1576b85026c3e551dd70d27bc186612..f2094f6ecd5b31b94ebfe7a64f42fbd289a0c81c 100644 --- a/src/gravity/Potential/gravity_iact.h +++ b/src/gravity/Potential/gravity_iact.h @@ -169,7 +169,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pm_full( /* Compute the derivatives of the potential */ struct potential_derivatives_M2P d; - compute_potential_derivatives_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 0, 0.f, + potential_derivatives_compute_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 0, 0.f, &d); /* 0th order contributions */ @@ -281,7 +281,7 @@ __attribute__((always_inline)) INLINE static void runner_iact_grav_pm_truncated( /* Compute the derivatives of the potential */ struct potential_derivatives_M2P d; - compute_potential_derivatives_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 1, + potential_derivatives_compute_M2P(r_x, r_y, r_z, r2, r_inv, h, h_inv, 1, r_s_inv, &d); /* 0th order contributions */ diff --git a/src/gravity_derivatives.h b/src/gravity_derivatives.h index 756fb7af66d4cb695ba014452e424843b1c7c25b..3dcffe1cc04c5e10d3b2353b7e21c532747c1475 100644 --- a/src/gravity_derivatives.h +++ b/src/gravity_derivatives.h @@ -125,6 +125,65 @@ struct potential_derivatives_M2P { #endif }; +/** + * @brief Converts the derivatives from a distance vector to its opposite. + * + * From a series of tensors D_xxx(r), compute D_xxx(-r). + * This can be computed efficiently by flipping the sign of all the odd + * derivative terms. + * + * @param pot The derivatives of the potential. + */ +__attribute__((always_inline)) INLINE static void +potential_derivatives_flip_signs(struct potential_derivatives_M2L *pot) { + +#if SELF_GRAVITY_MULTIPOLE_ORDER > 0 + /* 1st order terms */ + pot->D_100 = -pot->D_100; + pot->D_010 = -pot->D_010; + pot->D_001 = -pot->D_001; +#endif + +#if SELF_GRAVITY_MULTIPOLE_ORDER > 2 + /* 3rd order terms */ + pot->D_300 = -pot->D_300; + pot->D_030 = -pot->D_030; + pot->D_003 = -pot->D_003; + pot->D_210 = -pot->D_210; + pot->D_201 = -pot->D_201; + pot->D_021 = -pot->D_021; + pot->D_120 = -pot->D_120; + pot->D_012 = -pot->D_012; + pot->D_102 = -pot->D_102; + pot->D_111 = -pot->D_111; +#endif + +#if SELF_GRAVITY_MULTIPOLE_ORDER > 4 + /* 5th order terms */ + pot->D_500 = -pot->D_500; + pot->D_050 = -pot->D_050; + pot->D_005 = -pot->D_005; + pot->D_410 = -pot->D_410; + pot->D_401 = -pot->D_401; + pot->D_041 = -pot->D_041; + pot->D_140 = -pot->D_140; + pot->D_014 = -pot->D_014; + pot->D_104 = -pot->D_104; + pot->D_320 = -pot->D_320; + pot->D_302 = -pot->D_302; + pot->D_032 = -pot->D_032; + pot->D_230 = -pot->D_230; + pot->D_023 = -pot->D_023; + pot->D_203 = -pot->D_203; + pot->D_311 = -pot->D_311; + pot->D_131 = -pot->D_131; + pot->D_113 = -pot->D_113; + pot->D_122 = -pot->D_122; + pot->D_212 = -pot->D_212; + pot->D_221 = -pot->D_221; +#endif +} + /** * @brief Compute all the relevent derivatives of the softened and truncated * gravitational potential for the M2L kernel. @@ -141,7 +200,7 @@ struct potential_derivatives_M2P { * @param pot (return) The structure containing all the derivatives. */ __attribute__((always_inline)) INLINE static void -compute_potential_derivatives_M2L(const float r_x, const float r_y, +potential_derivatives_compute_M2L(const float r_x, const float r_y, const float r_z, const float r2, const float r_inv, const float eps, const float eps_inv, const int periodic, @@ -397,7 +456,7 @@ compute_potential_derivatives_M2L(const float r_x, const float r_y, * @param pot (return) The structure containing all the derivatives. */ __attribute__((always_inline)) INLINE static void -compute_potential_derivatives_M2P(const float r_x, const float r_y, +potential_derivatives_compute_M2P(const float r_x, const float r_y, const float r_z, const float r2, const float r_inv, const float eps, const float eps_inv, const int periodic, diff --git a/src/gravity_properties.c b/src/gravity_properties.c index fc1ce1d62e02c32d44667d602448fc4eb3a65344..40cd7cecb10eab7ebbe7b59d34fb15dccac8b620 100644 --- a/src/gravity_properties.c +++ b/src/gravity_properties.c @@ -58,12 +58,17 @@ void gravity_props_init(struct gravity_props *p, struct swift_params *params, p->r_cut_min_ratio = parser_get_opt_param_float( params, "Gravity:r_cut_min", gravity_props_default_r_cut_min); + /* Some basic checks */ if (p->mesh_size % 2 != 0) error("The mesh side-length must be an even number."); if (p->a_smooth <= 0.) error("The mesh smoothing scale 'a_smooth' must be > 0."); + if (2. * p->a_smooth * p->r_cut_max_ratio > p->mesh_size) + error("Mesh too small given r_cut_max. Should be at least %d cells wide.", + (int)(2. * p->a_smooth * p->r_cut_max_ratio) + 1); + /* Time integration */ p->eta = parser_get_param_float(params, "Gravity:eta"); diff --git a/src/kernel_long_gravity.h b/src/kernel_long_gravity.h index 1744f2cd046a90499563a182ca68212e43f4a252..f6580f8f72b9eb6a2b49a4d2d54a0e4d0593fcbf 100644 --- a/src/kernel_long_gravity.h +++ b/src/kernel_long_gravity.h @@ -90,7 +90,7 @@ __attribute__((always_inline)) INLINE static void kernel_long_grav_derivatives( const float r_s_inv5 = r_s_inv4 * r_s_inv; /* Derivatives of \chi */ - derivs->chi_0 = erfcf(u); + derivs->chi_0 = approx_erfcf(u); derivs->chi_1 = -r_s_inv; derivs->chi_2 = r_s_inv2 * u; derivs->chi_3 = -r_s_inv3 * (u2 - 0.5f); @@ -158,7 +158,7 @@ __attribute__((always_inline)) INLINE static void kernel_long_grav_pot_eval( #ifdef GADGET2_LONG_RANGE_CORRECTION const float arg1 = u * 0.5f; - const float term1 = erfcf(arg1); + const float term1 = approx_erfcf(arg1); *W = term1; #else @@ -190,7 +190,7 @@ __attribute__((always_inline)) INLINE static void kernel_long_grav_force_eval( const float arg1 = u * 0.5f; const float arg2 = -arg1 * arg1; - const float term1 = erfcf(arg1); + const float term1 = approx_erfcf(arg1); const float term2 = u * one_over_sqrt_pi * expf(arg2); *W = term1 + term2; diff --git a/src/mesh_gravity.c b/src/mesh_gravity.c index d193f19082338d6cd793272df80fdd80780ad144..8cad7b621a58cd211a7e84d083894c73eb8c641a 100644 --- a/src/mesh_gravity.c +++ b/src/mesh_gravity.c @@ -110,14 +110,22 @@ __attribute__((always_inline)) INLINE static void CIC_set( double dx, double dy, double dz, double value) { /* Classic CIC interpolation */ - mesh[row_major_id_periodic(i + 0, j + 0, k + 0, N)] += value * tx * ty * tz; - mesh[row_major_id_periodic(i + 0, j + 0, k + 1, N)] += value * tx * ty * dz; - mesh[row_major_id_periodic(i + 0, j + 1, k + 0, N)] += value * tx * dy * tz; - mesh[row_major_id_periodic(i + 0, j + 1, k + 1, N)] += value * tx * dy * dz; - mesh[row_major_id_periodic(i + 1, j + 0, k + 0, N)] += value * dx * ty * tz; - mesh[row_major_id_periodic(i + 1, j + 0, k + 1, N)] += value * dx * ty * dz; - mesh[row_major_id_periodic(i + 1, j + 1, k + 0, N)] += value * dx * dy * tz; - mesh[row_major_id_periodic(i + 1, j + 1, k + 1, N)] += value * dx * dy * dz; + atomic_add_d(&mesh[row_major_id_periodic(i + 0, j + 0, k + 0, N)], + value * tx * ty * tz); + atomic_add_d(&mesh[row_major_id_periodic(i + 0, j + 0, k + 1, N)], + value * tx * ty * dz); + atomic_add_d(&mesh[row_major_id_periodic(i + 0, j + 1, k + 0, N)], + value * tx * dy * tz); + atomic_add_d(&mesh[row_major_id_periodic(i + 0, j + 1, k + 1, N)], + value * tx * dy * dz); + atomic_add_d(&mesh[row_major_id_periodic(i + 1, j + 0, k + 0, N)], + value * dx * ty * tz); + atomic_add_d(&mesh[row_major_id_periodic(i + 1, j + 0, k + 1, N)], + value * dx * ty * dz); + atomic_add_d(&mesh[row_major_id_periodic(i + 1, j + 1, k + 0, N)], + value * dx * dy * tz); + atomic_add_d(&mesh[row_major_id_periodic(i + 1, j + 1, k + 1, N)], + value * dx * dy * dz); } /** @@ -165,6 +173,74 @@ INLINE static void gpart_to_mesh_CIC(const struct gpart* gp, double* rho, int N, CIC_set(rho, N, i, j, k, tx, ty, tz, dx, dy, dz, mass); } +/** + * @brief Assigns all the #gpart of a #cell to a density mesh using the CIC + * method. + * + * @param c The #cell. + * @param rho The density mesh. + * @param N the size of the mesh along one axis. + * @param fac The width of a mesh cell. + * @param dim The dimensions of the simulation box. + */ +void cell_gpart_to_mesh_CIC(const struct cell* c, double* rho, int N, + double fac, const double dim[3]) { + const int gcount = c->gcount; + const struct gpart* gparts = c->gparts; + + /* Assign all the gpart of that cell to the mesh */ + for (int i = 0; i < gcount; ++i) + gpart_to_mesh_CIC(&gparts[i], rho, N, fac, dim); +} + +/** + * @brief Shared information about the mesh to be used by all the threads in the + * pool. + */ +struct cic_mapper_data { + const struct cell* cells; + double* rho; + int N; + double fac; + double dim[3]; +}; + +/** + * @brief Threadpool mapper function for the mesh CIC assignment of a cell. + * + * @param map_data A chunk of the list of local cells. + * @param num The number of cells in the chunk. + * @param extra The information about the mesh and cells. + */ +void cell_gpart_to_mesh_CIC_mapper(void* map_data, int num, void* extra) { + + /* Unpack the shared information */ + const struct cic_mapper_data* data = (struct cic_mapper_data*)extra; + const struct cell* cells = data->cells; + double* rho = data->rho; + const int N = data->N; + const double fac = data->fac; + const double dim[3] = {data->dim[0], data->dim[1], data->dim[2]}; + + /* Pointer to the chunk to be processed */ + int* local_cells = (int*)map_data; + + // MATTHIEU: This could in principle be improved by creating a local mesh + // with just the extent required for the cell. Assignment can + // then be done without atomics. That local mesh is then added + // atomically to the global one. + + /* Loop over the elements assigned to this thread */ + for (int i = 0; i < num; ++i) { + + /* Pointer to local cell */ + const struct cell* c = &cells[local_cells[i]]; + + /* Assign this cell's content to the mesh */ + cell_gpart_to_mesh_CIC(c, rho, N, fac, dim); + } +} + /** * @brief Computes the potential on a gpart from a given mesh using the CIC * method. @@ -279,16 +355,19 @@ void mesh_to_gparts_CIC(struct gpart* gp, const double* pot, int N, double fac, * * @param mesh The #pm_mesh used to store the potential. * @param s The #space containing the particles. + * @param tp The #threadpool object used for parallelisation. * @param verbose Are we talkative? */ void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s, - int verbose) { + struct threadpool* tp, int verbose) { #ifdef HAVE_FFTW const double r_s = mesh->r_s; const double box_size = s->dim[0]; const double dim[3] = {s->dim[0], s->dim[1], s->dim[2]}; + const int* local_cells = s->local_cells_top; + const int nr_local_cells = s->nr_local_cells; if (r_s <= 0.) error("Invalid value of a_smooth"); if (mesh->dim[0] != dim[0] || mesh->dim[1] != dim[1] || @@ -322,9 +401,20 @@ void pm_mesh_compute_potential(struct pm_mesh* mesh, const struct space* s, /* Zero everything */ bzero(rho, N * N * N * sizeof(double)); - /* Do a CIC mesh assignment of the gparts */ - for (size_t i = 0; i < s->nr_gparts; ++i) - gpart_to_mesh_CIC(&s->gparts[i], rho, N, cell_fac, dim); + /* Gather the mesh shared information to be used by the threads */ + struct cic_mapper_data data; + data.cells = s->cells_top; + data.rho = rho; + data.N = N; + data.fac = cell_fac; + data.dim[0] = dim[0]; + data.dim[1] = dim[1]; + data.dim[2] = dim[2]; + + /* Do a parallel CIC mesh assignment of the gparts but only using + the local top-level cells */ + threadpool_map(tp, cell_gpart_to_mesh_CIC_mapper, (void*)local_cells, + nr_local_cells, sizeof(int), 0, (void*)&data); if (verbose) message("Gpart assignment took %.3f %s.", @@ -499,7 +589,7 @@ void pm_mesh_init(struct pm_mesh* mesh, const struct gravity_props* props, #ifdef HAVE_FFTW - if (dim[0] != dim[1] || dim[0] != dim[1]) + if (dim[0] != dim[1] || dim[0] != dim[2]) error("Doing mesh-gravity on a non-cubic domain"); const int N = props->mesh_size; @@ -516,6 +606,9 @@ void pm_mesh_init(struct pm_mesh* mesh, const struct gravity_props* props, mesh->r_cut_max = mesh->r_s * props->r_cut_max_ratio; mesh->r_cut_min = mesh->r_s * props->r_cut_min_ratio; + if (2. * mesh->r_cut_max > box_size) + error("Mesh too small or r_cut_max too big for this box size"); + /* Allocate the memory for the combined density and potential array */ mesh->potential = (double*)fftw_malloc(sizeof(double) * N * N * N); if (mesh->potential == NULL) diff --git a/src/mesh_gravity.h b/src/mesh_gravity.h index c512a53ca349816caf4c666c6f504dd4b717bcb7..abff77afbd9c29547a781cdfbf6fed745e1ca37f 100644 --- a/src/mesh_gravity.h +++ b/src/mesh_gravity.h @@ -29,6 +29,7 @@ /* Forward declarations */ struct space; struct gpart; +struct threadpool; /** * @brief Data structure for the long-range periodic forces using a mesh @@ -67,7 +68,7 @@ void pm_mesh_init(struct pm_mesh *mesh, const struct gravity_props *props, double dim[3]); void pm_mesh_init_no_mesh(struct pm_mesh *mesh, double dim[3]); void pm_mesh_compute_potential(struct pm_mesh *mesh, const struct space *s, - int verbose); + struct threadpool *tp, int verbose); void pm_mesh_interpolate_forces(const struct pm_mesh *mesh, const struct engine *e, struct gpart *gparts, int gcount); diff --git a/src/multipole.h b/src/multipole.h index 2ada835e32565dc4075dd1352ba3959b2ba4766b..1525ee57a9b3373c892cc603c793c44a40675a8f 100644 --- a/src/multipole.h +++ b/src/multipole.h @@ -293,8 +293,8 @@ INLINE static void gravity_field_tensors_init(struct grav_tensor *l, * @param la The gravity tensors to add to. * @param lb The gravity tensors to add. */ -INLINE static void gravity_field_tensors_add(struct grav_tensor *la, - const struct grav_tensor *lb) { +INLINE static void gravity_field_tensors_add( + struct grav_tensor *restrict la, const struct grav_tensor *restrict lb) { #ifdef SWIFT_DEBUG_CHECKS if (lb->num_interacted == 0) error("Adding tensors that did not interact"); la->num_interacted += lb->num_interacted; @@ -502,8 +502,8 @@ INLINE static void gravity_multipole_print(const struct multipole *m) { * @param ma The multipole to add to. * @param mb The multipole to add. */ -INLINE static void gravity_multipole_add(struct multipole *ma, - const struct multipole *mb) { +INLINE static void gravity_multipole_add(struct multipole *restrict ma, + const struct multipole *restrict mb) { /* Add 0th order term */ ma->M_000 += mb->M_000; @@ -1307,8 +1307,8 @@ INLINE static void gravity_P2M(struct gravity_tensors *multi, * @param pos_a The position to which m_b will be shifted. * @param pos_b The current postion of the multipole to shift. */ -INLINE static void gravity_M2M(struct multipole *m_a, - const struct multipole *m_b, +INLINE static void gravity_M2M(struct multipole *restrict m_a, + const struct multipole *restrict m_b, const double pos_a[3], const double pos_b[3]) { /* Shift 0th order term */ @@ -1558,43 +1558,11 @@ INLINE static void gravity_M2M(struct multipole *m_a, * * @param l_b The field tensor to compute. * @param m_a The multipole creating the field. - * @param pos_b The position of the field tensor. - * @param pos_a The position of the multipole. - * @param props The #gravity_props of this calculation. - * @param periodic Is the calculation periodic ? - * @param dim The size of the simulation box. - * @param rs_inv The inverse of the gravity mesh-smoothing scale. + * @param pot The derivatives of the potential. */ -INLINE static void gravity_M2L(struct grav_tensor *l_b, - const struct multipole *m_a, - const double pos_b[3], const double pos_a[3], - const struct gravity_props *props, int periodic, - const double dim[3], float rs_inv) { - - /* Recover some constants */ - const float eps = props->epsilon_cur; - const float eps_inv = props->epsilon_cur_inv; - - /* Compute distance vector */ - float dx = (float)(pos_b[0] - pos_a[0]); - float dy = (float)(pos_b[1] - pos_a[1]); - float dz = (float)(pos_b[2] - pos_a[2]); - - /* Apply BC */ - if (periodic) { - dx = nearest(dx, dim[0]); - dy = nearest(dy, dim[1]); - dz = nearest(dz, dim[2]); - } - - /* Compute distance */ - const float r2 = dx * dx + dy * dy + dz * dz; - const float r_inv = 1. / sqrtf(r2); - - /* Compute all derivatives */ - struct potential_derivatives_M2L pot; - compute_potential_derivatives_M2L(dx, dy, dz, r2, r_inv, eps, eps_inv, - periodic, rs_inv, &pot); +INLINE static void gravity_M2L_apply( + struct grav_tensor *restrict l_b, const struct multipole *restrict m_a, + const struct potential_derivatives_M2L *pot) { #ifdef SWIFT_DEBUG_CHECKS /* Count interactions */ @@ -1604,330 +1572,382 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b, /* Record that this tensor has received contributions */ l_b->interacted = 1; + const float M_000 = m_a->M_000; + const float D_000 = pot->D_000; + /* 0th order term */ - l_b->F_000 += m_a->M_000 * pot.D_000; + l_b->F_000 += M_000 * D_000; #if SELF_GRAVITY_MULTIPOLE_ORDER > 0 + + /* The dipole term is zero when using the CoM */ + /* The compiler will optimize out the terms in the equations */ + /* below. We keep them written to maintain the logical structure. */ + const float M_100 = 0.f; + const float M_010 = 0.f; + const float M_001 = 0.f; + + const float D_100 = pot->D_100; + const float D_010 = pot->D_010; + const float D_001 = pot->D_001; + /* 1st order multipole term (addition to rank 0)*/ - l_b->F_000 += - m_a->M_100 * pot.D_100 + m_a->M_010 * pot.D_010 + m_a->M_001 * pot.D_001; + l_b->F_000 += M_100 * D_100 + M_010 * D_010 + M_001 * D_001; /* 1st order multipole term (addition to rank 1)*/ - l_b->F_100 += m_a->M_000 * pot.D_100; - l_b->F_010 += m_a->M_000 * pot.D_010; - l_b->F_001 += m_a->M_000 * pot.D_001; + l_b->F_100 += M_000 * D_100; + l_b->F_010 += M_000 * D_010; + l_b->F_001 += M_000 * D_001; #endif #if SELF_GRAVITY_MULTIPOLE_ORDER > 1 + const float M_200 = m_a->M_200; + const float M_020 = m_a->M_020; + const float M_002 = m_a->M_002; + const float M_110 = m_a->M_110; + const float M_101 = m_a->M_101; + const float M_011 = m_a->M_011; + + const float D_200 = pot->D_200; + const float D_020 = pot->D_020; + const float D_002 = pot->D_002; + const float D_110 = pot->D_110; + const float D_101 = pot->D_101; + const float D_011 = pot->D_011; + /* 2nd order multipole term (addition to rank 0)*/ - l_b->F_000 += - m_a->M_200 * pot.D_200 + m_a->M_020 * pot.D_020 + m_a->M_002 * pot.D_002; - l_b->F_000 += - m_a->M_110 * pot.D_110 + m_a->M_101 * pot.D_101 + m_a->M_011 * pot.D_011; + l_b->F_000 += M_200 * D_200 + M_020 * D_020 + M_002 * D_002; + l_b->F_000 += M_110 * D_110 + M_101 * D_101 + M_011 * D_011; /* 2nd order multipole term (addition to rank 1)*/ - l_b->F_100 += - m_a->M_100 * pot.D_200 + m_a->M_010 * pot.D_110 + m_a->M_001 * pot.D_101; - l_b->F_010 += - m_a->M_100 * pot.D_110 + m_a->M_010 * pot.D_020 + m_a->M_001 * pot.D_011; - l_b->F_001 += - m_a->M_100 * pot.D_101 + m_a->M_010 * pot.D_011 + m_a->M_001 * pot.D_002; + l_b->F_100 += M_100 * D_200 + M_010 * D_110 + M_001 * D_101; + l_b->F_010 += M_100 * D_110 + M_010 * D_020 + M_001 * D_011; + l_b->F_001 += M_100 * D_101 + M_010 * D_011 + M_001 * D_002; /* 2nd order multipole term (addition to rank 2)*/ - l_b->F_200 += m_a->M_000 * pot.D_200; - l_b->F_020 += m_a->M_000 * pot.D_020; - l_b->F_002 += m_a->M_000 * pot.D_002; - l_b->F_110 += m_a->M_000 * pot.D_110; - l_b->F_101 += m_a->M_000 * pot.D_101; - l_b->F_011 += m_a->M_000 * pot.D_011; + l_b->F_200 += M_000 * D_200; + l_b->F_020 += M_000 * D_020; + l_b->F_002 += M_000 * D_002; + l_b->F_110 += M_000 * D_110; + l_b->F_101 += M_000 * D_101; + l_b->F_011 += M_000 * D_011; #endif #if SELF_GRAVITY_MULTIPOLE_ORDER > 2 + const float M_300 = m_a->M_300; + const float M_030 = m_a->M_030; + const float M_003 = m_a->M_003; + const float M_210 = m_a->M_210; + const float M_201 = m_a->M_201; + const float M_021 = m_a->M_021; + const float M_120 = m_a->M_120; + const float M_012 = m_a->M_012; + const float M_102 = m_a->M_102; + const float M_111 = m_a->M_111; + + const float D_300 = pot->D_300; + const float D_030 = pot->D_030; + const float D_003 = pot->D_003; + const float D_210 = pot->D_210; + const float D_201 = pot->D_201; + const float D_021 = pot->D_021; + const float D_120 = pot->D_120; + const float D_012 = pot->D_012; + const float D_102 = pot->D_102; + const float D_111 = pot->D_111; + /* 3rd order multipole term (addition to rank 0)*/ - l_b->F_000 += - m_a->M_300 * pot.D_300 + m_a->M_030 * pot.D_030 + m_a->M_003 * pot.D_003; - l_b->F_000 += - m_a->M_210 * pot.D_210 + m_a->M_201 * pot.D_201 + m_a->M_120 * pot.D_120; - l_b->F_000 += - m_a->M_021 * pot.D_021 + m_a->M_102 * pot.D_102 + m_a->M_012 * pot.D_012; - l_b->F_000 += m_a->M_111 * pot.D_111; + l_b->F_000 += M_300 * D_300 + M_030 * D_030 + M_003 * D_003; + l_b->F_000 += M_210 * D_210 + M_201 * D_201 + M_120 * D_120; + l_b->F_000 += M_021 * D_021 + M_102 * D_102 + M_012 * D_012; + l_b->F_000 += M_111 * D_111; /* 3rd order multipole term (addition to rank 1)*/ - l_b->F_100 += - m_a->M_200 * pot.D_300 + m_a->M_020 * pot.D_120 + m_a->M_002 * pot.D_102; - l_b->F_100 += - m_a->M_110 * pot.D_210 + m_a->M_101 * pot.D_201 + m_a->M_011 * pot.D_111; - l_b->F_010 += - m_a->M_200 * pot.D_210 + m_a->M_020 * pot.D_030 + m_a->M_002 * pot.D_012; - l_b->F_010 += - m_a->M_110 * pot.D_120 + m_a->M_101 * pot.D_111 + m_a->M_011 * pot.D_021; - l_b->F_001 += - m_a->M_200 * pot.D_201 + m_a->M_020 * pot.D_021 + m_a->M_002 * pot.D_003; - l_b->F_001 += - m_a->M_110 * pot.D_111 + m_a->M_101 * pot.D_102 + m_a->M_011 * pot.D_012; + l_b->F_100 += M_200 * D_300 + M_020 * D_120 + M_002 * D_102; + l_b->F_100 += M_110 * D_210 + M_101 * D_201 + M_011 * D_111; + l_b->F_010 += M_200 * D_210 + M_020 * D_030 + M_002 * D_012; + l_b->F_010 += M_110 * D_120 + M_101 * D_111 + M_011 * D_021; + l_b->F_001 += M_200 * D_201 + M_020 * D_021 + M_002 * D_003; + l_b->F_001 += M_110 * D_111 + M_101 * D_102 + M_011 * D_012; /* 3rd order multipole term (addition to rank 2)*/ - l_b->F_200 += - m_a->M_100 * pot.D_300 + m_a->M_010 * pot.D_210 + m_a->M_001 * pot.D_201; - l_b->F_020 += - m_a->M_100 * pot.D_120 + m_a->M_010 * pot.D_030 + m_a->M_001 * pot.D_021; - l_b->F_002 += - m_a->M_100 * pot.D_102 + m_a->M_010 * pot.D_012 + m_a->M_001 * pot.D_003; - l_b->F_110 += - m_a->M_100 * pot.D_210 + m_a->M_010 * pot.D_120 + m_a->M_001 * pot.D_111; - l_b->F_101 += - m_a->M_100 * pot.D_201 + m_a->M_010 * pot.D_111 + m_a->M_001 * pot.D_102; - l_b->F_011 += - m_a->M_100 * pot.D_111 + m_a->M_010 * pot.D_021 + m_a->M_001 * pot.D_012; + l_b->F_200 += M_100 * D_300 + M_010 * D_210 + M_001 * D_201; + l_b->F_020 += M_100 * D_120 + M_010 * D_030 + M_001 * D_021; + l_b->F_002 += M_100 * D_102 + M_010 * D_012 + M_001 * D_003; + l_b->F_110 += M_100 * D_210 + M_010 * D_120 + M_001 * D_111; + l_b->F_101 += M_100 * D_201 + M_010 * D_111 + M_001 * D_102; + l_b->F_011 += M_100 * D_111 + M_010 * D_021 + M_001 * D_012; /* 3rd order multipole term (addition to rank 3)*/ - l_b->F_300 += m_a->M_000 * pot.D_300; - l_b->F_030 += m_a->M_000 * pot.D_030; - l_b->F_003 += m_a->M_000 * pot.D_003; - l_b->F_210 += m_a->M_000 * pot.D_210; - l_b->F_201 += m_a->M_000 * pot.D_201; - l_b->F_120 += m_a->M_000 * pot.D_120; - l_b->F_021 += m_a->M_000 * pot.D_021; - l_b->F_102 += m_a->M_000 * pot.D_102; - l_b->F_012 += m_a->M_000 * pot.D_012; - l_b->F_111 += m_a->M_000 * pot.D_111; + l_b->F_300 += M_000 * D_300; + l_b->F_030 += M_000 * D_030; + l_b->F_003 += M_000 * D_003; + l_b->F_210 += M_000 * D_210; + l_b->F_201 += M_000 * D_201; + l_b->F_120 += M_000 * D_120; + l_b->F_021 += M_000 * D_021; + l_b->F_102 += M_000 * D_102; + l_b->F_012 += M_000 * D_012; + l_b->F_111 += M_000 * D_111; #endif #if SELF_GRAVITY_MULTIPOLE_ORDER > 3 + + const float M_400 = m_a->M_400; + const float M_040 = m_a->M_040; + const float M_004 = m_a->M_004; + const float M_310 = m_a->M_310; + const float M_301 = m_a->M_301; + const float M_031 = m_a->M_031; + const float M_130 = m_a->M_130; + const float M_013 = m_a->M_013; + const float M_103 = m_a->M_103; + const float M_220 = m_a->M_220; + const float M_202 = m_a->M_202; + const float M_022 = m_a->M_022; + const float M_211 = m_a->M_211; + const float M_121 = m_a->M_121; + const float M_112 = m_a->M_112; + + const float D_400 = pot->D_400; + const float D_040 = pot->D_040; + const float D_004 = pot->D_004; + const float D_310 = pot->D_310; + const float D_301 = pot->D_301; + const float D_031 = pot->D_031; + const float D_130 = pot->D_130; + const float D_013 = pot->D_013; + const float D_103 = pot->D_103; + const float D_220 = pot->D_220; + const float D_202 = pot->D_202; + const float D_022 = pot->D_022; + const float D_211 = pot->D_211; + const float D_121 = pot->D_121; + const float D_112 = pot->D_112; + /* Compute 4th order field tensor terms (addition to rank 0) */ - l_b->F_000 += - m_a->M_004 * pot.D_004 + m_a->M_013 * pot.D_013 + m_a->M_022 * pot.D_022 + - m_a->M_031 * pot.D_031 + m_a->M_040 * pot.D_040 + m_a->M_103 * pot.D_103 + - m_a->M_112 * pot.D_112 + m_a->M_121 * pot.D_121 + m_a->M_130 * pot.D_130 + - m_a->M_202 * pot.D_202 + m_a->M_211 * pot.D_211 + m_a->M_220 * pot.D_220 + - m_a->M_301 * pot.D_301 + m_a->M_310 * pot.D_310 + m_a->M_400 * pot.D_400; + l_b->F_000 += M_004 * D_004 + M_013 * D_013 + M_022 * D_022 + + M_031 * D_031 + M_040 * D_040 + M_103 * D_103 + + M_112 * D_112 + M_121 * D_121 + M_130 * D_130 + + M_202 * D_202 + M_211 * D_211 + M_220 * D_220 + + M_301 * D_301 + M_310 * D_310 + M_400 * D_400; /* Compute 4th order field tensor terms (addition to rank 1) */ - l_b->F_001 += m_a->M_003 * pot.D_004 + m_a->M_012 * pot.D_013 + - m_a->M_021 * pot.D_022 + m_a->M_030 * pot.D_031 + - m_a->M_102 * pot.D_103 + m_a->M_111 * pot.D_112 + - m_a->M_120 * pot.D_121 + m_a->M_201 * pot.D_202 + - m_a->M_210 * pot.D_211 + m_a->M_300 * pot.D_301; - l_b->F_010 += m_a->M_003 * pot.D_013 + m_a->M_012 * pot.D_022 + - m_a->M_021 * pot.D_031 + m_a->M_030 * pot.D_040 + - m_a->M_102 * pot.D_112 + m_a->M_111 * pot.D_121 + - m_a->M_120 * pot.D_130 + m_a->M_201 * pot.D_211 + - m_a->M_210 * pot.D_220 + m_a->M_300 * pot.D_310; - l_b->F_100 += m_a->M_003 * pot.D_103 + m_a->M_012 * pot.D_112 + - m_a->M_021 * pot.D_121 + m_a->M_030 * pot.D_130 + - m_a->M_102 * pot.D_202 + m_a->M_111 * pot.D_211 + - m_a->M_120 * pot.D_220 + m_a->M_201 * pot.D_301 + - m_a->M_210 * pot.D_310 + m_a->M_300 * pot.D_400; + l_b->F_001 += M_003 * D_004 + M_012 * D_013 + M_021 * D_022 + + M_030 * D_031 + M_102 * D_103 + M_111 * D_112 + + M_120 * D_121 + M_201 * D_202 + M_210 * D_211 + + M_300 * D_301; + l_b->F_010 += M_003 * D_013 + M_012 * D_022 + M_021 * D_031 + + M_030 * D_040 + M_102 * D_112 + M_111 * D_121 + + M_120 * D_130 + M_201 * D_211 + M_210 * D_220 + + M_300 * D_310; + l_b->F_100 += M_003 * D_103 + M_012 * D_112 + M_021 * D_121 + + M_030 * D_130 + M_102 * D_202 + M_111 * D_211 + + M_120 * D_220 + M_201 * D_301 + M_210 * D_310 + + M_300 * D_400; /* Compute 4th order field tensor terms (addition to rank 2) */ - l_b->F_002 += m_a->M_002 * pot.D_004 + m_a->M_011 * pot.D_013 + - m_a->M_020 * pot.D_022 + m_a->M_101 * pot.D_103 + - m_a->M_110 * pot.D_112 + m_a->M_200 * pot.D_202; - l_b->F_011 += m_a->M_002 * pot.D_013 + m_a->M_011 * pot.D_022 + - m_a->M_020 * pot.D_031 + m_a->M_101 * pot.D_112 + - m_a->M_110 * pot.D_121 + m_a->M_200 * pot.D_211; - l_b->F_020 += m_a->M_002 * pot.D_022 + m_a->M_011 * pot.D_031 + - m_a->M_020 * pot.D_040 + m_a->M_101 * pot.D_121 + - m_a->M_110 * pot.D_130 + m_a->M_200 * pot.D_220; - l_b->F_101 += m_a->M_002 * pot.D_103 + m_a->M_011 * pot.D_112 + - m_a->M_020 * pot.D_121 + m_a->M_101 * pot.D_202 + - m_a->M_110 * pot.D_211 + m_a->M_200 * pot.D_301; - l_b->F_110 += m_a->M_002 * pot.D_112 + m_a->M_011 * pot.D_121 + - m_a->M_020 * pot.D_130 + m_a->M_101 * pot.D_211 + - m_a->M_110 * pot.D_220 + m_a->M_200 * pot.D_310; - l_b->F_200 += m_a->M_002 * pot.D_202 + m_a->M_011 * pot.D_211 + - m_a->M_020 * pot.D_220 + m_a->M_101 * pot.D_301 + - m_a->M_110 * pot.D_310 + m_a->M_200 * pot.D_400; + l_b->F_002 += M_002 * D_004 + M_011 * D_013 + M_020 * D_022 + + M_101 * D_103 + M_110 * D_112 + M_200 * D_202; + l_b->F_011 += M_002 * D_013 + M_011 * D_022 + M_020 * D_031 + + M_101 * D_112 + M_110 * D_121 + M_200 * D_211; + l_b->F_020 += M_002 * D_022 + M_011 * D_031 + M_020 * D_040 + + M_101 * D_121 + M_110 * D_130 + M_200 * D_220; + l_b->F_101 += M_002 * D_103 + M_011 * D_112 + M_020 * D_121 + + M_101 * D_202 + M_110 * D_211 + M_200 * D_301; + l_b->F_110 += M_002 * D_112 + M_011 * D_121 + M_020 * D_130 + + M_101 * D_211 + M_110 * D_220 + M_200 * D_310; + l_b->F_200 += M_002 * D_202 + M_011 * D_211 + M_020 * D_220 + + M_101 * D_301 + M_110 * D_310 + M_200 * D_400; /* Compute 4th order field tensor terms (addition to rank 3) */ - l_b->F_003 += - m_a->M_001 * pot.D_004 + m_a->M_010 * pot.D_013 + m_a->M_100 * pot.D_103; - l_b->F_012 += - m_a->M_001 * pot.D_013 + m_a->M_010 * pot.D_022 + m_a->M_100 * pot.D_112; - l_b->F_021 += - m_a->M_001 * pot.D_022 + m_a->M_010 * pot.D_031 + m_a->M_100 * pot.D_121; - l_b->F_030 += - m_a->M_001 * pot.D_031 + m_a->M_010 * pot.D_040 + m_a->M_100 * pot.D_130; - l_b->F_102 += - m_a->M_001 * pot.D_103 + m_a->M_010 * pot.D_112 + m_a->M_100 * pot.D_202; - l_b->F_111 += - m_a->M_001 * pot.D_112 + m_a->M_010 * pot.D_121 + m_a->M_100 * pot.D_211; - l_b->F_120 += - m_a->M_001 * pot.D_121 + m_a->M_010 * pot.D_130 + m_a->M_100 * pot.D_220; - l_b->F_201 += - m_a->M_001 * pot.D_202 + m_a->M_010 * pot.D_211 + m_a->M_100 * pot.D_301; - l_b->F_210 += - m_a->M_001 * pot.D_211 + m_a->M_010 * pot.D_220 + m_a->M_100 * pot.D_310; - l_b->F_300 += - m_a->M_001 * pot.D_301 + m_a->M_010 * pot.D_310 + m_a->M_100 * pot.D_400; + l_b->F_003 += M_001 * D_004 + M_010 * D_013 + M_100 * D_103; + l_b->F_012 += M_001 * D_013 + M_010 * D_022 + M_100 * D_112; + l_b->F_021 += M_001 * D_022 + M_010 * D_031 + M_100 * D_121; + l_b->F_030 += M_001 * D_031 + M_010 * D_040 + M_100 * D_130; + l_b->F_102 += M_001 * D_103 + M_010 * D_112 + M_100 * D_202; + l_b->F_111 += M_001 * D_112 + M_010 * D_121 + M_100 * D_211; + l_b->F_120 += M_001 * D_121 + M_010 * D_130 + M_100 * D_220; + l_b->F_201 += M_001 * D_202 + M_010 * D_211 + M_100 * D_301; + l_b->F_210 += M_001 * D_211 + M_010 * D_220 + M_100 * D_310; + l_b->F_300 += M_001 * D_301 + M_010 * D_310 + M_100 * D_400; /* Compute 4th order field tensor terms (addition to rank 4) */ - l_b->F_004 += m_a->M_000 * pot.D_004; - l_b->F_013 += m_a->M_000 * pot.D_013; - l_b->F_022 += m_a->M_000 * pot.D_022; - l_b->F_031 += m_a->M_000 * pot.D_031; - l_b->F_040 += m_a->M_000 * pot.D_040; - l_b->F_103 += m_a->M_000 * pot.D_103; - l_b->F_112 += m_a->M_000 * pot.D_112; - l_b->F_121 += m_a->M_000 * pot.D_121; - l_b->F_130 += m_a->M_000 * pot.D_130; - l_b->F_202 += m_a->M_000 * pot.D_202; - l_b->F_211 += m_a->M_000 * pot.D_211; - l_b->F_220 += m_a->M_000 * pot.D_220; - l_b->F_301 += m_a->M_000 * pot.D_301; - l_b->F_310 += m_a->M_000 * pot.D_310; - l_b->F_400 += m_a->M_000 * pot.D_400; + l_b->F_004 += M_000 * D_004; + l_b->F_013 += M_000 * D_013; + l_b->F_022 += M_000 * D_022; + l_b->F_031 += M_000 * D_031; + l_b->F_040 += M_000 * D_040; + l_b->F_103 += M_000 * D_103; + l_b->F_112 += M_000 * D_112; + l_b->F_121 += M_000 * D_121; + l_b->F_130 += M_000 * D_130; + l_b->F_202 += M_000 * D_202; + l_b->F_211 += M_000 * D_211; + l_b->F_220 += M_000 * D_220; + l_b->F_301 += M_000 * D_301; + l_b->F_310 += M_000 * D_310; + l_b->F_400 += M_000 * D_400; #endif #if SELF_GRAVITY_MULTIPOLE_ORDER > 4 + const float M_500 = m_a->M_500; + const float M_050 = m_a->M_050; + const float M_005 = m_a->M_005; + const float M_410 = m_a->M_410; + const float M_401 = m_a->M_401; + const float M_041 = m_a->M_041; + const float M_140 = m_a->M_140; + const float M_014 = m_a->M_014; + const float M_104 = m_a->M_104; + const float M_320 = m_a->M_320; + const float M_302 = m_a->M_302; + const float M_230 = m_a->M_230; + const float M_032 = m_a->M_032; + const float M_203 = m_a->M_203; + const float M_023 = m_a->M_023; + const float M_122 = m_a->M_122; + const float M_212 = m_a->M_212; + const float M_221 = m_a->M_221; + const float M_311 = m_a->M_311; + const float M_131 = m_a->M_131; + const float M_113 = m_a->M_113; + + const float D_500 = pot->D_500; + const float D_050 = pot->D_050; + const float D_005 = pot->D_005; + const float D_410 = pot->D_410; + const float D_401 = pot->D_401; + const float D_041 = pot->D_041; + const float D_140 = pot->D_140; + const float D_014 = pot->D_014; + const float D_104 = pot->D_104; + const float D_320 = pot->D_320; + const float D_302 = pot->D_302; + const float D_230 = pot->D_230; + const float D_032 = pot->D_032; + const float D_203 = pot->D_203; + const float D_023 = pot->D_023; + const float D_122 = pot->D_122; + const float D_212 = pot->D_212; + const float D_221 = pot->D_221; + const float D_311 = pot->D_311; + const float D_131 = pot->D_131; + const float D_113 = pot->D_113; + /* Compute 5th order field tensor terms (addition to rank 0) */ - l_b->F_000 += - m_a->M_005 * pot.D_005 + m_a->M_014 * pot.D_014 + m_a->M_023 * pot.D_023 + - m_a->M_032 * pot.D_032 + m_a->M_041 * pot.D_041 + m_a->M_050 * pot.D_050 + - m_a->M_104 * pot.D_104 + m_a->M_113 * pot.D_113 + m_a->M_122 * pot.D_122 + - m_a->M_131 * pot.D_131 + m_a->M_140 * pot.D_140 + m_a->M_203 * pot.D_203 + - m_a->M_212 * pot.D_212 + m_a->M_221 * pot.D_221 + m_a->M_230 * pot.D_230 + - m_a->M_302 * pot.D_302 + m_a->M_311 * pot.D_311 + m_a->M_320 * pot.D_320 + - m_a->M_401 * pot.D_401 + m_a->M_410 * pot.D_410 + m_a->M_500 * pot.D_500; + l_b->F_000 += M_005 * D_005 + M_014 * D_014 + M_023 * D_023 + + M_032 * D_032 + M_041 * D_041 + M_050 * D_050 + + M_104 * D_104 + M_113 * D_113 + M_122 * D_122 + + M_131 * D_131 + M_140 * D_140 + M_203 * D_203 + + M_212 * D_212 + M_221 * D_221 + M_230 * D_230 + + M_302 * D_302 + M_311 * D_311 + M_320 * D_320 + + M_401 * D_401 + M_410 * D_410 + M_500 * D_500; /* Compute 5th order field tensor terms (addition to rank 1) */ - l_b->F_001 += - m_a->M_004 * pot.D_005 + m_a->M_013 * pot.D_014 + m_a->M_022 * pot.D_023 + - m_a->M_031 * pot.D_032 + m_a->M_040 * pot.D_041 + m_a->M_103 * pot.D_104 + - m_a->M_112 * pot.D_113 + m_a->M_121 * pot.D_122 + m_a->M_130 * pot.D_131 + - m_a->M_202 * pot.D_203 + m_a->M_211 * pot.D_212 + m_a->M_220 * pot.D_221 + - m_a->M_301 * pot.D_302 + m_a->M_310 * pot.D_311 + m_a->M_400 * pot.D_401; - l_b->F_010 += - m_a->M_004 * pot.D_014 + m_a->M_013 * pot.D_023 + m_a->M_022 * pot.D_032 + - m_a->M_031 * pot.D_041 + m_a->M_040 * pot.D_050 + m_a->M_103 * pot.D_113 + - m_a->M_112 * pot.D_122 + m_a->M_121 * pot.D_131 + m_a->M_130 * pot.D_140 + - m_a->M_202 * pot.D_212 + m_a->M_211 * pot.D_221 + m_a->M_220 * pot.D_230 + - m_a->M_301 * pot.D_311 + m_a->M_310 * pot.D_320 + m_a->M_400 * pot.D_410; - l_b->F_100 += - m_a->M_004 * pot.D_104 + m_a->M_013 * pot.D_113 + m_a->M_022 * pot.D_122 + - m_a->M_031 * pot.D_131 + m_a->M_040 * pot.D_140 + m_a->M_103 * pot.D_203 + - m_a->M_112 * pot.D_212 + m_a->M_121 * pot.D_221 + m_a->M_130 * pot.D_230 + - m_a->M_202 * pot.D_302 + m_a->M_211 * pot.D_311 + m_a->M_220 * pot.D_320 + - m_a->M_301 * pot.D_401 + m_a->M_310 * pot.D_410 + m_a->M_400 * pot.D_500; + l_b->F_001 += M_004 * D_005 + M_013 * D_014 + M_022 * D_023 + + M_031 * D_032 + M_040 * D_041 + M_103 * D_104 + + M_112 * D_113 + M_121 * D_122 + M_130 * D_131 + + M_202 * D_203 + M_211 * D_212 + M_220 * D_221 + + M_301 * D_302 + M_310 * D_311 + M_400 * D_401; + l_b->F_010 += M_004 * D_014 + M_013 * D_023 + M_022 * D_032 + + M_031 * D_041 + M_040 * D_050 + M_103 * D_113 + + M_112 * D_122 + M_121 * D_131 + M_130 * D_140 + + M_202 * D_212 + M_211 * D_221 + M_220 * D_230 + + M_301 * D_311 + M_310 * D_320 + M_400 * D_410; + l_b->F_100 += M_004 * D_104 + M_013 * D_113 + M_022 * D_122 + + M_031 * D_131 + M_040 * D_140 + M_103 * D_203 + + M_112 * D_212 + M_121 * D_221 + M_130 * D_230 + + M_202 * D_302 + M_211 * D_311 + M_220 * D_320 + + M_301 * D_401 + M_310 * D_410 + M_400 * D_500; /* Compute 5th order field tensor terms (addition to rank 2) */ - l_b->F_002 += m_a->M_003 * pot.D_005 + m_a->M_012 * pot.D_014 + - m_a->M_021 * pot.D_023 + m_a->M_030 * pot.D_032 + - m_a->M_102 * pot.D_104 + m_a->M_111 * pot.D_113 + - m_a->M_120 * pot.D_122 + m_a->M_201 * pot.D_203 + - m_a->M_210 * pot.D_212 + m_a->M_300 * pot.D_302; - l_b->F_011 += m_a->M_003 * pot.D_014 + m_a->M_012 * pot.D_023 + - m_a->M_021 * pot.D_032 + m_a->M_030 * pot.D_041 + - m_a->M_102 * pot.D_113 + m_a->M_111 * pot.D_122 + - m_a->M_120 * pot.D_131 + m_a->M_201 * pot.D_212 + - m_a->M_210 * pot.D_221 + m_a->M_300 * pot.D_311; - l_b->F_020 += m_a->M_003 * pot.D_023 + m_a->M_012 * pot.D_032 + - m_a->M_021 * pot.D_041 + m_a->M_030 * pot.D_050 + - m_a->M_102 * pot.D_122 + m_a->M_111 * pot.D_131 + - m_a->M_120 * pot.D_140 + m_a->M_201 * pot.D_221 + - m_a->M_210 * pot.D_230 + m_a->M_300 * pot.D_320; - l_b->F_101 += m_a->M_003 * pot.D_104 + m_a->M_012 * pot.D_113 + - m_a->M_021 * pot.D_122 + m_a->M_030 * pot.D_131 + - m_a->M_102 * pot.D_203 + m_a->M_111 * pot.D_212 + - m_a->M_120 * pot.D_221 + m_a->M_201 * pot.D_302 + - m_a->M_210 * pot.D_311 + m_a->M_300 * pot.D_401; - l_b->F_110 += m_a->M_003 * pot.D_113 + m_a->M_012 * pot.D_122 + - m_a->M_021 * pot.D_131 + m_a->M_030 * pot.D_140 + - m_a->M_102 * pot.D_212 + m_a->M_111 * pot.D_221 + - m_a->M_120 * pot.D_230 + m_a->M_201 * pot.D_311 + - m_a->M_210 * pot.D_320 + m_a->M_300 * pot.D_410; - l_b->F_200 += m_a->M_003 * pot.D_203 + m_a->M_012 * pot.D_212 + - m_a->M_021 * pot.D_221 + m_a->M_030 * pot.D_230 + - m_a->M_102 * pot.D_302 + m_a->M_111 * pot.D_311 + - m_a->M_120 * pot.D_320 + m_a->M_201 * pot.D_401 + - m_a->M_210 * pot.D_410 + m_a->M_300 * pot.D_500; + l_b->F_002 += M_003 * D_005 + M_012 * D_014 + M_021 * D_023 + + M_030 * D_032 + M_102 * D_104 + M_111 * D_113 + + M_120 * D_122 + M_201 * D_203 + M_210 * D_212 + + M_300 * D_302; + l_b->F_011 += M_003 * D_014 + M_012 * D_023 + M_021 * D_032 + + M_030 * D_041 + M_102 * D_113 + M_111 * D_122 + + M_120 * D_131 + M_201 * D_212 + M_210 * D_221 + + M_300 * D_311; + l_b->F_020 += M_003 * D_023 + M_012 * D_032 + M_021 * D_041 + + M_030 * D_050 + M_102 * D_122 + M_111 * D_131 + + M_120 * D_140 + M_201 * D_221 + M_210 * D_230 + + M_300 * D_320; + l_b->F_101 += M_003 * D_104 + M_012 * D_113 + M_021 * D_122 + + M_030 * D_131 + M_102 * D_203 + M_111 * D_212 + + M_120 * D_221 + M_201 * D_302 + M_210 * D_311 + + M_300 * D_401; + l_b->F_110 += M_003 * D_113 + M_012 * D_122 + M_021 * D_131 + + M_030 * D_140 + M_102 * D_212 + M_111 * D_221 + + M_120 * D_230 + M_201 * D_311 + M_210 * D_320 + + M_300 * D_410; + l_b->F_200 += M_003 * D_203 + M_012 * D_212 + M_021 * D_221 + + M_030 * D_230 + M_102 * D_302 + M_111 * D_311 + + M_120 * D_320 + M_201 * D_401 + M_210 * D_410 + + M_300 * D_500; /* Compute 5th order field tensor terms (addition to rank 3) */ - l_b->F_003 += m_a->M_002 * pot.D_005 + m_a->M_011 * pot.D_014 + - m_a->M_020 * pot.D_023 + m_a->M_101 * pot.D_104 + - m_a->M_110 * pot.D_113 + m_a->M_200 * pot.D_203; - l_b->F_012 += m_a->M_002 * pot.D_014 + m_a->M_011 * pot.D_023 + - m_a->M_020 * pot.D_032 + m_a->M_101 * pot.D_113 + - m_a->M_110 * pot.D_122 + m_a->M_200 * pot.D_212; - l_b->F_021 += m_a->M_002 * pot.D_023 + m_a->M_011 * pot.D_032 + - m_a->M_020 * pot.D_041 + m_a->M_101 * pot.D_122 + - m_a->M_110 * pot.D_131 + m_a->M_200 * pot.D_221; - l_b->F_030 += m_a->M_002 * pot.D_032 + m_a->M_011 * pot.D_041 + - m_a->M_020 * pot.D_050 + m_a->M_101 * pot.D_131 + - m_a->M_110 * pot.D_140 + m_a->M_200 * pot.D_230; - l_b->F_102 += m_a->M_002 * pot.D_104 + m_a->M_011 * pot.D_113 + - m_a->M_020 * pot.D_122 + m_a->M_101 * pot.D_203 + - m_a->M_110 * pot.D_212 + m_a->M_200 * pot.D_302; - l_b->F_111 += m_a->M_002 * pot.D_113 + m_a->M_011 * pot.D_122 + - m_a->M_020 * pot.D_131 + m_a->M_101 * pot.D_212 + - m_a->M_110 * pot.D_221 + m_a->M_200 * pot.D_311; - l_b->F_120 += m_a->M_002 * pot.D_122 + m_a->M_011 * pot.D_131 + - m_a->M_020 * pot.D_140 + m_a->M_101 * pot.D_221 + - m_a->M_110 * pot.D_230 + m_a->M_200 * pot.D_320; - l_b->F_201 += m_a->M_002 * pot.D_203 + m_a->M_011 * pot.D_212 + - m_a->M_020 * pot.D_221 + m_a->M_101 * pot.D_302 + - m_a->M_110 * pot.D_311 + m_a->M_200 * pot.D_401; - l_b->F_210 += m_a->M_002 * pot.D_212 + m_a->M_011 * pot.D_221 + - m_a->M_020 * pot.D_230 + m_a->M_101 * pot.D_311 + - m_a->M_110 * pot.D_320 + m_a->M_200 * pot.D_410; - l_b->F_300 += m_a->M_002 * pot.D_302 + m_a->M_011 * pot.D_311 + - m_a->M_020 * pot.D_320 + m_a->M_101 * pot.D_401 + - m_a->M_110 * pot.D_410 + m_a->M_200 * pot.D_500; + l_b->F_003 += M_002 * D_005 + M_011 * D_014 + M_020 * D_023 + + M_101 * D_104 + M_110 * D_113 + M_200 * D_203; + l_b->F_012 += M_002 * D_014 + M_011 * D_023 + M_020 * D_032 + + M_101 * D_113 + M_110 * D_122 + M_200 * D_212; + l_b->F_021 += M_002 * D_023 + M_011 * D_032 + M_020 * D_041 + + M_101 * D_122 + M_110 * D_131 + M_200 * D_221; + l_b->F_030 += M_002 * D_032 + M_011 * D_041 + M_020 * D_050 + + M_101 * D_131 + M_110 * D_140 + M_200 * D_230; + l_b->F_102 += M_002 * D_104 + M_011 * D_113 + M_020 * D_122 + + M_101 * D_203 + M_110 * D_212 + M_200 * D_302; + l_b->F_111 += M_002 * D_113 + M_011 * D_122 + M_020 * D_131 + + M_101 * D_212 + M_110 * D_221 + M_200 * D_311; + l_b->F_120 += M_002 * D_122 + M_011 * D_131 + M_020 * D_140 + + M_101 * D_221 + M_110 * D_230 + M_200 * D_320; + l_b->F_201 += M_002 * D_203 + M_011 * D_212 + M_020 * D_221 + + M_101 * D_302 + M_110 * D_311 + M_200 * D_401; + l_b->F_210 += M_002 * D_212 + M_011 * D_221 + M_020 * D_230 + + M_101 * D_311 + M_110 * D_320 + M_200 * D_410; + l_b->F_300 += M_002 * D_302 + M_011 * D_311 + M_020 * D_320 + + M_101 * D_401 + M_110 * D_410 + M_200 * D_500; /* Compute 5th order field tensor terms (addition to rank 4) */ - l_b->F_004 += - m_a->M_001 * pot.D_005 + m_a->M_010 * pot.D_014 + m_a->M_100 * pot.D_104; - l_b->F_013 += - m_a->M_001 * pot.D_014 + m_a->M_010 * pot.D_023 + m_a->M_100 * pot.D_113; - l_b->F_022 += - m_a->M_001 * pot.D_023 + m_a->M_010 * pot.D_032 + m_a->M_100 * pot.D_122; - l_b->F_031 += - m_a->M_001 * pot.D_032 + m_a->M_010 * pot.D_041 + m_a->M_100 * pot.D_131; - l_b->F_040 += - m_a->M_001 * pot.D_041 + m_a->M_010 * pot.D_050 + m_a->M_100 * pot.D_140; - l_b->F_103 += - m_a->M_001 * pot.D_104 + m_a->M_010 * pot.D_113 + m_a->M_100 * pot.D_203; - l_b->F_112 += - m_a->M_001 * pot.D_113 + m_a->M_010 * pot.D_122 + m_a->M_100 * pot.D_212; - l_b->F_121 += - m_a->M_001 * pot.D_122 + m_a->M_010 * pot.D_131 + m_a->M_100 * pot.D_221; - l_b->F_130 += - m_a->M_001 * pot.D_131 + m_a->M_010 * pot.D_140 + m_a->M_100 * pot.D_230; - l_b->F_202 += - m_a->M_001 * pot.D_203 + m_a->M_010 * pot.D_212 + m_a->M_100 * pot.D_302; - l_b->F_211 += - m_a->M_001 * pot.D_212 + m_a->M_010 * pot.D_221 + m_a->M_100 * pot.D_311; - l_b->F_220 += - m_a->M_001 * pot.D_221 + m_a->M_010 * pot.D_230 + m_a->M_100 * pot.D_320; - l_b->F_301 += - m_a->M_001 * pot.D_302 + m_a->M_010 * pot.D_311 + m_a->M_100 * pot.D_401; - l_b->F_310 += - m_a->M_001 * pot.D_311 + m_a->M_010 * pot.D_320 + m_a->M_100 * pot.D_410; - l_b->F_400 += - m_a->M_001 * pot.D_401 + m_a->M_010 * pot.D_410 + m_a->M_100 * pot.D_500; + l_b->F_004 += M_001 * D_005 + M_010 * D_014 + M_100 * D_104; + l_b->F_013 += M_001 * D_014 + M_010 * D_023 + M_100 * D_113; + l_b->F_022 += M_001 * D_023 + M_010 * D_032 + M_100 * D_122; + l_b->F_031 += M_001 * D_032 + M_010 * D_041 + M_100 * D_131; + l_b->F_040 += M_001 * D_041 + M_010 * D_050 + M_100 * D_140; + l_b->F_103 += M_001 * D_104 + M_010 * D_113 + M_100 * D_203; + l_b->F_112 += M_001 * D_113 + M_010 * D_122 + M_100 * D_212; + l_b->F_121 += M_001 * D_122 + M_010 * D_131 + M_100 * D_221; + l_b->F_130 += M_001 * D_131 + M_010 * D_140 + M_100 * D_230; + l_b->F_202 += M_001 * D_203 + M_010 * D_212 + M_100 * D_302; + l_b->F_211 += M_001 * D_212 + M_010 * D_221 + M_100 * D_311; + l_b->F_220 += M_001 * D_221 + M_010 * D_230 + M_100 * D_320; + l_b->F_301 += M_001 * D_302 + M_010 * D_311 + M_100 * D_401; + l_b->F_310 += M_001 * D_311 + M_010 * D_320 + M_100 * D_410; + l_b->F_400 += M_001 * D_401 + M_010 * D_410 + M_100 * D_500; /* Compute 5th order field tensor terms (addition to rank 5) */ - l_b->F_005 += m_a->M_000 * pot.D_005; - l_b->F_014 += m_a->M_000 * pot.D_014; - l_b->F_023 += m_a->M_000 * pot.D_023; - l_b->F_032 += m_a->M_000 * pot.D_032; - l_b->F_041 += m_a->M_000 * pot.D_041; - l_b->F_050 += m_a->M_000 * pot.D_050; - l_b->F_104 += m_a->M_000 * pot.D_104; - l_b->F_113 += m_a->M_000 * pot.D_113; - l_b->F_122 += m_a->M_000 * pot.D_122; - l_b->F_131 += m_a->M_000 * pot.D_131; - l_b->F_140 += m_a->M_000 * pot.D_140; - l_b->F_203 += m_a->M_000 * pot.D_203; - l_b->F_212 += m_a->M_000 * pot.D_212; - l_b->F_221 += m_a->M_000 * pot.D_221; - l_b->F_230 += m_a->M_000 * pot.D_230; - l_b->F_302 += m_a->M_000 * pot.D_302; - l_b->F_311 += m_a->M_000 * pot.D_311; - l_b->F_320 += m_a->M_000 * pot.D_320; - l_b->F_401 += m_a->M_000 * pot.D_401; - l_b->F_410 += m_a->M_000 * pot.D_410; - l_b->F_500 += m_a->M_000 * pot.D_500; + l_b->F_005 += M_000 * D_005; + l_b->F_014 += M_000 * D_014; + l_b->F_023 += M_000 * D_023; + l_b->F_032 += M_000 * D_032; + l_b->F_041 += M_000 * D_041; + l_b->F_050 += M_000 * D_050; + l_b->F_104 += M_000 * D_104; + l_b->F_113 += M_000 * D_113; + l_b->F_122 += M_000 * D_122; + l_b->F_131 += M_000 * D_131; + l_b->F_140 += M_000 * D_140; + l_b->F_203 += M_000 * D_203; + l_b->F_212 += M_000 * D_212; + l_b->F_221 += M_000 * D_221; + l_b->F_230 += M_000 * D_230; + l_b->F_302 += M_000 * D_302; + l_b->F_311 += M_000 * D_311; + l_b->F_320 += M_000 * D_320; + l_b->F_401 += M_000 * D_401; + l_b->F_410 += M_000 * D_410; + l_b->F_500 += M_000 * D_500; #endif #if SELF_GRAVITY_MULTIPOLE_ORDER > 5 @@ -1935,6 +1955,109 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b, #endif } +/** + * @brief Compute the field tensor due to a multipole. + * + * @param l_b The field tensor to compute. + * @param m_a The multipole. + * @param pos_b The position of the field tensor. + * @param pos_a The position of the multipole. + * @param props The #gravity_props of this calculation. + * @param periodic Is the calculation periodic ? + * @param dim The size of the simulation box. + * @param rs_inv The inverse of the gravity mesh-smoothing scale. + */ +INLINE static void gravity_M2L_nonsym( + struct grav_tensor *l_b, const struct multipole *m_a, const double pos_b[3], + const double pos_a[3], const struct gravity_props *props, + const int periodic, const double dim[3], const float rs_inv) { + + /* Recover some constants */ + const float eps = props->epsilon_cur; + const float eps_inv = props->epsilon_cur_inv; + + /* Compute distance vector */ + float dx = (float)(pos_b[0] - pos_a[0]); + float dy = (float)(pos_b[1] - pos_a[1]); + float dz = (float)(pos_b[2] - pos_a[2]); + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + + /* Compute distance */ + const float r2 = dx * dx + dy * dy + dz * dz; + const float r_inv = 1. / sqrtf(r2); + + /* Compute all derivatives */ + struct potential_derivatives_M2L pot; + potential_derivatives_compute_M2L(dx, dy, dz, r2, r_inv, eps, eps_inv, + periodic, rs_inv, &pot); + + /* Do the M2L tensor multiplication */ + gravity_M2L_apply(l_b, m_a, &pot); +} + +/** + * @brief Compute the field tensor due to a multipole and the symmetric + * equivalent. + * + * @param l_a The first field tensor to compute. + * @param l_b The second field tensor to compute. + * @param m_a The first multipole. + * @param m_b The second multipole. + * @param pos_a The position of the first m-pole and field tensor. + * @param pos_b The position of the second m-pole and field tensor. + * @param props The #gravity_props of this calculation. + * @param periodic Is the calculation periodic ? + * @param dim The size of the simulation box. + * @param rs_inv The inverse of the gravity mesh-smoothing scale. + */ +INLINE static void gravity_M2L_symmetric( + struct grav_tensor *restrict l_a, struct grav_tensor *restrict l_b, + const struct multipole *restrict m_a, const struct multipole *restrict m_b, + const double pos_a[3], const double pos_b[3], + const struct gravity_props *props, const int periodic, const double dim[3], + const float rs_inv) { + + /* Recover some constants */ + const float eps = props->epsilon_cur; + const float eps_inv = props->epsilon_cur_inv; + + /* Compute distance vector */ + float dx = (float)(pos_b[0] - pos_a[0]); + float dy = (float)(pos_b[1] - pos_a[1]); + float dz = (float)(pos_b[2] - pos_a[2]); + + /* Apply BC */ + if (periodic) { + dx = nearest(dx, dim[0]); + dy = nearest(dy, dim[1]); + dz = nearest(dz, dim[2]); + } + + /* Compute distance */ + const float r2 = dx * dx + dy * dy + dz * dz; + const float r_inv = 1. / sqrtf(r2); + + /* Compute all derivatives */ + struct potential_derivatives_M2L pot; + potential_derivatives_compute_M2L(dx, dy, dz, r2, r_inv, eps, eps_inv, + periodic, rs_inv, &pot); + + /* Do the first M2L tensor multiplication */ + gravity_M2L_apply(l_b, m_a, &pot); + + /* Flip the signs of odd derivatives */ + potential_derivatives_flip_signs(&pot); + + /* Do the second M2L tensor multiplication */ + gravity_M2L_apply(l_a, m_b, &pot); +} + /** * @brief Creates a copy of #grav_tensor shifted to a new location. * @@ -1945,8 +2068,8 @@ INLINE static void gravity_M2L(struct grav_tensor *l_b, * @param pos_a The position to which m_b will be shifted. * @param pos_b The current postion of the multipole to shift. */ -INLINE static void gravity_L2L(struct grav_tensor *la, - const struct grav_tensor *lb, +INLINE static void gravity_L2L(struct grav_tensor *restrict la, + const struct grav_tensor *restrict lb, const double pos_a[3], const double pos_b[3]) { /* Initialise everything to zero */ diff --git a/src/runner.c b/src/runner.c index 56b9a85f639f4a45aa4255f6cd3087e8e7e4fdf9..0be6288cbcc0b77fbe9ba3128d820b82a8943480 100644 --- a/src/runner.c +++ b/src/runner.c @@ -2273,7 +2273,7 @@ void *runner_main(void *data) { runner_do_grav_long_range(r, t->ci, 1); break; case task_type_grav_mm: - runner_dopair_grav_mm_symmetric(r, t->ci, t->cj); + runner_dopair_grav_mm(r, t->ci, t->cj); break; case task_type_cooling: runner_do_cooling(r, t->ci, 1); diff --git a/src/runner_doiact_grav.h b/src/runner_doiact_grav.h index 961e64006f6d732e501473caaaf1b6c2f75aab31..f9d1f833e65f7e6dde6085696dfbcd0719ae2a87 100644 --- a/src/runner_doiact_grav.h +++ b/src/runner_doiact_grav.h @@ -670,8 +670,10 @@ static INLINE void runner_dopair_grav_pp(struct runner *r, struct cell *ci, TIMER_TIC; /* Record activity status */ - const int ci_active = cell_is_active_gravity(ci, e); - const int cj_active = cell_is_active_gravity(cj, e); + const int ci_active = + cell_is_active_gravity(ci, e) && (ci->nodeID == e->nodeID); + const int cj_active = + cell_is_active_gravity(cj, e) && (cj->nodeID == e->nodeID); /* Anything to do here? */ if (!ci_active && !cj_active) return; @@ -1158,6 +1160,72 @@ static INLINE void runner_doself_grav_pp(struct runner *r, struct cell *c) { TIMER_TOC(timer_doself_grav_pp); } +/** + * @brief Computes the interaction of the field tensor and multipole + * of two cells symmetrically. + * + * @param r The #runner. + * @param ci The first #cell. + * @param cj The second #cell. + */ +static INLINE void runner_dopair_grav_mm_symmetric(struct runner *r, + struct cell *restrict ci, + struct cell *restrict cj) { + + /* Some constants */ + const struct engine *e = r->e; + const struct gravity_props *props = e->gravity_properties; + const int periodic = e->mesh->periodic; + const double dim[3] = {e->mesh->dim[0], e->mesh->dim[1], e->mesh->dim[2]}; + const float r_s_inv = e->mesh->r_s_inv; + + TIMER_TIC; + + /* Anything to do here? */ + if ((!cell_is_active_gravity_mm(ci, e) || ci->nodeID != engine_rank) || + (!cell_is_active_gravity_mm(cj, e) || cj->nodeID != engine_rank)) + error("Invalid state in symmetric M-M calculation!"); + + /* Short-cut to the multipole */ + const struct multipole *multi_i = &ci->multipole->m_pole; + const struct multipole *multi_j = &cj->multipole->m_pole; + +#ifdef SWIFT_DEBUG_CHECKS + if (ci == cj) error("Interacting a cell with itself using M2L"); + + if (multi_i->num_gpart == 0) + error("Multipole i does not seem to have been set."); + + if (multi_j->num_gpart == 0) + error("Multipole j does not seem to have been set."); + + if (ci->multipole->pot.ti_init != e->ti_current) + error("ci->grav tensor not initialised."); + + if (ci->multipole->pot.ti_init != e->ti_current) + error("cj->grav tensor not initialised."); + + if (ci->ti_old_multipole != e->ti_current) + error( + "Undrifted multipole ci->ti_old_multipole=%lld ci->nodeID=%d " + "cj->nodeID=%d e->ti_current=%lld", + ci->ti_old_multipole, ci->nodeID, cj->nodeID, e->ti_current); + + if (cj->ti_old_multipole != e->ti_current) + error( + "Undrifted multipole cj->ti_old_multipole=%lld cj->nodeID=%d " + "ci->nodeID=%d e->ti_current=%lld", + cj->ti_old_multipole, cj->nodeID, ci->nodeID, e->ti_current); +#endif + + /* Let's interact at this level */ + gravity_M2L_symmetric(&ci->multipole->pot, &cj->multipole->pot, multi_i, + multi_j, ci->multipole->CoM, cj->multipole->CoM, props, + periodic, dim, r_s_inv); + + TIMER_TOC(timer_dopair_grav_mm); +} + /** * @brief Computes the interaction of the field tensor in a cell with the * multipole of another cell. @@ -1166,9 +1234,9 @@ static INLINE void runner_doself_grav_pp(struct runner *r, struct cell *c) { * @param ci The #cell with field tensor to interact. * @param cj The #cell with the multipole. */ -static INLINE void runner_dopair_grav_mm(struct runner *r, - struct cell *restrict ci, - struct cell *restrict cj) { +static INLINE void runner_dopair_grav_mm_nonsym( + struct runner *r, struct cell *restrict ci, + const struct cell *restrict cj) { /* Some constants */ const struct engine *e = r->e; @@ -1193,22 +1261,49 @@ static INLINE void runner_dopair_grav_mm(struct runner *r, if (ci->multipole->pot.ti_init != e->ti_current) error("ci->grav tensor not initialised."); -#endif - /* Do we need to drift the multipole ? */ if (cj->ti_old_multipole != e->ti_current) error( "Undrifted multipole cj->ti_old_multipole=%lld cj->nodeID=%d " "ci->nodeID=%d e->ti_current=%lld", cj->ti_old_multipole, cj->nodeID, ci->nodeID, e->ti_current); +#endif /* Let's interact at this level */ - gravity_M2L(&ci->multipole->pot, multi_j, ci->multipole->CoM, - cj->multipole->CoM, props, periodic, dim, r_s_inv); + gravity_M2L_nonsym(&ci->multipole->pot, multi_j, ci->multipole->CoM, + cj->multipole->CoM, props, periodic, dim, r_s_inv); TIMER_TOC(timer_dopair_grav_mm); } +/** + * @brief Call the M-M calculation on two cells if active. + * + * @param r The #runner object. + * @param ci The first #cell. + * @param cj The second #cell. + */ +static INLINE void runner_dopair_grav_mm(struct runner *r, + struct cell *restrict ci, + struct cell *restrict cj) { + + const struct engine *e = r->e; + + const int do_i = + cell_is_active_gravity_mm(ci, e) && (ci->nodeID == e->nodeID); + const int do_j = + cell_is_active_gravity_mm(cj, e) && (cj->nodeID == e->nodeID); + + if (do_i && do_j) + runner_dopair_grav_mm_symmetric(r, ci, cj); + else if (do_i) + runner_dopair_grav_mm_nonsym(r, ci, cj); + else if (do_j) + runner_dopair_grav_mm_nonsym(r, cj, ci); + else + error("Running M-M task with two inactive cells."); +} + static INLINE void runner_dopair_recursive_grav_pm(struct runner *r, struct cell *ci, const struct cell *cj) { @@ -1381,9 +1476,8 @@ static INLINE void runner_dopair_recursive_grav(struct runner *r, /* Can we use M-M interactions ? */ if (gravity_M2L_accept(multi_i->r_max, multi_j->r_max, theta_crit2, r2)) { - /* MATTHIEU: make a symmetric M-M interaction function ! */ + /* Go M-M */ runner_dopair_grav_mm(r, ci, cj); - runner_dopair_grav_mm(r, cj, ci); } else if (!ci->split && !cj->split) { @@ -1503,28 +1597,6 @@ static INLINE void runner_doself_recursive_grav(struct runner *r, if (gettimer) TIMER_TOC(timer_dosub_self_grav); } -/** - * @brief Call the non-symmetric M-M calculation on two cells if active. - * - * @param r The #runner object. - * @param ci The first #cell. - * @param cj The second #cell. - */ -static INLINE void runner_dopair_grav_mm_symmetric(struct runner *r, - struct cell *ci, - struct cell *cj) { - - const struct engine *e = r->e; - -#ifdef SWIFT_DEBUG_CHECKS - if (!cell_is_active_gravity(ci, e) && !cell_is_active_gravity(cj, e)) - error("Running M-M task with two inactive cells."); -#endif - - if (cell_is_active_gravity_mm(ci, e)) runner_dopair_grav_mm(r, ci, cj); - if (cell_is_active_gravity_mm(cj, e)) runner_dopair_grav_mm(r, cj, ci); -} - /** * @brief Performs all M-M interactions between a given top-level cell and all * the other top-levels that are far enough. @@ -1622,7 +1694,7 @@ static INLINE void runner_do_grav_long_range(struct runner *r, struct cell *ci, theta_crit2, r2_rebuild)) { /* Call the PM interaction fucntion on the active sub-cells of ci */ - runner_dopair_grav_mm(r, ci, cj); + runner_dopair_grav_mm_nonsym(r, ci, cj); // runner_dopair_recursive_grav_pm(r, ci, cj); /* Record that this multipole received a contribution */ diff --git a/src/space.c b/src/space.c index 11fc5c9422103895d5f0cbf7799864a981547090..281c452a9d75b1ca7d214f117364d96fc4515df7 100644 --- a/src/space.c +++ b/src/space.c @@ -360,6 +360,7 @@ void space_regrid(struct space *s, int verbose) { /* Free the old cells, if they were allocated. */ if (s->cells_top != NULL) { space_free_cells(s); + free(s->local_cells_with_tasks_top); free(s->local_cells_top); free(s->cells_top); free(s->multipoles_top); @@ -398,6 +399,12 @@ void space_regrid(struct space *s, int verbose) { error("Failed to allocate indices of local top-level cells."); bzero(s->local_cells_top, s->nr_cells * sizeof(int)); + /* Allocate the indices of local cells with tasks */ + if (posix_memalign((void **)&s->local_cells_with_tasks_top, + SWIFT_STRUCT_ALIGNMENT, s->nr_cells * sizeof(int)) != 0) + error("Failed to allocate indices of local top-level cells."); + bzero(s->local_cells_with_tasks_top, s->nr_cells * sizeof(int)); + /* Set the cells' locks */ for (int k = 0; k < s->nr_cells; k++) { if (lock_init(&s->cells_top[k].lock) != 0) @@ -2337,7 +2344,7 @@ void space_free_buff_sort_indices(struct space *s) { /** * @brief Construct the list of top-level cells that have any tasks in - * their hierarchy. + * their hierarchy on this MPI rank. * * This assumes the list has been pre-allocated at a regrid. * @@ -2345,15 +2352,38 @@ void space_free_buff_sort_indices(struct space *s) { */ void space_list_cells_with_tasks(struct space *s) { - /* Let's rebuild the list of local top-level cells */ - s->nr_local_cells = 0; + s->nr_local_cells_with_tasks = 0; + for (int i = 0; i < s->nr_cells; ++i) if (cell_has_tasks(&s->cells_top[i])) { + s->local_cells_with_tasks_top[s->nr_local_cells_with_tasks] = i; + s->nr_local_cells_with_tasks++; + } + if (s->e->verbose) + message("Have %d local top-level cells with tasks (total=%d)", + s->nr_local_cells_with_tasks, s->nr_cells); +} + +/** + * @brief Construct the list of local top-level cells. + * + * This assumes the list has been pre-allocated at a regrid. + * + * @param s The #space. + */ +void space_list_local_cells(struct space *s) { + + s->nr_local_cells = 0; + + for (int i = 0; i < s->nr_cells; ++i) + if (s->cells_top[i].nodeID == engine_rank) { s->local_cells_top[s->nr_local_cells] = i; s->nr_local_cells++; } + if (s->e->verbose) - message("Have %d local cells (total=%d)", s->nr_local_cells, s->nr_cells); + message("Have %d local top-level cells (total=%d)", s->nr_local_cells, + s->nr_cells); } void space_synchronize_particle_positions_mapper(void *map_data, int nr_gparts, @@ -3287,6 +3317,7 @@ void space_clean(struct space *s) { free(s->cells_top); free(s->multipoles_top); free(s->local_cells_top); + free(s->local_cells_with_tasks_top); free(s->parts); free(s->xparts); free(s->gparts); @@ -3338,6 +3369,7 @@ void space_struct_restore(struct space *s, FILE *stream) { s->multipoles_top = NULL; s->multipoles_sub = NULL; s->local_cells_top = NULL; + s->local_cells_with_tasks_top = NULL; s->grav_top_level = NULL; #ifdef WITH_MPI s->parts_foreign = NULL; diff --git a/src/space.h b/src/space.h index e3173ece1e2749a3afb8072b179150587a100a82..af6d58c6c75033662f8aedbda8d9086a4e2d0950 100644 --- a/src/space.h +++ b/src/space.h @@ -106,9 +106,12 @@ struct space { /*! Total number of cells (top- and sub-) */ int tot_cells; - /*! Number of *local* top-level cells with tasks */ + /*! Number of *local* top-level cells */ int nr_local_cells; + /*! Number of *local* top-level cells with tasks */ + int nr_local_cells_with_tasks; + /*! The (level 0) cells themselves. */ struct cell *cells_top; @@ -121,9 +124,12 @@ struct space { /*! Buffer of unused multipoles for the sub-cells. */ struct gravity_tensors *multipoles_sub; - /*! The indices of the *local* top-level cells with tasks */ + /*! The indices of the *local* top-level cells */ int *local_cells_top; + /*! The indices of the *local* top-level cells with tasks */ + int *local_cells_with_tasks_top; + /*! The total number of parts in the space. */ size_t nr_parts, size_parts; @@ -228,6 +234,7 @@ void space_recycle_list(struct space *s, struct cell *cell_list_begin, void space_split(struct space *s, struct cell *cells, int nr_cells, int verbose); void space_split_mapper(void *map_data, int num_elements, void *extra_data); +void space_list_local_cells(struct space *s); void space_list_cells_with_tasks(struct space *s); void space_parts_get_cell_index(struct space *s, int *ind, int *cell_counts, struct cell *cells, int verbose); diff --git a/tests/testGravityDerivatives.c b/tests/testGravityDerivatives.c index 184d66db623f34963dc91915c12fc58fbaa4ec4d..125cbc70815ae14056604e79a5616ac20393b650 100644 --- a/tests/testGravityDerivatives.c +++ b/tests/testGravityDerivatives.c @@ -943,6 +943,13 @@ int main(int argc, char* argv[]) { message("Testing gravity for r=(%e %e %e)", dx, dy, dz); + const double r_s = 100. * ((double)rand() / (RAND_MAX)); + const double r_s_inv = 1. / r_s; + + const int periodic = 0; + + message("Mesh scale r_s=%e periodic=%d", r_s, periodic); + /* Compute distance */ const double r2 = dx * dx + dy * dy + dz * dz; const double r_inv = 1. / sqrt(r2); @@ -953,7 +960,7 @@ int main(int argc, char* argv[]) { /* Compute all derivatives */ struct potential_derivatives_M2L pot; compute_potential_derivatives_M2L(dx, dy, dz, r2, r_inv, eps, eps_inv, - /*periodic*/ 0, /* 1/r_s */ 0., &pot); + periodic, r_s_inv, &pot); /* Minimal value we care about */ const double min = 1e-9;