Commit b2308cc0 authored by Peter W. Draper's avatar Peter W. Draper
Browse files

Merge branch 'cbrt' into 'master'

Cube root

See merge request !266
parents 820fe548 21af411d
......@@ -273,6 +273,18 @@ if test "$enable_debugging_checks" = "yes"; then
AC_DEFINE([SWIFT_DEBUG_CHECKS],1,[Enable expensive debugging])
fi
# Check if using our custom icbrtf is enalbled.
AC_ARG_ENABLE([custom-icbrtf],
[AS_HELP_STRING([--enable-custom-icbrtf],
[Use SWIFT's custom icbrtf function instead of the system cbrtf @<:@yes/no@:>@]
)],
[enable_custom_icbrtf="$enableval"],
[enable_custom_icbrtf="no"]
)
if test "$enable_custom_icbrtf" = "yes"; then
AC_DEFINE([WITH_ICBRTF],1,[Enable custom icbrtf])
fi
# Check whether we want to default to naive cell interactions
AC_ARG_ENABLE([naive-interactions],
[AS_HELP_STRING([--enable-naive-interactions],
......@@ -1354,5 +1366,6 @@ AC_MSG_RESULT([
Interaction debugging : $enable_debug_interactions
Naive interactions : $enable_naive_interactions
Gravity checks : $gravity_force_checks
Custom icbrtf : $enable_custom_icbrtf
------------------------])
......@@ -47,7 +47,8 @@ include_HEADERS = space.h runner.h queue.h task.h lock.h cell.h part.h const.h \
sourceterms.h sourceterms_struct.h statistics.h memswap.h cache.h runner_doiact_vec.h profiler.h \
dump.h logger.h active.h timeline.h xmf.h gravity_properties.h gravity_derivatives.h \
gravity_softened_derivatives.h vector_power.h collectgroup.h hydro_space.h sort_part.h \
chemistry.h chemistry_io.h chemistry_struct.h cosmology.h restart.h space_getsid.h utilities.h
chemistry.h chemistry_io.h chemistry_struct.h cosmology.h restart.h space_getsid.h utilities.h \
cbrt.h
# Common source files
AM_SOURCES = space.c runner.c queue.c task.c cell.c engine.c \
......
......@@ -33,6 +33,7 @@
#include <math.h>
/* Local headers. */
#include "cbrt.h"
#include "error.h"
#include "inline.h"
......@@ -112,8 +113,13 @@ __attribute__((always_inline)) INLINE static float pow_gamma(float x) {
#if defined(HYDRO_GAMMA_5_3)
#ifdef WITH_ICBRTF
const float icbrt = icbrtf(x); /* x^(-1/3) */
return icbrt * x * x; /* x^(5/3) */
#else
const float cbrt = cbrtf(x); /* x^(1/3) */
return cbrt * cbrt * x; /* x^(5/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_7_5)
......@@ -121,7 +127,12 @@ __attribute__((always_inline)) INLINE static float pow_gamma(float x) {
#elif defined(HYDRO_GAMMA_4_3)
#ifdef WITH_ICBRTF
const float icbrt = icbrtf(x); /* x^(-1/3) */
return icbrt * icbrt * x * x; /* x^(4/3) */
#else
return cbrtf(x) * x; /* x^(4/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_2_1)
......@@ -146,8 +157,13 @@ __attribute__((always_inline)) INLINE static float pow_gamma_minus_one(
#if defined(HYDRO_GAMMA_5_3)
#ifdef WITH_ICBRTF
const float icbrt = icbrtf(x); /* x^(-1/3) */
return x * icbrt; /* x^(2/3) */
#else
const float cbrt = cbrtf(x); /* x^(1/3) */
return cbrt * cbrt; /* x^(2/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_7_5)
......@@ -155,7 +171,12 @@ __attribute__((always_inline)) INLINE static float pow_gamma_minus_one(
#elif defined(HYDRO_GAMMA_4_3)
#ifdef WITH_ICBRTF
const float icbrt = icbrtf(x); /* x^(-1/3) */
return x * icbrt * icbrt; /* x^(1/3) */
#else
return cbrtf(x); /* x^(1/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_2_1)
......@@ -180,8 +201,13 @@ __attribute__((always_inline)) INLINE static float pow_minus_gamma_minus_one(
#if defined(HYDRO_GAMMA_5_3)
#ifdef WITH_ICBRTF
const float icbrt = icbrtf(x); /* x^(-1/3) */
return icbrt * icbrt; /* x^(-2/3) */
#else
const float cbrt_inv = 1.f / cbrtf(x); /* x^(-1/3) */
return cbrt_inv * cbrt_inv; /* x^(-2/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_7_5)
......@@ -189,7 +215,11 @@ __attribute__((always_inline)) INLINE static float pow_minus_gamma_minus_one(
#elif defined(HYDRO_GAMMA_4_3)
#ifdef WITH_ICBRTF
return icbrtf(x); /* x^(-1/3) */
#else
return 1.f / cbrtf(x); /* x^(-1/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_2_1)
......@@ -216,9 +246,15 @@ __attribute__((always_inline)) INLINE static float pow_minus_gamma(float x) {
#if defined(HYDRO_GAMMA_5_3)
#ifdef WITH_ICBRTF
const float icbrt = icbrtf(x); /* x^(-1/3) */
const float icbrt2 = icbrt * icbrt; /* x^(-2/3) */
return icbrt * icbrt2 * icbrt2; /* x^(-5/3) */
#else
const float cbrt_inv = 1.f / cbrtf(x); /* x^(-1/3) */
const float cbrt_inv2 = cbrt_inv * cbrt_inv; /* x^(-2/3) */
return cbrt_inv * cbrt_inv2 * cbrt_inv2; /* x^(-5/3) */
#endif // WITH_ICBRTF
#elif defined(HYDRO_GAMMA_7_5)
......@@ -226,7 +262,11 @@ __attribute__((always_inline)) INLINE static float pow_minus_gamma(float x) {
#elif defined(HYDRO_GAMMA_4_3)
#ifdef WITH_ICBRTF
const float cbrt_inv = icbrtf(x); /* x^(-1/3) */
#else
const float cbrt_inv = 1.f / cbrtf(x); /* x^(-1/3) */
#endif // WITH_ICBRTF
const float cbrt_inv2 = cbrt_inv * cbrt_inv; /* x^(-2/3) */
return cbrt_inv2 * cbrt_inv2; /* x^(-4/3) */
......
/*******************************************************************************
* This file is part of SWIFT.
* Copyright (c) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
* Matthieu Schaller (matthieu.schaller@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
#ifndef SWIFT_CBRT_H
#define SWIFT_CBRT_H
/* Config parameters. */
#include "../config.h"
/* Some standard headers. */
#include <math.h>
/* Local headers. */
#include "inline.h"
/**
* @brief Compute the inverse cube root of a single-precision floating-point
* number.
*
* This function does not care about non-finite inputs.
*
* @warning This function is faster than both gcc and Intel's `cbrtf()`
* functions on x86 systems. However, Other compilers or other architectures
* may have faster implementations of the standard function `cbrtf()` that
* will potentionally outperform this function.
*
* @param x_in The input value.
*
* @return The inverse cubic root of @c x_in (i.e. \f$x_{in}^{-1/3} \f$) .
*/
__attribute__((always_inline)) INLINE static float icbrtf(float x_in) {
union {
float as_float;
unsigned int as_uint;
int as_int;
} cast;
/* Extract the exponent. */
cast.as_float = x_in;
const int exponent = ((cast.as_int & 0x7f800000) >> 23) - 127;
/* Clear the exponent and sign to get the mantissa. */
cast.as_uint = (cast.as_uint & ~0xff800000) | 0x3f800000;
const float x_norm = cast.as_float;
/* Multiply by sqrt(1/2) and subtract one, should then be in the
range [sqrt(1/2) - 1, sqrt(2) - 1). */
const float x = x_norm * (float)M_SQRT1_2 - 1.0f;
/* Compute the polynomial interpolant. */
float res =
9.99976591940035e-01f +
x * (-3.32901212909283e-01f +
x * (2.24361110929912e-01f +
x * (-1.88913279594895e-01f + x * 1.28384036492344e-01f)));
/* Compute the new exponent and the correction factor. */
int exponent_new = exponent;
if (exponent_new < 0) exponent_new -= 2;
exponent_new = -exponent_new / 3;
const int exponent_rem = exponent + 3 * exponent_new;
cast.as_uint = (exponent_new + 127) << 23;
static const float scale[3] = {8.90898718140339e-01f, 7.07106781186548e-01f,
5.61231024154687e-01f};
const float exponent_scale = cast.as_float * scale[exponent_rem];
/* Scale the result and set the correct sign. */
res = copysignf(res * exponent_scale, x_in);
/* One step of Newton iteration to refine the result. */
res *= (1.0f / 3.0f) * (4.0f - x_in * res * res * res);
/* We're done. */
return res;
}
#endif /* SWIFT_CBRT_H */
......@@ -27,7 +27,8 @@ TESTS = testGreetings testMaths testReading.sh testSingle testKernel testSymmetr
testMatrixInversion testThreadpool testDump testLogger testInteractions.sh \
testVoronoi1D testVoronoi2D testVoronoi3D testGravityDerivatives \
testPeriodicBC.sh testPeriodicBCPerturbed.sh testPotentialSelf \
testPotentialPair testEOS testUtilities testSelectOutput.sh
testPotentialPair testEOS testUtilities testSelectOutput.sh \
testCbrt
# List of test programs to compile
check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
......@@ -38,7 +39,7 @@ check_PROGRAMS = testGreetings testReading testSingle testTimeIntegration \
testRiemannHLLC testMatrixInversion testDump testLogger \
testVoronoi1D testVoronoi2D testVoronoi3D testPeriodicBC \
testGravityDerivatives testPotentialSelf testPotentialPair testEOS testUtilities \
testSelectOutput
testSelectOutput testCbrt
# Rebuild tests when SWIFT is updated.
$(check_PROGRAMS): ../src/.libs/libswiftsim.a
......
/*******************************************************************************
* This file is part of SWIFT.
* Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@durham.ac.uk)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published
* by the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
******************************************************************************/
/* Config parameters. */
#include "../config.h"
/* Standard includes. */
#include <fenv.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
/* Local includes. */
#include "cbrt.h"
#include "clocks.h"
#include "error.h"
int main(int argc, char *argv[]) {
/* Initialize CPU frequency, this also starts time. */
unsigned long long cpufreq = 0;
clocks_set_cpufreq(cpufreq);
/* Choke on FP-exceptions */
feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
/* Some constants for this test. */
const int num_vals = 200000000;
const float range_min = -1e6f;
const float range_max = 1e6f;
const float err_rel_tol = 1e-7;
message("executing %i runs of each command.", num_vals);
/* Create and fill an array of floats. */
float *data = (float *)malloc(sizeof(float) * num_vals);
for (int k = 0; k < num_vals; k++) {
data[k] = (float)rand() / RAND_MAX;
data[k] = (1.0f - data[k]) * range_min + data[k] * range_max;
if (data[k] == 0.f) k--; /* Skip 0 to avoid spurious mistakes */
}
/* First run just checks for correctness. */
for (int k = 0; k < num_vals; k++) {
const double exact = cbrt(data[k]); // computed in double just to be sure.
const float ours = 1.0f / icbrtf(data[k]);
const float err_abs = fabsf(exact - ours);
const float err_rel = 0.5f * fabsf(exact - ours) / (exact + ours);
if (err_rel > err_rel_tol && data[k] != 0.f)
error(
"failed for x = %.8e, exact = %.8e, ours = %.8e, err = %.3e, rel = "
"%.3e",
data[k], exact, ours, err_abs, err_rel);
}
/* Second run to check the speed of the inverse cube root. */
double acc_exact = 0.0f;
ticks tic_exact = getticks();
for (int k = 0; k < num_vals; k++) {
acc_exact += 1.0f / cbrtf(data[k]);
}
message("1.0f / cbrtf took %9.3f %s (acc = %18.11e).",
clocks_from_ticks(getticks() - tic_exact), clocks_getunit(),
acc_exact);
double acc_ours = 0.0;
ticks tic_ours = getticks();
for (int k = 0; k < num_vals; k++) {
acc_ours += icbrtf(data[k]);
}
message("icbrtf took %9.3f %s (acc = %18.11e).",
clocks_from_ticks(getticks() - tic_ours), clocks_getunit(), acc_ours);
/* Third run to check the speed of the cube root. */
acc_exact = 0.0f;
tic_exact = getticks();
for (int k = 0; k < num_vals; k++) {
acc_exact += cbrtf(data[k]);
}
message("cbrtf took %9.3f %s (acc = %18.11e).",
clocks_from_ticks(getticks() - tic_exact), clocks_getunit(),
acc_exact);
acc_ours = 0.0f;
tic_ours = getticks();
for (int k = 0; k < num_vals; k++) {
const float temp = icbrtf(data[k]);
acc_ours += data[k] * temp * temp;
}
message("x * icbrtf^2 took %9.3f %s (acc = %18.11e).",
clocks_from_ticks(getticks() - tic_ours), clocks_getunit(), acc_ours);
/* Fourth run to check the speed of (.)^(2/3). */
acc_exact = 0.0f;
tic_exact = getticks();
for (int k = 0; k < num_vals; k++) {
const float temp = cbrtf(data[k]);
acc_exact += temp * temp;
}
message("cbrtf^2 took %9.3f %s (acc = %18.11e).",
clocks_from_ticks(getticks() - tic_exact), clocks_getunit(),
acc_exact);
acc_ours = 0.0f;
tic_ours = getticks();
for (int k = 0; k < num_vals; k++) {
acc_ours += data[k] * icbrtf(data[k]);
}
message("x * icbrtf took %9.3f %s (acc = %18.11e).",
clocks_from_ticks(getticks() - tic_ours), clocks_getunit(), acc_ours);
return 0;
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment