cbrt
Just dumping this here since I haven't had the time to add a cbrt.h
file write a proper unit test:
float __attribute__((always_inline)) my_inv_cbrt(float x_in) {
// Extract the exponent.
const unsigned int x_as_uint = *((int *)&x_in);
const int exponent = ((x_as_uint & 0x7f800000) >> 23) - 127;
// Clear the exponent and sign to get the mantissa.
const unsigned int x_norm_as_int = (x_as_uint & ~0xff800000) | 0x3f800000;
const float x_norm = *((float *)&x_norm_as_int);
// Multiply by sqrt(1/2) and subtract one, should then be in the
// range [sqrt(1/2) - 1, sqrt(2) - 1).
const float x = x_norm * (float)M_SQRT1_2 - 1.0f;
// Compute the polynomial interpolant.
float res = 9.99976591940035e-01f +
x * (-3.32901212909283e-01f +
x * (2.24361110929912e-01f +
x * (-1.88913279594895e-01f +
x * 1.28384036492344e-01f)));
// Compute the new exponent and the correction factor.
int exponent_new = exponent;
if (exponent_new < 0) exponent_new -= 2;
exponent_new = -exponent_new / 3;
const int exponent_rem = exponent + 3 * exponent_new;
const unsigned int exponent_scale_as_int = (exponent_new + 127) << 23;
float exponent_scale = *((float *)&exponent_scale_as_int);
exponent_scale *= exponent_rem > 0 ? (exponent_rem > 1 ? 5.61231024154687e-01f : 7.07106781186548e-01f) : 8.90898718140339e-01f;
res *= exponent_scale;
// One step of Newton iteration to refine the result.
res *= (1.0f / 3.0f) * (4.0f - x_in * res * res * res);
// We're done.
return copysignf(res, x_in);
}
With -O3 -ffast-math
this should be ~65 lines of assembler. Note that this computes the inverse cube root, but in half our use-cases, that's just what we want :)