#include #include #include uint32_t float_as_uint32 (float a) { uint32_t r; memcpy (&r, &a, sizeof r); return r; } float uint32_as_float (uint32_t a) { float r; memcpy (&r, &a, sizeof r); return r; } void logf_ext (float a, float *loghi, float *loglo) { const float LOG2_HI = 6.93147182e-1f; const float LOG2_LO = -1.90465421e-9f; const float SQRT_HALF = 0.70710678f; float m, r, i, s, t, p, qhi, qlo; int e; const float POW_TWO_M23 = 1.19209290e-7f; const float POW_TWO_P23 = 8388608.0f; const float FP32_MIN_NORM = 1.175494351e-38f; i = 0.0f; if (a < FP32_MIN_NORM) { a = a * POW_TWO_P23; i = -23.0f; } e = (float_as_uint32 (a) - float_as_uint32 (SQRT_HALF)) & 0xff800000; m = uint32_as_float (float_as_uint32 (a) - e); i = fmaf ((float)e, POW_TWO_M23, i); p = m + 1.0f; m = m - 1.0f; r = 1.0f / p; qhi = r * m; qlo = r * fmaf (qhi, -m, fmaf (qhi, -2.0f, m)); s = qhi * qhi; r = 0.1293334961f; r = fmaf (r, s, 0.1419928074f); r = fmaf (r, s, 0.2000148296f); r = fmaf (r, s, 0.3333332539f); t = fmaf (qhi, qlo + qlo, fmaf (qhi, qhi, -s)); p = s * qhi; t = fmaf (s, qlo, fmaf (t, qhi, fmaf (s, qhi, -p))); s = fmaf (r, p, fmaf (r, t, qlo)); r = 2 * qhi; t = fmaf ( LOG2_HI, i, r); p = fmaf (-LOG2_HI, i, t); s = fmaf ( LOG2_LO, i, fmaf (2.f, s, r - p)); *loghi = p = t + s; *loglo = (t - p) + s; } float expf_unchecked (float a) { float f, j, r; int i; j = fmaf (1.442695f, a, 12582912.f) - 12582912.f; f = fmaf (j, -6.93145752e-1f, a); f = fmaf (j, -1.42860677e-6f, f); i = (int)j; r = 1.37805939e-3f; r = fmaf (r, f, 8.37312452e-3f); r = fmaf (r, f, 4.16695364e-2f); r = fmaf (r, f, 1.66664720e-1f); r = fmaf (r, f, 4.99999851e-1f); r = fmaf (r, f, 1.00000000e+0f); r = fmaf (r, f, 1.00000000e+0f); float s, t; uint32_t ia = (i > 0) ? 0u : 0x83000000u; s = uint32_as_float (0x7f000000u + ia); t = uint32_as_float (((uint32_t)i << 23) - ia); r = r * s; r = r * t; return r; } float powf_core (float a, float b) { const float MAX_IEEE754_FLT = uint32_as_float (0x7f7fffff); const float EXP_OVFL_BOUND = 88.7228394f; // 0x1.62e430p+6f; const float EXP_OVFL_UNFL_F = 104.0f; const float MY_INF_F = uint32_as_float (0x7f800000); float lhi, llo, thi, tlo, phi, plo, r; logf_ext (a, &lhi, &llo); thi = lhi * b; if (fabsf (thi) > EXP_OVFL_UNFL_F) { r = (thi < 0.0f) ? 0.0f : MY_INF_F; } else { tlo = fmaf (lhi, b, -thi); tlo = fmaf (llo, b, +tlo); phi = thi + tlo; if (phi == EXP_OVFL_BOUND) phi = uint32_as_float (float_as_uint32 (phi) - 1); plo = (thi - phi) + tlo; r = expf_unchecked (phi); if (fabsf (r) <= MAX_IEEE754_FLT) { r = fmaf (plo, r, r); } } return r; } float powf (float a, float b) { const float MY_INF_F = uint32_as_float (0x7f800000); const float MY_NAN_F = uint32_as_float (0xffc00000); int expo_odd_int; float r; expo_odd_int = fmaf (-2.0f, floorf (0.5f * b), b) == 1.0f; if ((a == 1.0f) || (b == 0.0f)) { r = 1.0f; } else if (isnan (a) || isnan (b)) { r = a + b; } else if (isinf (b)) { r = ((fabsf (a) < 1.0f) != (b < 0.0f)) ? 0.0f : MY_INF_F; if (a == -1.0f) r = 1.0f; } else if (isinf (a)) { r = (b < 0.0f) ? 0.0f : MY_INF_F; if ((a < 0.0f) && expo_odd_int) r = -r; } else if (a == 0.0f) { r = (expo_odd_int) ? (a + a) : 0.0f; if (b < 0.0f) r = copysignf (MY_INF_F, r); } else if ((a < 0.0f) && (b != floorf (b))) { r = MY_NAN_F; } else { r = powf_core (fabsf (a), b); if ((a < 0.0f) && expo_odd_int) { r = -r; } } return r; } double pow (double a, double b) { return powf(a, b); // who cares about precision in a kernel :3 }