| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 1 | /*- | 
| Elliott Hughes | c1e46b6 | 2023-07-19 14:06:31 -0700 | [diff] [blame] | 2 | * SPDX-License-Identifier: BSD-2-Clause | 
| Elliott Hughes | 8da8ca4 | 2018-05-08 13:35:33 -0700 | [diff] [blame] | 3 | * | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 4 | * Copyright (c) 2005-2011 David Schultz <das@FreeBSD.ORG> | 
|  | 5 | * All rights reserved. | 
|  | 6 | * | 
|  | 7 | * Redistribution and use in source and binary forms, with or without | 
|  | 8 | * modification, are permitted provided that the following conditions | 
|  | 9 | * are met: | 
|  | 10 | * 1. Redistributions of source code must retain the above copyright | 
|  | 11 | *    notice, this list of conditions and the following disclaimer. | 
|  | 12 | * 2. Redistributions in binary form must reproduce the above copyright | 
|  | 13 | *    notice, this list of conditions and the following disclaimer in the | 
|  | 14 | *    documentation and/or other materials provided with the distribution. | 
|  | 15 | * | 
|  | 16 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | 
|  | 17 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|  | 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|  | 19 | * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | 
|  | 20 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | 
|  | 21 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | 
|  | 22 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | 
|  | 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | 
|  | 24 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | 
|  | 25 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | 
|  | 26 | * SUCH DAMAGE. | 
|  | 27 | */ | 
|  | 28 |  | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 29 | #include <fenv.h> | 
|  | 30 | #include <float.h> | 
|  | 31 | #include <math.h> | 
|  | 32 |  | 
|  | 33 | #include "math_private.h" | 
|  | 34 |  | 
| Elliott Hughes | 99ef447 | 2022-01-12 17:51:20 -0800 | [diff] [blame] | 35 | #ifdef USE_BUILTIN_FMA | 
|  | 36 | double | 
|  | 37 | fma(double x, double y, double z) | 
|  | 38 | { | 
|  | 39 | return (__builtin_fma(x, y, z)); | 
|  | 40 | } | 
|  | 41 | #else | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 42 | /* | 
|  | 43 | * A struct dd represents a floating-point number with twice the precision | 
|  | 44 | * of a double.  We maintain the invariant that "hi" stores the 53 high-order | 
|  | 45 | * bits of the result. | 
|  | 46 | */ | 
|  | 47 | struct dd { | 
|  | 48 | double hi; | 
|  | 49 | double lo; | 
|  | 50 | }; | 
|  | 51 |  | 
|  | 52 | /* | 
|  | 53 | * Compute a+b exactly, returning the exact result in a struct dd.  We assume | 
|  | 54 | * that both a and b are finite, but make no assumptions about their relative | 
|  | 55 | * magnitudes. | 
|  | 56 | */ | 
|  | 57 | static inline struct dd | 
|  | 58 | dd_add(double a, double b) | 
|  | 59 | { | 
|  | 60 | struct dd ret; | 
|  | 61 | double s; | 
|  | 62 |  | 
|  | 63 | ret.hi = a + b; | 
|  | 64 | s = ret.hi - a; | 
|  | 65 | ret.lo = (a - (ret.hi - s)) + (b - s); | 
|  | 66 | return (ret); | 
|  | 67 | } | 
|  | 68 |  | 
|  | 69 | /* | 
|  | 70 | * Compute a+b, with a small tweak:  The least significant bit of the | 
|  | 71 | * result is adjusted into a sticky bit summarizing all the bits that | 
|  | 72 | * were lost to rounding.  This adjustment negates the effects of double | 
|  | 73 | * rounding when the result is added to another number with a higher | 
|  | 74 | * exponent.  For an explanation of round and sticky bits, see any reference | 
|  | 75 | * on FPU design, e.g., | 
|  | 76 | * | 
|  | 77 | *     J. Coonen.  An Implementation Guide to a Proposed Standard for | 
|  | 78 | *     Floating-Point Arithmetic.  Computer, vol. 13, no. 1, Jan 1980. | 
|  | 79 | */ | 
|  | 80 | static inline double | 
|  | 81 | add_adjusted(double a, double b) | 
|  | 82 | { | 
|  | 83 | struct dd sum; | 
|  | 84 | uint64_t hibits, lobits; | 
|  | 85 |  | 
|  | 86 | sum = dd_add(a, b); | 
|  | 87 | if (sum.lo != 0) { | 
|  | 88 | EXTRACT_WORD64(hibits, sum.hi); | 
|  | 89 | if ((hibits & 1) == 0) { | 
|  | 90 | /* hibits += (int)copysign(1.0, sum.hi * sum.lo) */ | 
|  | 91 | EXTRACT_WORD64(lobits, sum.lo); | 
|  | 92 | hibits += 1 - ((hibits ^ lobits) >> 62); | 
|  | 93 | INSERT_WORD64(sum.hi, hibits); | 
|  | 94 | } | 
|  | 95 | } | 
|  | 96 | return (sum.hi); | 
|  | 97 | } | 
|  | 98 |  | 
|  | 99 | /* | 
|  | 100 | * Compute ldexp(a+b, scale) with a single rounding error. It is assumed | 
|  | 101 | * that the result will be subnormal, and care is taken to ensure that | 
|  | 102 | * double rounding does not occur. | 
|  | 103 | */ | 
|  | 104 | static inline double | 
|  | 105 | add_and_denormalize(double a, double b, int scale) | 
|  | 106 | { | 
|  | 107 | struct dd sum; | 
|  | 108 | uint64_t hibits, lobits; | 
|  | 109 | int bits_lost; | 
|  | 110 |  | 
|  | 111 | sum = dd_add(a, b); | 
|  | 112 |  | 
|  | 113 | /* | 
|  | 114 | * If we are losing at least two bits of accuracy to denormalization, | 
|  | 115 | * then the first lost bit becomes a round bit, and we adjust the | 
|  | 116 | * lowest bit of sum.hi to make it a sticky bit summarizing all the | 
|  | 117 | * bits in sum.lo. With the sticky bit adjusted, the hardware will | 
|  | 118 | * break any ties in the correct direction. | 
|  | 119 | * | 
|  | 120 | * If we are losing only one bit to denormalization, however, we must | 
|  | 121 | * break the ties manually. | 
|  | 122 | */ | 
|  | 123 | if (sum.lo != 0) { | 
|  | 124 | EXTRACT_WORD64(hibits, sum.hi); | 
|  | 125 | bits_lost = -((int)(hibits >> 52) & 0x7ff) - scale + 1; | 
| Calin Juravle | bd3155d | 2014-03-13 16:20:36 +0000 | [diff] [blame] | 126 | if ((bits_lost != 1) ^ (int)(hibits & 1)) { | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 127 | /* hibits += (int)copysign(1.0, sum.hi * sum.lo) */ | 
|  | 128 | EXTRACT_WORD64(lobits, sum.lo); | 
|  | 129 | hibits += 1 - (((hibits ^ lobits) >> 62) & 2); | 
|  | 130 | INSERT_WORD64(sum.hi, hibits); | 
|  | 131 | } | 
|  | 132 | } | 
|  | 133 | return (ldexp(sum.hi, scale)); | 
|  | 134 | } | 
|  | 135 |  | 
|  | 136 | /* | 
|  | 137 | * Compute a*b exactly, returning the exact result in a struct dd.  We assume | 
|  | 138 | * that both a and b are normalized, so no underflow or overflow will occur. | 
|  | 139 | * The current rounding mode must be round-to-nearest. | 
|  | 140 | */ | 
|  | 141 | static inline struct dd | 
|  | 142 | dd_mul(double a, double b) | 
|  | 143 | { | 
|  | 144 | static const double split = 0x1p27 + 1.0; | 
|  | 145 | struct dd ret; | 
|  | 146 | double ha, hb, la, lb, p, q; | 
|  | 147 |  | 
|  | 148 | p = a * split; | 
|  | 149 | ha = a - p; | 
|  | 150 | ha += p; | 
|  | 151 | la = a - ha; | 
|  | 152 |  | 
|  | 153 | p = b * split; | 
|  | 154 | hb = b - p; | 
|  | 155 | hb += p; | 
|  | 156 | lb = b - hb; | 
|  | 157 |  | 
|  | 158 | p = ha * hb; | 
|  | 159 | q = ha * lb + la * hb; | 
|  | 160 |  | 
|  | 161 | ret.hi = p + q; | 
|  | 162 | ret.lo = p - ret.hi + q + la * lb; | 
|  | 163 | return (ret); | 
|  | 164 | } | 
|  | 165 |  | 
|  | 166 | /* | 
|  | 167 | * Fused multiply-add: Compute x * y + z with a single rounding error. | 
|  | 168 | * | 
|  | 169 | * We use scaling to avoid overflow/underflow, along with the | 
|  | 170 | * canonical precision-doubling technique adapted from: | 
|  | 171 | * | 
|  | 172 | *	Dekker, T.  A Floating-Point Technique for Extending the | 
|  | 173 | *	Available Precision.  Numer. Math. 18, 224-242 (1971). | 
|  | 174 | * | 
|  | 175 | * This algorithm is sensitive to the rounding precision.  FPUs such | 
|  | 176 | * as the i387 must be set in double-precision mode if variables are | 
|  | 177 | * to be stored in FP registers in order to avoid incorrect results. | 
|  | 178 | * This is the default on FreeBSD, but not on many other systems. | 
|  | 179 | * | 
|  | 180 | * Hardware instructions should be used on architectures that support it, | 
|  | 181 | * since this implementation will likely be several times slower. | 
|  | 182 | */ | 
|  | 183 | double | 
|  | 184 | fma(double x, double y, double z) | 
|  | 185 | { | 
|  | 186 | double xs, ys, zs, adj; | 
|  | 187 | struct dd xy, r; | 
|  | 188 | int oround; | 
|  | 189 | int ex, ey, ez; | 
|  | 190 | int spread; | 
|  | 191 |  | 
|  | 192 | /* | 
|  | 193 | * Handle special cases. The order of operations and the particular | 
|  | 194 | * return values here are crucial in handling special cases involving | 
|  | 195 | * infinities, NaNs, overflows, and signed zeroes correctly. | 
|  | 196 | */ | 
|  | 197 | if (x == 0.0 || y == 0.0) | 
|  | 198 | return (x * y + z); | 
|  | 199 | if (z == 0.0) | 
|  | 200 | return (x * y); | 
|  | 201 | if (!isfinite(x) || !isfinite(y)) | 
|  | 202 | return (x * y + z); | 
|  | 203 | if (!isfinite(z)) | 
|  | 204 | return (z); | 
|  | 205 |  | 
|  | 206 | xs = frexp(x, &ex); | 
|  | 207 | ys = frexp(y, &ey); | 
|  | 208 | zs = frexp(z, &ez); | 
|  | 209 | oround = fegetround(); | 
|  | 210 | spread = ex + ey - ez; | 
|  | 211 |  | 
|  | 212 | /* | 
|  | 213 | * If x * y and z are many orders of magnitude apart, the scaling | 
|  | 214 | * will overflow, so we handle these cases specially.  Rounding | 
|  | 215 | * modes other than FE_TONEAREST are painful. | 
|  | 216 | */ | 
|  | 217 | if (spread < -DBL_MANT_DIG) { | 
|  | 218 | feraiseexcept(FE_INEXACT); | 
|  | 219 | if (!isnormal(z)) | 
|  | 220 | feraiseexcept(FE_UNDERFLOW); | 
|  | 221 | switch (oround) { | 
|  | 222 | case FE_TONEAREST: | 
|  | 223 | return (z); | 
|  | 224 | case FE_TOWARDZERO: | 
| Elliott Hughes | ce6ce4f | 2024-07-18 17:19:31 +0000 | [diff] [blame] | 225 | if ((x > 0.0) ^ (y < 0.0) ^ (z < 0.0)) | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 226 | return (z); | 
|  | 227 | else | 
|  | 228 | return (nextafter(z, 0)); | 
|  | 229 | case FE_DOWNWARD: | 
| Elliott Hughes | ce6ce4f | 2024-07-18 17:19:31 +0000 | [diff] [blame] | 230 | if ((x > 0.0) ^ (y < 0.0)) | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 231 | return (z); | 
|  | 232 | else | 
|  | 233 | return (nextafter(z, -INFINITY)); | 
|  | 234 | default:	/* FE_UPWARD */ | 
| Elliott Hughes | ce6ce4f | 2024-07-18 17:19:31 +0000 | [diff] [blame] | 235 | if ((x > 0.0) ^ (y < 0.0)) | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 236 | return (nextafter(z, INFINITY)); | 
|  | 237 | else | 
|  | 238 | return (z); | 
|  | 239 | } | 
|  | 240 | } | 
|  | 241 | if (spread <= DBL_MANT_DIG * 2) | 
|  | 242 | zs = ldexp(zs, -spread); | 
|  | 243 | else | 
|  | 244 | zs = copysign(DBL_MIN, zs); | 
|  | 245 |  | 
|  | 246 | fesetround(FE_TONEAREST); | 
| Elliott Hughes | ce6ce4f | 2024-07-18 17:19:31 +0000 | [diff] [blame] | 247 | /* work around clang issue #8472 */ | 
| Elliott Hughes | 7841946 | 2013-06-12 16:37:58 -0700 | [diff] [blame] | 248 | volatile double vxs = xs; | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 249 |  | 
|  | 250 | /* | 
|  | 251 | * Basic approach for round-to-nearest: | 
|  | 252 | * | 
|  | 253 | *     (xy.hi, xy.lo) = x * y		(exact) | 
|  | 254 | *     (r.hi, r.lo)   = xy.hi + z	(exact) | 
|  | 255 | *     adj = xy.lo + r.lo		(inexact; low bit is sticky) | 
|  | 256 | *     result = r.hi + adj		(correctly rounded) | 
|  | 257 | */ | 
| Elliott Hughes | 7841946 | 2013-06-12 16:37:58 -0700 | [diff] [blame] | 258 | xy = dd_mul(vxs, ys); | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 259 | r = dd_add(xy.hi, zs); | 
|  | 260 |  | 
|  | 261 | spread = ex + ey; | 
|  | 262 |  | 
| Elliott Hughes | 8cca5d4 | 2024-08-29 20:28:45 +0000 | [diff] [blame] | 263 | if (r.hi == 0.0 && xy.lo == 0) { | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 264 | /* | 
|  | 265 | * When the addends cancel to 0, ensure that the result has | 
|  | 266 | * the correct sign. | 
|  | 267 | */ | 
|  | 268 | fesetround(oround); | 
|  | 269 | volatile double vzs = zs; /* XXX gcc CSE bug workaround */ | 
| Elliott Hughes | 8cca5d4 | 2024-08-29 20:28:45 +0000 | [diff] [blame] | 270 | return (xy.hi + vzs); | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 271 | } | 
|  | 272 |  | 
|  | 273 | if (oround != FE_TONEAREST) { | 
|  | 274 | /* | 
|  | 275 | * There is no need to worry about double rounding in directed | 
|  | 276 | * rounding modes. | 
|  | 277 | */ | 
|  | 278 | fesetround(oround); | 
| Elliott Hughes | ce6ce4f | 2024-07-18 17:19:31 +0000 | [diff] [blame] | 279 | /* work around clang issue #8472 */ | 
| Elliott Hughes | 7841946 | 2013-06-12 16:37:58 -0700 | [diff] [blame] | 280 | volatile double vrlo = r.lo; | 
|  | 281 | adj = vrlo + xy.lo; | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 282 | return (ldexp(r.hi + adj, spread)); | 
|  | 283 | } | 
|  | 284 |  | 
|  | 285 | adj = add_adjusted(r.lo, xy.lo); | 
|  | 286 | if (spread + ilogb(r.hi) > -1023) | 
|  | 287 | return (ldexp(r.hi + adj, spread)); | 
|  | 288 | else | 
|  | 289 | return (add_and_denormalize(r.hi, adj, spread)); | 
|  | 290 | } | 
| Elliott Hughes | 99ef447 | 2022-01-12 17:51:20 -0800 | [diff] [blame] | 291 | #endif /* !USE_BUILTIN_FMA */ | 
| Elliott Hughes | a0ee078 | 2013-01-30 19:06:37 -0800 | [diff] [blame] | 292 |  | 
|  | 293 | #if (LDBL_MANT_DIG == 53) | 
|  | 294 | __weak_reference(fma, fmal); | 
|  | 295 | #endif |