Update to FreeBSD libm r336665.

This reverts commit 253a8306316cedfd6fd3e3a169fbffe4cac04035 and moves
us forward to a revision that contains fixes for the problem with the
previous attempt.

This also makes sincos(3)/sincosf(3)/sincosl(3) available to `_BSD_SOURCE`
as well as `_GNU_SOURCE`.

The new FreeBSD libm code requires the FreeBSD `__CONCAT` macro, and all
our existing callers are FreeBSD too, so update that.

There's also an assumption that <complex.h> drags in <math.h> which isn't
true for us, so work around that with `-include` in the makefile. This
then causes clang to recognize a bug -- returning from a void function --
in our fake (LP32) sincosl(3), so fix that too.

Bug: http://b/111710419
Change-Id: I84703ad844f8afde6ec6b11604ab3c096ccb62c3
Test: ran tests
diff --git a/libm/upstream-freebsd/lib/msun/src/math_private.h b/libm/upstream-freebsd/lib/msun/src/math_private.h
index 637a09a..bc3d516 100644
--- a/libm/upstream-freebsd/lib/msun/src/math_private.h
+++ b/libm/upstream-freebsd/lib/msun/src/math_private.h
@@ -11,7 +11,7 @@
 
 /*
  * from: @(#)fdlibm.h 5.1 93/09/24
- * $FreeBSD: head/lib/msun/src/math_private.h 319047 2017-05-28 06:13:38Z mmel $
+ * $FreeBSD: head/lib/msun/src/math_private.h 336663 2018-07-24 10:10:16Z bde $
  */
 
 #ifndef _MATH_PRIVATE_H_
@@ -48,6 +48,47 @@
 #define	IEEE_WORD_ORDER	BYTE_ORDER
 #endif
 
+/* A union which permits us to convert between a long double and
+   four 32 bit ints.  */
+
+#if IEEE_WORD_ORDER == BIG_ENDIAN
+
+typedef union
+{
+  long double value;
+  struct {
+    u_int32_t mswhi;
+    u_int32_t mswlo;
+    u_int32_t lswhi;
+    u_int32_t lswlo;
+  } parts32;
+  struct {
+    u_int64_t msw;
+    u_int64_t lsw;
+  } parts64;
+} ieee_quad_shape_type;
+
+#endif
+
+#if IEEE_WORD_ORDER == LITTLE_ENDIAN
+
+typedef union
+{
+  long double value;
+  struct {
+    u_int32_t lswlo;
+    u_int32_t lswhi;
+    u_int32_t mswlo;
+    u_int32_t mswhi;
+  } parts32;
+  struct {
+    u_int64_t lsw;
+    u_int64_t msw;
+  } parts64;
+} ieee_quad_shape_type;
+
+#endif
+
 #if IEEE_WORD_ORDER == BIG_ENDIAN
 
 typedef union
@@ -294,8 +335,9 @@
 
 /* Support switching the mode to FP_PE if necessary. */
 #if defined(__i386__) && !defined(NO_FPSETPREC)
-#define	ENTERI()				\
-	long double __retval;			\
+#define	ENTERI() ENTERIT(long double)
+#define	ENTERIT(returntype)			\
+	returntype __retval;			\
 	fp_prec_t __oprec;			\
 						\
 	if ((__oprec = fpgetprec()) != FP_PE)	\
@@ -318,6 +360,7 @@
 } while (0)
 #else
 #define	ENTERI()
+#define	ENTERIT(x)
 #define	RETURNI(x)	RETURNF(x)
 #define	ENTERV()
 #define	RETURNV()	return
@@ -435,6 +478,31 @@
  */
 void _scan_nan(uint32_t *__words, int __num_words, const char *__s);
 
+/*
+ * Mix 0, 1 or 2 NaNs.  First add 0 to each arg.  This normally just turns
+ * signaling NaNs into quiet NaNs by setting a quiet bit.  We do this
+ * because we want to never return a signaling NaN, and also because we
+ * don't want the quiet bit to affect the result.  Then mix the converted
+ * args using the specified operation.
+ *
+ * When one arg is NaN, the result is typically that arg quieted.  When both
+ * args are NaNs, the result is typically the quietening of the arg whose
+ * mantissa is largest after quietening.  When neither arg is NaN, the
+ * result may be NaN because it is indeterminate, or finite for subsequent
+ * construction of a NaN as the indeterminate 0.0L/0.0L.
+ *
+ * Technical complications: the result in bits after rounding to the final
+ * precision might depend on the runtime precision and/or on compiler
+ * optimizations, especially when different register sets are used for
+ * different precisions.  Try to make the result not depend on at least the
+ * runtime precision by always doing the main mixing step in long double
+ * precision.  Try to reduce dependencies on optimizations by adding the
+ * the 0's in different precisions (unless everything is in long double
+ * precision).
+ */
+#define	nan_mix(x, y)		(nan_mix_op((x), (y), +))
+#define	nan_mix_op(x, y, op)	(((x) + 0.0L) op ((y) + 0))
+
 #ifdef _COMPLEX_H
 
 /*
@@ -510,48 +578,116 @@
 
 #endif /* _COMPLEX_H */
  
-#ifdef __GNUCLIKE_ASM
+/*
+ * The rnint() family rounds to the nearest integer for a restricted range
+ * range of args (up to about 2**MANT_DIG).  We assume that the current
+ * rounding mode is FE_TONEAREST so that this can be done efficiently.
+ * Extra precision causes more problems in practice, and we only centralize
+ * this here to reduce those problems, and have not solved the efficiency
+ * problems.  The exp2() family uses a more delicate version of this that
+ * requires extracting bits from the intermediate value, so it is not
+ * centralized here and should copy any solution of the efficiency problems.
+ */
 
-/* Asm versions of some functions. */
+static inline double
+rnint(__double_t x)
+{
+	/*
+	 * This casts to double to kill any extra precision.  This depends
+	 * on the cast being applied to a double_t to avoid compiler bugs
+	 * (this is a cleaner version of STRICT_ASSIGN()).  This is
+	 * inefficient if there actually is extra precision, but is hard
+	 * to improve on.  We use double_t in the API to minimise conversions
+	 * for just calling here.  Note that we cannot easily change the
+	 * magic number to the one that works directly with double_t, since
+	 * the rounding precision is variable at runtime on x86 so the
+	 * magic number would need to be variable.  Assuming that the
+	 * rounding precision is always the default is too fragile.  This
+	 * and many other complications will move when the default is
+	 * changed to FP_PE.
+	 */
+	return ((double)(x + 0x1.8p52) - 0x1.8p52);
+}
 
-#ifdef __amd64__
+static inline float
+rnintf(__float_t x)
+{
+	/*
+	 * As for rnint(), except we could just call that to handle the
+	 * extra precision case, usually without losing efficiency.
+	 */
+	return ((float)(x + 0x1.8p23F) - 0x1.8p23F);
+}
+
+#ifdef LDBL_MANT_DIG
+/*
+ * The complications for extra precision are smaller for rnintl() since it
+ * can safely assume that the rounding precision has been increased from
+ * its default to FP_PE on x86.  We don't exploit that here to get small
+ * optimizations from limiting the rangle to double.  We just need it for
+ * the magic number to work with long doubles.  ld128 callers should use
+ * rnint() instead of this if possible.  ld80 callers should prefer
+ * rnintl() since for amd64 this avoids swapping the register set, while
+ * for i386 it makes no difference (assuming FP_PE), and for other arches
+ * it makes little difference.
+ */
+static inline long double
+rnintl(long double x)
+{
+	return (x + __CONCAT(0x1.8p, LDBL_MANT_DIG) / 2 -
+	    __CONCAT(0x1.8p, LDBL_MANT_DIG) / 2);
+}
+#endif /* LDBL_MANT_DIG */
+
+/*
+ * irint() and i64rint() give the same result as casting to their integer
+ * return type provided their arg is a floating point integer.  They can
+ * sometimes be more efficient because no rounding is required.
+ */
+#if (defined(amd64) || defined(__i386__)) && defined(__GNUCLIKE_ASM)
+#define	irint(x)						\
+    (sizeof(x) == sizeof(float) &&				\
+    sizeof(__float_t) == sizeof(long double) ? irintf(x) :	\
+    sizeof(x) == sizeof(double) &&				\
+    sizeof(__double_t) == sizeof(long double) ? irintd(x) :	\
+    sizeof(x) == sizeof(long double) ? irintl(x) : (int)(x))
+#else
+#define	irint(x)	((int)(x))
+#endif
+
+#define	i64rint(x)	((int64_t)(x))	/* only needed for ld128 so not opt. */
+
+#if defined(__i386__) && defined(__GNUCLIKE_ASM)
 static __inline int
-irint(double x)
+irintf(float x)
 {
 	int n;
 
-	asm("cvtsd2si %1,%0" : "=r" (n) : "x" (x));
+	__asm("fistl %0" : "=m" (n) : "t" (x));
 	return (n);
 }
-#define	HAVE_EFFICIENT_IRINT
-#endif
 
-#ifdef __i386__
 static __inline int
-irint(double x)
+irintd(double x)
 {
 	int n;
 
-	asm("fistl %0" : "=m" (n) : "t" (x));
+	__asm("fistl %0" : "=m" (n) : "t" (x));
 	return (n);
 }
-#define	HAVE_EFFICIENT_IRINT
 #endif
 
-#if defined(__amd64__) || defined(__i386__)
+#if (defined(__amd64__) || defined(__i386__)) && defined(__GNUCLIKE_ASM)
 static __inline int
 irintl(long double x)
 {
 	int n;
 
-	asm("fistl %0" : "=m" (n) : "t" (x));
+	__asm("fistl %0" : "=m" (n) : "t" (x));
 	return (n);
 }
-#define	HAVE_EFFICIENT_IRINTL
 #endif
 
-#endif /* __GNUCLIKE_ASM */
-
 #ifdef DEBUG
 #if defined(__amd64__) || defined(__i386__)
 #define	breakpoint()	asm("int $3")