Add the optimized implementation of 18 math functions for x86 and x86_64 respectively

Change-Id: I31bf601448a9427f825517f3a0ff24de47f49bfa
Signed-off-by: Jingwei Zhang <jingwei.zhang@intel.com>
Signed-off-by: Mingwei Shi <mingwei.shi@intel.com>
diff --git a/libm/x86_64/e_hypot.S b/libm/x86_64/e_hypot.S
new file mode 100644
index 0000000..089b2b4
--- /dev/null
+++ b/libm/x86_64/e_hypot.S
@@ -0,0 +1,210 @@
+/*
+Copyright (c) 2014, Intel Corporation
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright notice,
+    * this list of conditions and the following disclaimer in the documentation
+    * and/or other materials provided with the distribution.
+
+    * Neither the name of Intel Corporation nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/******************************************************************************/
+//                     ALGORITHM DESCRIPTION
+//                     ---------------------
+//
+// X87 version:
+// Use 80-bit FPU precision fmul, fsqrt to compute square and sqrt.
+//
+// SSE version:
+// Swap x, y if |x|<|y|
+// For x=2^k*x, get y=y*2^(-k)
+// Get S ~ sqrt(x^2+y^2)  (leading 1 + leading 25 mantissa bits)
+//
+// Get D = ( RN(x^2+y^2) - S^2 ) + ( x^2 - RN(x^2) ) +
+//                               + ( y^2 - ((RN(x^2+y^2)-RN(x^2)) )
+//
+// Result is 2^k*(S + Se),  where Se = S*e
+//        S*e is approximated as (D/2S)*( 1 - (D/2S)^2*1.0/S )
+//
+// Return 2^k*(S+Se)
+//
+// For |y/x|<2^(-64), return x
+//
+// For cases where maximum biased exponent is either greater than 7fdh or
+// below 32, take a special path to check for special cases (0, NaN, Inf),
+// possible overflow, and more accurate computation for denormal results
+//
+// Special cases:
+//  hypot(x,y), hypot(y,x), and hypot(x,-y) are equivalent
+//  hypot(x,+-0) is equivalent to fabs(x)
+//  hypot(x,y) = y if (x==NaN or x==INF) and y==INF
+//  hypot(x,y) = x if (x==NaN or x==INF) and y!=INF (even if y==NaN!)
+//  hypot(x,y) = y if (x!=NaN and x!=INF) and (y==NaN or y==INF)
+//
+/******************************************************************************/
+
+#include <private/bionic_asm.h>
+# -- Begin  hypot
+ENTRY(hypot)
+# parameter 1: %xmm0
+# parameter 2: %xmm1
+..B1.1:
+..___tag_value_hypot.1:
+..___tag_value_hypot.3:
+..B1.2:
+        subq      $64, %rsp
+        movapd    static_const_table(%rip), %xmm3
+        movsd     %xmm0, 48(%rsp)
+        movsd     %xmm1, 56(%rsp)
+        andpd     %xmm3, %xmm0
+        andpd     %xmm3, %xmm1
+        pextrw    $3, %xmm0, %eax
+        pextrw    $3, %xmm1, %edx
+        cmpl      $24528, %eax
+        ja        .L_2TAG_PACKET_0.0.1
+        cmpl      $24528, %edx
+        ja        .L_2TAG_PACKET_0.0.1
+.L_2TAG_PACKET_1.0.1:
+        fldl      48(%rsp)
+        fldl      56(%rsp)
+        fxch      %st(1)
+        fmul      %st(0), %st
+        fxch      %st(1)
+        nop       
+        fmul      %st(0), %st
+        faddp     %st, %st(1)
+        fsqrt     
+        jmp       .L_2TAG_PACKET_2.0.1
+.L_2TAG_PACKET_0.0.1:
+        cmpl      $32752, %eax
+        movl      %eax, %ecx
+        jae       .L_2TAG_PACKET_3.0.1
+        subl      %edx, %ecx
+        cmpl      $32752, %edx
+        jae       .L_2TAG_PACKET_3.0.1
+        addl      $928, %ecx
+        addl      %edx, %eax
+        cmpl      $1856, %ecx
+        ja        .L_2TAG_PACKET_4.0.1
+        cmpl      $49056, %eax
+        jb        .L_2TAG_PACKET_1.0.1
+        fldl      48(%rsp)
+        fldl      56(%rsp)
+        fxch      %st(1)
+        fmul      %st(0), %st
+        fxch      %st(1)
+        nop       
+        fmul      %st(0), %st
+        faddp     %st, %st(1)
+        fsqrt     
+.L_2TAG_PACKET_5.0.1:
+        fstl      (%rsp)
+        fstpt     16(%rsp)
+        xorl      %eax, %eax
+        movw      24(%rsp), %ax
+        cmpl      $17407, %eax
+        jae       .L_2TAG_PACKET_6.0.1
+        fldl      (%rsp)
+        jmp       .L_2TAG_PACKET_7.0.1
+.L_2TAG_PACKET_4.0.1:
+        movsd     %xmm0, 32(%rsp)
+        movsd     %xmm1, 40(%rsp)
+        fldl      32(%rsp)
+        faddl     40(%rsp)
+        jmp       .L_2TAG_PACKET_5.0.1
+.L_2TAG_PACKET_6.0.1:
+        fldl      (%rsp)
+        jmp       .L_2TAG_PACKET_7.0.1
+.L_2TAG_PACKET_3.0.1:
+        shufpd    $0, %xmm1, %xmm0
+        movdqa    %xmm0, %xmm2
+        movdqa    16+static_const_table(%rip), %xmm3
+        movsd     %xmm0, 32(%rsp)
+        movsd     %xmm1, 40(%rsp)
+        cmppd     $3, %xmm0, %xmm2
+        cmppd     $0, %xmm0, %xmm3
+        movmskpd  %xmm2, %edx
+        movmskpd  %xmm3, %rax
+        testl     %edx, %edx
+        je        .L_2TAG_PACKET_8.0.1
+        fldl      32(%rsp)
+        fmull     40(%rsp)
+        testq     $1, %rax
+        jne       .L_2TAG_PACKET_9.0.1
+        testq     $2, %rax
+        jne       .L_2TAG_PACKET_10.0.1
+        jmp       .L_2TAG_PACKET_2.0.1
+.L_2TAG_PACKET_8.0.1:
+        fldl      32(%rsp)
+        faddl     40(%rsp)
+        jmp       .L_2TAG_PACKET_2.0.1
+.L_2TAG_PACKET_9.0.1:
+        fstpl     40(%rsp)
+        fldl      32(%rsp)
+        jmp       .L_2TAG_PACKET_7.0.1
+.L_2TAG_PACKET_10.0.1:
+        fstpl     32(%rsp)
+        fldl      40(%rsp)
+        jmp       .L_2TAG_PACKET_7.0.1
+.L_2TAG_PACKET_2.0.1:
+.L_2TAG_PACKET_7.0.1:
+        fstpl     16(%rsp)
+        movq      16(%rsp), %xmm0
+        addq      $64, %rsp
+        ret       
+..B1.3:
+..___tag_value_hypot.4:
+END(hypot)
+# -- End  hypot
+	.section .rodata, "a"
+	.align 16
+	.align 16
+static_const_table:
+	.long	4294967295
+	.long	2147483647
+	.long	4294967295
+	.long	2147483647
+	.long	0
+	.long	2146435072
+	.long	0
+	.long	2146435072
+	.type	static_const_table,@object
+	.size	static_const_table,32
+	.data
+	.section .note.GNU-stack, ""
+// -- Begin DWARF2 SEGMENT .eh_frame
+	.section .eh_frame,"a",@progbits
+.eh_frame_seg:
+	.align 1
+	.4byte 0x00000014
+	.8byte 0x00527a0100000000
+	.8byte 0x08070c1b01107801
+	.4byte 0x00000190
+	.4byte 0x00000014
+	.4byte 0x0000001c
+	.4byte ..___tag_value_hypot.1-.
+	.4byte ..___tag_value_hypot.4-..___tag_value_hypot.1
+	.2byte 0x0400
+	.4byte ..___tag_value_hypot.3-..___tag_value_hypot.1
+	.2byte 0x100e
+# End