Supported FMA Intrinsics in Audio Resampler FIR processing module

Test: make vts -j
      run vts run vts-hal run vts-kernel
      tests/build_and_run_all_unit_tests.sh

Change-Id: I997e0bb3dd1282a3c9ca5a270b57898558162b30
Signed-off-by: jaishank <jaishankar.rajendran@intel.com>
diff --git a/media/libaudioprocessing/Android.bp b/media/libaudioprocessing/Android.bp
index cb78063..9f190b6 100644
--- a/media/libaudioprocessing/Android.bp
+++ b/media/libaudioprocessing/Android.bp
@@ -26,6 +26,25 @@
         // uncomment to disable NEON on architectures that actually do support NEON, for benchmarking
         // "-DUSE_NEON=false",
     ],
+
+    arch: {
+        x86: {
+            avx2: {
+                cflags: [
+                    "-mavx2",
+                    "-mfma",
+                ],
+            },
+        },
+        x86_64: {
+            avx2: {
+                cflags: [
+                    "-mavx2",
+                    "-mfma",
+                ],
+            },
+        },
+    },
 }
 
 cc_library_shared {
diff --git a/media/libaudioprocessing/AudioResamplerFirOps.h b/media/libaudioprocessing/AudioResamplerFirOps.h
index 2e4cee3..a3f5ff5 100644
--- a/media/libaudioprocessing/AudioResamplerFirOps.h
+++ b/media/libaudioprocessing/AudioResamplerFirOps.h
@@ -36,13 +36,20 @@
 #include <arm_neon.h>
 #endif
 
-#if defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
+#if defined(__AVX2__)  // Should be supported in x86 ABI for both 32 & 64-bit.
+#define USE_AVX2 (true)  // Inference AVX2/FMA Intrinsics
 #define USE_SSE (true)
+#include <immintrin.h>
+#elif defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
+#define USE_SSE (true)  // Inference SSE Intrinsics
+#define USE_AVX2 (false)
 #include <tmmintrin.h>
 #else
 #define USE_SSE (false)
+#define USE_AVX2(false)
 #endif
 
+
 template<typename T, typename U>
 struct is_same
 {
diff --git a/media/libaudioprocessing/AudioResamplerFirProcessSSE.h b/media/libaudioprocessing/AudioResamplerFirProcessSSE.h
index 30233b5..1c16bc4 100644
--- a/media/libaudioprocessing/AudioResamplerFirProcessSSE.h
+++ b/media/libaudioprocessing/AudioResamplerFirProcessSSE.h
@@ -80,11 +80,16 @@
             posCoef1 = _mm_sub_ps(posCoef1, posCoef);
             negCoef = _mm_sub_ps(negCoef, negCoef1);
 
+
+            #if USE_AVX2
+            posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
+            negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
+            #else
             posCoef1 = _mm_mul_ps(posCoef1, interp);
             negCoef = _mm_mul_ps(negCoef, interp);
-
             posCoef = _mm_add_ps(posCoef1, posCoef);
             negCoef = _mm_add_ps(negCoef, negCoef1);
+            #endif //USE_AVX2
         }
         switch (CHANNELS) {
         case 1: {
@@ -94,11 +99,17 @@
             sN += 4;
 
             posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
+
+            #if USE_AVX2
+            accL = _mm_fmadd_ps(posSamp, posCoef, accL);
+            accL = _mm_fmadd_ps(negSamp, negCoef, accL);
+            #else
             posSamp = _mm_mul_ps(posSamp, posCoef);
             negSamp = _mm_mul_ps(negSamp, negCoef);
-
             accL = _mm_add_ps(accL, posSamp);
             accL = _mm_add_ps(accL, negSamp);
+            #endif
+
         } break;
         case 2: {
             __m128 posSamp0 = _mm_loadu_ps(sP);
@@ -114,15 +125,23 @@
             __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
             __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
 
-            posSampL = _mm_mul_ps(posSampL, posCoef);
-            posSampR = _mm_mul_ps(posSampR, posCoef);
-            negSampL = _mm_mul_ps(negSampL, negCoef);
-            negSampR = _mm_mul_ps(negSampR, negCoef);
+           #if USE_AVX2
+           accL = _mm_fmadd_ps(posSampL, posCoef, accL);
+           accR = _mm_fmadd_ps(posSampR, posCoef, accR);
+           accL = _mm_fmadd_ps(negSampL, negCoef, accL);
+           accR = _mm_fmadd_ps(negSampR, negCoef, accR);
+           #else
+           posSampL = _mm_mul_ps(posSampL, posCoef);
+           posSampR = _mm_mul_ps(posSampR, posCoef);
+           negSampL = _mm_mul_ps(negSampL, negCoef);
+           negSampR = _mm_mul_ps(negSampR, negCoef);
 
-            accL = _mm_add_ps(accL, posSampL);
-            accR = _mm_add_ps(accR, posSampR);
-            accL = _mm_add_ps(accL, negSampL);
-            accR = _mm_add_ps(accR, negSampR);
+           accL = _mm_add_ps(accL, posSampL);
+           accR = _mm_add_ps(accR, posSampR);
+           accL = _mm_add_ps(accL, negSampL);
+           accR = _mm_add_ps(accR, negSampR);
+           #endif
+
         } break;
         }
     } while (count -= 4);
@@ -144,9 +163,13 @@
         outAccum = _mm_hadd_ps(accL, accR);
         outAccum = _mm_hadd_ps(outAccum, outAccum);
     }
-
+    #if USE_AVX2
+    outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
+    #else
     outAccum = _mm_mul_ps(outAccum, vLR);
     outSamp = _mm_add_ps(outSamp, outAccum);
+    #endif
+
     _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
 }