Supported FMA Intrinsics in Audio Resampler FIR processing module
Test: make vts -j
run vts run vts-hal run vts-kernel
tests/build_and_run_all_unit_tests.sh
Change-Id: I997e0bb3dd1282a3c9ca5a270b57898558162b30
Signed-off-by: jaishank <jaishankar.rajendran@intel.com>
diff --git a/media/libaudioprocessing/Android.bp b/media/libaudioprocessing/Android.bp
index cb78063..9f190b6 100644
--- a/media/libaudioprocessing/Android.bp
+++ b/media/libaudioprocessing/Android.bp
@@ -26,6 +26,25 @@
// uncomment to disable NEON on architectures that actually do support NEON, for benchmarking
// "-DUSE_NEON=false",
],
+
+ arch: {
+ x86: {
+ avx2: {
+ cflags: [
+ "-mavx2",
+ "-mfma",
+ ],
+ },
+ },
+ x86_64: {
+ avx2: {
+ cflags: [
+ "-mavx2",
+ "-mfma",
+ ],
+ },
+ },
+ },
}
cc_library_shared {
diff --git a/media/libaudioprocessing/AudioResamplerFirOps.h b/media/libaudioprocessing/AudioResamplerFirOps.h
index 2e4cee3..a3f5ff5 100644
--- a/media/libaudioprocessing/AudioResamplerFirOps.h
+++ b/media/libaudioprocessing/AudioResamplerFirOps.h
@@ -36,13 +36,20 @@
#include <arm_neon.h>
#endif
-#if defined(__SSSE3__) // Should be supported in x86 ABI for both 32 & 64-bit.
+#if defined(__AVX2__) // Should be supported in x86 ABI for both 32 & 64-bit.
+#define USE_AVX2 (true) // Inference AVX2/FMA Intrinsics
#define USE_SSE (true)
+#include <immintrin.h>
+#elif defined(__SSSE3__) // Should be supported in x86 ABI for both 32 & 64-bit.
+#define USE_SSE (true) // Inference SSE Intrinsics
+#define USE_AVX2 (false)
#include <tmmintrin.h>
#else
#define USE_SSE (false)
+#define USE_AVX2(false)
#endif
+
template<typename T, typename U>
struct is_same
{
diff --git a/media/libaudioprocessing/AudioResamplerFirProcessSSE.h b/media/libaudioprocessing/AudioResamplerFirProcessSSE.h
index 30233b5..1c16bc4 100644
--- a/media/libaudioprocessing/AudioResamplerFirProcessSSE.h
+++ b/media/libaudioprocessing/AudioResamplerFirProcessSSE.h
@@ -80,11 +80,16 @@
posCoef1 = _mm_sub_ps(posCoef1, posCoef);
negCoef = _mm_sub_ps(negCoef, negCoef1);
+
+ #if USE_AVX2
+ posCoef = _mm_fmadd_ps(posCoef1, interp, posCoef);
+ negCoef = _mm_fmadd_ps(negCoef, interp, negCoef1);
+ #else
posCoef1 = _mm_mul_ps(posCoef1, interp);
negCoef = _mm_mul_ps(negCoef, interp);
-
posCoef = _mm_add_ps(posCoef1, posCoef);
negCoef = _mm_add_ps(negCoef, negCoef1);
+ #endif //USE_AVX2
}
switch (CHANNELS) {
case 1: {
@@ -94,11 +99,17 @@
sN += 4;
posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
+
+ #if USE_AVX2
+ accL = _mm_fmadd_ps(posSamp, posCoef, accL);
+ accL = _mm_fmadd_ps(negSamp, negCoef, accL);
+ #else
posSamp = _mm_mul_ps(posSamp, posCoef);
negSamp = _mm_mul_ps(negSamp, negCoef);
-
accL = _mm_add_ps(accL, posSamp);
accL = _mm_add_ps(accL, negSamp);
+ #endif
+
} break;
case 2: {
__m128 posSamp0 = _mm_loadu_ps(sP);
@@ -114,15 +125,23 @@
__m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
__m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
- posSampL = _mm_mul_ps(posSampL, posCoef);
- posSampR = _mm_mul_ps(posSampR, posCoef);
- negSampL = _mm_mul_ps(negSampL, negCoef);
- negSampR = _mm_mul_ps(negSampR, negCoef);
+ #if USE_AVX2
+ accL = _mm_fmadd_ps(posSampL, posCoef, accL);
+ accR = _mm_fmadd_ps(posSampR, posCoef, accR);
+ accL = _mm_fmadd_ps(negSampL, negCoef, accL);
+ accR = _mm_fmadd_ps(negSampR, negCoef, accR);
+ #else
+ posSampL = _mm_mul_ps(posSampL, posCoef);
+ posSampR = _mm_mul_ps(posSampR, posCoef);
+ negSampL = _mm_mul_ps(negSampL, negCoef);
+ negSampR = _mm_mul_ps(negSampR, negCoef);
- accL = _mm_add_ps(accL, posSampL);
- accR = _mm_add_ps(accR, posSampR);
- accL = _mm_add_ps(accL, negSampL);
- accR = _mm_add_ps(accR, negSampR);
+ accL = _mm_add_ps(accL, posSampL);
+ accR = _mm_add_ps(accR, posSampR);
+ accL = _mm_add_ps(accL, negSampL);
+ accR = _mm_add_ps(accR, negSampR);
+ #endif
+
} break;
}
} while (count -= 4);
@@ -144,9 +163,13 @@
outAccum = _mm_hadd_ps(accL, accR);
outAccum = _mm_hadd_ps(outAccum, outAccum);
}
-
+ #if USE_AVX2
+ outSamp = _mm_fmadd_ps(outAccum, vLR,outSamp);
+ #else
outAccum = _mm_mul_ps(outAccum, vLR);
outSamp = _mm_add_ps(outSamp, outAccum);
+ #endif
+
_mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
}