Merge "Make metadata field mandatory for program info struct."
diff --git a/camera/ndk/Android.bp b/camera/ndk/Android.bp
index 6f2351f..c5fc646 100644
--- a/camera/ndk/Android.bp
+++ b/camera/ndk/Android.bp
@@ -20,4 +20,5 @@
     name: "libcamera2ndk.ndk",
     symbol_file: "libcamera2ndk.map.txt",
     first_version: "24",
+    unversioned_until: "current",
 }
diff --git a/include/media/nbaio/NBLog.h b/include/media/nbaio/NBLog.h
index 1297b51..acf2d31 100644
--- a/include/media/nbaio/NBLog.h
+++ b/include/media/nbaio/NBLog.h
@@ -21,7 +21,7 @@
 
 #include <binder/IMemory.h>
 #include <utils/Mutex.h>
-#include <audio_utils/roundup.h>
+#include <audio_utils/fifo.h>
 
 namespace android {
 
@@ -55,8 +55,11 @@
 private:
     friend class Writer;
     Event       mEvent;     // event type
-    size_t      mLength;    // length of additional data, 0 <= mLength <= 255
+    uint8_t     mLength;    // length of additional data, 0 <= mLength <= kMaxLength
     const void *mData;      // event type-specific data
+    static const size_t kMaxLength = 255;
+public:
+    static const size_t kOverhead = 3;  // mEvent, mLength, mData[...], duplicate mLength
 };
 
 // representation of a single log entry in shared memory
@@ -70,13 +73,17 @@
 //  byte[2+mLength]     duplicate copy of mLength to permit reverse scan
 //  byte[3+mLength]     start of next log entry
 
-// located in shared memory
+public:
+
+// Located in shared memory, must be POD.
+// Exactly one process must explicitly call the constructor or use placement new.
+// Since this is a POD, the destructor is empty and unnecessary to call it explicitly.
 struct Shared {
-    Shared() : mRear(0) { }
+    Shared() /* mRear initialized via default constructor */ { }
     /*virtual*/ ~Shared() { }
 
-    volatile int32_t mRear;     // index one byte past the end of most recent Entry
-    char    mBuffer[0];         // circular buffer for entries
+    audio_utils_fifo_index  mRear;  // index one byte past the end of most recent Entry
+    char    mBuffer[0];             // circular buffer for entries
 };
 
 public:
@@ -117,10 +124,10 @@
 
     // Input parameter 'size' is the desired size of the timeline in byte units.
     // The size of the shared memory must be at least Timeline::sharedSize(size).
-    Writer(size_t size, void *shared);
-    Writer(size_t size, const sp<IMemory>& iMemory);
+    Writer(void *shared, size_t size);
+    Writer(const sp<IMemory>& iMemory, size_t size);
 
-    virtual ~Writer() { }
+    virtual ~Writer();
 
     virtual void    log(const char *string);
     virtual void    logf(const char *fmt, ...) __attribute__ ((format (printf, 2, 3)));
@@ -138,13 +145,16 @@
     sp<IMemory>     getIMemory() const  { return mIMemory; }
 
 private:
+    // 0 <= length <= kMaxLength
     void    log(Event event, const void *data, size_t length);
     void    log(const Entry *entry, bool trusted = false);
 
-    const size_t    mSize;      // circular buffer size in bytes, must be a power of 2
     Shared* const   mShared;    // raw pointer to shared memory
-    const sp<IMemory> mIMemory; // ref-counted version
-    int32_t         mRear;      // my private copy of mShared->mRear
+    sp<IMemory>     mIMemory;   // ref-counted version, initialized in constructor and then const
+    audio_utils_fifo * const mFifo;                 // FIFO itself,
+                                                    // non-NULL unless constructor fails
+    audio_utils_fifo_writer * const mFifoWriter;    // used to write to FIFO,
+                                                    // non-NULL unless dummy constructor used
     bool            mEnabled;   // whether to actually log
 };
 
@@ -154,7 +164,7 @@
 class LockedWriter : public Writer {
 public:
     LockedWriter();
-    LockedWriter(size_t size, void *shared);
+    LockedWriter(void *shared, size_t size);
 
     virtual void    log(const char *string);
     virtual void    logf(const char *fmt, ...) __attribute__ ((format (printf, 2, 3)));
@@ -176,21 +186,24 @@
 
     // Input parameter 'size' is the desired size of the timeline in byte units.
     // The size of the shared memory must be at least Timeline::sharedSize(size).
-    Reader(size_t size, const void *shared);
-    Reader(size_t size, const sp<IMemory>& iMemory);
+    Reader(const void *shared, size_t size);
+    Reader(const sp<IMemory>& iMemory, size_t size);
 
-    virtual ~Reader() { }
+    virtual ~Reader();
 
     void    dump(int fd, size_t indent = 0);
     bool    isIMemory(const sp<IMemory>& iMemory) const;
 
 private:
-    const size_t    mSize;      // circular buffer size in bytes, must be a power of 2
-    const Shared* const mShared; // raw pointer to shared memory
-    const sp<IMemory> mIMemory; // ref-counted version
-    int32_t     mFront;         // index of oldest acknowledged Entry
+    /*const*/ Shared* const mShared;    // raw pointer to shared memory, actually const but not
+                                        // declared as const because audio_utils_fifo() constructor
+    sp<IMemory> mIMemory;       // ref-counted version, assigned only in constructor
     int     mFd;                // file descriptor
     int     mIndent;            // indentation level
+    audio_utils_fifo * const mFifo;                 // FIFO itself,
+                                                    // non-NULL unless constructor fails
+    audio_utils_fifo_reader * const mFifoReader;    // used to read from FIFO,
+                                                    // non-NULL unless constructor fails
 
     void    dumpLine(const String8& timestamp, String8& body);
 
diff --git a/media/libnbaio/NBLog.cpp b/media/libnbaio/NBLog.cpp
index c728e3e..f019df5 100644
--- a/media/libnbaio/NBLog.cpp
+++ b/media/libnbaio/NBLog.cpp
@@ -23,7 +23,7 @@
 #include <string.h>
 #include <time.h>
 #include <new>
-#include <cutils/atomic.h>
+#include <audio_utils/roundup.h>
 #include <media/nbaio/NBLog.h>
 #include <utils/Log.h>
 #include <utils/String8.h>
@@ -74,19 +74,30 @@
 // ---------------------------------------------------------------------------
 
 NBLog::Writer::Writer()
-    : mSize(0), mShared(NULL), mRear(0), mEnabled(false)
+    : mShared(NULL), mFifo(NULL), mFifoWriter(NULL), mEnabled(false)
 {
 }
 
-NBLog::Writer::Writer(size_t size, void *shared)
-    : mSize(roundup(size)), mShared((Shared *) shared), mRear(0), mEnabled(mShared != NULL)
+NBLog::Writer::Writer(void *shared, size_t size)
+    : mShared((Shared *) shared),
+      mFifo(mShared != NULL ?
+        new audio_utils_fifo(size, sizeof(uint8_t),
+            mShared->mBuffer, mShared->mRear, NULL /*throttlesFront*/) : NULL),
+      mFifoWriter(mFifo != NULL ? new audio_utils_fifo_writer(*mFifo) : NULL),
+      mEnabled(mFifoWriter != NULL)
 {
 }
 
-NBLog::Writer::Writer(size_t size, const sp<IMemory>& iMemory)
-    : mSize(roundup(size)), mShared(iMemory != 0 ? (Shared *) iMemory->pointer() : NULL),
-      mIMemory(iMemory), mRear(0), mEnabled(mShared != NULL)
+NBLog::Writer::Writer(const sp<IMemory>& iMemory, size_t size)
+    : Writer(iMemory != 0 ? (Shared *) iMemory->pointer() : NULL, size)
 {
+    mIMemory = iMemory;
+}
+
+NBLog::Writer::~Writer()
+{
+    delete mFifoWriter;
+    delete mFifo;
 }
 
 void NBLog::Writer::log(const char *string)
@@ -95,8 +106,8 @@
         return;
     }
     size_t length = strlen(string);
-    if (length > 255) {
-        length = 255;
+    if (length > Entry::kMaxLength) {
+        length = Entry::kMaxLength;
     }
     log(EVENT_STRING, string, length);
 }
@@ -117,7 +128,7 @@
     if (!mEnabled) {
         return;
     }
-    char buffer[256];
+    char buffer[Entry::kMaxLength + 1 /*NUL*/];
     int length = vsnprintf(buffer, sizeof(buffer), fmt, ap);
     if (length >= (int) sizeof(buffer)) {
         length = sizeof(buffer) - 1;
@@ -153,7 +164,10 @@
     if (!mEnabled) {
         return;
     }
-    if (data == NULL || length > 255) {
+    if (data == NULL || length > Entry::kMaxLength) {
+        // TODO Perhaps it makes sense to display truncated data or at least a
+        //      message that the data is too long?  The current behavior can create
+        //      a confusion for a programmer debugging their code.
         return;
     }
     switch (event) {
@@ -177,26 +191,16 @@
         log(entry->mEvent, entry->mData, entry->mLength);
         return;
     }
-    size_t rear = mRear & (mSize - 1);
-    size_t written = mSize - rear;      // written = number of bytes that have been written so far
-    size_t need = entry->mLength + 3;   // mEvent, mLength, data[length], mLength
-                                        // need = number of bytes remaining to write
-    if (written > need) {
-        written = need;
-    }
-    size_t i;
+    size_t need = entry->mLength + Entry::kOverhead;    // mEvent, mLength, data[length], mLength
+                                                        // need = number of bytes remaining to write
+
     // FIXME optimize this using memcpy for the data part of the Entry.
     // The Entry could have a method copyTo(ptr, offset, size) to optimize the copy.
-    for (i = 0; i < written; ++i) {
-        mShared->mBuffer[rear + i] = entry->readAt(i);
+    uint8_t temp[Entry::kMaxLength + Entry::kOverhead];
+    for (size_t i = 0; i < need; i++) {
+        temp[i] = entry->readAt(i);
     }
-    if (rear + written == mSize && (need -= written) > 0)  {
-        for (i = 0; i < need; ++i) {
-            mShared->mBuffer[i] = entry->readAt(written + i);
-        }
-        written += need;
-    }
-    android_atomic_release_store(mRear += written, &mShared->mRear);
+    mFifoWriter->write(temp, need);
 }
 
 bool NBLog::Writer::isEnabled() const
@@ -218,8 +222,8 @@
 {
 }
 
-NBLog::LockedWriter::LockedWriter(size_t size, void *shared)
-    : Writer(size, shared)
+NBLog::LockedWriter::LockedWriter(void *shared, size_t size)
+    : Writer(shared, size)
 {
 }
 
@@ -273,60 +277,59 @@
 
 // ---------------------------------------------------------------------------
 
-NBLog::Reader::Reader(size_t size, const void *shared)
-    : mSize(roundup(size)), mShared((const Shared *) shared), mFront(0)
+NBLog::Reader::Reader(const void *shared, size_t size)
+    : mShared((/*const*/ Shared *) shared), /*mIMemory*/
+      mFd(-1), mIndent(0),
+      mFifo(mShared != NULL ?
+        new audio_utils_fifo(size, sizeof(uint8_t),
+            mShared->mBuffer, mShared->mRear, NULL /*throttlesFront*/) : NULL),
+      mFifoReader(mFifo != NULL ? new audio_utils_fifo_reader(*mFifo) : NULL)
 {
 }
 
-NBLog::Reader::Reader(size_t size, const sp<IMemory>& iMemory)
-    : mSize(roundup(size)), mShared(iMemory != 0 ? (const Shared *) iMemory->pointer() : NULL),
-      mIMemory(iMemory), mFront(0)
+NBLog::Reader::Reader(const sp<IMemory>& iMemory, size_t size)
+    : Reader(iMemory != 0 ? (Shared *) iMemory->pointer() : NULL, size)
 {
+    mIMemory = iMemory;
+}
+
+NBLog::Reader::~Reader()
+{
+    delete mFifoReader;
+    delete mFifo;
 }
 
 void NBLog::Reader::dump(int fd, size_t indent)
 {
-    int32_t rear = android_atomic_acquire_load(&mShared->mRear);
-    size_t avail = rear - mFront;
-    if (avail == 0) {
+    if (mFifoReader == NULL) {
         return;
     }
-    size_t lost = 0;
-    if (avail > mSize) {
-        lost = avail - mSize;
-        mFront += lost;
-        avail = mSize;
-    }
-    size_t remaining = avail;       // remaining = number of bytes left to read
-    size_t front = mFront & (mSize - 1);
-    size_t read = mSize - front;    // read = number of bytes that have been read so far
-    if (read > remaining) {
-        read = remaining;
-    }
     // make a copy to avoid race condition with writer
-    uint8_t *copy = new uint8_t[avail];
-    // copy first part of circular buffer up until the wraparound point
-    memcpy(copy, &mShared->mBuffer[front], read);
-    if (front + read == mSize) {
-        if ((remaining -= read) > 0) {
-            // copy second part of circular buffer starting at beginning
-            memcpy(&copy[read], mShared->mBuffer, remaining);
-            read += remaining;
-            // remaining = 0 but not necessary
-        }
-    }
-    mFront += read;
+    size_t capacity = mFifo->capacity();
+
+    // TODO Stack-based allocation of large objects may fail.
+    //      Currently the log buffers are a page or two, which should be safe.
+    //      But if the log buffers ever get a lot larger,
+    //      then change this to allocate from heap when necessary.
+    static size_t kReasonableStackObjectSize = 32768;
+    ALOGW_IF(capacity > kReasonableStackObjectSize, "Stack-based allocation of object may fail");
+    uint8_t copy[capacity];
+
+    size_t lost;
+    ssize_t actual = mFifoReader->read(copy, capacity, NULL /*timeout*/, &lost);
+    ALOG_ASSERT(actual <= capacity);
+    size_t avail = actual > 0 ? (size_t) actual : 0;
     size_t i = avail;
     Event event;
     size_t length;
     struct timespec ts;
     time_t maxSec = -1;
-    while (i >= 3) {
+    while (i >= Entry::kOverhead) {
         length = copy[i - 1];
-        if (length + 3 > i || copy[i - length - 2] != length) {
+        if (length + Entry::kOverhead > i || copy[i - length - 2] != length) {
             break;
         }
-        event = (Event) copy[i - length - 3];
+        event = (Event) copy[i - length - Entry::kOverhead];
         if (event == EVENT_TIMESTAMP) {
             if (length != sizeof(struct timespec)) {
                 // corrupt
@@ -337,7 +340,7 @@
                 maxSec = ts.tv_sec;
             }
         }
-        i -= length + 3;
+        i -= length + Entry::kOverhead;
     }
     mFd = fd;
     mIndent = indent;
@@ -362,7 +365,7 @@
         event = (Event) copy[i];
         length = copy[i + 1];
         const void *data = &copy[i + 2];
-        size_t advance = length + 3;
+        size_t advance = length + Entry::kOverhead;
         switch (event) {
         case EVENT_STRING:
             body.appendFormat("%.*s", (int) length, (const char *) data);
@@ -376,7 +379,7 @@
             long deltaTotal = 0;
             size_t j = i;
             for (;;) {
-                j += sizeof(struct timespec) + 3;
+                j += sizeof(struct timespec) + 3 /*Entry::kOverhead?*/;
                 if (j >= avail || (Event) copy[j] != EVENT_TIMESTAMP) {
                     break;
                 }
@@ -398,7 +401,7 @@
                 deltaTotal += delta;
                 prevNsec = tsNext.tv_nsec;
             }
-            size_t n = (j - i) / (sizeof(struct timespec) + 3);
+            size_t n = (j - i) / (sizeof(struct timespec) + 3 /*Entry::kOverhead?*/);
             if (deferredTimestamp) {
                 dumpLine(timestamp, body);
                 deferredTimestamp = false;
@@ -432,8 +435,6 @@
     if (deferredTimestamp) {
         dumpLine(timestamp, body);
     }
-    // FIXME it would be more efficient to put a char mCopy[256] as a member variable of the dumper
-    delete[] copy;
 }
 
 void NBLog::Reader::dumpLine(const String8& timestamp, String8& body)
diff --git a/media/liboboe/Android.bp b/media/liboboe/Android.bp
index 0d22e65..bfcc049 100644
--- a/media/liboboe/Android.bp
+++ b/media/liboboe/Android.bp
@@ -24,4 +24,5 @@
     name: "liboboe.ndk",
     symbol_file: "liboboe.map.txt",
     first_version: "26",
+    unversioned_until: "current",
 }
diff --git a/media/ndk/Android.bp b/media/ndk/Android.bp
index 1ac1eeb..e4e3d8f 100644
--- a/media/ndk/Android.bp
+++ b/media/ndk/Android.bp
@@ -20,4 +20,5 @@
     name: "libmediandk.ndk",
     symbol_file: "libmediandk.map.txt",
     first_version: "21",
+    unversioned_until: "current",
 }
diff --git a/services/audioflinger/AudioFlinger.cpp b/services/audioflinger/AudioFlinger.cpp
index d08309b..e4b73c6 100644
--- a/services/audioflinger/AudioFlinger.cpp
+++ b/services/audioflinger/AudioFlinger.cpp
@@ -512,8 +512,11 @@
         return new NBLog::Writer();
     }
 success:
+    NBLog::Shared *sharedRawPtr = (NBLog::Shared *) shared->pointer();
+    new((void *) sharedRawPtr) NBLog::Shared(); // placement new here, but the corresponding
+                                                // explicit destructor not needed since it is POD
     mediaLogService->registerWriter(shared, size, name);
-    return new NBLog::Writer(size, shared);
+    return new NBLog::Writer(shared, size);
 }
 
 void AudioFlinger::unregisterWriter(const sp<NBLog::Writer>& writer)
diff --git a/services/audioflinger/AudioResamplerDyn.cpp b/services/audioflinger/AudioResamplerDyn.cpp
index 21914b9..8f7b982 100644
--- a/services/audioflinger/AudioResamplerDyn.cpp
+++ b/services/audioflinger/AudioResamplerDyn.cpp
@@ -29,9 +29,10 @@
 #include <utils/Log.h>
 #include <audio_utils/primitives.h>
 
-#include "AudioResamplerFirOps.h" // USE_NEON and USE_INLINE_ASSEMBLY defined here
+#include "AudioResamplerFirOps.h" // USE_NEON, USE_SSE and USE_INLINE_ASSEMBLY defined here
 #include "AudioResamplerFirProcess.h"
 #include "AudioResamplerFirProcessNeon.h"
+#include "AudioResamplerFirProcessSSE.h"
 #include "AudioResamplerFirGen.h" // requires math.h
 #include "AudioResamplerDyn.h"
 
diff --git a/services/audioflinger/AudioResamplerFirOps.h b/services/audioflinger/AudioResamplerFirOps.h
index 2a26496..2e4cee3 100644
--- a/services/audioflinger/AudioResamplerFirOps.h
+++ b/services/audioflinger/AudioResamplerFirOps.h
@@ -36,6 +36,13 @@
 #include <arm_neon.h>
 #endif
 
+#if defined(__SSSE3__)  // Should be supported in x86 ABI for both 32 & 64-bit.
+#define USE_SSE (true)
+#include <tmmintrin.h>
+#else
+#define USE_SSE (false)
+#endif
+
 template<typename T, typename U>
 struct is_same
 {
@@ -119,7 +126,7 @@
 static inline
 int32_t mulAddRL(int left, uint32_t inRL, int16_t v, int32_t a)
 {
-#if USE_INLINE_ASSEMBLY
+#if 0 // USE_INLINE_ASSEMBLY Seems to fail with Clang b/34110890
     int32_t out;
     if (left) {
         asm( "smlabb %[out], %[v], %[inRL], %[a] \n"
@@ -142,7 +149,7 @@
 static inline
 int32_t mulAddRL(int left, uint32_t inRL, int32_t v, int32_t a)
 {
-#if USE_INLINE_ASSEMBLY
+#if 0 // USE_INLINE_ASSEMBLY Seems to fail with Clang b/34110890
     int32_t out;
     if (left) {
         asm( "smlawb %[out], %[v], %[inRL], %[a] \n"
diff --git a/services/audioflinger/AudioResamplerFirProcessNeon.h b/services/audioflinger/AudioResamplerFirProcessNeon.h
index 3de9edd..1ce76a8 100644
--- a/services/audioflinger/AudioResamplerFirProcessNeon.h
+++ b/services/audioflinger/AudioResamplerFirProcessNeon.h
@@ -155,8 +155,8 @@
             accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
             accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
             sP -= 16;
-        }
         } break;
+        }
     } while (count -= 8);
 
     // multiply by volume and save
diff --git a/services/audioflinger/AudioResamplerFirProcessSSE.h b/services/audioflinger/AudioResamplerFirProcessSSE.h
new file mode 100644
index 0000000..63ed052
--- /dev/null
+++ b/services/audioflinger/AudioResamplerFirProcessSSE.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
+#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H
+
+namespace android {
+
+// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
+
+#if USE_SSE
+
+#define TO_STRING2(x) #x
+#define TO_STRING(x) TO_STRING2(x)
+// uncomment to print GCC version, may be relevant for intrinsic optimizations
+/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
+        "." TO_STRING(__GNUC_MINOR__) \
+        "." TO_STRING(__GNUC_PATCHLEVEL__)) */
+
+//
+// SSEx specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
+//
+
+template <int CHANNELS, int STRIDE, bool FIXED>
+static inline void ProcessSSEIntrinsic(float* out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* sP,
+        const float* sN,
+        const float* volumeLR,
+        float lerpP,
+        const float* coefsP1,
+        const float* coefsN1)
+{
+    ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
+    COMPILE_TIME_ASSERT_FUNCTION_SCOPE(CHANNELS == 1 || CHANNELS == 2);
+
+    sP -= CHANNELS*(4-1);   // adjust sP for a loop iteration of four
+
+    __m128 interp;
+    if (!FIXED) {
+        interp = _mm_set1_ps(lerpP);
+    }
+
+    __m128 accL, accR;
+    accL = _mm_setzero_ps();
+    if (CHANNELS == 2) {
+        accR = _mm_setzero_ps();
+    }
+
+    do {
+        __m128 posCoef = _mm_load_ps(coefsP);
+        __m128 negCoef = _mm_load_ps(coefsN);
+        coefsP += 4;
+        coefsN += 4;
+
+        if (!FIXED) { // interpolate
+            __m128 posCoef1 = _mm_load_ps(coefsP1);
+            __m128 negCoef1 = _mm_load_ps(coefsN1);
+            coefsP1 += 4;
+            coefsN1 += 4;
+
+            // Calculate the final coefficient for interpolation
+            // posCoef = interp * (posCoef1 - posCoef) + posCoef
+            // negCoef = interp * (negCoef - negCoef1) + negCoef1
+            posCoef1 = _mm_sub_ps(posCoef1, posCoef);
+            negCoef = _mm_sub_ps(negCoef, negCoef1);
+
+            posCoef1 = _mm_mul_ps(posCoef1, interp);
+            negCoef = _mm_mul_ps(negCoef, interp);
+
+            posCoef = _mm_add_ps(posCoef1, posCoef);
+            negCoef = _mm_add_ps(negCoef, negCoef1);
+        }
+        switch (CHANNELS) {
+        case 1: {
+            __m128 posSamp = _mm_loadu_ps(sP);
+            __m128 negSamp = _mm_loadu_ps(sN);
+            sP -= 4;
+            sN += 4;
+
+            posSamp = _mm_shuffle_ps(posSamp, posSamp, 0x1B);
+            posSamp = _mm_mul_ps(posSamp, posCoef);
+            negSamp = _mm_mul_ps(negSamp, negCoef);
+
+            accL = _mm_add_ps(accL, posSamp);
+            accL = _mm_add_ps(accL, negSamp);
+        } break;
+        case 2: {
+            __m128 posSamp0 = _mm_loadu_ps(sP);
+            __m128 posSamp1 = _mm_loadu_ps(sP+4);
+            __m128 negSamp0 = _mm_loadu_ps(sN);
+            __m128 negSamp1 = _mm_loadu_ps(sN+4);
+            sP -= 8;
+            sN += 8;
+
+            // deinterleave everything and reverse the positives
+            __m128 posSampL = _mm_shuffle_ps(posSamp1, posSamp0, 0x22);
+            __m128 posSampR = _mm_shuffle_ps(posSamp1, posSamp0, 0x77);
+            __m128 negSampL = _mm_shuffle_ps(negSamp0, negSamp1, 0x88);
+            __m128 negSampR = _mm_shuffle_ps(negSamp0, negSamp1, 0xDD);
+
+            posSampL = _mm_mul_ps(posSampL, posCoef);
+            posSampR = _mm_mul_ps(posSampR, posCoef);
+            negSampL = _mm_mul_ps(negSampL, negCoef);
+            negSampR = _mm_mul_ps(negSampR, negCoef);
+
+            accL = _mm_add_ps(accL, posSampL);
+            accR = _mm_add_ps(accR, posSampR);
+            accL = _mm_add_ps(accL, negSampL);
+            accR = _mm_add_ps(accR, negSampR);
+        } break;
+        }
+    } while (count -= 4);
+
+    // multiply by volume and save
+    __m128 vLR = _mm_setzero_ps();
+    __m128 outSamp;
+    vLR = _mm_loadl_pi(vLR, reinterpret_cast<const __m64*>(volumeLR));
+    outSamp = _mm_loadl_pi(vLR, reinterpret_cast<__m64*>(out));
+
+    // combine and funnel down accumulator
+    __m128 outAccum = _mm_setzero_ps();
+    if (CHANNELS == 1) {
+        // duplicate accL to both L and R
+        outAccum = _mm_add_ps(accL, _mm_movehl_ps(accL, accL));
+        outAccum = _mm_add_ps(outAccum, _mm_shuffle_ps(outAccum, outAccum, 0x11));
+    } else if (CHANNELS == 2) {
+        // accR contains R, fold in
+        outAccum = _mm_hadd_ps(accL, accR);
+        outAccum = _mm_hadd_ps(outAccum, outAccum);
+    }
+
+    outAccum = _mm_mul_ps(outAccum, vLR);
+    outSamp = _mm_add_ps(outSamp, outAccum);
+    _mm_storel_pi(reinterpret_cast<__m64*>(out), outSamp);
+}
+
+template<>
+inline void ProcessL<1, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* sP,
+        const float* sN,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+}
+
+template<>
+inline void ProcessL<2, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* sP,
+        const float* sN,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
+}
+
+template<>
+inline void Process<1, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* coefsP1,
+        const float* coefsN1,
+        const float* sP,
+        const float* sN,
+        float lerpP,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            lerpP, coefsP1, coefsN1);
+}
+
+template<>
+inline void Process<2, 16>(float* const out,
+        int count,
+        const float* coefsP,
+        const float* coefsN,
+        const float* coefsP1,
+        const float* coefsN1,
+        const float* sP,
+        const float* sN,
+        float lerpP,
+        const float* const volumeLR)
+{
+    ProcessSSEIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
+            lerpP, coefsP1, coefsN1);
+}
+
+#endif //USE_SSE
+
+} // namespace android
+
+#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_SSE_H*/
diff --git a/services/audioflinger/FastThreadDumpState.cpp b/services/audioflinger/FastThreadDumpState.cpp
index 9df5c4c..964a725 100644
--- a/services/audioflinger/FastThreadDumpState.cpp
+++ b/services/audioflinger/FastThreadDumpState.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <audio_utils/roundup.h>
 #include "FastThreadDumpState.h"
 
 namespace android {
diff --git a/services/audioflinger/tests/build_and_run_all_unit_tests.sh b/services/audioflinger/tests/build_and_run_all_unit_tests.sh
index 7f4d456..9a71096 100755
--- a/services/audioflinger/tests/build_and_run_all_unit_tests.sh
+++ b/services/audioflinger/tests/build_and_run_all_unit_tests.sh
@@ -15,7 +15,9 @@
 echo "waiting for device"
 adb root && adb wait-for-device remount
 adb push $OUT/system/lib/libaudioresampler.so /system/lib
-adb push $OUT/data/nativetest/resampler_tests /system/bin
+adb push $OUT/system/lib64/libaudioresampler.so /system/lib64
+adb push $OUT/data/nativetest/resampler_tests/resampler_tests /data/nativetest/resampler_tests/resampler_tests
+adb push $OUT/data/nativetest64/resampler_tests/resampler_tests /data/nativetest64/resampler_tests/resampler_tests
 
 sh $ANDROID_BUILD_TOP/frameworks/av/services/audioflinger/tests/run_all_unit_tests.sh
 
diff --git a/services/audioflinger/tests/resampler_tests.cpp b/services/audioflinger/tests/resampler_tests.cpp
index 406b960..77a265f 100644
--- a/services/audioflinger/tests/resampler_tests.cpp
+++ b/services/audioflinger/tests/resampler_tests.cpp
@@ -39,6 +39,17 @@
 #include "AudioResampler.h"
 #include "test_utils.h"
 
+template <typename T>
+static void printData(T *data, size_t size) {
+    const size_t stride = 8;
+    for (size_t i = 0; i < size; ) {
+        for (size_t j = 0; j < stride && i < size; ++j) {
+            std::cout << data[i++] << ' ';  // extra space before newline
+        }
+        std::cout << '\n'; // or endl
+    }
+}
+
 void resample(int channels, void *output,
         size_t outputFrames, const std::vector<size_t> &outputIncr,
         android::AudioBufferProvider *provider, android::AudioResampler *resampler)
@@ -91,7 +102,7 @@
 
     // calculate the output size
     size_t outputFrames = ((int64_t) provider.getNumFrames() * outputFreq) / inputFreq;
-    size_t outputFrameSize = channels * (useFloat ? sizeof(float) : sizeof(int32_t));
+    size_t outputFrameSize = (channels == 1 ? 2 : channels) * (useFloat ? sizeof(float) : sizeof(int32_t));
     size_t outputSize = outputFrameSize * outputFrames;
     outputSize &= ~7;
 
@@ -106,7 +117,7 @@
     // set up the reference run
     std::vector<size_t> refIncr;
     refIncr.push_back(outputFrames);
-    void* reference = malloc(outputSize);
+    void* reference = calloc(outputFrames, outputFrameSize);
     resample(channels, reference, outputFrames, refIncr, &provider, resampler);
 
     provider.reset();
@@ -127,7 +138,7 @@
     outIncr.push_back(1);
     outIncr.push_back(2);
     outIncr.push_back(3);
-    void* test = malloc(outputSize);
+    void* test = calloc(outputFrames, outputFrameSize);
     inputIncr.push_back(1);
     inputIncr.push_back(3);
     provider.setIncr(inputIncr);
@@ -177,7 +188,7 @@
 
     // calculate the output size
     size_t outputFrames = ((int64_t) provider.getNumFrames() * outputFreq) / inputFreq;
-    size_t outputFrameSize = channels * sizeof(TO);
+    size_t outputFrameSize = (channels == 1 ? 2 : channels) * sizeof(TO);
     size_t outputSize = outputFrameSize * outputFrames;
     outputSize &= ~7;
 
@@ -194,7 +205,7 @@
     // set up the reference run
     std::vector<size_t> refIncr;
     refIncr.push_back(outputFrames);
-    void* reference = malloc(outputSize);
+    void* reference = calloc(outputFrames, outputFrameSize);
     resample(channels, reference, outputFrames, refIncr, &provider, resampler);
 
     TO *out = reinterpret_cast<TO *>(reference);
@@ -204,6 +215,8 @@
     const unsigned stopbandFrame = stopband * outputFreq / 1000.;
 
     // check each channel separately
+    if (channels == 1) channels = 2; // workaround (mono duplicates output channel)
+
     for (size_t i = 0; i < channels; ++i) {
         double passbandEnergy = signalEnergy(out, out + passbandFrame * channels, channels);
         double stopbandEnergy = signalEnergy(out + stopbandFrame * channels,
@@ -331,6 +344,34 @@
     }
 }
 
+TEST(audioflinger_resampler, stopbandresponse_integer_mono) {
+    // not all of these may work (old resamplers fail on downsampling)
+    static const enum android::AudioResampler::src_quality kQualityArray[] = {
+            //android::AudioResampler::LOW_QUALITY,
+            //android::AudioResampler::MED_QUALITY,
+            //android::AudioResampler::HIGH_QUALITY,
+            //android::AudioResampler::VERY_HIGH_QUALITY,
+            android::AudioResampler::DYN_LOW_QUALITY,
+            android::AudioResampler::DYN_MED_QUALITY,
+            android::AudioResampler::DYN_HIGH_QUALITY,
+    };
+
+    // in this test we assume a maximum transition band between 12kHz and 20kHz.
+    // there must be at least 60dB relative attenuation between stopband and passband.
+    for (size_t i = 0; i < ARRAY_SIZE(kQualityArray); ++i) {
+        testStopbandDownconversion<int16_t, int32_t>(
+                1, 48000, 32000, 12000, 20000, kQualityArray[i]);
+    }
+
+    // in this test we assume a maximum transition band between 7kHz and 15kHz.
+    // there must be at least 60dB relative attenuation between stopband and passband.
+    // (the weird ratio triggers interpolative resampling)
+    for (size_t i = 0; i < ARRAY_SIZE(kQualityArray); ++i) {
+        testStopbandDownconversion<int16_t, int32_t>(
+                1, 48000, 22101, 7000, 15000, kQualityArray[i]);
+    }
+}
+
 TEST(audioflinger_resampler, stopbandresponse_integer_multichannel) {
     // not all of these may work (old resamplers fail on downsampling)
     static const enum android::AudioResampler::src_quality kQualityArray[] = {
@@ -387,6 +428,34 @@
     }
 }
 
+TEST(audioflinger_resampler, stopbandresponse_float_mono) {
+    // not all of these may work (old resamplers fail on downsampling)
+    static const enum android::AudioResampler::src_quality kQualityArray[] = {
+            //android::AudioResampler::LOW_QUALITY,
+            //android::AudioResampler::MED_QUALITY,
+            //android::AudioResampler::HIGH_QUALITY,
+            //android::AudioResampler::VERY_HIGH_QUALITY,
+            android::AudioResampler::DYN_LOW_QUALITY,
+            android::AudioResampler::DYN_MED_QUALITY,
+            android::AudioResampler::DYN_HIGH_QUALITY,
+    };
+
+    // in this test we assume a maximum transition band between 12kHz and 20kHz.
+    // there must be at least 60dB relative attenuation between stopband and passband.
+    for (size_t i = 0; i < ARRAY_SIZE(kQualityArray); ++i) {
+        testStopbandDownconversion<float, float>(
+                1, 48000, 32000, 12000, 20000, kQualityArray[i]);
+    }
+
+    // in this test we assume a maximum transition band between 7kHz and 15kHz.
+    // there must be at least 60dB relative attenuation between stopband and passband.
+    // (the weird ratio triggers interpolative resampling)
+    for (size_t i = 0; i < ARRAY_SIZE(kQualityArray); ++i) {
+        testStopbandDownconversion<float, float>(
+                1, 48000, 22101, 7000, 15000, kQualityArray[i]);
+    }
+}
+
 TEST(audioflinger_resampler, stopbandresponse_float_multichannel) {
     // not all of these may work (old resamplers fail on downsampling)
     static const enum android::AudioResampler::src_quality kQualityArray[] = {
diff --git a/services/audioflinger/tests/run_all_unit_tests.sh b/services/audioflinger/tests/run_all_unit_tests.sh
index 113f39e..15a94c2 100755
--- a/services/audioflinger/tests/run_all_unit_tests.sh
+++ b/services/audioflinger/tests/run_all_unit_tests.sh
@@ -8,5 +8,5 @@
 echo "waiting for device"
 adb root && adb wait-for-device remount
 
-#adb shell /system/bin/resampler_tests
 adb shell /data/nativetest/resampler_tests/resampler_tests
+adb shell /data/nativetest64/resampler_tests/resampler_tests
diff --git a/services/medialog/Android.mk b/services/medialog/Android.mk
index a1da63d..423b186 100644
--- a/services/medialog/Android.mk
+++ b/services/medialog/Android.mk
@@ -4,7 +4,7 @@
 
 LOCAL_SRC_FILES := MediaLogService.cpp IMediaLogService.cpp
 
-LOCAL_SHARED_LIBRARIES := libbinder libutils liblog libnbaio
+LOCAL_SHARED_LIBRARIES := libbinder libutils liblog libnbaio libaudioutils
 
 LOCAL_MULTILIB := $(AUDIOSERVER_MULTILIB)
 
diff --git a/services/medialog/MediaLogService.cpp b/services/medialog/MediaLogService.cpp
index f85aa13..ab2f925 100644
--- a/services/medialog/MediaLogService.cpp
+++ b/services/medialog/MediaLogService.cpp
@@ -35,7 +35,7 @@
             shared->size() < NBLog::Timeline::sharedSize(size)) {
         return;
     }
-    sp<NBLog::Reader> reader(new NBLog::Reader(size, shared));
+    sp<NBLog::Reader> reader(new NBLog::Reader(shared, size));
     NamedReader namedReader(reader, name);
     Mutex::Autolock _l(mLock);
     mNamedReaders.add(namedReader);