snapuserd: Implement snapshot merge Implement snapshot merge in the daemon. Following are the important changes: 1: Spin up merge thread which does merging in user-space. 2: For ordered ops (COPY and XOR), read-ahead thread is used. 3: Read-ahead thread will read a fixed set of COW ops and cache them in memory. Furthermore data is saved in scratch space buffer in COW device. 4: No change in scratch space buffer - default 2MB buffer is allocated. 5: Merge thread and Read-ahead thread will work in lock step for merging ordered ops. 6: Once the ordered ops are merged, REPLACE and XOR operations are merged. 7: If there is a crash during merge, COW header tracks the number of operations merged. No change in this code path. 8: Merge thread requires Base device for merging as opposed to using the source device. Hence, while initializng the merge threads, libsnapshot will have to pass the "Base" device to the daemon. This is the same "Base" device which is passed to dm-snapshot during snapshot creation. Patch does not handle any communication with dm-user yet. Bug: 193863397 Bug: 193863280 Bug: 193862712 Test: snapuserd_test on CF Signed-off-by: Akilesh Kailash <akailash@google.com> Change-Id: I14aab6eaa07ac68f2a3a23516ed9ba6567a35734

commit: 228f6a099cea9e3030cedbace42772680f455ab1 [log] [tgz]
author: Akilesh Kailash <akailash@google.com> Tue Aug 17 07:19:54 2021 +0000
committer: Akilesh Kailash <akailash@google.com> Thu Oct 07 07:09:28 2021 +0000
tree: 71cbad46e9ce8efca60d355cb4d100a0a3a9f475
parent: 46e15bd18b737df683d315aea6a3fc0af48d49b1 [diff]
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
index 143f73c..63a9e68 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h

@@ -145,6 +145,8 @@
     // Creates a clone of the current CowReader without the file handlers
     std::unique_ptr<CowReader> CloneCowReader();
 
+    void UpdateMergeOpsCompleted(int num_merge_ops) { header_.num_merge_ops += num_merge_ops; }
+
   private:
     bool ParseOps(std::optional<uint64_t> label);
     bool PrepMergeOps();

diff --git a/fs_mgr/libsnapshot/snapuserd/Android.bp b/fs_mgr/libsnapshot/snapuserd/Android.bp
index af49a39..5865507 100644
--- a/fs_mgr/libsnapshot/snapuserd/Android.bp
+++ b/fs_mgr/libsnapshot/snapuserd/Android.bp

@@ -61,6 +61,11 @@
         "dm-snapshot-merge/snapuserd_worker.cpp",
         "dm-snapshot-merge/snapuserd_readahead.cpp",
         "snapuserd_daemon.cpp",
+	"user-space-merge/snapuserd_core.cpp",
+	"user-space-merge/snapuserd_dm_user.cpp",
+	"user-space-merge/snapuserd_merge.cpp",
+	"user-space-merge/snapuserd_readahead.cpp",
+	"user-space-merge/snapuserd_transitions.cpp",
     ],
 
     cflags: [

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
new file mode 100644
index 0000000..a2538d2
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp

@@ -0,0 +1,464 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "snapuserd_core.h"
+
+#include <android-base/strings.h>
+
+namespace android {
+namespace snapshot {
+
+using namespace android;
+using namespace android::dm;
+using android::base::unique_fd;
+
+SnapshotHandler::SnapshotHandler(std::string misc_name, std::string cow_device,
+                                 std::string backing_device, std::string base_path_merge) {
+    misc_name_ = std::move(misc_name);
+    cow_device_ = std::move(cow_device);
+    backing_store_device_ = std::move(backing_device);
+    control_device_ = "/dev/dm-user/" + misc_name_;
+    base_path_merge_ = std::move(base_path_merge);
+}
+
+bool SnapshotHandler::InitializeWorkers() {
+    for (int i = 0; i < NUM_THREADS_PER_PARTITION; i++) {
+        std::unique_ptr<Worker> wt =
+                std::make_unique<Worker>(cow_device_, backing_store_device_, control_device_,
+                                         misc_name_, base_path_merge_, GetSharedPtr());
+        if (!wt->Init()) {
+            SNAP_LOG(ERROR) << "Thread initialization failed";
+            return false;
+        }
+
+        worker_threads_.push_back(std::move(wt));
+    }
+
+    merge_thread_ = std::make_unique<Worker>(cow_device_, backing_store_device_, control_device_,
+                                             misc_name_, base_path_merge_, GetSharedPtr());
+
+    read_ahead_thread_ = std::make_unique<ReadAhead>(cow_device_, backing_store_device_, misc_name_,
+                                                     GetSharedPtr());
+    return true;
+}
+
+std::unique_ptr<CowReader> SnapshotHandler::CloneReaderForWorker() {
+    return reader_->CloneCowReader();
+}
+
+bool SnapshotHandler::CommitMerge(int num_merge_ops) {
+    struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
+    ch->num_merge_ops += num_merge_ops;
+
+    if (scratch_space_) {
+        if (ra_thread_) {
+            struct BufferState* ra_state = GetBufferState();
+            ra_state->read_ahead_state = kCowReadAheadInProgress;
+        }
+
+        int ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
+        if (ret < 0) {
+            SNAP_PLOG(ERROR) << "msync header failed: " << ret;
+            return false;
+        }
+    } else {
+        reader_->UpdateMergeOpsCompleted(num_merge_ops);
+        CowHeader header;
+        reader_->GetHeader(&header);
+
+        if (lseek(cow_fd_.get(), 0, SEEK_SET) < 0) {
+            SNAP_PLOG(ERROR) << "lseek failed";
+            return false;
+        }
+
+        if (!android::base::WriteFully(cow_fd_, &header, sizeof(CowHeader))) {
+            SNAP_PLOG(ERROR) << "Write to header failed";
+            return false;
+        }
+
+        if (fsync(cow_fd_.get()) < 0) {
+            SNAP_PLOG(ERROR) << "fsync failed";
+            return false;
+        }
+    }
+
+    return true;
+}
+
+void SnapshotHandler::PrepareReadAhead() {
+    struct BufferState* ra_state = GetBufferState();
+    // Check if the data has to be re-constructed from COW device
+    if (ra_state->read_ahead_state == kCowReadAheadDone) {
+        populate_data_from_cow_ = true;
+    } else {
+        populate_data_from_cow_ = false;
+    }
+
+    NotifyRAForMergeReady();
+}
+
+void SnapshotHandler::CheckMergeCompletionStatus() {
+    if (!merge_initiated_) {
+        SNAP_LOG(INFO) << "Merge was not initiated. Total-data-ops: "
+                       << reader_->get_num_total_data_ops();
+        return;
+    }
+
+    struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
+
+    SNAP_LOG(INFO) << "Merge-status: Total-Merged-ops: " << ch->num_merge_ops
+                   << " Total-data-ops: " << reader_->get_num_total_data_ops();
+}
+
+bool SnapshotHandler::ReadMetadata() {
+    reader_ = std::make_unique<CowReader>();
+    CowHeader header;
+    CowOptions options;
+
+    SNAP_LOG(DEBUG) << "ReadMetadata: Parsing cow file";
+
+    if (!reader_->Parse(cow_fd_)) {
+        SNAP_LOG(ERROR) << "Failed to parse";
+        return false;
+    }
+
+    if (!reader_->GetHeader(&header)) {
+        SNAP_LOG(ERROR) << "Failed to get header";
+        return false;
+    }
+
+    if (!(header.block_size == BLOCK_SZ)) {
+        SNAP_LOG(ERROR) << "Invalid header block size found: " << header.block_size;
+        return false;
+    }
+
+    SNAP_LOG(INFO) << "Merge-ops: " << header.num_merge_ops;
+
+    if (!MmapMetadata()) {
+        SNAP_LOG(ERROR) << "mmap failed";
+        return false;
+    }
+
+    // Initialize the iterator for reading metadata
+    std::unique_ptr<ICowOpIter> cowop_iter = reader_->GetMergeOpIter();
+
+    while (!cowop_iter->Done()) {
+        const CowOperation* cow_op = &cowop_iter->Get();
+
+        chunk_vec_.push_back(std::make_pair(ChunkToSector(cow_op->new_block), cow_op));
+
+        if (!ra_thread_ && IsOrderedOp(*cow_op)) {
+            ra_thread_ = true;
+        }
+        cowop_iter->Next();
+    }
+
+    chunk_vec_.shrink_to_fit();
+
+    // Sort the vector based on sectors as we need this during un-aligned access
+    std::sort(chunk_vec_.begin(), chunk_vec_.end(), compare);
+
+    PrepareReadAhead();
+
+    return true;
+}
+
+bool SnapshotHandler::MmapMetadata() {
+    CowHeader header;
+    reader_->GetHeader(&header);
+
+    total_mapped_addr_length_ = header.header_size + BUFFER_REGION_DEFAULT_SIZE;
+
+    if (header.major_version >= 2 && header.buffer_size > 0) {
+        scratch_space_ = true;
+    }
+
+    if (scratch_space_) {
+        mapped_addr_ = mmap(NULL, total_mapped_addr_length_, PROT_READ | PROT_WRITE, MAP_SHARED,
+                            cow_fd_.get(), 0);
+    } else {
+        mapped_addr_ = mmap(NULL, total_mapped_addr_length_, PROT_READ | PROT_WRITE,
+                            MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+        struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
+        ch->num_merge_ops = header.num_merge_ops;
+    }
+
+    if (mapped_addr_ == MAP_FAILED) {
+        SNAP_LOG(ERROR) << "mmap metadata failed";
+        return false;
+    }
+
+    return true;
+}
+
+void SnapshotHandler::UnmapBufferRegion() {
+    int ret = munmap(mapped_addr_, total_mapped_addr_length_);
+    if (ret < 0) {
+        SNAP_PLOG(ERROR) << "munmap failed";
+    }
+}
+
+bool SnapshotHandler::InitCowDevice() {
+    cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
+    if (cow_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
+        return false;
+    }
+
+    unique_fd fd(TEMP_FAILURE_RETRY(open(base_path_merge_.c_str(), O_RDONLY | O_CLOEXEC)));
+    if (fd < 0) {
+        SNAP_LOG(ERROR) << "Cannot open block device";
+        return false;
+    }
+
+    uint64_t dev_sz = get_block_device_size(fd.get());
+    if (!dev_sz) {
+        SNAP_LOG(ERROR) << "Failed to find block device size: " << base_path_merge_;
+        return false;
+    }
+
+    num_sectors_ = dev_sz >> SECTOR_SHIFT;
+
+    return ReadMetadata();
+}
+
+void SnapshotHandler::ReadBlocksToCache(const std::string& dm_block_device,
+                                        const std::string& partition_name, off_t offset,
+                                        size_t size) {
+    android::base::unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY)));
+    if (fd.get() == -1) {
+        SNAP_PLOG(ERROR) << "Error reading " << dm_block_device
+                         << " partition-name: " << partition_name;
+        return;
+    }
+
+    size_t remain = size;
+    off_t file_offset = offset;
+    // We pick 4M I/O size based on the fact that the current
+    // update_verifier has a similar I/O size.
+    size_t read_sz = 1024 * BLOCK_SZ;
+    std::vector<uint8_t> buf(read_sz);
+
+    while (remain > 0) {
+        size_t to_read = std::min(remain, read_sz);
+
+        if (!android::base::ReadFullyAtOffset(fd.get(), buf.data(), to_read, file_offset)) {
+            SNAP_PLOG(ERROR) << "Failed to read block from block device: " << dm_block_device
+                             << " at offset: " << file_offset
+                             << " partition-name: " << partition_name << " total-size: " << size
+                             << " remain_size: " << remain;
+            return;
+        }
+
+        file_offset += to_read;
+        remain -= to_read;
+    }
+
+    SNAP_LOG(INFO) << "Finished reading block-device: " << dm_block_device
+                   << " partition: " << partition_name << " size: " << size
+                   << " offset: " << offset;
+}
+
+void SnapshotHandler::ReadBlocks(const std::string partition_name,
+                                 const std::string& dm_block_device) {
+    SNAP_LOG(DEBUG) << "Reading partition: " << partition_name
+                    << " Block-Device: " << dm_block_device;
+
+    uint64_t dev_sz = 0;
+
+    unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY | O_CLOEXEC)));
+    if (fd < 0) {
+        SNAP_LOG(ERROR) << "Cannot open block device";
+        return;
+    }
+
+    dev_sz = get_block_device_size(fd.get());
+    if (!dev_sz) {
+        SNAP_PLOG(ERROR) << "Could not determine block device size: " << dm_block_device;
+        return;
+    }
+
+    int num_threads = 2;
+    size_t num_blocks = dev_sz >> BLOCK_SHIFT;
+    size_t num_blocks_per_thread = num_blocks / num_threads;
+    size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
+    off_t offset = 0;
+
+    for (int i = 0; i < num_threads; i++) {
+        std::async(std::launch::async, &SnapshotHandler::ReadBlocksToCache, this, dm_block_device,
+                   partition_name, offset, read_sz_per_thread);
+
+        offset += read_sz_per_thread;
+    }
+}
+
+/*
+ * Entry point to launch threads
+ */
+bool SnapshotHandler::Start() {
+    std::vector<std::future<bool>> threads;
+    std::future<bool> ra_thread_status;
+
+    if (ra_thread_) {
+        ra_thread_status =
+                std::async(std::launch::async, &ReadAhead::RunThread, read_ahead_thread_.get());
+
+        SNAP_LOG(INFO) << "Read-ahead thread started...";
+    }
+
+    // Launch worker threads
+    for (int i = 0; i < worker_threads_.size(); i++) {
+        threads.emplace_back(
+                std::async(std::launch::async, &Worker::RunThread, worker_threads_[i].get()));
+    }
+
+    bool second_stage_init = true;
+
+    // We don't want to read the blocks during first stage init.
+    if (android::base::EndsWith(misc_name_, "-init") || is_socket_present_) {
+        second_stage_init = false;
+    }
+
+    if (second_stage_init) {
+        SNAP_LOG(INFO) << "Reading blocks to cache....";
+        auto& dm = DeviceMapper::Instance();
+        auto dm_block_devices = dm.FindDmPartitions();
+        if (dm_block_devices.empty()) {
+            SNAP_LOG(ERROR) << "No dm-enabled block device is found.";
+        } else {
+            auto parts = android::base::Split(misc_name_, "-");
+            std::string partition_name = parts[0];
+
+            const char* suffix_b = "_b";
+            const char* suffix_a = "_a";
+
+            partition_name.erase(partition_name.find_last_not_of(suffix_b) + 1);
+            partition_name.erase(partition_name.find_last_not_of(suffix_a) + 1);
+
+            if (dm_block_devices.find(partition_name) == dm_block_devices.end()) {
+                SNAP_LOG(ERROR) << "Failed to find dm block device for " << partition_name;
+            } else {
+                ReadBlocks(partition_name, dm_block_devices.at(partition_name));
+            }
+        }
+    } else {
+        SNAP_LOG(INFO) << "Not reading block device into cache";
+    }
+
+    std::future<bool> merge_thread =
+            std::async(std::launch::async, &Worker::RunMergeThread, merge_thread_.get());
+
+    bool ret = true;
+    for (auto& t : threads) {
+        ret = t.get() && ret;
+    }
+
+    // Worker threads are terminated by this point - this can only happen:
+    //
+    // 1: If dm-user device is destroyed
+    // 2: We had an I/O failure when reading root partitions
+    //
+    // In case (1), this would be a graceful shutdown. In this case, merge
+    // thread and RA thread should have already terminated by this point. We will be
+    // destroying the dm-user device only _after_ merge is completed.
+    //
+    // In case (2), if merge thread had started, then it will be
+    // continuing to merge; however, since we had an I/O failure and the
+    // I/O on root partitions are no longer served, we will terminate the
+    // merge
+
+    NotifyIOTerminated();
+
+    bool read_ahead_retval = false;
+
+    SNAP_LOG(INFO) << "Snapshot I/O terminated. Waiting for merge thread....";
+    bool merge_thread_status = merge_thread.get();
+
+    if (ra_thread_) {
+        read_ahead_retval = ra_thread_status.get();
+    }
+
+    SNAP_LOG(INFO) << "Worker threads terminated with ret: " << ret
+                   << " Merge-thread with ret: " << merge_thread_status
+                   << " RA-thread with ret: " << read_ahead_retval;
+    return ret;
+}
+
+uint64_t SnapshotHandler::GetBufferMetadataOffset() {
+    CowHeader header;
+    reader_->GetHeader(&header);
+
+    return (header.header_size + sizeof(BufferState));
+}
+
+/*
+ * Metadata for read-ahead is 16 bytes. For a 2 MB region, we will
+ * end up with 8k (2 PAGE) worth of metadata. Thus, a 2MB buffer
+ * region is split into:
+ *
+ * 1: 8k metadata
+ * 2: Scratch space
+ *
+ */
+size_t SnapshotHandler::GetBufferMetadataSize() {
+    CowHeader header;
+    reader_->GetHeader(&header);
+    size_t buffer_size = header.buffer_size;
+
+    // If there is no scratch space, then just use the
+    // anonymous memory
+    if (buffer_size == 0) {
+        buffer_size = BUFFER_REGION_DEFAULT_SIZE;
+    }
+
+    return ((buffer_size * sizeof(struct ScratchMetadata)) / BLOCK_SZ);
+}
+
+size_t SnapshotHandler::GetBufferDataOffset() {
+    CowHeader header;
+    reader_->GetHeader(&header);
+
+    return (header.header_size + GetBufferMetadataSize());
+}
+
+/*
+ * (2MB - 8K = 2088960 bytes) will be the buffer region to hold the data.
+ */
+size_t SnapshotHandler::GetBufferDataSize() {
+    CowHeader header;
+    reader_->GetHeader(&header);
+    size_t buffer_size = header.buffer_size;
+
+    // If there is no scratch space, then just use the
+    // anonymous memory
+    if (buffer_size == 0) {
+        buffer_size = BUFFER_REGION_DEFAULT_SIZE;
+    }
+
+    return (buffer_size - GetBufferMetadataSize());
+}
+
+struct BufferState* SnapshotHandler::GetBufferState() {
+    CowHeader header;
+    reader_->GetHeader(&header);
+
+    struct BufferState* ra_state =
+            reinterpret_cast<struct BufferState*>((char*)mapped_addr_ + header.header_size);
+    return ra_state;
+}
+
+}  // namespace snapshot
+}  // namespace android

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
new file mode 100644
index 0000000..e37be7b
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h

@@ -0,0 +1,328 @@
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <linux/types.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include <condition_variable>
+#include <cstring>
+#include <future>
+#include <iostream>
+#include <limits>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <android-base/file.h>
+#include <android-base/logging.h>
+#include <android-base/stringprintf.h>
+#include <android-base/unique_fd.h>
+#include <ext4_utils/ext4_utils.h>
+#include <libdm/dm.h>
+#include <libsnapshot/cow_reader.h>
+#include <libsnapshot/cow_writer.h>
+#include <snapuserd/snapuserd_kernel.h>
+
+namespace android {
+namespace snapshot {
+
+using android::base::unique_fd;
+using namespace std::chrono_literals;
+
+static constexpr size_t PAYLOAD_SIZE = (1UL << 20);
+static_assert(PAYLOAD_SIZE >= BLOCK_SZ);
+
+static constexpr int NUM_THREADS_PER_PARTITION = 1;
+
+#define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
+#define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
+
+enum class MERGE_IO_TRANSITION {
+    MERGE_READY,
+    MERGE_BEGIN,
+    MERGE_FAILED,
+    MERGE_COMPLETE,
+    IO_TERMINATED,
+    READ_AHEAD_FAILURE,
+};
+
+class Bufsink : public IByteSink {
+  public:
+    void Initialize(size_t size);
+    void* GetBufPtr() { return buffer_.get(); }
+    void Clear() { memset(GetBufPtr(), 0, buffer_size_); }
+    void* GetPayloadBuffer(size_t size);
+    void* GetBuffer(size_t requested, size_t* actual) override;
+    void UpdateBufferOffset(size_t size) { buffer_offset_ += size; }
+    struct dm_user_header* GetHeaderPtr();
+    bool ReturnData(void*, size_t) override { return true; }
+    void ResetBufferOffset() { buffer_offset_ = 0; }
+    void* GetPayloadBufPtr();
+
+  private:
+    std::unique_ptr<uint8_t[]> buffer_;
+    loff_t buffer_offset_;
+    size_t buffer_size_;
+};
+
+class XorBufSink : public IByteSink {
+  public:
+    void Initialize(Bufsink* sink, size_t size);
+    void Reset();
+    void* GetBuffer(size_t requested, size_t* actual) override;
+    bool ReturnData(void* buffer, size_t len) override;
+
+  private:
+    Bufsink* bufsink_;
+    std::unique_ptr<uint8_t[]> buffer_;
+    size_t buffer_size_;
+    size_t returned_;
+};
+
+class SnapshotHandler;
+
+class ReadAhead {
+  public:
+    ReadAhead(const std::string& cow_device, const std::string& backing_device,
+              const std::string& misc_name, std::shared_ptr<SnapshotHandler> snapuserd);
+    bool RunThread();
+
+  private:
+    void InitializeRAIter();
+    bool RAIterDone();
+    void RAIterNext();
+    const CowOperation* GetRAOpIter();
+
+    void InitializeBuffer();
+    bool InitReader();
+    bool InitializeFds();
+
+    void CloseFds() { backing_store_fd_ = {}; }
+
+    bool ReadAheadIOStart();
+    int PrepareNextReadAhead(uint64_t* source_offset, int* pending_ops,
+                             std::vector<uint64_t>& blocks,
+                             std::vector<const CowOperation*>& xor_op_vec);
+    bool ReconstructDataFromCow();
+    void CheckOverlap(const CowOperation* cow_op);
+
+    void* read_ahead_buffer_;
+    void* metadata_buffer_;
+
+    std::unique_ptr<ICowOpIter> cowop_iter_;
+
+    std::string cow_device_;
+    std::string backing_store_device_;
+    std::string misc_name_;
+
+    unique_fd cow_fd_;
+    unique_fd backing_store_fd_;
+
+    std::shared_ptr<SnapshotHandler> snapuserd_;
+    std::unique_ptr<CowReader> reader_;
+
+    std::unordered_set<uint64_t> dest_blocks_;
+    std::unordered_set<uint64_t> source_blocks_;
+    bool overlap_;
+    Bufsink bufsink_;
+};
+
+class Worker {
+  public:
+    Worker(const std::string& cow_device, const std::string& backing_device,
+           const std::string& control_device, const std::string& misc_name,
+           const std::string& base_path_merge, std::shared_ptr<SnapshotHandler> snapuserd);
+    bool RunThread();
+    bool RunMergeThread();
+    bool Init();
+
+  private:
+    // Initialization
+    void InitializeBufsink();
+    bool InitializeFds();
+    bool InitReader();
+    void CloseFds() {
+        ctrl_fd_ = {};
+        backing_store_fd_ = {};
+        base_path_merge_fd_ = {};
+    }
+
+    // IO Path
+    bool ProcessIORequest();
+
+    // Processing COW operations
+    bool ProcessReplaceOp(const CowOperation* cow_op);
+    bool ProcessZeroOp();
+
+    // Handles Copy and Xor
+    bool ProcessCopyOp(const CowOperation* cow_op);
+    bool ProcessXorOp(const CowOperation* cow_op);
+
+    // Merge related ops
+    bool Merge();
+    bool MergeOrderedOps(const std::unique_ptr<ICowOpIter>& cowop_iter);
+    bool MergeReplaceZeroOps(const std::unique_ptr<ICowOpIter>& cowop_iter);
+    int PrepareMerge(uint64_t* source_offset, int* pending_ops,
+                     const std::unique_ptr<ICowOpIter>& cowop_iter,
+                     std::vector<const CowOperation*>* replace_zero_vec = nullptr);
+
+    std::unique_ptr<CowReader> reader_;
+    Bufsink bufsink_;
+    XorBufSink xorsink_;
+
+    std::string cow_device_;
+    std::string backing_store_device_;
+    std::string control_device_;
+    std::string misc_name_;
+    std::string base_path_merge_;
+
+    unique_fd cow_fd_;
+    unique_fd backing_store_fd_;
+    unique_fd base_path_merge_fd_;
+    unique_fd ctrl_fd_;
+
+    std::shared_ptr<SnapshotHandler> snapuserd_;
+};
+
+class SnapshotHandler : public std::enable_shared_from_this<SnapshotHandler> {
+  public:
+    SnapshotHandler(std::string misc_name, std::string cow_device, std::string backing_device,
+                    std::string base_path_merge);
+    bool InitCowDevice();
+    bool Start();
+
+    const std::string& GetControlDevicePath() { return control_device_; }
+    const std::string& GetMiscName() { return misc_name_; }
+    const uint64_t& GetNumSectors() { return num_sectors_; }
+    const bool& IsAttached() const { return attached_; }
+    void AttachControlDevice() { attached_ = true; }
+
+    void CheckMergeCompletionStatus();
+    bool CommitMerge(int num_merge_ops);
+
+    void CloseFds() { cow_fd_ = {}; }
+    void FreeResources() {
+        worker_threads_.clear();
+        read_ahead_thread_ = nullptr;
+        merge_thread_ = nullptr;
+    }
+
+    bool InitializeWorkers();
+    std::unique_ptr<CowReader> CloneReaderForWorker();
+    std::shared_ptr<SnapshotHandler> GetSharedPtr() { return shared_from_this(); }
+
+    std::vector<std::pair<sector_t, const CowOperation*>>& GetChunkVec() { return chunk_vec_; }
+
+    static bool compare(std::pair<sector_t, const CowOperation*> p1,
+                        std::pair<sector_t, const CowOperation*> p2) {
+        return p1.first < p2.first;
+    }
+
+    void UnmapBufferRegion();
+    bool MmapMetadata();
+
+    // Read-ahead related functions
+    void* GetMappedAddr() { return mapped_addr_; }
+    void PrepareReadAhead();
+
+    // State transitions for merge
+    void InitiateMerge();
+    void WaitForMergeComplete();
+    bool WaitForMergeBegin();
+    void NotifyRAForMergeReady();
+    bool WaitForMergeReady();
+    void MergeFailed();
+    bool IsIOTerminated();
+    void MergeCompleted();
+    void NotifyIOTerminated();
+    bool ReadAheadIOCompleted(bool sync);
+    void ReadAheadIOFailed();
+
+    bool ShouldReconstructDataFromCow() { return populate_data_from_cow_; }
+    void FinishReconstructDataFromCow() { populate_data_from_cow_ = false; }
+
+    // RA related functions
+    uint64_t GetBufferMetadataOffset();
+    size_t GetBufferMetadataSize();
+    size_t GetBufferDataOffset();
+    size_t GetBufferDataSize();
+
+    // Total number of blocks to be merged in a given read-ahead buffer region
+    void SetMergedBlockCountForNextCommit(int x) { total_ra_blocks_merged_ = x; }
+    int GetTotalBlocksToMerge() { return total_ra_blocks_merged_; }
+    void SetSocketPresent(bool socket) { is_socket_present_ = socket; }
+    bool MergeInitiated() { return merge_initiated_; }
+
+  private:
+    bool ReadMetadata();
+    sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
+    chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
+    struct BufferState* GetBufferState();
+
+    void ReadBlocks(const std::string partition_name, const std::string& dm_block_device);
+    void ReadBlocksToCache(const std::string& dm_block_device, const std::string& partition_name,
+                           off_t offset, size_t size);
+
+    // COW device
+    std::string cow_device_;
+    // Source device
+    std::string backing_store_device_;
+    // dm-user control device
+    std::string control_device_;
+    std::string misc_name_;
+    // Base device for merging
+    std::string base_path_merge_;
+
+    unique_fd cow_fd_;
+
+    // Number of sectors required when initializing dm-user
+    uint64_t num_sectors_;
+
+    std::unique_ptr<CowReader> reader_;
+
+    // chunk_vec stores the pseudo mapping of sector
+    // to COW operations.
+    std::vector<std::pair<sector_t, const CowOperation*>> chunk_vec_;
+
+    std::mutex lock_;
+    std::condition_variable cv;
+
+    void* mapped_addr_;
+    size_t total_mapped_addr_length_;
+
+    std::vector<std::unique_ptr<Worker>> worker_threads_;
+    // Read-ahead related
+    bool populate_data_from_cow_ = false;
+    bool ra_thread_ = false;
+    int total_ra_blocks_merged_ = 0;
+    MERGE_IO_TRANSITION io_state_;
+    std::unique_ptr<ReadAhead> read_ahead_thread_;
+
+    std::unique_ptr<Worker> merge_thread_;
+
+    bool merge_initiated_ = false;
+    bool attached_ = false;
+    bool is_socket_present_;
+    bool scratch_space_ = false;
+};
+
+}  // namespace snapshot
+}  // namespace android

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_dm_user.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_dm_user.cpp
new file mode 100644
index 0000000..8036c81
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_dm_user.cpp

@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "snapuserd_core.h"
+
+namespace android {
+namespace snapshot {
+
+using namespace android;
+using namespace android::dm;
+using android::base::unique_fd;
+
+void Bufsink::Initialize(size_t size) {
+    buffer_size_ = size;
+    buffer_offset_ = 0;
+    buffer_ = std::make_unique<uint8_t[]>(size);
+}
+
+void* Bufsink::GetPayloadBuffer(size_t size) {
+    if ((buffer_size_ - buffer_offset_) < size) return nullptr;
+
+    char* buffer = reinterpret_cast<char*>(GetBufPtr());
+    struct dm_user_message* msg = (struct dm_user_message*)(&(buffer[0]));
+    return (char*)msg->payload.buf + buffer_offset_;
+}
+
+void* Bufsink::GetBuffer(size_t requested, size_t* actual) {
+    void* buf = GetPayloadBuffer(requested);
+    if (!buf) {
+        *actual = 0;
+        return nullptr;
+    }
+    *actual = requested;
+    return buf;
+}
+
+struct dm_user_header* Bufsink::GetHeaderPtr() {
+    if (!(sizeof(struct dm_user_header) <= buffer_size_)) {
+        return nullptr;
+    }
+    char* buf = reinterpret_cast<char*>(GetBufPtr());
+    struct dm_user_header* header = (struct dm_user_header*)(&(buf[0]));
+    return header;
+}
+
+void* Bufsink::GetPayloadBufPtr() {
+    char* buffer = reinterpret_cast<char*>(GetBufPtr());
+    struct dm_user_message* msg = reinterpret_cast<struct dm_user_message*>(&(buffer[0]));
+    return msg->payload.buf;
+}
+
+void XorBufSink::Initialize(Bufsink* sink, size_t size) {
+    bufsink_ = sink;
+    buffer_size_ = size;
+    returned_ = 0;
+    buffer_ = std::make_unique<uint8_t[]>(size);
+}
+
+void XorBufSink::Reset() {
+    returned_ = 0;
+}
+
+void* XorBufSink::GetBuffer(size_t requested, size_t* actual) {
+    if (requested > buffer_size_) {
+        *actual = buffer_size_;
+    } else {
+        *actual = requested;
+    }
+    return buffer_.get();
+}
+
+bool XorBufSink::ReturnData(void* buffer, size_t len) {
+    uint8_t* xor_data = reinterpret_cast<uint8_t*>(buffer);
+    uint8_t* buff = reinterpret_cast<uint8_t*>(bufsink_->GetPayloadBuffer(len + returned_));
+    if (buff == nullptr) {
+        return false;
+    }
+    for (size_t i = 0; i < len; i++) {
+        buff[returned_ + i] ^= xor_data[i];
+    }
+    returned_ += len;
+    return true;
+}
+
+Worker::Worker(const std::string& cow_device, const std::string& backing_device,
+               const std::string& control_device, const std::string& misc_name,
+               const std::string& base_path_merge, std::shared_ptr<SnapshotHandler> snapuserd) {
+    cow_device_ = cow_device;
+    backing_store_device_ = backing_device;
+    control_device_ = control_device;
+    misc_name_ = misc_name;
+    base_path_merge_ = base_path_merge;
+    snapuserd_ = snapuserd;
+}
+
+bool Worker::InitializeFds() {
+    backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
+    if (backing_store_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
+        return false;
+    }
+
+    cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
+    if (cow_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
+        return false;
+    }
+
+    ctrl_fd_.reset(open(control_device_.c_str(), O_RDWR));
+    if (ctrl_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Unable to open " << control_device_;
+        return false;
+    }
+
+    // Base device used by merge thread
+    base_path_merge_fd_.reset(open(base_path_merge_.c_str(), O_RDWR));
+    if (base_path_merge_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Open Failed: " << base_path_merge_;
+        return false;
+    }
+
+    return true;
+}
+
+bool Worker::InitReader() {
+    reader_ = snapuserd_->CloneReaderForWorker();
+
+    if (!reader_->InitForMerge(std::move(cow_fd_))) {
+        return false;
+    }
+    return true;
+}
+
+// Start the replace operation. This will read the
+// internal COW format and if the block is compressed,
+// it will be de-compressed.
+bool Worker::ProcessReplaceOp(const CowOperation* cow_op) {
+    if (!reader_->ReadData(*cow_op, &bufsink_)) {
+        SNAP_LOG(ERROR) << "ProcessReplaceOp failed for block " << cow_op->new_block;
+        return false;
+    }
+
+    return true;
+}
+
+bool Worker::ProcessZeroOp() {
+    // Zero out the entire block
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
+    if (buffer == nullptr) {
+        SNAP_LOG(ERROR) << "ProcessZeroOp: Failed to get payload buffer";
+        return false;
+    }
+
+    memset(buffer, 0, BLOCK_SZ);
+    return true;
+}
+
+bool Worker::ProcessCopyOp(const CowOperation*) {
+    return true;
+}
+
+bool Worker::ProcessXorOp(const CowOperation*) {
+    return true;
+}
+
+void Worker::InitializeBufsink() {
+    // Allocate the buffer which is used to communicate between
+    // daemon and dm-user. The buffer comprises of header and a fixed payload.
+    // If the dm-user requests a big IO, the IO will be broken into chunks
+    // of PAYLOAD_SIZE.
+    size_t buf_size = sizeof(struct dm_user_header) + PAYLOAD_SIZE;
+    bufsink_.Initialize(buf_size);
+}
+
+bool Worker::Init() {
+    InitializeBufsink();
+    xorsink_.Initialize(&bufsink_, BLOCK_SZ);
+
+    if (!InitializeFds()) {
+        return false;
+    }
+
+    if (!InitReader()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool Worker::RunThread() {
+    SNAP_LOG(DEBUG) << "Processing snapshot I/O requests...";
+    // Start serving IO
+    while (true) {
+        if (!ProcessIORequest()) {
+            break;
+        }
+    }
+
+    CloseFds();
+    reader_->CloseCowFd();
+
+    return true;
+}
+
+bool Worker::ProcessIORequest() {
+    // No communication with dm-user yet
+    return true;
+}
+
+}  // namespace snapshot
+}  // namespace android

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp
new file mode 100644
index 0000000..696ede7
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp

@@ -0,0 +1,300 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "snapuserd_core.h"
+
+namespace android {
+namespace snapshot {
+
+using namespace android;
+using namespace android::dm;
+using android::base::unique_fd;
+
+int Worker::PrepareMerge(uint64_t* source_offset, int* pending_ops,
+                         const std::unique_ptr<ICowOpIter>& cowop_iter,
+                         std::vector<const CowOperation*>* replace_zero_vec) {
+    int num_ops = *pending_ops;
+    int nr_consecutive = 0;
+    bool checkOrderedOp = (replace_zero_vec == nullptr);
+
+    do {
+        if (!cowop_iter->Done() && num_ops) {
+            const CowOperation* cow_op = &cowop_iter->Get();
+            if (checkOrderedOp && !IsOrderedOp(*cow_op)) {
+                break;
+            }
+
+            *source_offset = cow_op->new_block * BLOCK_SZ;
+            if (!checkOrderedOp) {
+                replace_zero_vec->push_back(cow_op);
+            }
+
+            cowop_iter->Next();
+            num_ops -= 1;
+            nr_consecutive = 1;
+
+            while (!cowop_iter->Done() && num_ops) {
+                const CowOperation* op = &cowop_iter->Get();
+                if (checkOrderedOp && !IsOrderedOp(*op)) {
+                    break;
+                }
+
+                // Check for consecutive blocks
+                uint64_t next_offset = op->new_block * BLOCK_SZ;
+                if (next_offset != (*source_offset + nr_consecutive * BLOCK_SZ)) {
+                    break;
+                }
+
+                if (!checkOrderedOp) {
+                    replace_zero_vec->push_back(op);
+                }
+
+                nr_consecutive += 1;
+                num_ops -= 1;
+                cowop_iter->Next();
+            }
+        }
+    } while (0);
+
+    return nr_consecutive;
+}
+
+bool Worker::MergeReplaceZeroOps(const std::unique_ptr<ICowOpIter>& cowop_iter) {
+    // Flush every 2048 ops. Since all ops are independent and there is no
+    // dependency between COW ops, we will flush the data and the number
+    // of ops merged in COW file for every 2048 ops. If there is a crash,
+    // we will end up replaying some of the COW ops which were already merged.
+    // That is ok.
+    //
+    // Why 2048 ops ? We can probably increase this to bigger value but just
+    // need to ensure that merge makes forward progress if there are
+    // crashes repeatedly which is highly unlikely.
+    int total_ops_merged_per_commit = (PAYLOAD_SIZE / BLOCK_SZ) * 8;
+    int num_ops_merged = 0;
+
+    while (!cowop_iter->Done()) {
+        int num_ops = PAYLOAD_SIZE / BLOCK_SZ;
+        std::vector<const CowOperation*> replace_zero_vec;
+        uint64_t source_offset;
+
+        int linear_blocks = PrepareMerge(&source_offset, &num_ops, cowop_iter, &replace_zero_vec);
+        if (linear_blocks == 0) {
+            // Merge complete
+            CHECK(cowop_iter->Done());
+            break;
+        }
+
+        for (size_t i = 0; i < replace_zero_vec.size(); i++) {
+            const CowOperation* cow_op = replace_zero_vec[i];
+            if (cow_op->type == kCowReplaceOp) {
+                if (!ProcessReplaceOp(cow_op)) {
+                    SNAP_LOG(ERROR) << "Merge - ReplaceOp failed for block: " << cow_op->new_block;
+                    return false;
+                }
+            } else {
+                CHECK(cow_op->type == kCowZeroOp);
+                if (!ProcessZeroOp()) {
+                    SNAP_LOG(ERROR) << "Merge ZeroOp failed.";
+                    return false;
+                }
+            }
+
+            bufsink_.UpdateBufferOffset(BLOCK_SZ);
+        }
+
+        size_t io_size = linear_blocks * BLOCK_SZ;
+
+        // Merge - Write the contents back to base device
+        int ret = pwrite(base_path_merge_fd_.get(), bufsink_.GetPayloadBufPtr(), io_size,
+                         source_offset);
+        if (ret < 0 || ret != io_size) {
+            SNAP_LOG(ERROR)
+                    << "Merge: ReplaceZeroOps: Failed to write to backing device while merging "
+                    << " at offset: " << source_offset << " io_size: " << io_size;
+            return false;
+        }
+
+        num_ops_merged += linear_blocks;
+
+        if (num_ops_merged == total_ops_merged_per_commit) {
+            // Flush the data
+            if (fsync(base_path_merge_fd_.get()) < 0) {
+                SNAP_LOG(ERROR) << "Merge: ReplaceZeroOps: Failed to fsync merged data";
+                return false;
+            }
+
+            // Track the merge completion
+            if (!snapuserd_->CommitMerge(num_ops_merged)) {
+                SNAP_LOG(ERROR) << " Failed to commit the merged block in the header";
+                return false;
+            }
+
+            num_ops_merged = 0;
+        }
+
+        bufsink_.ResetBufferOffset();
+
+        if (snapuserd_->IsIOTerminated()) {
+            SNAP_LOG(ERROR)
+                    << "MergeReplaceZeroOps: Worker threads terminated - shutting down merge";
+            return false;
+        }
+    }
+
+    // Any left over ops not flushed yet.
+    if (num_ops_merged) {
+        // Flush the data
+        if (fsync(base_path_merge_fd_.get()) < 0) {
+            SNAP_LOG(ERROR) << "Merge: ReplaceZeroOps: Failed to fsync merged data";
+            return false;
+        }
+
+        if (!snapuserd_->CommitMerge(num_ops_merged)) {
+            SNAP_LOG(ERROR) << " Failed to commit the merged block in the header";
+            return false;
+        }
+
+        num_ops_merged = 0;
+    }
+
+    return true;
+}
+
+bool Worker::MergeOrderedOps(const std::unique_ptr<ICowOpIter>& cowop_iter) {
+    void* mapped_addr = snapuserd_->GetMappedAddr();
+    void* read_ahead_buffer =
+            static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
+
+    SNAP_LOG(INFO) << "MergeOrderedOps started....";
+
+    while (!cowop_iter->Done()) {
+        const CowOperation* cow_op = &cowop_iter->Get();
+        if (!IsOrderedOp(*cow_op)) {
+            break;
+        }
+
+        SNAP_LOG(DEBUG) << "Waiting for merge begin...";
+        // Wait for RA thread to notify that the merge window
+        // is ready for merging.
+        if (!snapuserd_->WaitForMergeBegin()) {
+            return false;
+        }
+
+        loff_t offset = 0;
+        int num_ops = snapuserd_->GetTotalBlocksToMerge();
+        SNAP_LOG(DEBUG) << "Merging copy-ops of size: " << num_ops;
+        while (num_ops) {
+            uint64_t source_offset;
+
+            int linear_blocks = PrepareMerge(&source_offset, &num_ops, cowop_iter);
+            if (linear_blocks == 0) {
+                break;
+            }
+
+            size_t io_size = (linear_blocks * BLOCK_SZ);
+            // Write to the base device. Data is already in the RA buffer. Note
+            // that XOR ops is already handled by the RA thread. We just write
+            // the contents out.
+            int ret = pwrite(base_path_merge_fd_.get(), (char*)read_ahead_buffer + offset, io_size,
+                             source_offset);
+            if (ret < 0 || ret != io_size) {
+                SNAP_LOG(ERROR) << "Failed to write to backing device while merging "
+                                << " at offset: " << source_offset << " io_size: " << io_size;
+                return false;
+            }
+
+            offset += io_size;
+            num_ops -= linear_blocks;
+        }
+
+        // Verify all ops are merged
+        CHECK(num_ops == 0);
+
+        // Flush the data
+        if (fsync(base_path_merge_fd_.get()) < 0) {
+            SNAP_LOG(ERROR) << " Failed to fsync merged data";
+            return false;
+        }
+
+        // Merge is done and data is on disk. Update the COW Header about
+        // the merge completion
+        if (!snapuserd_->CommitMerge(snapuserd_->GetTotalBlocksToMerge())) {
+            SNAP_LOG(ERROR) << " Failed to commit the merged block in the header";
+            return false;
+        }
+
+        SNAP_LOG(DEBUG) << "Block commit of size: " << snapuserd_->GetTotalBlocksToMerge();
+
+        // Notify RA thread that the merge thread is ready to merge the next
+        // window
+        snapuserd_->NotifyRAForMergeReady();
+    }
+
+    return true;
+}
+
+bool Worker::Merge() {
+    std::unique_ptr<ICowOpIter> cowop_iter = reader_->GetMergeOpIter();
+
+    // Start with Copy and Xor ops
+    if (!MergeOrderedOps(cowop_iter)) {
+        SNAP_LOG(ERROR) << "Merge failed for ordered ops";
+        snapuserd_->MergeFailed();
+        return false;
+    }
+
+    SNAP_LOG(INFO) << "MergeOrderedOps completed...";
+
+    // Replace and Zero ops
+    if (!MergeReplaceZeroOps(cowop_iter)) {
+        SNAP_LOG(ERROR) << "Merge failed for replace/zero ops";
+        snapuserd_->MergeFailed();
+        return false;
+    }
+
+    snapuserd_->MergeCompleted();
+
+    return true;
+}
+
+bool Worker::RunMergeThread() {
+    SNAP_LOG(DEBUG) << "Waiting for merge begin...";
+    if (!snapuserd_->WaitForMergeBegin()) {
+        SNAP_LOG(ERROR) << "Merge terminated early...";
+        return true;
+    }
+
+    SNAP_LOG(INFO) << "Merge starting..";
+
+    if (!Init()) {
+        SNAP_LOG(ERROR) << "Merge thread initialization failed...";
+        return false;
+    }
+
+    if (!Merge()) {
+        return false;
+    }
+
+    CloseFds();
+    reader_->CloseCowFd();
+
+    SNAP_LOG(INFO) << "Merge finish";
+
+    return true;
+}
+
+}  // namespace snapshot
+}  // namespace android

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp
new file mode 100644
index 0000000..319755b
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp

@@ -0,0 +1,424 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "snapuserd_core.h"
+
+namespace android {
+namespace snapshot {
+
+using namespace android;
+using namespace android::dm;
+using android::base::unique_fd;
+
+ReadAhead::ReadAhead(const std::string& cow_device, const std::string& backing_device,
+                     const std::string& misc_name, std::shared_ptr<SnapshotHandler> snapuserd) {
+    cow_device_ = cow_device;
+    backing_store_device_ = backing_device;
+    misc_name_ = misc_name;
+    snapuserd_ = snapuserd;
+}
+
+void ReadAhead::CheckOverlap(const CowOperation* cow_op) {
+    uint64_t source_block = cow_op->source;
+    uint64_t source_offset = 0;
+    if (cow_op->type == kCowXorOp) {
+        source_block /= BLOCK_SZ;
+        source_offset = cow_op->source % BLOCK_SZ;
+    }
+    if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(source_block) ||
+        (source_offset > 0 && source_blocks_.count(source_block + 1))) {
+        overlap_ = true;
+    }
+
+    dest_blocks_.insert(source_block);
+    if (source_offset > 0) {
+        dest_blocks_.insert(source_block + 1);
+    }
+    source_blocks_.insert(cow_op->new_block);
+}
+
+int ReadAhead::PrepareNextReadAhead(uint64_t* source_offset, int* pending_ops,
+                                    std::vector<uint64_t>& blocks,
+                                    std::vector<const CowOperation*>& xor_op_vec) {
+    int num_ops = *pending_ops;
+    int nr_consecutive = 0;
+
+    bool is_ops_present = (!RAIterDone() && num_ops);
+
+    if (!is_ops_present) {
+        return nr_consecutive;
+    }
+
+    // Get the first block with offset
+    const CowOperation* cow_op = GetRAOpIter();
+    *source_offset = cow_op->source;
+
+    if (cow_op->type == kCowCopyOp) {
+        *source_offset *= BLOCK_SZ;
+    } else if (cow_op->type == kCowXorOp) {
+        xor_op_vec.push_back(cow_op);
+    }
+
+    RAIterNext();
+    num_ops -= 1;
+    nr_consecutive = 1;
+    blocks.push_back(cow_op->new_block);
+
+    if (!overlap_) {
+        CheckOverlap(cow_op);
+    }
+
+    /*
+     * Find number of consecutive blocks
+     */
+    while (!RAIterDone() && num_ops) {
+        const CowOperation* op = GetRAOpIter();
+        uint64_t next_offset = op->source;
+
+        if (cow_op->type == kCowCopyOp) {
+            next_offset *= BLOCK_SZ;
+        }
+
+        // Check for consecutive blocks
+        if (next_offset != (*source_offset + nr_consecutive * BLOCK_SZ)) {
+            break;
+        }
+
+        if (op->type == kCowXorOp) {
+            xor_op_vec.push_back(op);
+        }
+
+        nr_consecutive += 1;
+        num_ops -= 1;
+        blocks.push_back(op->new_block);
+        RAIterNext();
+
+        if (!overlap_) {
+            CheckOverlap(op);
+        }
+    }
+
+    return nr_consecutive;
+}
+
+bool ReadAhead::ReconstructDataFromCow() {
+    std::unordered_map<uint64_t, void*> read_ahead_buffer_map;
+    loff_t metadata_offset = 0;
+    loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
+    int num_ops = 0;
+    int total_blocks_merged = 0;
+
+    while (true) {
+        struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
+                (char*)metadata_buffer_ + metadata_offset);
+
+        // Done reading metadata
+        if (bm->new_block == 0 && bm->file_offset == 0) {
+            break;
+        }
+
+        loff_t buffer_offset = bm->file_offset - start_data_offset;
+        void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + buffer_offset);
+        read_ahead_buffer_map[bm->new_block] = bufptr;
+        num_ops += 1;
+        total_blocks_merged += 1;
+
+        metadata_offset += sizeof(struct ScratchMetadata);
+    }
+
+    // We are done re-constructing the mapping; however, we need to make sure
+    // all the COW operations to-be merged are present in the re-constructed
+    // mapping.
+    while (!RAIterDone()) {
+        const CowOperation* op = GetRAOpIter();
+        if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
+            num_ops -= 1;
+            RAIterNext();
+            continue;
+        }
+
+        // Verify that we have covered all the ops which were re-constructed
+        // from COW device - These are the ops which are being
+        // re-constructed after crash.
+        if (!(num_ops == 0)) {
+            SNAP_LOG(ERROR) << "ReconstructDataFromCow failed. Not all ops recoverd "
+                            << " Pending ops: " << num_ops;
+            snapuserd_->ReadAheadIOFailed();
+            return false;
+        }
+
+        break;
+    }
+
+    snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged);
+
+    snapuserd_->FinishReconstructDataFromCow();
+
+    if (!snapuserd_->ReadAheadIOCompleted(true)) {
+        SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
+        snapuserd_->ReadAheadIOFailed();
+        return false;
+    }
+
+    SNAP_LOG(INFO) << "ReconstructDataFromCow success";
+    return true;
+}
+
+bool ReadAhead::ReadAheadIOStart() {
+    // Check if the data has to be constructed from the COW file.
+    // This will be true only once during boot up after a crash
+    // during merge.
+    if (snapuserd_->ShouldReconstructDataFromCow()) {
+        return ReconstructDataFromCow();
+    }
+
+    std::vector<uint64_t> blocks;
+
+    int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
+    loff_t buffer_offset = 0;
+    int total_blocks_merged = 0;
+    overlap_ = false;
+    dest_blocks_.clear();
+    source_blocks_.clear();
+    std::vector<const CowOperation*> xor_op_vec;
+
+    auto ra_temp_buffer = std::make_unique<uint8_t[]>(snapuserd_->GetBufferDataSize());
+
+    // Number of ops to be merged in this window. This is a fixed size
+    // except for the last window wherein the number of ops can be less
+    // than the size of the RA window.
+    while (num_ops) {
+        uint64_t source_offset;
+
+        int linear_blocks = PrepareNextReadAhead(&source_offset, &num_ops, blocks, xor_op_vec);
+        if (linear_blocks == 0) {
+            // No more blocks to read
+            SNAP_LOG(DEBUG) << " Read-ahead completed....";
+            break;
+        }
+
+        size_t io_size = (linear_blocks * BLOCK_SZ);
+
+        // Read from the base device consecutive set of blocks in one shot
+        if (!android::base::ReadFullyAtOffset(backing_store_fd_,
+                                              (char*)ra_temp_buffer.get() + buffer_offset, io_size,
+                                              source_offset)) {
+            SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
+                             << backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
+                             << " offset :" << source_offset % BLOCK_SZ
+                             << " buffer_offset : " << buffer_offset << " io_size : " << io_size
+                             << " buf-addr : " << read_ahead_buffer_;
+
+            snapuserd_->ReadAheadIOFailed();
+            return false;
+        }
+
+        buffer_offset += io_size;
+        total_blocks_merged += linear_blocks;
+        num_ops -= linear_blocks;
+    }
+
+    // Done with merging ordered ops
+    if (RAIterDone() && total_blocks_merged == 0) {
+        return true;
+    }
+
+    loff_t metadata_offset = 0;
+
+    auto ra_temp_meta_buffer = std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
+
+    struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
+            (char*)ra_temp_meta_buffer.get() + metadata_offset);
+
+    bm->new_block = 0;
+    bm->file_offset = 0;
+
+    loff_t file_offset = snapuserd_->GetBufferDataOffset();
+
+    loff_t offset = 0;
+    CHECK(blocks.size() == total_blocks_merged);
+
+    size_t xor_index = 0;
+    for (size_t block_index = 0; block_index < blocks.size(); block_index++) {
+        void* bufptr = static_cast<void*>((char*)ra_temp_buffer.get() + offset);
+        uint64_t new_block = blocks[block_index];
+
+        if (xor_index < xor_op_vec.size()) {
+            const CowOperation* xor_op = xor_op_vec[xor_index];
+
+            // Check if this block is an XOR op
+            if (xor_op->new_block == new_block) {
+                // Read the xor'ed data from COW
+                if (!reader_->ReadData(*xor_op, &bufsink_)) {
+                    SNAP_LOG(ERROR)
+                            << " ReadAhead - XorOp Read failed for block: " << xor_op->new_block;
+                    snapuserd_->ReadAheadIOFailed();
+                    return false;
+                }
+
+                // Pointer to the data read from base device
+                uint8_t* buffer = reinterpret_cast<uint8_t*>(bufptr);
+                // Get the xor'ed data read from COW device
+                uint8_t* xor_data = reinterpret_cast<uint8_t*>(bufsink_.GetPayloadBufPtr());
+
+                // Retrieve the original data
+                for (size_t byte_offset = 0; byte_offset < BLOCK_SZ; byte_offset++) {
+                    buffer[byte_offset] ^= xor_data[byte_offset];
+                }
+
+                // Move to next XOR op
+                xor_index += 1;
+            }
+        }
+
+        offset += BLOCK_SZ;
+        // Track the metadata blocks which are stored in scratch space
+        bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer.get() +
+                                                       metadata_offset);
+
+        bm->new_block = new_block;
+        bm->file_offset = file_offset;
+
+        metadata_offset += sizeof(struct ScratchMetadata);
+        file_offset += BLOCK_SZ;
+    }
+
+    // Verify if all the xor blocks were scanned to retrieve the original data
+    CHECK(xor_index == xor_op_vec.size());
+
+    // This is important - explicitly set the contents to zero. This is used
+    // when re-constructing the data after crash. This indicates end of
+    // reading metadata contents when re-constructing the data
+    bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer.get() +
+                                                   metadata_offset);
+    bm->new_block = 0;
+    bm->file_offset = 0;
+
+    // Wait for the merge to finish for the previous RA window. We shouldn't
+    // be touching the scratch space until merge is complete of previous RA
+    // window. If there is a crash during this time frame, merge should resume
+    // based on the contents of the scratch space.
+    if (!snapuserd_->WaitForMergeReady()) {
+        return false;
+    }
+
+    // Copy the data to scratch space
+    memcpy(metadata_buffer_, ra_temp_meta_buffer.get(), snapuserd_->GetBufferMetadataSize());
+    memcpy(read_ahead_buffer_, ra_temp_buffer.get(), total_blocks_merged * BLOCK_SZ);
+
+    snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged);
+
+    // Flush the data only if we have a overlapping blocks in the region
+    // Notify the Merge thread to resume merging this window
+    if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
+        SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
+        snapuserd_->ReadAheadIOFailed();
+        return false;
+    }
+
+    return true;
+}
+
+bool ReadAhead::RunThread() {
+    if (!InitializeFds()) {
+        return false;
+    }
+
+    InitializeBuffer();
+
+    if (!InitReader()) {
+        return false;
+    }
+
+    InitializeRAIter();
+
+    while (!RAIterDone()) {
+        if (!ReadAheadIOStart()) {
+            break;
+        }
+    }
+
+    CloseFds();
+    reader_->CloseCowFd();
+    SNAP_LOG(INFO) << " ReadAhead thread terminating....";
+    return true;
+}
+
+// Initialization
+bool ReadAhead::InitializeFds() {
+    backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
+    if (backing_store_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
+        return false;
+    }
+
+    cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
+    if (cow_fd_ < 0) {
+        SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
+        return false;
+    }
+
+    return true;
+}
+
+bool ReadAhead::InitReader() {
+    reader_ = snapuserd_->CloneReaderForWorker();
+
+    if (!reader_->InitForMerge(std::move(cow_fd_))) {
+        return false;
+    }
+    return true;
+}
+
+void ReadAhead::InitializeRAIter() {
+    cowop_iter_ = reader_->GetMergeOpIter();
+}
+
+bool ReadAhead::RAIterDone() {
+    if (cowop_iter_->Done()) {
+        return true;
+    }
+
+    const CowOperation* cow_op = GetRAOpIter();
+
+    if (!IsOrderedOp(*cow_op)) {
+        return true;
+    }
+
+    return false;
+}
+
+void ReadAhead::RAIterNext() {
+    cowop_iter_->Next();
+}
+
+const CowOperation* ReadAhead::GetRAOpIter() {
+    const CowOperation* cow_op = &cowop_iter_->Get();
+    return cow_op;
+}
+
+void ReadAhead::InitializeBuffer() {
+    void* mapped_addr = snapuserd_->GetMappedAddr();
+    // Map the scratch space region into memory
+    metadata_buffer_ =
+            static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
+    read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
+    // For xor ops
+    bufsink_.Initialize(PAYLOAD_SIZE);
+}
+
+}  // namespace snapshot
+}  // namespace android

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp
new file mode 100644
index 0000000..97418bd
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp

@@ -0,0 +1,363 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "snapuserd_core.h"
+
+/*
+ * Readahead is used to optimize the merge of COPY and XOR Ops.
+ *
+ * We create a scratch space of 2MB to store the read-ahead data in the COW
+ * device.
+ *
+ *      +-----------------------+
+ *      |     Header (fixed)    |
+ *      +-----------------------+
+ *      |    Scratch space      |  <-- 2MB
+ *      +-----------------------+
+ *
+ *      Scratch space is as follows:
+ *
+ *      +-----------------------+
+ *      |       Metadata        | <- 4k page
+ *      +-----------------------+
+ *      |       Metadata        | <- 4k page
+ *      +-----------------------+
+ *      |                       |
+ *      |    Read-ahead data    |
+ *      |                       |
+ *      +-----------------------+
+ *
+ *
+ * * ===================================================================
+ *
+ * Example:
+ *
+ * We have 6 copy operations to be executed in OTA. Update-engine
+ * will write to COW file as follows:
+ *
+ * Op-1: 20 -> 23
+ * Op-2: 19 -> 22
+ * Op-3: 18 -> 21
+ * Op-4: 17 -> 20
+ * Op-5: 16 -> 19
+ * Op-6: 15 -> 18
+ *
+ * Read-ahead thread will read all the 6 source blocks and store the data in the
+ * scratch space. Metadata will contain the destination block numbers. Thus,
+ * scratch space will look something like this:
+ *
+ * +--------------+
+ * | Block   23   |
+ * | offset - 1   |
+ * +--------------+
+ * | Block   22   |
+ * | offset - 2   |
+ * +--------------+
+ * | Block   21   |
+ * | offset - 3   |
+ * +--------------+
+ *    ...
+ *    ...
+ * +--------------+
+ * | Data-Block 20| <-- offset - 1
+ * +--------------+
+ * | Data-Block 19| <-- offset - 2
+ * +--------------+
+ * | Data-Block 18| <-- offset - 3
+ * +--------------+
+ *     ...
+ *     ...
+ *
+ * ====================================================================
+ *
+ *
+ *  Read-ahead thread will process the COW Ops in fixed set. Consider
+ *  the following example:
+ *
+ *  +--------------------------+
+ *  |op-1|op-2|op-3|....|op-510|
+ *  +--------------------------+
+ *
+ *  <------ One RA Block ------>
+ *
+ *  RA thread will read 510 ordered COW ops at a time and will store
+ *  the data in the scratch space.
+ *
+ *  RA thread and Merge thread will go lock-step wherein RA thread
+ *  will make sure that 510 COW operation data are read upfront
+ *  and is in memory. Thus, when merge thread will pick up the data
+ *  directly from memory and write it back to base device.
+ *
+ *
+ *  +--------------------------+------------------------------------+
+ *  |op-1|op-2|op-3|....|op-510|op-511|op-512|op-513........|op-1020|
+ *  +--------------------------+------------------------------------+
+ *
+ *  <------Merge 510 Blocks----><-Prepare 510 blocks for merge by RA->
+ *           ^                                  ^
+ *           |                                  |
+ *      Merge thread                        RA thread
+ *
+ * Both Merge and RA thread will strive to work in parallel.
+ *
+ * ===========================================================================
+ *
+ * State transitions and communication between RA thread and Merge thread:
+ *
+ *  Merge Thread                                      RA Thread
+ *  ----------------------------------------------------------------------------
+ *
+ *          |                                         |
+ *    WAIT for RA Block N                     READ one RA Block (N)
+ *        for merge                                   |
+ *          |                                         |
+ *          |                                         |
+ *          <--------------MERGE BEGIN--------READ Block N done(copy to scratch)
+ *          |                                         |
+ *          |                                         |
+ *    Merge Begin Block N                     READ one RA BLock (N+1)
+ *          |                                         |
+ *          |                                         |
+ *          |                                  READ done. Wait for merge complete
+ *          |                                         |
+ *          |                                        WAIT
+ *          |                                         |
+ *    Merge done Block N                              |
+ *          ----------------MERGE READY-------------->|
+ *    WAIT for RA Block N+1                     Copy RA Block (N+1)
+ *        for merge                              to scratch space
+ *          |                                         |
+ *          <---------------MERGE BEGIN---------BLOCK N+1 Done
+ *          |                                         |
+ *          |                                         |
+ *    Merge Begin Block N+1                   READ one RA BLock (N+2)
+ *          |                                         |
+ *          |                                         |
+ *          |                                  READ done. Wait for merge complete
+ *          |                                         |
+ *          |                                        WAIT
+ *          |                                         |
+ *    Merge done Block N+1                            |
+ *          ----------------MERGE READY-------------->|
+ *    WAIT for RA Block N+2                     Copy RA Block (N+2)
+ *        for merge                              to scratch space
+ *          |                                         |
+ *          <---------------MERGE BEGIN---------BLOCK N+2 Done
+ */
+
+namespace android {
+namespace snapshot {
+
+using namespace android;
+using namespace android::dm;
+using android::base::unique_fd;
+
+// This is invoked once primarily by update-engine to initiate
+// the merge
+void SnapshotHandler::InitiateMerge() {
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        merge_initiated_ = true;
+
+        // If there are only REPLACE ops to be merged, then we need
+        // to explicitly set the state to MERGE_BEGIN as there
+        // is no read-ahead thread
+        if (!ra_thread_) {
+            io_state_ = MERGE_IO_TRANSITION::MERGE_BEGIN;
+        }
+    }
+    cv.notify_all();
+}
+
+// Invoked by Merge thread - Waits on RA thread to resume merging. Will
+// be waken up RA thread.
+bool SnapshotHandler::WaitForMergeBegin() {
+    {
+        std::unique_lock<std::mutex> lock(lock_);
+        while (!MergeInitiated()) {
+            cv.wait(lock);
+
+            if (io_state_ == MERGE_IO_TRANSITION::READ_AHEAD_FAILURE ||
+                io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED) {
+                return false;
+            }
+        }
+
+        while (!(io_state_ == MERGE_IO_TRANSITION::MERGE_BEGIN ||
+                 io_state_ == MERGE_IO_TRANSITION::READ_AHEAD_FAILURE ||
+                 io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED)) {
+            cv.wait(lock);
+        }
+
+        if (io_state_ == MERGE_IO_TRANSITION::READ_AHEAD_FAILURE ||
+            io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED) {
+            return false;
+        }
+
+        return true;
+    }
+}
+
+// Invoked by RA thread - Flushes the RA block to scratch space if necessary
+// and then notifies the merge thread to resume merging
+bool SnapshotHandler::ReadAheadIOCompleted(bool sync) {
+    if (sync) {
+        // Flush the entire buffer region
+        int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
+        if (ret < 0) {
+            PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
+            return false;
+        }
+
+        // Metadata and data are synced. Now, update the state.
+        // We need to update the state after flushing data; if there is a crash
+        // when read-ahead IO is in progress, the state of data in the COW file
+        // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
+        // in the scratch space is good and during next reboot, read-ahead thread
+        // can safely re-construct the data.
+        struct BufferState* ra_state = GetBufferState();
+        ra_state->read_ahead_state = kCowReadAheadDone;
+
+        ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
+        if (ret < 0) {
+            PLOG(ERROR) << "msync failed to flush Readahead completion state...";
+            return false;
+        }
+    }
+
+    // Notify the merge thread to resume merging
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        if (io_state_ != MERGE_IO_TRANSITION::IO_TERMINATED &&
+            io_state_ != MERGE_IO_TRANSITION::MERGE_FAILED) {
+            io_state_ = MERGE_IO_TRANSITION::MERGE_BEGIN;
+        }
+    }
+
+    cv.notify_all();
+    return true;
+}
+
+// Invoked by RA thread - Waits for merge thread to finish merging
+// RA Block N - RA thread would be ready will with Block N+1 but
+// will wait to merge thread to finish Block N. Once Block N
+// is merged, RA thread will be woken up by Merge thread and will
+// flush the data of Block N+1 to scratch space
+bool SnapshotHandler::WaitForMergeReady() {
+    {
+        std::unique_lock<std::mutex> lock(lock_);
+        while (!(io_state_ == MERGE_IO_TRANSITION::MERGE_READY ||
+                 io_state_ == MERGE_IO_TRANSITION::MERGE_FAILED ||
+                 io_state_ == MERGE_IO_TRANSITION::MERGE_COMPLETE ||
+                 io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED)) {
+            cv.wait(lock);
+        }
+
+        // Check if merge failed
+        if (io_state_ == MERGE_IO_TRANSITION::MERGE_FAILED ||
+            io_state_ == MERGE_IO_TRANSITION::MERGE_COMPLETE ||
+            io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED) {
+            return false;
+        }
+        return true;
+    }
+}
+
+// Invoked by Merge thread - Notify RA thread about Merge completion
+// for Block N and wake up
+void SnapshotHandler::NotifyRAForMergeReady() {
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        if (io_state_ != MERGE_IO_TRANSITION::IO_TERMINATED &&
+            io_state_ != MERGE_IO_TRANSITION::READ_AHEAD_FAILURE) {
+            io_state_ = MERGE_IO_TRANSITION::MERGE_READY;
+        }
+    }
+
+    cv.notify_all();
+}
+
+// The following transitions are mostly in the failure paths
+void SnapshotHandler::MergeFailed() {
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        io_state_ = MERGE_IO_TRANSITION::MERGE_FAILED;
+    }
+
+    cv.notify_all();
+}
+
+void SnapshotHandler::MergeCompleted() {
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        io_state_ = MERGE_IO_TRANSITION::MERGE_COMPLETE;
+    }
+
+    cv.notify_all();
+}
+
+// This is invoked by worker threads.
+//
+// Worker threads are terminated either by two scenarios:
+//
+// 1: If dm-user device is destroyed
+// 2: We had an I/O failure when reading root partitions
+//
+// In case (1), this would be a graceful shutdown. In this case, merge
+// thread and RA thread should have _already_ terminated by this point. We will be
+// destroying the dm-user device only _after_ merge is completed.
+//
+// In case (2), if merge thread had started, then it will be
+// continuing to merge; however, since we had an I/O failure and the
+// I/O on root partitions are no longer served, we will terminate the
+// merge.
+//
+// This functions is about handling case (2)
+void SnapshotHandler::NotifyIOTerminated() {
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        io_state_ = MERGE_IO_TRANSITION::IO_TERMINATED;
+    }
+
+    cv.notify_all();
+}
+
+bool SnapshotHandler::IsIOTerminated() {
+    std::lock_guard<std::mutex> lock(lock_);
+    return (io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED);
+}
+
+// Invoked by RA thread
+void SnapshotHandler::ReadAheadIOFailed() {
+    {
+        std::lock_guard<std::mutex> lock(lock_);
+        io_state_ = MERGE_IO_TRANSITION::READ_AHEAD_FAILURE;
+    }
+
+    cv.notify_all();
+}
+
+void SnapshotHandler::WaitForMergeComplete() {
+    std::unique_lock<std::mutex> lock(lock_);
+    while (!(io_state_ == MERGE_IO_TRANSITION::MERGE_COMPLETE ||
+             io_state_ == MERGE_IO_TRANSITION::MERGE_FAILED ||
+             io_state_ == MERGE_IO_TRANSITION::IO_TERMINATED)) {
+        cv.wait(lock);
+    }
+}
+
+}  // namespace snapshot
+}  // namespace android
commit	228f6a099cea9e3030cedbace42772680f455ab1	[log] [tgz]
author	Akilesh Kailash <akailash@google.com>	Tue Aug 17 07:19:54 2021 +0000
committer	Akilesh Kailash <akailash@google.com>	Thu Oct 07 07:09:28 2021 +0000
tree	71cbad46e9ce8efca60d355cb4d100a0a3a9f475
parent	46e15bd18b737df683d315aea6a3fc0af48d49b1 [diff]