Merge "Add checking for sparse file format"

commit: 4915f0416598b29b98b352a7b5f2982f2c035f6c [log] [tgz]
author: Keith Mok <keithmok@google.com> Fri Jan 14 00:38:35 2022 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Fri Jan 14 00:38:35 2022 +0000
tree: d81a045702f87e14a1b1c3207cc1b05d475de0bf
parent: c54ddc2b72b1af1540cc2a8d58fa664183b6c163 [diff]
parent: a3b72067de65d37d8b646e699b900e984d70165d [diff]
diff --git a/debuggerd/libdebuggerd/gwp_asan.cpp b/debuggerd/libdebuggerd/gwp_asan.cpp
index 3ee309f..ed2b974 100644
--- a/debuggerd/libdebuggerd/gwp_asan.cpp
+++ b/debuggerd/libdebuggerd/gwp_asan.cpp

@@ -147,7 +147,7 @@
   for (size_t i = 0; i != num_frames; ++i) {
     unwindstack::FrameData frame_data = unwinder->BuildFrameFromPcOnly(frames[i]);
     BacktraceFrame* f = heap_object->add_allocation_backtrace();
-    fill_in_backtrace_frame(f, frame_data, unwinder->GetMaps());
+    fill_in_backtrace_frame(f, frame_data);
   }
 
   heap_object->set_deallocation_tid(__gwp_asan_get_deallocation_thread_id(responsible_allocation_));
@@ -156,7 +156,7 @@
   for (size_t i = 0; i != num_frames; ++i) {
     unwindstack::FrameData frame_data = unwinder->BuildFrameFromPcOnly(frames[i]);
     BacktraceFrame* f = heap_object->add_deallocation_backtrace();
-    fill_in_backtrace_frame(f, frame_data, unwinder->GetMaps());
+    fill_in_backtrace_frame(f, frame_data);
   }
 
   set_human_readable_cause(cause, crash_address_);

diff --git a/debuggerd/libdebuggerd/include/libdebuggerd/tombstone.h b/debuggerd/libdebuggerd/include/libdebuggerd/tombstone.h
index 2331f1e..7bf1688 100644
--- a/debuggerd/libdebuggerd/include/libdebuggerd/tombstone.h
+++ b/debuggerd/libdebuggerd/include/libdebuggerd/tombstone.h

@@ -37,7 +37,6 @@
 
 namespace unwindstack {
 struct FrameData;
-class Maps;
 class Unwinder;
 }
 
@@ -68,8 +67,7 @@
     const Tombstone& tombstone,
     std::function<void(const std::string& line, bool should_log)> callback);
 
-void fill_in_backtrace_frame(BacktraceFrame* f, const unwindstack::FrameData& frame,
-                             unwindstack::Maps* maps);
+void fill_in_backtrace_frame(BacktraceFrame* f, const unwindstack::FrameData& frame);
 void set_human_readable_cause(Cause* cause, uint64_t fault_addr);
 
 #endif  // _DEBUGGERD_TOMBSTONE_H

diff --git a/debuggerd/libdebuggerd/scudo.cpp b/debuggerd/libdebuggerd/scudo.cpp
index a89f385..a2933f2 100644
--- a/debuggerd/libdebuggerd/scudo.cpp
+++ b/debuggerd/libdebuggerd/scudo.cpp

@@ -108,7 +108,7 @@
   for (size_t i = 0; i < arraysize(report->allocation_trace) && report->allocation_trace[i]; ++i) {
     unwindstack::FrameData frame_data = unwinder->BuildFrameFromPcOnly(report->allocation_trace[i]);
     BacktraceFrame* f = heap_object->add_allocation_backtrace();
-    fill_in_backtrace_frame(f, frame_data, unwinder->GetMaps());
+    fill_in_backtrace_frame(f, frame_data);
   }
 
   heap_object->set_deallocation_tid(report->deallocation_tid);
@@ -117,7 +117,7 @@
     unwindstack::FrameData frame_data =
         unwinder->BuildFrameFromPcOnly(report->deallocation_trace[i]);
     BacktraceFrame* f = heap_object->add_deallocation_backtrace();
-    fill_in_backtrace_frame(f, frame_data, unwinder->GetMaps());
+    fill_in_backtrace_frame(f, frame_data);
   }
 
   set_human_readable_cause(cause, untagged_fault_addr_);

diff --git a/debuggerd/libdebuggerd/tombstone_proto.cpp b/debuggerd/libdebuggerd/tombstone_proto.cpp
index b1c4ef3..714d6b2 100644
--- a/debuggerd/libdebuggerd/tombstone_proto.cpp
+++ b/debuggerd/libdebuggerd/tombstone_proto.cpp

@@ -312,8 +312,7 @@
   }
 }
 
-void fill_in_backtrace_frame(BacktraceFrame* f, const unwindstack::FrameData& frame,
-                             unwindstack::Maps* maps) {
+void fill_in_backtrace_frame(BacktraceFrame* f, const unwindstack::FrameData& frame) {
   f->set_rel_pc(frame.rel_pc);
   f->set_pc(frame.pc);
   f->set_sp(frame.sp);
@@ -331,21 +330,20 @@
 
   f->set_function_offset(frame.function_offset);
 
-  if (frame.map_start == frame.map_end) {
+  if (frame.map_info == nullptr) {
     // No valid map associated with this frame.
     f->set_file_name("<unknown>");
-  } else if (!frame.map_name.empty()) {
-    f->set_file_name(frame.map_name);
+    return;
+  }
+
+  if (!frame.map_info->name().empty()) {
+    f->set_file_name(frame.map_info->GetFullName());
   } else {
-    f->set_file_name(StringPrintf("<anonymous:%" PRIx64 ">", frame.map_start));
+    f->set_file_name(StringPrintf("<anonymous:%" PRIx64 ">", frame.map_info->start()));
   }
+  f->set_file_map_offset(frame.map_info->elf_start_offset());
 
-  f->set_file_map_offset(frame.map_elf_start_offset);
-
-  auto map_info = maps->Find(frame.map_start);
-  if (map_info.get() != nullptr) {
-    f->set_build_id(map_info->GetPrintableBuildID());
-  }
+  f->set_build_id(frame.map_info->GetPrintableBuildID());
 }
 
 static void dump_thread(Tombstone* tombstone, unwindstack::Unwinder* unwinder,
@@ -434,7 +432,7 @@
     unwinder->SetDisplayBuildID(true);
     for (const auto& frame : unwinder->frames()) {
       BacktraceFrame* f = thread.add_current_backtrace();
-      fill_in_backtrace_frame(f, frame, maps);
+      fill_in_backtrace_frame(f, frame);
     }
   }
 

diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp
index 18a9d22..f3de2b4 100644
--- a/fs_mgr/libsnapshot/snapshot.cpp
+++ b/fs_mgr/libsnapshot/snapshot.cpp

@@ -1467,6 +1467,14 @@
     }
 
     RemoveAllUpdateState(lock);
+
+    if (UpdateUsesUserSnapshots(lock) && !device()->IsTestDevice()) {
+        if (snapuserd_client_) {
+            snapuserd_client_->DetachSnapuserd();
+            snapuserd_client_->CloseConnection();
+            snapuserd_client_ = nullptr;
+        }
+    }
 }
 
 void SnapshotManager::AcknowledgeMergeFailure(MergeFailureCode failure_code) {
@@ -3200,7 +3208,7 @@
 
             // Terminate stale daemon if any
             std::unique_ptr<SnapuserdClient> snapuserd_client =
-                    SnapuserdClient::Connect(kSnapuserdSocket, 10s);
+                    SnapuserdClient::Connect(kSnapuserdSocket, 5s);
             if (snapuserd_client) {
                 snapuserd_client->DetachSnapuserd();
                 snapuserd_client->CloseConnection();

diff --git a/fs_mgr/libsnapshot/snapshot_test.cpp b/fs_mgr/libsnapshot/snapshot_test.cpp
index 11cebe1..d76558b 100644
--- a/fs_mgr/libsnapshot/snapshot_test.cpp
+++ b/fs_mgr/libsnapshot/snapshot_test.cpp

@@ -54,6 +54,8 @@
 #include <libsnapshot/mock_snapshot.h>
 
 DEFINE_string(force_config, "", "Force testing mode (dmsnap, vab, vabc) ignoring device config.");
+DEFINE_string(force_iouring_disable, "",
+              "Force testing mode (iouring_disabled) - disable io_uring");
 
 namespace android {
 namespace snapshot {
@@ -2769,10 +2771,22 @@
         }
     }
 
+    if (FLAGS_force_iouring_disable == "iouring_disabled") {
+        if (!android::base::SetProperty("snapuserd.test.io_uring.force_disable", "1")) {
+            return testing::AssertionFailure()
+                   << "Failed to disable property: snapuserd.test.io_uring.disabled";
+        }
+    }
+
     int ret = RUN_ALL_TESTS();
 
     if (FLAGS_force_config == "dmsnap") {
         android::base::SetProperty("snapuserd.test.dm.snapshots", "0");
     }
+
+    if (FLAGS_force_iouring_disable == "iouring_disabled") {
+        android::base::SetProperty("snapuserd.test.io_uring.force_disable", "0");
+    }
+
     return ret;
 }

diff --git a/fs_mgr/libsnapshot/snapuserd/Android.bp b/fs_mgr/libsnapshot/snapuserd/Android.bp
index 84bcb94..bc2bceb 100644
--- a/fs_mgr/libsnapshot/snapuserd/Android.bp
+++ b/fs_mgr/libsnapshot/snapuserd/Android.bp

@@ -86,7 +86,9 @@
         "libsnapshot_cow",
         "libz",
         "libext4_utils",
+        "liburing",
     ],
+    include_dirs: ["bionic/libc/kernel"],
 }
 
 cc_binary {
@@ -182,7 +184,10 @@
         "libfs_mgr",
         "libdm",
         "libext4_utils",
+        "liburing",
+        "libgflags",
     ],
+    include_dirs: ["bionic/libc/kernel"],
     header_libs: [
         "libstorage_literals_headers",
         "libfiemap_headers",

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
index 95d95cd..5109d82 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp

@@ -16,6 +16,10 @@
 
 #include "snapuserd_core.h"
 
+#include <sys/utsname.h>
+
+#include <android-base/properties.h>
+#include <android-base/scopeguard.h>
 #include <android-base/strings.h>
 
 namespace android {
@@ -288,6 +292,136 @@
     return ReadMetadata();
 }
 
+void SnapshotHandler::FinalizeIouring() {
+    io_uring_queue_exit(ring_.get());
+}
+
+bool SnapshotHandler::InitializeIouring(int io_depth) {
+    ring_ = std::make_unique<struct io_uring>();
+
+    int ret = io_uring_queue_init(io_depth, ring_.get(), 0);
+    if (ret) {
+        LOG(ERROR) << "io_uring_queue_init failed with ret: " << ret;
+        return false;
+    }
+
+    LOG(INFO) << "io_uring_queue_init success with io_depth: " << io_depth;
+    return true;
+}
+
+bool SnapshotHandler::ReadBlocksAsync(const std::string& dm_block_device,
+                                      const std::string& partition_name, size_t size) {
+    // 64k block size with io_depth of 64 is optimal
+    // for a single thread. We just need a single thread
+    // to read all the blocks from all dynamic partitions.
+    size_t io_depth = 64;
+    size_t bs = (64 * 1024);
+
+    if (!InitializeIouring(io_depth)) {
+        return false;
+    }
+
+    LOG(INFO) << "ReadBlockAsync start "
+              << " Block-device: " << dm_block_device << " Partition-name: " << partition_name
+              << " Size: " << size;
+
+    auto scope_guard = android::base::make_scope_guard([this]() -> void { FinalizeIouring(); });
+
+    std::vector<std::unique_ptr<struct iovec>> vecs;
+    using AlignedBuf = std::unique_ptr<void, decltype(free)*>;
+    std::vector<AlignedBuf> alignedBufVector;
+
+    /*
+     * TODO: We need aligned memory for DIRECT-IO. However, if we do
+     * a DIRECT-IO and verify the blocks then we need to inform
+     * update-verifier that block verification has been done and
+     * there is no need to repeat the same. We are not there yet
+     * as we need to see if there are any boot time improvements doing
+     * a DIRECT-IO.
+     *
+     * Also, we could you the same function post merge for block verification;
+     * again, we can do a DIRECT-IO instead of thrashing page-cache and
+     * hurting other applications.
+     *
+     * For now, we will just create aligned buffers but rely on buffered
+     * I/O until we have perf numbers to justify DIRECT-IO.
+     */
+    for (int i = 0; i < io_depth; i++) {
+        auto iovec = std::make_unique<struct iovec>();
+        vecs.push_back(std::move(iovec));
+
+        struct iovec* iovec_ptr = vecs[i].get();
+
+        if (posix_memalign(&iovec_ptr->iov_base, BLOCK_SZ, bs)) {
+            LOG(ERROR) << "posix_memalign failed";
+            return false;
+        }
+
+        iovec_ptr->iov_len = bs;
+        alignedBufVector.push_back(
+                std::unique_ptr<void, decltype(free)*>(iovec_ptr->iov_base, free));
+    }
+
+    android::base::unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY)));
+    if (fd.get() == -1) {
+        SNAP_PLOG(ERROR) << "File open failed - block-device " << dm_block_device
+                         << " partition-name: " << partition_name;
+        return false;
+    }
+
+    loff_t offset = 0;
+    size_t remain = size;
+    size_t read_sz = io_depth * bs;
+
+    while (remain > 0) {
+        size_t to_read = std::min(remain, read_sz);
+        size_t queue_size = to_read / bs;
+
+        for (int i = 0; i < queue_size; i++) {
+            struct io_uring_sqe* sqe = io_uring_get_sqe(ring_.get());
+            if (!sqe) {
+                SNAP_LOG(ERROR) << "io_uring_get_sqe() failed";
+                return false;
+            }
+
+            struct iovec* iovec_ptr = vecs[i].get();
+
+            io_uring_prep_read(sqe, fd.get(), iovec_ptr->iov_base, iovec_ptr->iov_len, offset);
+            sqe->flags |= IOSQE_ASYNC;
+            offset += bs;
+        }
+
+        int ret = io_uring_submit(ring_.get());
+        if (ret != queue_size) {
+            SNAP_LOG(ERROR) << "submit got: " << ret << " wanted: " << queue_size;
+            return false;
+        }
+
+        for (int i = 0; i < queue_size; i++) {
+            struct io_uring_cqe* cqe;
+
+            int ret = io_uring_wait_cqe(ring_.get(), &cqe);
+            if (ret) {
+                SNAP_PLOG(ERROR) << "wait_cqe failed" << ret;
+                return false;
+            }
+
+            if (cqe->res < 0) {
+                SNAP_LOG(ERROR) << "io failed with res: " << cqe->res;
+                return false;
+            }
+            io_uring_cqe_seen(ring_.get(), cqe);
+        }
+
+        remain -= to_read;
+    }
+
+    LOG(INFO) << "ReadBlockAsync complete: "
+              << " Block-device: " << dm_block_device << " Partition-name: " << partition_name
+              << " Size: " << size;
+    return true;
+}
+
 void SnapshotHandler::ReadBlocksToCache(const std::string& dm_block_device,
                                         const std::string& partition_name, off_t offset,
                                         size_t size) {
@@ -344,17 +478,22 @@
         return;
     }
 
-    int num_threads = 2;
-    size_t num_blocks = dev_sz >> BLOCK_SHIFT;
-    size_t num_blocks_per_thread = num_blocks / num_threads;
-    size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
-    off_t offset = 0;
+    if (IsIouringSupported()) {
+        std::async(std::launch::async, &SnapshotHandler::ReadBlocksAsync, this, dm_block_device,
+                   partition_name, dev_sz);
+    } else {
+        int num_threads = 2;
+        size_t num_blocks = dev_sz >> BLOCK_SHIFT;
+        size_t num_blocks_per_thread = num_blocks / num_threads;
+        size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
+        off_t offset = 0;
 
-    for (int i = 0; i < num_threads; i++) {
-        std::async(std::launch::async, &SnapshotHandler::ReadBlocksToCache, this, dm_block_device,
-                   partition_name, offset, read_sz_per_thread);
+        for (int i = 0; i < num_threads; i++) {
+            std::async(std::launch::async, &SnapshotHandler::ReadBlocksToCache, this,
+                       dm_block_device, partition_name, offset, read_sz_per_thread);
 
-        offset += read_sz_per_thread;
+            offset += read_sz_per_thread;
+        }
     }
 }
 
@@ -513,5 +652,33 @@
     return ra_state;
 }
 
+bool SnapshotHandler::IsIouringSupported() {
+    struct utsname uts;
+    unsigned int major, minor;
+
+    if (android::base::GetBoolProperty("snapuserd.test.io_uring.force_disable", false)) {
+        SNAP_LOG(INFO) << "io_uring disabled for testing";
+        return false;
+    }
+
+    if ((uname(&uts) != 0) || (sscanf(uts.release, "%u.%u", &major, &minor) != 2)) {
+        SNAP_LOG(ERROR) << "Could not parse the kernel version from uname. "
+                        << " io_uring not supported";
+        return false;
+    }
+
+    // We will only support kernels from 5.6 onwards as IOSQE_ASYNC flag and
+    // IO_URING_OP_READ/WRITE opcodes were introduced only on 5.6 kernel
+    if (major >= 5) {
+        if (major == 5 && minor < 6) {
+            return false;
+        }
+    } else {
+        return false;
+    }
+
+    return android::base::GetBoolProperty("ro.virtual_ab.io_uring.enabled", false);
+}
+
 }  // namespace snapshot
 }  // namespace android

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
index 1953316..b0f2d65 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h

@@ -39,6 +39,7 @@
 #include <libdm/dm.h>
 #include <libsnapshot/cow_reader.h>
 #include <libsnapshot/cow_writer.h>
+#include <liburing.h>
 #include <snapuserd/snapuserd_buffer.h>
 #include <snapuserd/snapuserd_kernel.h>
 
@@ -113,6 +114,19 @@
     bool ReconstructDataFromCow();
     void CheckOverlap(const CowOperation* cow_op);
 
+    bool ReadAheadAsyncIO();
+    bool ReapIoCompletions(int pending_ios_to_complete);
+    bool ReadXorData(size_t block_index, size_t xor_op_index,
+                     std::vector<const CowOperation*>& xor_op_vec);
+    void ProcessXorData(size_t& block_xor_index, size_t& xor_index,
+                        std::vector<const CowOperation*>& xor_op_vec, void* buffer,
+                        loff_t& buffer_offset);
+    void UpdateScratchMetadata();
+
+    bool ReadAheadSyncIO();
+    bool InitializeIouring();
+    void FinalizeIouring();
+
     void* read_ahead_buffer_;
     void* metadata_buffer_;
 
@@ -131,7 +145,19 @@
     std::unordered_set<uint64_t> dest_blocks_;
     std::unordered_set<uint64_t> source_blocks_;
     bool overlap_;
+    std::vector<uint64_t> blocks_;
+    int total_blocks_merged_ = 0;
+    std::unique_ptr<uint8_t[]> ra_temp_buffer_;
+    std::unique_ptr<uint8_t[]> ra_temp_meta_buffer_;
     BufferSink bufsink_;
+
+    bool read_ahead_async_ = false;
+    // Queue depth of 32 seems optimal. We don't want
+    // to have a huge depth as it may put more memory pressure
+    // on the kernel worker threads given that we use
+    // IOSQE_ASYNC flag.
+    int queue_depth_ = 32;
+    std::unique_ptr<struct io_uring> ring_;
 };
 
 class Worker {
@@ -185,6 +211,7 @@
     // Merge related ops
     bool Merge();
     bool MergeOrderedOps(const std::unique_ptr<ICowOpIter>& cowop_iter);
+    bool MergeOrderedOpsAsync(const std::unique_ptr<ICowOpIter>& cowop_iter);
     bool MergeReplaceZeroOps(const std::unique_ptr<ICowOpIter>& cowop_iter);
     int PrepareMerge(uint64_t* source_offset, int* pending_ops,
                      const std::unique_ptr<ICowOpIter>& cowop_iter,
@@ -193,6 +220,9 @@
     sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
     chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
 
+    bool InitializeIouring();
+    void FinalizeIouring();
+
     std::unique_ptr<CowReader> reader_;
     BufferSink bufsink_;
     XorSink xorsink_;
@@ -208,6 +238,14 @@
     unique_fd base_path_merge_fd_;
     unique_fd ctrl_fd_;
 
+    bool merge_async_ = false;
+    // Queue depth of 32 seems optimal. We don't want
+    // to have a huge depth as it may put more memory pressure
+    // on the kernel worker threads given that we use
+    // IOSQE_ASYNC flag.
+    int queue_depth_ = 32;
+    std::unique_ptr<struct io_uring> ring_;
+
     std::shared_ptr<SnapshotHandler> snapuserd_;
 };
 
@@ -292,6 +330,8 @@
     bool GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t block, void* buffer);
     MERGE_GROUP_STATE ProcessMergingBlock(uint64_t new_block, void* buffer);
 
+    bool IsIouringSupported();
+
   private:
     bool ReadMetadata();
     sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
@@ -304,6 +344,11 @@
     void ReadBlocksToCache(const std::string& dm_block_device, const std::string& partition_name,
                            off_t offset, size_t size);
 
+    bool InitializeIouring(int io_depth);
+    void FinalizeIouring();
+    bool ReadBlocksAsync(const std::string& dm_block_device, const std::string& partition_name,
+                         size_t size);
+
     // COW device
     std::string cow_device_;
     // Source device
@@ -352,6 +397,8 @@
     bool attached_ = false;
     bool is_socket_present_;
     bool scratch_space_ = false;
+
+    std::unique_ptr<struct io_uring> ring_;
 };
 
 }  // namespace snapshot

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp
index fa055b7..d4d4efe 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_merge.cpp

@@ -72,16 +72,16 @@
 }
 
 bool Worker::MergeReplaceZeroOps(const std::unique_ptr<ICowOpIter>& cowop_iter) {
-    // Flush every 2048 ops. Since all ops are independent and there is no
+    // Flush every 8192 ops. Since all ops are independent and there is no
     // dependency between COW ops, we will flush the data and the number
-    // of ops merged in COW file for every 2048 ops. If there is a crash,
+    // of ops merged in COW file for every 8192 ops. If there is a crash,
     // we will end up replaying some of the COW ops which were already merged.
     // That is ok.
     //
-    // Why 2048 ops ? We can probably increase this to bigger value but just
-    // need to ensure that merge makes forward progress if there are
-    // crashes repeatedly which is highly unlikely.
-    int total_ops_merged_per_commit = (PAYLOAD_BUFFER_SZ / BLOCK_SZ) * 8;
+    // Why 8192 ops ? Increasing this may improve merge time 3-4 seconds but
+    // we need to make sure that we checkpoint; 8k ops seems optimal. In-case
+    // if there is a crash merge should always make forward progress.
+    int total_ops_merged_per_commit = (PAYLOAD_BUFFER_SZ / BLOCK_SZ) * 32;
     int num_ops_merged = 0;
 
     while (!cowop_iter->Done()) {
@@ -128,7 +128,7 @@
 
         num_ops_merged += linear_blocks;
 
-        if (num_ops_merged == total_ops_merged_per_commit) {
+        if (num_ops_merged >= total_ops_merged_per_commit) {
             // Flush the data
             if (fsync(base_path_merge_fd_.get()) < 0) {
                 SNAP_LOG(ERROR) << "Merge: ReplaceZeroOps: Failed to fsync merged data";
@@ -172,6 +172,173 @@
     return true;
 }
 
+bool Worker::MergeOrderedOpsAsync(const std::unique_ptr<ICowOpIter>& cowop_iter) {
+    void* mapped_addr = snapuserd_->GetMappedAddr();
+    void* read_ahead_buffer =
+            static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
+    size_t block_index = 0;
+
+    SNAP_LOG(INFO) << "MergeOrderedOpsAsync started....";
+
+    while (!cowop_iter->Done()) {
+        const CowOperation* cow_op = &cowop_iter->Get();
+        if (!IsOrderedOp(*cow_op)) {
+            break;
+        }
+
+        SNAP_LOG(DEBUG) << "Waiting for merge begin...";
+        // Wait for RA thread to notify that the merge window
+        // is ready for merging.
+        if (!snapuserd_->WaitForMergeBegin()) {
+            snapuserd_->SetMergeFailed(block_index);
+            return false;
+        }
+
+        snapuserd_->SetMergeInProgress(block_index);
+
+        loff_t offset = 0;
+        int num_ops = snapuserd_->GetTotalBlocksToMerge();
+
+        int pending_sqe = queue_depth_;
+        int pending_ios_to_submit = 0;
+        bool flush_required = false;
+
+        SNAP_LOG(DEBUG) << "Merging copy-ops of size: " << num_ops;
+        while (num_ops) {
+            uint64_t source_offset;
+
+            int linear_blocks = PrepareMerge(&source_offset, &num_ops, cowop_iter);
+
+            if (linear_blocks != 0) {
+                size_t io_size = (linear_blocks * BLOCK_SZ);
+
+                // Get an SQE entry from the ring and populate the I/O variables
+                struct io_uring_sqe* sqe = io_uring_get_sqe(ring_.get());
+                if (!sqe) {
+                    SNAP_PLOG(ERROR) << "io_uring_get_sqe failed during merge-ordered ops";
+                    snapuserd_->SetMergeFailed(block_index);
+                    return false;
+                }
+
+                io_uring_prep_write(sqe, base_path_merge_fd_.get(),
+                                    (char*)read_ahead_buffer + offset, io_size, source_offset);
+
+                offset += io_size;
+                num_ops -= linear_blocks;
+
+                pending_sqe -= 1;
+                pending_ios_to_submit += 1;
+                sqe->flags |= IOSQE_ASYNC;
+            }
+
+            // Ring is full or no more COW ops to be merged in this batch
+            if (pending_sqe == 0 || num_ops == 0 || (linear_blocks == 0 && pending_ios_to_submit)) {
+                // If this is a last set of COW ops to be merged in this batch, we need
+                // to sync the merged data. We will try to grab an SQE entry
+                // and set the FSYNC command; additionally, make sure that
+                // the fsync is done after all the I/O operations queued
+                // in the ring is completed by setting IOSQE_IO_DRAIN.
+                //
+                // If there is no space in the ring, we will flush it later
+                // by explicitly calling fsync() system call.
+                if (num_ops == 0 || (linear_blocks == 0 && pending_ios_to_submit)) {
+                    if (pending_sqe != 0) {
+                        struct io_uring_sqe* sqe = io_uring_get_sqe(ring_.get());
+                        if (!sqe) {
+                            // very unlikely but let's continue and not fail the
+                            // merge - we will flush it later
+                            SNAP_PLOG(ERROR) << "io_uring_get_sqe failed during merge-ordered ops";
+                            flush_required = true;
+                        } else {
+                            io_uring_prep_fsync(sqe, base_path_merge_fd_.get(), 0);
+                            // Drain the queue before fsync
+                            io_uring_sqe_set_flags(sqe, IOSQE_IO_DRAIN);
+                            pending_sqe -= 1;
+                            flush_required = false;
+                            pending_ios_to_submit += 1;
+                            sqe->flags |= IOSQE_ASYNC;
+                        }
+                    } else {
+                        flush_required = true;
+                    }
+                }
+
+                // Submit the IO for all the COW ops in a single syscall
+                int ret = io_uring_submit(ring_.get());
+                if (ret != pending_ios_to_submit) {
+                    SNAP_PLOG(ERROR)
+                            << "io_uring_submit failed for read-ahead: "
+                            << " io submit: " << ret << " expected: " << pending_ios_to_submit;
+                    snapuserd_->SetMergeFailed(block_index);
+                    return false;
+                }
+
+                int pending_ios_to_complete = pending_ios_to_submit;
+                pending_ios_to_submit = 0;
+
+                // Reap I/O completions
+                while (pending_ios_to_complete) {
+                    struct io_uring_cqe* cqe;
+
+                    ret = io_uring_wait_cqe(ring_.get(), &cqe);
+                    if (ret) {
+                        SNAP_LOG(ERROR) << "Read-ahead - io_uring_wait_cqe failed: " << ret;
+                        snapuserd_->SetMergeFailed(block_index);
+                        return false;
+                    }
+
+                    if (cqe->res < 0) {
+                        SNAP_LOG(ERROR)
+                                << "Read-ahead - io_uring_Wait_cqe failed with res: " << cqe->res;
+                        snapuserd_->SetMergeFailed(block_index);
+                        return false;
+                    }
+
+                    io_uring_cqe_seen(ring_.get(), cqe);
+                    pending_ios_to_complete -= 1;
+                }
+
+                pending_sqe = queue_depth_;
+            }
+
+            if (linear_blocks == 0) {
+                break;
+            }
+        }
+
+        // Verify all ops are merged
+        CHECK(num_ops == 0);
+
+        // Flush the data
+        if (flush_required && (fsync(base_path_merge_fd_.get()) < 0)) {
+            SNAP_LOG(ERROR) << " Failed to fsync merged data";
+            snapuserd_->SetMergeFailed(block_index);
+            return false;
+        }
+
+        // Merge is done and data is on disk. Update the COW Header about
+        // the merge completion
+        if (!snapuserd_->CommitMerge(snapuserd_->GetTotalBlocksToMerge())) {
+            SNAP_LOG(ERROR) << " Failed to commit the merged block in the header";
+            snapuserd_->SetMergeFailed(block_index);
+            return false;
+        }
+
+        SNAP_LOG(DEBUG) << "Block commit of size: " << snapuserd_->GetTotalBlocksToMerge();
+        // Mark the block as merge complete
+        snapuserd_->SetMergeCompleted(block_index);
+
+        // Notify RA thread that the merge thread is ready to merge the next
+        // window
+        snapuserd_->NotifyRAForMergeReady();
+
+        // Get the next block
+        block_index += 1;
+    }
+
+    return true;
+}
+
 bool Worker::MergeOrderedOps(const std::unique_ptr<ICowOpIter>& cowop_iter) {
     void* mapped_addr = snapuserd_->GetMappedAddr();
     void* read_ahead_buffer =
@@ -260,15 +427,23 @@
 bool Worker::Merge() {
     std::unique_ptr<ICowOpIter> cowop_iter = reader_->GetMergeOpIter();
 
-    // Start with Copy and Xor ops
-    if (!MergeOrderedOps(cowop_iter)) {
-        SNAP_LOG(ERROR) << "Merge failed for ordered ops";
-        snapuserd_->MergeFailed();
-        return false;
+    if (merge_async_) {
+        if (!MergeOrderedOpsAsync(cowop_iter)) {
+            SNAP_LOG(ERROR) << "Merge failed for ordered ops";
+            snapuserd_->MergeFailed();
+            return false;
+        }
+        SNAP_LOG(INFO) << "MergeOrderedOpsAsync completed.....";
+    } else {
+        // Start with Copy and Xor ops
+        if (!MergeOrderedOps(cowop_iter)) {
+            SNAP_LOG(ERROR) << "Merge failed for ordered ops";
+            snapuserd_->MergeFailed();
+            return false;
+        }
+        SNAP_LOG(INFO) << "MergeOrderedOps completed.....";
     }
 
-    SNAP_LOG(INFO) << "MergeOrderedOps completed...";
-
     // Replace and Zero ops
     if (!MergeReplaceZeroOps(cowop_iter)) {
         SNAP_LOG(ERROR) << "Merge failed for replace/zero ops";
@@ -281,6 +456,31 @@
     return true;
 }
 
+bool Worker::InitializeIouring() {
+    if (!snapuserd_->IsIouringSupported()) {
+        return false;
+    }
+
+    ring_ = std::make_unique<struct io_uring>();
+
+    int ret = io_uring_queue_init(queue_depth_, ring_.get(), 0);
+    if (ret) {
+        LOG(ERROR) << "Merge: io_uring_queue_init failed with ret: " << ret;
+        return false;
+    }
+
+    merge_async_ = true;
+
+    LOG(INFO) << "Merge: io_uring initialized with queue depth: " << queue_depth_;
+    return true;
+}
+
+void Worker::FinalizeIouring() {
+    if (merge_async_) {
+        io_uring_queue_exit(ring_.get());
+    }
+}
+
 bool Worker::RunMergeThread() {
     SNAP_LOG(DEBUG) << "Waiting for merge begin...";
     if (!snapuserd_->WaitForMergeBegin()) {
@@ -296,10 +496,13 @@
         return false;
     }
 
+    InitializeIouring();
+
     if (!Merge()) {
         return false;
     }
 
+    FinalizeIouring();
     CloseFds();
     reader_->CloseCowFd();
 

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp
index 9e8ccfb..26c5f19 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp

@@ -183,25 +183,311 @@
     return true;
 }
 
-bool ReadAhead::ReadAheadIOStart() {
-    // Check if the data has to be constructed from the COW file.
-    // This will be true only once during boot up after a crash
-    // during merge.
-    if (snapuserd_->ShouldReconstructDataFromCow()) {
-        return ReconstructDataFromCow();
-    }
+/*
+ * With io_uring, the data flow is slightly different.
+ *
+ * The data flow is as follows:
+ *
+ * 1: Queue the I/O requests to be read from backing source device.
+ * This is done by retrieving the SQE entry from ring and populating
+ * the SQE entry. Note that the I/O is not submitted yet.
+ *
+ * 2: Once the ring is full (aka queue_depth), we will submit all
+ * the queued I/O request with a single system call. This essentially
+ * cuts down "queue_depth" number of system calls to a single system call.
+ *
+ * 3: Once the I/O is submitted, user-space thread will now work
+ * on processing the XOR Operations. This happens in parallel when
+ * I/O requests are submitted to the kernel. This is ok because, for XOR
+ * operations, we first need to retrieve the compressed data form COW block
+ * device. Thus, we have offloaded the backing source I/O to the kernel
+ * and user-space is parallely working on fetching the data for XOR operations.
+ *
+ * 4: After the XOR operations are read from COW device, poll the completion
+ * queue for all the I/O submitted. If the I/O's were already completed,
+ * then user-space thread will just read the CQE requests from the ring
+ * without doing any system call. If none of the I/O were completed yet,
+ * user-space thread will do a system call and wait for I/O completions.
+ *
+ * Flow diagram:
+ *                                                    SQ-RING
+ *  SQE1 <----------- Fetch SQE1 Entry ---------- |SQE1||SQE2|SQE3|
+ *
+ *  SQE1  ------------ Populate SQE1 Entry ------> |SQE1-X||SQE2|SQE3|
+ *
+ *  SQE2 <----------- Fetch SQE2 Entry ---------- |SQE1-X||SQE2|SQE3|
+ *
+ *  SQE2  ------------ Populate SQE2 Entry ------> |SQE1-X||SQE2-X|SQE3|
+ *
+ *  SQE3 <----------- Fetch SQE3 Entry ---------- |SQE1-X||SQE2-X|SQE3|
+ *
+ *  SQE3  ------------ Populate SQE3 Entry ------> |SQE1-X||SQE2-X|SQE3-X|
+ *
+ *  Submit-IO ---------------------------------> |SQE1-X||SQE2-X|SQE3-X|
+ *     |                                                  |
+ *     |                                        Process I/O entries in kernel
+ *     |                                                  |
+ *  Retrieve XOR                                          |
+ *  data from COW                                         |
+ *     |                                                  |
+ *     |                                                  |
+ *  Fetch CQ completions
+ *     |                                              CQ-RING
+ *                                               |CQE1-X||CQE2-X|CQE3-X|
+ *                                                        |
+ *   CQE1 <------------Fetch CQE1 Entry          |CQE1||CQE2-X|CQE3-X|
+ *   CQE2 <------------Fetch CQE2 Entry          |CQE1||CQE2-|CQE3-X|
+ *   CQE3 <------------Fetch CQE3 Entry          |CQE1||CQE2-|CQE3-|
+ *    |
+ *    |
+ *  Continue Next set of operations in the RING
+ */
 
-    std::vector<uint64_t> blocks;
-
+bool ReadAhead::ReadAheadAsyncIO() {
     int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
     loff_t buffer_offset = 0;
-    int total_blocks_merged = 0;
+    total_blocks_merged_ = 0;
     overlap_ = false;
     dest_blocks_.clear();
     source_blocks_.clear();
+    blocks_.clear();
     std::vector<const CowOperation*> xor_op_vec;
 
-    auto ra_temp_buffer = std::make_unique<uint8_t[]>(snapuserd_->GetBufferDataSize());
+    int pending_sqe = queue_depth_;
+    int pending_ios_to_submit = 0;
+
+    size_t xor_op_index = 0;
+    size_t block_index = 0;
+
+    loff_t offset = 0;
+
+    bufsink_.ResetBufferOffset();
+
+    // Number of ops to be merged in this window. This is a fixed size
+    // except for the last window wherein the number of ops can be less
+    // than the size of the RA window.
+    while (num_ops) {
+        uint64_t source_offset;
+        struct io_uring_sqe* sqe;
+
+        int linear_blocks = PrepareNextReadAhead(&source_offset, &num_ops, blocks_, xor_op_vec);
+
+        if (linear_blocks != 0) {
+            size_t io_size = (linear_blocks * BLOCK_SZ);
+
+            // Get an SQE entry from the ring and populate the I/O variables
+            sqe = io_uring_get_sqe(ring_.get());
+            if (!sqe) {
+                SNAP_PLOG(ERROR) << "io_uring_get_sqe failed during read-ahead";
+                snapuserd_->ReadAheadIOFailed();
+                return false;
+            }
+
+            io_uring_prep_read(sqe, backing_store_fd_.get(),
+                               (char*)ra_temp_buffer_.get() + buffer_offset, io_size,
+                               source_offset);
+
+            buffer_offset += io_size;
+            num_ops -= linear_blocks;
+            total_blocks_merged_ += linear_blocks;
+
+            pending_sqe -= 1;
+            pending_ios_to_submit += 1;
+            sqe->flags |= IOSQE_ASYNC;
+        }
+
+        // pending_sqe == 0 : Ring is full
+        //
+        // num_ops == 0 : All the COW ops in this batch are processed - Submit
+        // pending I/O requests in the ring
+        //
+        // linear_blocks == 0 : All the COW ops processing is done. Submit
+        // pending I/O requests in the ring
+        if (pending_sqe == 0 || num_ops == 0 || (linear_blocks == 0 && pending_ios_to_submit)) {
+            // Submit the IO for all the COW ops in a single syscall
+            int ret = io_uring_submit(ring_.get());
+            if (ret != pending_ios_to_submit) {
+                SNAP_PLOG(ERROR) << "io_uring_submit failed for read-ahead: "
+                                 << " io submit: " << ret << " expected: " << pending_ios_to_submit;
+                snapuserd_->ReadAheadIOFailed();
+                return false;
+            }
+
+            int pending_ios_to_complete = pending_ios_to_submit;
+            pending_ios_to_submit = 0;
+
+            bool xor_processing_required = (xor_op_vec.size() > 0);
+
+            // Read XOR data from COW file in parallel when I/O's are in-flight
+            if (xor_processing_required && !ReadXorData(block_index, xor_op_index, xor_op_vec)) {
+                SNAP_LOG(ERROR) << "ReadXorData failed";
+                snapuserd_->ReadAheadIOFailed();
+                return false;
+            }
+
+            // Fetch I/O completions
+            if (!ReapIoCompletions(pending_ios_to_complete)) {
+                SNAP_LOG(ERROR) << "ReapIoCompletions failed";
+                snapuserd_->ReadAheadIOFailed();
+                return false;
+            }
+
+            // Retrieve XOR'ed data
+            if (xor_processing_required) {
+                ProcessXorData(block_index, xor_op_index, xor_op_vec, ra_temp_buffer_.get(),
+                               offset);
+            }
+
+            // All the I/O in the ring is processed.
+            pending_sqe = queue_depth_;
+        }
+
+        if (linear_blocks == 0) {
+            break;
+        }
+    }
+
+    // Done with merging ordered ops
+    if (RAIterDone() && total_blocks_merged_ == 0) {
+        return true;
+    }
+
+    CHECK(blocks_.size() == total_blocks_merged_);
+
+    UpdateScratchMetadata();
+
+    return true;
+}
+
+void ReadAhead::UpdateScratchMetadata() {
+    loff_t metadata_offset = 0;
+
+    struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
+            (char*)ra_temp_meta_buffer_.get() + metadata_offset);
+
+    bm->new_block = 0;
+    bm->file_offset = 0;
+
+    loff_t file_offset = snapuserd_->GetBufferDataOffset();
+
+    for (size_t block_index = 0; block_index < blocks_.size(); block_index++) {
+        uint64_t new_block = blocks_[block_index];
+        // Track the metadata blocks which are stored in scratch space
+        bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer_.get() +
+                                                       metadata_offset);
+
+        bm->new_block = new_block;
+        bm->file_offset = file_offset;
+
+        metadata_offset += sizeof(struct ScratchMetadata);
+        file_offset += BLOCK_SZ;
+    }
+
+    // This is important - explicitly set the contents to zero. This is used
+    // when re-constructing the data after crash. This indicates end of
+    // reading metadata contents when re-constructing the data
+    bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer_.get() +
+                                                   metadata_offset);
+    bm->new_block = 0;
+    bm->file_offset = 0;
+}
+
+bool ReadAhead::ReapIoCompletions(int pending_ios_to_complete) {
+    // Reap I/O completions
+    while (pending_ios_to_complete) {
+        struct io_uring_cqe* cqe;
+
+        int ret = io_uring_wait_cqe(ring_.get(), &cqe);
+        if (ret) {
+            SNAP_LOG(ERROR) << "Read-ahead - io_uring_wait_cqe failed: " << ret;
+            return false;
+        }
+
+        if (cqe->res < 0) {
+            SNAP_LOG(ERROR) << "Read-ahead - io_uring_Wait_cqe failed with res: " << cqe->res;
+            return false;
+        }
+
+        io_uring_cqe_seen(ring_.get(), cqe);
+        pending_ios_to_complete -= 1;
+    }
+
+    return true;
+}
+
+void ReadAhead::ProcessXorData(size_t& block_xor_index, size_t& xor_index,
+                               std::vector<const CowOperation*>& xor_op_vec, void* buffer,
+                               loff_t& buffer_offset) {
+    loff_t xor_buf_offset = 0;
+
+    while (block_xor_index < blocks_.size()) {
+        void* bufptr = static_cast<void*>((char*)buffer + buffer_offset);
+        uint64_t new_block = blocks_[block_xor_index];
+
+        if (xor_index < xor_op_vec.size()) {
+            const CowOperation* xor_op = xor_op_vec[xor_index];
+
+            // Check if this block is an XOR op
+            if (xor_op->new_block == new_block) {
+                // Pointer to the data read from base device
+                uint8_t* buffer = reinterpret_cast<uint8_t*>(bufptr);
+                // Get the xor'ed data read from COW device
+                uint8_t* xor_data = reinterpret_cast<uint8_t*>((char*)bufsink_.GetPayloadBufPtr() +
+                                                               xor_buf_offset);
+
+                for (size_t byte_offset = 0; byte_offset < BLOCK_SZ; byte_offset++) {
+                    buffer[byte_offset] ^= xor_data[byte_offset];
+                }
+
+                // Move to next XOR op
+                xor_index += 1;
+                xor_buf_offset += BLOCK_SZ;
+            }
+        }
+
+        buffer_offset += BLOCK_SZ;
+        block_xor_index += 1;
+    }
+
+    bufsink_.ResetBufferOffset();
+}
+
+bool ReadAhead::ReadXorData(size_t block_index, size_t xor_op_index,
+                            std::vector<const CowOperation*>& xor_op_vec) {
+    // Process the XOR ops in parallel - We will be reading data
+    // from COW file for XOR ops processing.
+    while (block_index < blocks_.size()) {
+        uint64_t new_block = blocks_[block_index];
+
+        if (xor_op_index < xor_op_vec.size()) {
+            const CowOperation* xor_op = xor_op_vec[xor_op_index];
+            if (xor_op->new_block == new_block) {
+                if (!reader_->ReadData(*xor_op, &bufsink_)) {
+                    SNAP_LOG(ERROR)
+                            << " ReadAhead - XorOp Read failed for block: " << xor_op->new_block;
+                    return false;
+                }
+
+                xor_op_index += 1;
+                bufsink_.UpdateBufferOffset(BLOCK_SZ);
+            }
+        }
+        block_index += 1;
+    }
+    return true;
+}
+
+bool ReadAhead::ReadAheadSyncIO() {
+    int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
+    loff_t buffer_offset = 0;
+    total_blocks_merged_ = 0;
+    overlap_ = false;
+    dest_blocks_.clear();
+    source_blocks_.clear();
+    blocks_.clear();
+    std::vector<const CowOperation*> xor_op_vec;
+
+    bufsink_.ResetBufferOffset();
 
     // Number of ops to be merged in this window. This is a fixed size
     // except for the last window wherein the number of ops can be less
@@ -209,7 +495,7 @@
     while (num_ops) {
         uint64_t source_offset;
 
-        int linear_blocks = PrepareNextReadAhead(&source_offset, &num_ops, blocks, xor_op_vec);
+        int linear_blocks = PrepareNextReadAhead(&source_offset, &num_ops, blocks_, xor_op_vec);
         if (linear_blocks == 0) {
             // No more blocks to read
             SNAP_LOG(DEBUG) << " Read-ahead completed....";
@@ -220,7 +506,7 @@
 
         // Read from the base device consecutive set of blocks in one shot
         if (!android::base::ReadFullyAtOffset(backing_store_fd_,
-                                              (char*)ra_temp_buffer.get() + buffer_offset, io_size,
+                                              (char*)ra_temp_buffer_.get() + buffer_offset, io_size,
                                               source_offset)) {
             SNAP_PLOG(ERROR) << "Ordered-op failed. Read from backing store: "
                              << backing_store_device_ << "at block :" << source_offset / BLOCK_SZ
@@ -233,21 +519,19 @@
         }
 
         buffer_offset += io_size;
-        total_blocks_merged += linear_blocks;
+        total_blocks_merged_ += linear_blocks;
         num_ops -= linear_blocks;
     }
 
     // Done with merging ordered ops
-    if (RAIterDone() && total_blocks_merged == 0) {
+    if (RAIterDone() && total_blocks_merged_ == 0) {
         return true;
     }
 
     loff_t metadata_offset = 0;
 
-    auto ra_temp_meta_buffer = std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
-
     struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
-            (char*)ra_temp_meta_buffer.get() + metadata_offset);
+            (char*)ra_temp_meta_buffer_.get() + metadata_offset);
 
     bm->new_block = 0;
     bm->file_offset = 0;
@@ -255,12 +539,15 @@
     loff_t file_offset = snapuserd_->GetBufferDataOffset();
 
     loff_t offset = 0;
-    CHECK(blocks.size() == total_blocks_merged);
+    CHECK(blocks_.size() == total_blocks_merged_);
 
     size_t xor_index = 0;
-    for (size_t block_index = 0; block_index < blocks.size(); block_index++) {
-        void* bufptr = static_cast<void*>((char*)ra_temp_buffer.get() + offset);
-        uint64_t new_block = blocks[block_index];
+    BufferSink bufsink;
+    bufsink.Initialize(BLOCK_SZ * 2);
+
+    for (size_t block_index = 0; block_index < blocks_.size(); block_index++) {
+        void* bufptr = static_cast<void*>((char*)ra_temp_buffer_.get() + offset);
+        uint64_t new_block = blocks_[block_index];
 
         if (xor_index < xor_op_vec.size()) {
             const CowOperation* xor_op = xor_op_vec[xor_index];
@@ -268,17 +555,16 @@
             // Check if this block is an XOR op
             if (xor_op->new_block == new_block) {
                 // Read the xor'ed data from COW
-                if (!reader_->ReadData(*xor_op, &bufsink_)) {
+                if (!reader_->ReadData(*xor_op, &bufsink)) {
                     SNAP_LOG(ERROR)
                             << " ReadAhead - XorOp Read failed for block: " << xor_op->new_block;
                     snapuserd_->ReadAheadIOFailed();
                     return false;
                 }
-
                 // Pointer to the data read from base device
                 uint8_t* buffer = reinterpret_cast<uint8_t*>(bufptr);
                 // Get the xor'ed data read from COW device
-                uint8_t* xor_data = reinterpret_cast<uint8_t*>(bufsink_.GetPayloadBufPtr());
+                uint8_t* xor_data = reinterpret_cast<uint8_t*>(bufsink.GetPayloadBufPtr());
 
                 // Retrieve the original data
                 for (size_t byte_offset = 0; byte_offset < BLOCK_SZ; byte_offset++) {
@@ -292,7 +578,7 @@
 
         offset += BLOCK_SZ;
         // Track the metadata blocks which are stored in scratch space
-        bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer.get() +
+        bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer_.get() +
                                                        metadata_offset);
 
         bm->new_block = new_block;
@@ -308,11 +594,34 @@
     // This is important - explicitly set the contents to zero. This is used
     // when re-constructing the data after crash. This indicates end of
     // reading metadata contents when re-constructing the data
-    bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer.get() +
+    bm = reinterpret_cast<struct ScratchMetadata*>((char*)ra_temp_meta_buffer_.get() +
                                                    metadata_offset);
     bm->new_block = 0;
     bm->file_offset = 0;
 
+    return true;
+}
+
+bool ReadAhead::ReadAheadIOStart() {
+    // Check if the data has to be constructed from the COW file.
+    // This will be true only once during boot up after a crash
+    // during merge.
+    if (snapuserd_->ShouldReconstructDataFromCow()) {
+        return ReconstructDataFromCow();
+    }
+
+    if (read_ahead_async_) {
+        if (!ReadAheadAsyncIO()) {
+            SNAP_LOG(ERROR) << "ReadAheadAsyncIO failed - io_uring processing failure.";
+            return false;
+        }
+    } else {
+        if (!ReadAheadSyncIO()) {
+            SNAP_LOG(ERROR) << "ReadAheadSyncIO failed";
+            return false;
+        }
+    }
+
     // Wait for the merge to finish for the previous RA window. We shouldn't
     // be touching the scratch space until merge is complete of previous RA
     // window. If there is a crash during this time frame, merge should resume
@@ -322,22 +631,22 @@
     }
 
     // Copy the data to scratch space
-    memcpy(metadata_buffer_, ra_temp_meta_buffer.get(), snapuserd_->GetBufferMetadataSize());
-    memcpy(read_ahead_buffer_, ra_temp_buffer.get(), total_blocks_merged * BLOCK_SZ);
+    memcpy(metadata_buffer_, ra_temp_meta_buffer_.get(), snapuserd_->GetBufferMetadataSize());
+    memcpy(read_ahead_buffer_, ra_temp_buffer_.get(), total_blocks_merged_ * BLOCK_SZ);
 
-    offset = 0;
+    loff_t offset = 0;
     std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
     read_ahead_buffer_map.clear();
 
-    for (size_t block_index = 0; block_index < blocks.size(); block_index++) {
+    for (size_t block_index = 0; block_index < blocks_.size(); block_index++) {
         void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + offset);
-        uint64_t new_block = blocks[block_index];
+        uint64_t new_block = blocks_[block_index];
 
         read_ahead_buffer_map[new_block] = bufptr;
         offset += BLOCK_SZ;
     }
 
-    snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged);
+    snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged_);
 
     // Flush the data only if we have a overlapping blocks in the region
     // Notify the Merge thread to resume merging this window
@@ -350,6 +659,33 @@
     return true;
 }
 
+bool ReadAhead::InitializeIouring() {
+    if (!snapuserd_->IsIouringSupported()) {
+        return false;
+    }
+
+    ring_ = std::make_unique<struct io_uring>();
+
+    int ret = io_uring_queue_init(queue_depth_, ring_.get(), 0);
+    if (ret) {
+        SNAP_LOG(ERROR) << "io_uring_queue_init failed with ret: " << ret;
+        return false;
+    }
+
+    // For xor ops processing
+    bufsink_.Initialize(PAYLOAD_BUFFER_SZ * 2);
+    read_ahead_async_ = true;
+
+    SNAP_LOG(INFO) << "Read-ahead: io_uring initialized with queue depth: " << queue_depth_;
+    return true;
+}
+
+void ReadAhead::FinalizeIouring() {
+    if (read_ahead_async_) {
+        io_uring_queue_exit(ring_.get());
+    }
+}
+
 bool ReadAhead::RunThread() {
     if (!InitializeFds()) {
         return false;
@@ -363,14 +699,18 @@
 
     InitializeRAIter();
 
+    InitializeIouring();
+
     while (!RAIterDone()) {
         if (!ReadAheadIOStart()) {
             break;
         }
     }
 
+    FinalizeIouring();
     CloseFds();
     reader_->CloseCowFd();
+
     SNAP_LOG(INFO) << " ReadAhead thread terminating....";
     return true;
 }
@@ -434,8 +774,9 @@
     metadata_buffer_ =
             static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
     read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
-    // For xor ops
-    bufsink_.Initialize(PAYLOAD_BUFFER_SZ);
+
+    ra_temp_buffer_ = std::make_unique<uint8_t[]>(snapuserd_->GetBufferDataSize());
+    ra_temp_meta_buffer_ = std::make_unique<uint8_t[]>(snapuserd_->GetBufferMetadataSize());
 }
 
 }  // namespace snapshot

diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp
index 1c3e04b..d670f1e 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp

@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <android-base/strings.h>
+#include <gflags/gflags.h>
+
 #include <fcntl.h>
 #include <linux/fs.h>
 #include <linux/memfd.h>
@@ -27,6 +30,7 @@
 #include <string_view>
 
 #include <android-base/file.h>
+#include <android-base/properties.h>
 #include <android-base/unique_fd.h>
 #include <fs_mgr/file_wait.h>
 #include <gtest/gtest.h>
@@ -38,6 +42,8 @@
 
 #include "snapuserd_core.h"
 
+DEFINE_string(force_config, "", "Force testing mode with iouring disabled");
+
 namespace android {
 namespace snapshot {
 
@@ -857,5 +863,23 @@
 
 int main(int argc, char** argv) {
     ::testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
+
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+
+    android::base::SetProperty("ctl.stop", "snapuserd");
+
+    if (FLAGS_force_config == "iouring_disabled") {
+        if (!android::base::SetProperty("snapuserd.test.io_uring.force_disable", "1")) {
+            return testing::AssertionFailure()
+                   << "Failed to disable property: snapuserd.test.io_uring.disabled";
+        }
+    }
+
+    int ret = RUN_ALL_TESTS();
+
+    if (FLAGS_force_config == "iouring_disabled") {
+        android::base::SetProperty("snapuserd.test.io_uring.force_disable", "0");
+    }
+
+    return ret;
 }

diff --git a/libutils/Unicode_test.cpp b/libutils/Unicode_test.cpp
index b92eef8..8b994d9 100644
--- a/libutils/Unicode_test.cpp
+++ b/libutils/Unicode_test.cpp

@@ -100,7 +100,7 @@
         0xF0, 0x90, 0x80, 0x80, // U+10000, 2 UTF-16 character
     };
 
-    char16_t output[1 + 1 + 1 + 2 + 1]; // Room for NULL
+    char16_t output[1 + 1 + 1 + 2 + 1];  // Room for null
 
     utf8_to_utf16(str, sizeof(str), output, sizeof(output) / sizeof(output[0]));
 
@@ -114,8 +114,7 @@
             << "should be first half of surrogate U+10000";
     EXPECT_EQ(0xDC00, output[4])
             << "should be second half of surrogate U+10000";
-    EXPECT_EQ(NULL, output[5])
-            << "should be NULL terminated";
+    EXPECT_EQ(0, output[5]) << "should be null terminated";
 }
 
 TEST_F(UnicodeTest, strstr16EmptyTarget) {
commit	4915f0416598b29b98b352a7b5f2982f2c035f6c	[log] [tgz]
author	Keith Mok <keithmok@google.com>	Fri Jan 14 00:38:35 2022 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Fri Jan 14 00:38:35 2022 +0000
tree	d81a045702f87e14a1b1c3207cc1b05d475de0bf
parent	c54ddc2b72b1af1540cc2a8d58fa664183b6c163 [diff]
parent	a3b72067de65d37d8b646e699b900e984d70165d [diff]