Merge "Verify partitions using registered buffers" into main am: 95b890708d

Original change: https://android-review.googlesource.com/c/platform/system/core/+/3527289

Change-Id: Ie6b9b01c11615492f154417bee0105a93cddbab6
Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/fs_mgr/libsnapshot/snapuserd/Android.bp b/fs_mgr/libsnapshot/snapuserd/Android.bp
index 639116e..9972bc7 100644
--- a/fs_mgr/libsnapshot/snapuserd/Android.bp
+++ b/fs_mgr/libsnapshot/snapuserd/Android.bp
@@ -88,6 +88,7 @@
         "libprocessgroup",
         "libprocessgroup_util",
         "libjsoncpp",
+        "liburing_cpp",
     ],
     export_include_dirs: ["include"],
     header_libs: [
@@ -136,6 +137,7 @@
         "libext4_utils",
         "liburing",
         "libzstd",
+        "liburing_cpp",
     ],
 
     header_libs: [
@@ -222,6 +224,7 @@
         "libjsoncpp",
         "liburing",
         "libz",
+        "liburing_cpp",
     ],
     include_dirs: [
         ".",
@@ -319,6 +322,7 @@
         "libjsoncpp",
         "liburing",
         "libz",
+        "liburing_cpp",
     ],
     include_dirs: [
         ".",
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.cpp
index 957c6a8..97f8df4 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.cpp
@@ -22,6 +22,7 @@
 
 #include "android-base/properties.h"
 #include "snapuserd_core.h"
+#include "utility.h"
 
 namespace android {
 namespace snapshot {
@@ -104,43 +105,108 @@
         return false;
     }
 
-    loff_t file_offset = offset;
-    auto verify_block_size = android::base::GetUintProperty<uint>("ro.virtual_ab.verify_block_size",
-                                                                  kBlockSizeVerify);
-    const uint64_t read_sz = verify_block_size;
+    int queue_depth = std::max(queue_depth_, 1);
+    int verify_block_size = verify_block_size_;
 
-    void* addr;
-    ssize_t page_size = getpagesize();
-    if (posix_memalign(&addr, page_size, read_sz) < 0) {
-        SNAP_PLOG(ERROR) << "posix_memalign failed "
-                         << " page_size: " << page_size << " read_sz: " << read_sz;
+    // Smaller partitions don't need a bigger queue-depth.
+    // This is required for low-memory devices.
+    if (dev_sz < threshold_size_) {
+        queue_depth = std::max(queue_depth / 2, 1);
+        verify_block_size >>= 2;
+    }
+
+    if (!IsBlockAligned(verify_block_size)) {
+        verify_block_size = EXT4_ALIGN(verify_block_size, BLOCK_SZ);
+    }
+
+    std::unique_ptr<io_uring_cpp::IoUringInterface> ring =
+            io_uring_cpp::IoUringInterface::CreateLinuxIoUring(queue_depth, 0);
+    if (ring.get() == nullptr) {
+        PLOG(ERROR) << "Verify: io_uring_queue_init failed for queue_depth: " << queue_depth;
         return false;
     }
 
-    std::unique_ptr<void, decltype(&::free)> buffer(addr, ::free);
-
-    uint64_t bytes_read = 0;
-
-    while (true) {
-        size_t to_read = std::min((dev_sz - file_offset), read_sz);
-
-        if (!android::base::ReadFullyAtOffset(fd.get(), buffer.get(), to_read, file_offset)) {
-            SNAP_PLOG(ERROR) << "Failed to read block from block device: " << dm_block_device
-                             << " partition-name: " << partition_name
-                             << " at offset: " << file_offset << " read-size: " << to_read
-                             << " block-size: " << dev_sz;
+    std::unique_ptr<struct iovec[]> vecs = std::make_unique<struct iovec[]>(queue_depth);
+    std::vector<std::unique_ptr<void, decltype(&::free)>> buffers;
+    for (int i = 0; i < queue_depth; i++) {
+        void* addr;
+        ssize_t page_size = getpagesize();
+        if (posix_memalign(&addr, page_size, verify_block_size) < 0) {
+            LOG(ERROR) << "posix_memalign failed";
             return false;
         }
 
-        bytes_read += to_read;
-        file_offset += (skip_blocks * verify_block_size);
-        if (file_offset >= dev_sz) {
+        buffers.emplace_back(addr, ::free);
+        vecs[i].iov_base = addr;
+        vecs[i].iov_len = verify_block_size;
+    }
+
+    auto ret = ring->RegisterBuffers(vecs.get(), queue_depth);
+    if (!ret.IsOk()) {
+        SNAP_LOG(ERROR) << "io_uring_register_buffers failed: " << ret.ErrCode();
+        return false;
+    }
+
+    loff_t file_offset = offset;
+    const uint64_t read_sz = verify_block_size;
+    uint64_t total_read = 0;
+    int num_submitted = 0;
+
+    SNAP_LOG(DEBUG) << "VerifyBlocks: queue_depth: " << queue_depth
+                    << " verify_block_size: " << verify_block_size << " dev_sz: " << dev_sz
+                    << " file_offset: " << file_offset << " skip_blocks: " << skip_blocks;
+
+    while (file_offset < dev_sz) {
+        for (size_t i = 0; i < queue_depth; i++) {
+            uint64_t to_read = std::min((dev_sz - file_offset), read_sz);
+            if (to_read <= 0) break;
+
+            const auto sqe =
+                    ring->PrepReadFixed(fd.get(), vecs[i].iov_base, to_read, file_offset, i);
+            if (!sqe.IsOk()) {
+                SNAP_PLOG(ERROR) << "PrepReadFixed failed";
+                return false;
+            }
+            file_offset += (skip_blocks * to_read);
+            total_read += to_read;
+            num_submitted += 1;
+            if (file_offset >= dev_sz) {
+                break;
+            }
+        }
+
+        if (num_submitted == 0) {
             break;
         }
+
+        const auto io_submit = ring->SubmitAndWait(num_submitted);
+        if (!io_submit.IsOk()) {
+            SNAP_LOG(ERROR) << "SubmitAndWait failed: " << io_submit.ErrMsg()
+                            << " for: " << num_submitted << " entries.";
+            return false;
+        }
+
+        SNAP_LOG(DEBUG) << "io_uring_submit: " << total_read << "num_submitted: " << num_submitted
+                        << "ret: " << ret;
+
+        const auto cqes = ring->PopCQE(num_submitted);
+        if (cqes.IsErr()) {
+            SNAP_LOG(ERROR) << "PopCqe failed for: " << num_submitted
+                            << " error: " << cqes.GetError().ErrMsg();
+            return false;
+        }
+        for (const auto& cqe : cqes.GetResult()) {
+            if (cqe.res < 0) {
+                SNAP_LOG(ERROR) << "I/O failed: cqe->res: " << cqe.res;
+                return false;
+            }
+            num_submitted -= 1;
+        }
     }
 
-    SNAP_LOG(DEBUG) << "Verification success with bytes-read: " << bytes_read
-                    << " dev_sz: " << dev_sz << " partition_name: " << partition_name;
+    SNAP_LOG(DEBUG) << "Verification success with io_uring: "
+                    << " dev_sz: " << dev_sz << " partition_name: " << partition_name
+                    << " total_read: " << total_read;
 
     return true;
 }
@@ -175,21 +241,14 @@
         return false;
     }
 
-    /*
-     * Not all partitions are of same size. Some partitions are as small as
-     * 100Mb. We can just finish them in a single thread. For bigger partitions
-     * such as product, 4 threads are sufficient enough.
-     *
-     * TODO: With io_uring SQ_POLL support, we can completely cut this
-     * down to just single thread for all partitions and potentially verify all
-     * the partitions with zero syscalls. Additionally, since block layer
-     * supports polling, IO_POLL could be used which will further cut down
-     * latency.
-     */
+    if (!KernelSupportsIoUring()) {
+        SNAP_LOG(INFO) << "Kernel does not support io_uring. Skipping verification.\n";
+        // This will fallback to update_verifier to do the verification.
+        return false;
+    }
+
     int num_threads = kMinThreadsToVerify;
-    auto verify_threshold_size = android::base::GetUintProperty<uint>(
-            "ro.virtual_ab.verify_threshold_size", kThresholdSize);
-    if (dev_sz > verify_threshold_size) {
+    if (dev_sz > threshold_size_) {
         num_threads = kMaxThreadsToVerify;
     }
 
@@ -197,13 +256,11 @@
     off_t start_offset = 0;
     const int skip_blocks = num_threads;
 
-    auto verify_block_size =
-            android::base::GetUintProperty("ro.virtual_ab.verify_block_size", kBlockSizeVerify);
     while (num_threads) {
         threads.emplace_back(std::async(std::launch::async, &UpdateVerify::VerifyBlocks, this,
                                         partition_name, dm_block_device, start_offset, skip_blocks,
                                         dev_sz));
-        start_offset += verify_block_size;
+        start_offset += verify_block_size_;
         num_threads -= 1;
         if (start_offset >= dev_sz) {
             break;
@@ -218,9 +275,9 @@
     if (ret) {
         succeeded = true;
         UpdatePartitionVerificationState(UpdateVerifyState::VERIFY_SUCCESS);
-        SNAP_LOG(INFO) << "Partition: " << partition_name << " Block-device: " << dm_block_device
-                       << " Size: " << dev_sz
-                       << " verification success. Duration : " << timer.duration().count() << " ms";
+        SNAP_LOG(INFO) << "Partition verification success: " << partition_name
+                       << " Block-device: " << dm_block_device << " Size: " << dev_sz
+                       << " Duration : " << timer.duration().count() << " ms";
         return true;
     }
 
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.h b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.h
index b300a70..69a334b 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.h
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_verify.h
@@ -15,6 +15,7 @@
 
 #pragma once
 
+#include <liburing.h>
 #include <stdint.h>
 #include <sys/types.h>
 
@@ -22,6 +23,7 @@
 #include <mutex>
 #include <string>
 
+#include <liburing_cpp/IoUring.h>
 #include <snapuserd/snapuserd_kernel.h>
 #include <storage_literals/storage_literals.h>
 
@@ -48,27 +50,23 @@
     std::mutex m_lock_;
     std::condition_variable m_cv_;
 
+    int kMinThreadsToVerify = 1;
+    int kMaxThreadsToVerify = 3;
+
     /*
-     * Scanning of partitions is an expensive operation both in terms of memory
-     * and CPU usage. The goal here is to scan the partitions fast enough without
-     * significant increase in the boot time.
-     *
-     * Partitions such as system, product which may be huge and may need multiple
-     * threads to speed up the verification process. Using multiple threads for
-     * all partitions may increase CPU usage significantly. Hence, limit that to
-     * 1 thread per partition.
+     * To optimize partition scanning speed without significantly impacting boot time,
+     * we employ O_DIRECT, bypassing the page-cache. However, O_DIRECT's memory
+     * allocation from CMA can be problematic on devices with restricted CMA space.
+     * To address this, io_uring_register_buffers() pre-registers I/O buffers,
+     * preventing CMA usage. See b/401952955 for more details.
      *
      * These numbers were derived by monitoring the memory and CPU pressure
      * (/proc/pressure/{cpu,memory}; and monitoring the Inactive(file) and
      * Active(file) pages from /proc/meminfo.
-     *
-     * Additionally, for low memory devices, it is advisable to use O_DIRECT
-     * functionality for source block device.
      */
-    int kMinThreadsToVerify = 1;
-    int kMaxThreadsToVerify = 3;
-    uint64_t kThresholdSize = 750_MiB;
-    uint64_t kBlockSizeVerify = 2_MiB;
+    uint64_t verify_block_size_ = 1_MiB;
+    uint64_t threshold_size_ = 2_GiB;
+    int queue_depth_ = 4;
 
     bool IsBlockAligned(uint64_t read_size) { return ((read_size & (BLOCK_SZ - 1)) == 0); }
     void UpdatePartitionVerificationState(UpdateVerifyState state);