snapuserd: Async I/O for block verification

Boot time improvements:

==================================

Incremental OTA of 300M between two git_master branches on Pixel 6:

Android S (with dm-snapshot):

BootComplete                  : 39.379 seconds

Android T (with io_uring):

BootComplete                  : 32.001 seconds

Time taken for each partition:

snapuserd: ReadBlockAsync complete: 2055 ms Block-device: /dev/block/dm-21 Partition-name: system_ext Size: 399302656
snapuserd: ReadBlockAsync complete: 2956 ms Block-device: /dev/block/dm-23 Partition-name: vendor Size: 650084352
snapuserd: ReadBlockAsync complete: 3534 ms Block-device: /dev/block/dm-20 Partition-name: system Size: 859746304
snapuserd: ReadBlockAsync complete: 7808 ms Block-device: /dev/block/dm-22 Partition-name: product Size: 3030687744

====================================

Bug: 202784286
Test: Full/Incremental OTA
Signed-off-by: Akilesh Kailash <akailash@google.com>
Change-Id: I615f9f4fde4e565aa1d611a2d6bbf6a6f62fa3f1
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
index 2c84ff9..e48a1be 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
@@ -19,6 +19,7 @@
 #include <sys/utsname.h>
 
 #include <android-base/properties.h>
+#include <android-base/scopeguard.h>
 #include <android-base/strings.h>
 
 namespace android {
@@ -291,6 +292,136 @@
     return ReadMetadata();
 }
 
+void SnapshotHandler::FinalizeIouring() {
+    io_uring_queue_exit(ring_.get());
+}
+
+bool SnapshotHandler::InitializeIouring(int io_depth) {
+    ring_ = std::make_unique<struct io_uring>();
+
+    int ret = io_uring_queue_init(io_depth, ring_.get(), 0);
+    if (ret) {
+        LOG(ERROR) << "io_uring_queue_init failed with ret: " << ret;
+        return false;
+    }
+
+    LOG(INFO) << "io_uring_queue_init success with io_depth: " << io_depth;
+    return true;
+}
+
+bool SnapshotHandler::ReadBlocksAsync(const std::string& dm_block_device,
+                                      const std::string& partition_name, size_t size) {
+    // 64k block size with io_depth of 64 is optimal
+    // for a single thread. We just need a single thread
+    // to read all the blocks from all dynamic partitions.
+    size_t io_depth = 64;
+    size_t bs = (64 * 1024);
+
+    if (!InitializeIouring(io_depth)) {
+        return false;
+    }
+
+    LOG(INFO) << "ReadBlockAsync start "
+              << " Block-device: " << dm_block_device << " Partition-name: " << partition_name
+              << " Size: " << size;
+
+    auto scope_guard = android::base::make_scope_guard([this]() -> void { FinalizeIouring(); });
+
+    std::vector<std::unique_ptr<struct iovec>> vecs;
+    using AlignedBuf = std::unique_ptr<void, decltype(free)*>;
+    std::vector<AlignedBuf> alignedBufVector;
+
+    /*
+     * TODO: We need aligned memory for DIRECT-IO. However, if we do
+     * a DIRECT-IO and verify the blocks then we need to inform
+     * update-verifier that block verification has been done and
+     * there is no need to repeat the same. We are not there yet
+     * as we need to see if there are any boot time improvements doing
+     * a DIRECT-IO.
+     *
+     * Also, we could you the same function post merge for block verification;
+     * again, we can do a DIRECT-IO instead of thrashing page-cache and
+     * hurting other applications.
+     *
+     * For now, we will just create aligned buffers but rely on buffered
+     * I/O until we have perf numbers to justify DIRECT-IO.
+     */
+    for (int i = 0; i < io_depth; i++) {
+        auto iovec = std::make_unique<struct iovec>();
+        vecs.push_back(std::move(iovec));
+
+        struct iovec* iovec_ptr = vecs[i].get();
+
+        if (posix_memalign(&iovec_ptr->iov_base, BLOCK_SZ, bs)) {
+            LOG(ERROR) << "posix_memalign failed";
+            return false;
+        }
+
+        iovec_ptr->iov_len = bs;
+        alignedBufVector.push_back(
+                std::unique_ptr<void, decltype(free)*>(iovec_ptr->iov_base, free));
+    }
+
+    android::base::unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY)));
+    if (fd.get() == -1) {
+        SNAP_PLOG(ERROR) << "File open failed - block-device " << dm_block_device
+                         << " partition-name: " << partition_name;
+        return false;
+    }
+
+    loff_t offset = 0;
+    size_t remain = size;
+    size_t read_sz = io_depth * bs;
+
+    while (remain > 0) {
+        size_t to_read = std::min(remain, read_sz);
+        size_t queue_size = to_read / bs;
+
+        for (int i = 0; i < queue_size; i++) {
+            struct io_uring_sqe* sqe = io_uring_get_sqe(ring_.get());
+            if (!sqe) {
+                SNAP_LOG(ERROR) << "io_uring_get_sqe() failed";
+                return false;
+            }
+
+            struct iovec* iovec_ptr = vecs[i].get();
+
+            io_uring_prep_read(sqe, fd.get(), iovec_ptr->iov_base, iovec_ptr->iov_len, offset);
+            sqe->flags |= IOSQE_ASYNC;
+            offset += bs;
+        }
+
+        int ret = io_uring_submit(ring_.get());
+        if (ret != queue_size) {
+            SNAP_LOG(ERROR) << "submit got: " << ret << " wanted: " << queue_size;
+            return false;
+        }
+
+        for (int i = 0; i < queue_size; i++) {
+            struct io_uring_cqe* cqe;
+
+            int ret = io_uring_wait_cqe(ring_.get(), &cqe);
+            if (ret) {
+                SNAP_PLOG(ERROR) << "wait_cqe failed" << ret;
+                return false;
+            }
+
+            if (cqe->res < 0) {
+                SNAP_LOG(ERROR) << "io failed with res: " << cqe->res;
+                return false;
+            }
+            io_uring_cqe_seen(ring_.get(), cqe);
+        }
+
+        remain -= to_read;
+    }
+
+    LOG(INFO) << "ReadBlockAsync complete: "
+              << " Block-device: " << dm_block_device << " Partition-name: " << partition_name
+              << " Size: " << size;
+    return true;
+}
+
 void SnapshotHandler::ReadBlocksToCache(const std::string& dm_block_device,
                                         const std::string& partition_name, off_t offset,
                                         size_t size) {
@@ -347,17 +478,22 @@
         return;
     }
 
-    int num_threads = 2;
-    size_t num_blocks = dev_sz >> BLOCK_SHIFT;
-    size_t num_blocks_per_thread = num_blocks / num_threads;
-    size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
-    off_t offset = 0;
+    if (IsIouringSupported()) {
+        std::async(std::launch::async, &SnapshotHandler::ReadBlocksAsync, this, dm_block_device,
+                   partition_name, dev_sz);
+    } else {
+        int num_threads = 2;
+        size_t num_blocks = dev_sz >> BLOCK_SHIFT;
+        size_t num_blocks_per_thread = num_blocks / num_threads;
+        size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
+        off_t offset = 0;
 
-    for (int i = 0; i < num_threads; i++) {
-        std::async(std::launch::async, &SnapshotHandler::ReadBlocksToCache, this, dm_block_device,
-                   partition_name, offset, read_sz_per_thread);
+        for (int i = 0; i < num_threads; i++) {
+            std::async(std::launch::async, &SnapshotHandler::ReadBlocksToCache, this,
+                       dm_block_device, partition_name, offset, read_sz_per_thread);
 
-        offset += read_sz_per_thread;
+            offset += read_sz_per_thread;
+        }
     }
 }
 
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
index f36866a..b0f2d65 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
@@ -344,6 +344,11 @@
     void ReadBlocksToCache(const std::string& dm_block_device, const std::string& partition_name,
                            off_t offset, size_t size);
 
+    bool InitializeIouring(int io_depth);
+    void FinalizeIouring();
+    bool ReadBlocksAsync(const std::string& dm_block_device, const std::string& partition_name,
+                         size_t size);
+
     // COW device
     std::string cow_device_;
     // Source device
@@ -392,6 +397,8 @@
     bool attached_ = false;
     bool is_socket_present_;
     bool scratch_space_ = false;
+
+    std::unique_ptr<struct io_uring> ring_;
 };
 
 }  // namespace snapshot