libsnapshot: Flush data to scratch space only for overlapping regions When read-ahead thread caches the data from base device, flush the data only if there are overlapping regions. If there is crash, subsequent reboot will not recover the data from scratch space. Rather, data will be re-constructed from base device. Additionally, allow batch merge of blocks by the kernel even for overlapping region given that we have the read-ahead thread taking care of the overlapping blocks. Bug: 183863613 Test: 1: Incremental OTA from build 7284758 to 7288239. Merge time reduces from ~6 minutes to ~2.5 minutes 2: Reboot and crash kernel multiple times when merge was in progress 3: Verify read-ahead thread re-constructs the data for overlapping region. Signed-off-by: Akilesh Kailash <akailash@google.com> Change-Id: I50e0d828f4fb36a23f0ca13b07a73229ba68874d

commit: 150bcbf7c6aa4d47cfef9525dd4321801f8a3786 [log] [tgz]
author: Akilesh Kailash <akailash@google.com> Sun Apr 18 18:24:54 2021 +0000
committer: Akilesh Kailash <akailash@google.com> Fri Apr 30 06:46:47 2021 +0000
tree: 0e0efd44839dce85800f054c7822bf9261cc5af1
parent: 580312bc953a61cf1b7d2d1d082ece759844bfcb [diff]
diff --git a/fs_mgr/libsnapshot/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd.cpp
index e6af777..2ccc750 100644
--- a/fs_mgr/libsnapshot/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd.cpp

@@ -197,27 +197,29 @@
     cv.notify_one();
 }
 
-bool Snapuserd::ReadAheadIOCompleted() {
-    // Flush the entire buffer region
-    int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
-    if (ret < 0) {
-        PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
-        return false;
-    }
+bool Snapuserd::ReadAheadIOCompleted(bool sync) {
+    if (sync) {
+        // Flush the entire buffer region
+        int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
+        if (ret < 0) {
+            PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
+            return false;
+        }
 
-    // Metadata and data are synced. Now, update the state.
-    // We need to update the state after flushing data; if there is a crash
-    // when read-ahead IO is in progress, the state of data in the COW file
-    // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
-    // in the scratch space is good and during next reboot, read-ahead thread
-    // can safely re-construct the data.
-    struct BufferState* ra_state = GetBufferState();
-    ra_state->read_ahead_state = kCowReadAheadDone;
+        // Metadata and data are synced. Now, update the state.
+        // We need to update the state after flushing data; if there is a crash
+        // when read-ahead IO is in progress, the state of data in the COW file
+        // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
+        // in the scratch space is good and during next reboot, read-ahead thread
+        // can safely re-construct the data.
+        struct BufferState* ra_state = GetBufferState();
+        ra_state->read_ahead_state = kCowReadAheadDone;
 
-    ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
-    if (ret < 0) {
-        PLOG(ERROR) << "msync failed to flush Readahead completion state...";
-        return false;
+        ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
+        if (ret < 0) {
+            PLOG(ERROR) << "msync failed to flush Readahead completion state...";
+            return false;
+        }
     }
 
     // Notify the worker threads
@@ -435,7 +437,6 @@
     int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
     std::optional<chunk_t> prev_id = {};
     std::map<uint64_t, const CowOperation*> map;
-    std::set<uint64_t> dest_blocks;
     size_t pending_copy_ops = exceptions_per_area_ - num_ops;
     uint64_t total_copy_ops = reader_->total_copy_ops();
 
@@ -477,41 +478,20 @@
             // Op-6: 15 -> 18
             //
             // Note that the blocks numbers are contiguous. Hence, all 6 copy
-            // operations can potentially be batch merged. However, that will be
+            // operations can be batch merged. However, that will be
             // problematic if we have a crash as block 20, 19, 18 would have
             // been overwritten and hence subsequent recovery may end up with
             // a silent data corruption when op-1, op-2 and op-3 are
             // re-executed.
             //
-            // We will split these 6 operations into two batches viz:
-            //
-            // Batch-1:
-            // ===================
-            // Op-1: 20 -> 23
-            // Op-2: 19 -> 22
-            // Op-3: 18 -> 21
-            // ===================
-            //
-            // Batch-2:
-            // ==================
-            // Op-4: 17 -> 20
-            // Op-5: 16 -> 19
-            // Op-6: 15 -> 18
-            // ==================
-            //
-            // Now, merge sequence will look like:
-            //
-            // 1: Merge Batch-1 { op-1, op-2, op-3 }
-            // 2: Update Metadata in COW File that op-1, op-2, op-3 merge is
-            // done.
-            // 3: Merge Batch-2
-            // 4: Update Metadata in COW File that op-4, op-5, op-6 merge is
-            // done.
-            //
-            // Note, that the order of block operations are still the same.
-            // However, we have two batch merge operations. Any crash between
-            // either of this sequence should be safe as each of these
-            // batches are self-contained.
+            // To address the above problem, read-ahead thread will
+            // read all the 6 source blocks, cache them in the scratch
+            // space of the COW file. During merge, read-ahead
+            // thread will serve the blocks from the read-ahead cache.
+            // If there is a crash during merge; on subsequent reboot,
+            // read-ahead thread will recover the data from the
+            // scratch space and re-construct it thereby there
+            // is no loss of data.
             //
             //===========================================================
             //
@@ -575,14 +555,10 @@
                 if (diff != 1) {
                     break;
                 }
-                if (dest_blocks.count(cow_op->new_block) || map.count(cow_op->source) > 0) {
-                    break;
-                }
             }
             metadata_found = true;
             pending_copy_ops -= 1;
             map[cow_op->new_block] = cow_op;
-            dest_blocks.insert(cow_op->source);
             prev_id = cow_op->new_block;
             cowop_riter_->Next();
         } while (!cowop_riter_->Done() && pending_copy_ops);
@@ -644,7 +620,6 @@
             }
         }
         map.clear();
-        dest_blocks.clear();
         prev_id.reset();
     }
 
@@ -746,6 +721,8 @@
             SNAP_LOG(ERROR) << "Failed to start Read-ahead thread...";
             return false;
         }
+
+        SNAP_LOG(INFO) << "Read-ahead thread started...";
     }
 
     // Launch worker threads

diff --git a/fs_mgr/libsnapshot/snapuserd.h b/fs_mgr/libsnapshot/snapuserd.h
index cd9d82a..0a5ab50 100644
--- a/fs_mgr/libsnapshot/snapuserd.h
+++ b/fs_mgr/libsnapshot/snapuserd.h

@@ -31,6 +31,7 @@
 #include <string>
 #include <thread>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include <android-base/file.h>
@@ -129,6 +130,7 @@
     bool ReadAheadIOStart();
     void PrepareReadAhead(uint64_t* source_block, int* pending_ops, std::vector<uint64_t>& blocks);
     bool ReconstructDataFromCow();
+    void CheckOverlap(const CowOperation* cow_op);
 
     void* read_ahead_buffer_;
     void* metadata_buffer_;
@@ -141,6 +143,10 @@
     unique_fd backing_store_fd_;
 
     std::shared_ptr<Snapuserd> snapuserd_;
+
+    std::unordered_set<uint64_t> dest_blocks_;
+    std::unordered_set<uint64_t> source_blocks_;
+    bool overlap_;
 };
 
 class WorkerThread {
@@ -258,7 +264,7 @@
     void PrepareReadAhead();
     void StartReadAhead();
     void MergeCompleted();
-    bool ReadAheadIOCompleted();
+    bool ReadAheadIOCompleted(bool sync);
     void ReadAheadIOFailed();
     bool WaitForMergeToComplete();
     bool GetReadAheadPopulatedBuffer(uint64_t block, void* buffer);

diff --git a/fs_mgr/libsnapshot/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd_readahead.cpp
index d60a353..09ee2f2 100644
--- a/fs_mgr/libsnapshot/snapuserd_readahead.cpp
+++ b/fs_mgr/libsnapshot/snapuserd_readahead.cpp

@@ -171,6 +171,15 @@
     snapuserd_ = snapuserd;
 }
 
+void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
+    if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(cow_op->source)) {
+        overlap_ = true;
+    }
+
+    dest_blocks_.insert(cow_op->source);
+    source_blocks_.insert(cow_op->new_block);
+}
+
 void ReadAheadThread::PrepareReadAhead(uint64_t* source_block, int* pending_ops,
                                        std::vector<uint64_t>& blocks) {
     int num_ops = *pending_ops;
@@ -185,6 +194,10 @@
         nr_consecutive = 1;
         blocks.push_back(cow_op->new_block);
 
+        if (!overlap_) {
+            CheckOverlap(cow_op);
+        }
+
         /*
          * Find number of consecutive blocks working backwards.
          */
@@ -197,6 +210,10 @@
             num_ops -= 1;
             blocks.push_back(op->new_block);
             IterNext();
+
+            if (!overlap_) {
+                CheckOverlap(op);
+            }
         }
     }
 }
@@ -249,7 +266,7 @@
 
     snapuserd_->ReconstructDataFromCowFinish();
 
-    if (!snapuserd_->ReadAheadIOCompleted()) {
+    if (!snapuserd_->ReadAheadIOCompleted(true)) {
         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
         snapuserd_->ReadAheadIOFailed();
         return false;
@@ -285,6 +302,9 @@
     loff_t offset = 0;
     loff_t file_offset = snapuserd_->GetBufferDataOffset();
     int total_blocks_merged = 0;
+    overlap_ = false;
+    dest_blocks_.clear();
+    source_blocks_.clear();
 
     while (true) {
         uint64_t source_block;
@@ -356,7 +376,8 @@
 
     snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
 
-    if (!snapuserd_->ReadAheadIOCompleted()) {
+    // Flush the data only if we have a overlapping blocks in the region
+    if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
         snapuserd_->ReadAheadIOFailed();
         return false;
commit	150bcbf7c6aa4d47cfef9525dd4321801f8a3786	[log] [tgz]
author	Akilesh Kailash <akailash@google.com>	Sun Apr 18 18:24:54 2021 +0000
committer	Akilesh Kailash <akailash@google.com>	Fri Apr 30 06:46:47 2021 +0000
tree	0e0efd44839dce85800f054c7822bf9261cc5af1
parent	580312bc953a61cf1b7d2d1d082ece759844bfcb [diff]