libsnapshot: Flush data to scratch space only for overlapping regions

When read-ahead thread caches the data from base device, flush the data
only if there are overlapping regions. If there is crash, subsequent
reboot will not recover the data from scratch space. Rather, data
will be re-constructed from base device.

Additionally, allow batch merge of blocks by the kernel even for
overlapping region given that we have the read-ahead thread
taking care of the overlapping blocks.

Bug: 183863613
Test: 1: Incremental OTA from build 7284758 to 7288239. Merge time
         reduces from ~6 minutes to ~2.5 minutes
      2: Reboot and crash kernel multiple times when merge was in
         progress
      3: Verify read-ahead thread re-constructs the data for overlapping
         region.
Signed-off-by: Akilesh Kailash <akailash@google.com>
Change-Id: I50e0d828f4fb36a23f0ca13b07a73229ba68874d
diff --git a/fs_mgr/libsnapshot/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd.cpp
index e6af777..2ccc750 100644
--- a/fs_mgr/libsnapshot/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd.cpp
@@ -197,27 +197,29 @@
     cv.notify_one();
 }
 
-bool Snapuserd::ReadAheadIOCompleted() {
-    // Flush the entire buffer region
-    int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
-    if (ret < 0) {
-        PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
-        return false;
-    }
+bool Snapuserd::ReadAheadIOCompleted(bool sync) {
+    if (sync) {
+        // Flush the entire buffer region
+        int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
+        if (ret < 0) {
+            PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
+            return false;
+        }
 
-    // Metadata and data are synced. Now, update the state.
-    // We need to update the state after flushing data; if there is a crash
-    // when read-ahead IO is in progress, the state of data in the COW file
-    // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
-    // in the scratch space is good and during next reboot, read-ahead thread
-    // can safely re-construct the data.
-    struct BufferState* ra_state = GetBufferState();
-    ra_state->read_ahead_state = kCowReadAheadDone;
+        // Metadata and data are synced. Now, update the state.
+        // We need to update the state after flushing data; if there is a crash
+        // when read-ahead IO is in progress, the state of data in the COW file
+        // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
+        // in the scratch space is good and during next reboot, read-ahead thread
+        // can safely re-construct the data.
+        struct BufferState* ra_state = GetBufferState();
+        ra_state->read_ahead_state = kCowReadAheadDone;
 
-    ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
-    if (ret < 0) {
-        PLOG(ERROR) << "msync failed to flush Readahead completion state...";
-        return false;
+        ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
+        if (ret < 0) {
+            PLOG(ERROR) << "msync failed to flush Readahead completion state...";
+            return false;
+        }
     }
 
     // Notify the worker threads
@@ -435,7 +437,6 @@
     int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
     std::optional<chunk_t> prev_id = {};
     std::map<uint64_t, const CowOperation*> map;
-    std::set<uint64_t> dest_blocks;
     size_t pending_copy_ops = exceptions_per_area_ - num_ops;
     uint64_t total_copy_ops = reader_->total_copy_ops();
 
@@ -477,41 +478,20 @@
             // Op-6: 15 -> 18
             //
             // Note that the blocks numbers are contiguous. Hence, all 6 copy
-            // operations can potentially be batch merged. However, that will be
+            // operations can be batch merged. However, that will be
             // problematic if we have a crash as block 20, 19, 18 would have
             // been overwritten and hence subsequent recovery may end up with
             // a silent data corruption when op-1, op-2 and op-3 are
             // re-executed.
             //
-            // We will split these 6 operations into two batches viz:
-            //
-            // Batch-1:
-            // ===================
-            // Op-1: 20 -> 23
-            // Op-2: 19 -> 22
-            // Op-3: 18 -> 21
-            // ===================
-            //
-            // Batch-2:
-            // ==================
-            // Op-4: 17 -> 20
-            // Op-5: 16 -> 19
-            // Op-6: 15 -> 18
-            // ==================
-            //
-            // Now, merge sequence will look like:
-            //
-            // 1: Merge Batch-1 { op-1, op-2, op-3 }
-            // 2: Update Metadata in COW File that op-1, op-2, op-3 merge is
-            // done.
-            // 3: Merge Batch-2
-            // 4: Update Metadata in COW File that op-4, op-5, op-6 merge is
-            // done.
-            //
-            // Note, that the order of block operations are still the same.
-            // However, we have two batch merge operations. Any crash between
-            // either of this sequence should be safe as each of these
-            // batches are self-contained.
+            // To address the above problem, read-ahead thread will
+            // read all the 6 source blocks, cache them in the scratch
+            // space of the COW file. During merge, read-ahead
+            // thread will serve the blocks from the read-ahead cache.
+            // If there is a crash during merge; on subsequent reboot,
+            // read-ahead thread will recover the data from the
+            // scratch space and re-construct it thereby there
+            // is no loss of data.
             //
             //===========================================================
             //
@@ -575,14 +555,10 @@
                 if (diff != 1) {
                     break;
                 }
-                if (dest_blocks.count(cow_op->new_block) || map.count(cow_op->source) > 0) {
-                    break;
-                }
             }
             metadata_found = true;
             pending_copy_ops -= 1;
             map[cow_op->new_block] = cow_op;
-            dest_blocks.insert(cow_op->source);
             prev_id = cow_op->new_block;
             cowop_riter_->Next();
         } while (!cowop_riter_->Done() && pending_copy_ops);
@@ -644,7 +620,6 @@
             }
         }
         map.clear();
-        dest_blocks.clear();
         prev_id.reset();
     }
 
@@ -746,6 +721,8 @@
             SNAP_LOG(ERROR) << "Failed to start Read-ahead thread...";
             return false;
         }
+
+        SNAP_LOG(INFO) << "Read-ahead thread started...";
     }
 
     // Launch worker threads
diff --git a/fs_mgr/libsnapshot/snapuserd.h b/fs_mgr/libsnapshot/snapuserd.h
index cd9d82a..0a5ab50 100644
--- a/fs_mgr/libsnapshot/snapuserd.h
+++ b/fs_mgr/libsnapshot/snapuserd.h
@@ -31,6 +31,7 @@
 #include <string>
 #include <thread>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include <android-base/file.h>
@@ -129,6 +130,7 @@
     bool ReadAheadIOStart();
     void PrepareReadAhead(uint64_t* source_block, int* pending_ops, std::vector<uint64_t>& blocks);
     bool ReconstructDataFromCow();
+    void CheckOverlap(const CowOperation* cow_op);
 
     void* read_ahead_buffer_;
     void* metadata_buffer_;
@@ -141,6 +143,10 @@
     unique_fd backing_store_fd_;
 
     std::shared_ptr<Snapuserd> snapuserd_;
+
+    std::unordered_set<uint64_t> dest_blocks_;
+    std::unordered_set<uint64_t> source_blocks_;
+    bool overlap_;
 };
 
 class WorkerThread {
@@ -258,7 +264,7 @@
     void PrepareReadAhead();
     void StartReadAhead();
     void MergeCompleted();
-    bool ReadAheadIOCompleted();
+    bool ReadAheadIOCompleted(bool sync);
     void ReadAheadIOFailed();
     bool WaitForMergeToComplete();
     bool GetReadAheadPopulatedBuffer(uint64_t block, void* buffer);
diff --git a/fs_mgr/libsnapshot/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd_readahead.cpp
index d60a353..09ee2f2 100644
--- a/fs_mgr/libsnapshot/snapuserd_readahead.cpp
+++ b/fs_mgr/libsnapshot/snapuserd_readahead.cpp
@@ -171,6 +171,15 @@
     snapuserd_ = snapuserd;
 }
 
+void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
+    if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(cow_op->source)) {
+        overlap_ = true;
+    }
+
+    dest_blocks_.insert(cow_op->source);
+    source_blocks_.insert(cow_op->new_block);
+}
+
 void ReadAheadThread::PrepareReadAhead(uint64_t* source_block, int* pending_ops,
                                        std::vector<uint64_t>& blocks) {
     int num_ops = *pending_ops;
@@ -185,6 +194,10 @@
         nr_consecutive = 1;
         blocks.push_back(cow_op->new_block);
 
+        if (!overlap_) {
+            CheckOverlap(cow_op);
+        }
+
         /*
          * Find number of consecutive blocks working backwards.
          */
@@ -197,6 +210,10 @@
             num_ops -= 1;
             blocks.push_back(op->new_block);
             IterNext();
+
+            if (!overlap_) {
+                CheckOverlap(op);
+            }
         }
     }
 }
@@ -249,7 +266,7 @@
 
     snapuserd_->ReconstructDataFromCowFinish();
 
-    if (!snapuserd_->ReadAheadIOCompleted()) {
+    if (!snapuserd_->ReadAheadIOCompleted(true)) {
         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
         snapuserd_->ReadAheadIOFailed();
         return false;
@@ -285,6 +302,9 @@
     loff_t offset = 0;
     loff_t file_offset = snapuserd_->GetBufferDataOffset();
     int total_blocks_merged = 0;
+    overlap_ = false;
+    dest_blocks_.clear();
+    source_blocks_.clear();
 
     while (true) {
         uint64_t source_block;
@@ -356,7 +376,8 @@
 
     snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
 
-    if (!snapuserd_->ReadAheadIOCompleted()) {
+    // Flush the data only if we have a overlapping blocks in the region
+    if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
         SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
         snapuserd_->ReadAheadIOFailed();
         return false;