Merge "libsnapshot:snapuserd: Batch merge copy operation"

commit: 7cd1c031783dcc23e03593ee13eedddc07ef5c62 [log] [tgz]
author: Akilesh Kailash <akailash@google.com> Wed Feb 10 05:08:48 2021 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Wed Feb 10 05:08:48 2021 +0000
tree: e64303bee81b3b9a3718171e6dd9f0347f0444c8
parent: 089383938531c3cfb8c0b3400c3e6bba617e6c5c [diff]
parent: 66914e2b2e457b439b61bc0ef2e8823707b0299a [diff]
diff --git a/fs_mgr/libsnapshot/Android.bp b/fs_mgr/libsnapshot/Android.bp
index d36a7f0..678adf8 100644
--- a/fs_mgr/libsnapshot/Android.bp
+++ b/fs_mgr/libsnapshot/Android.bp

@@ -549,6 +549,7 @@
     ],
     srcs: [
         "cow_snapuserd_test.cpp",
+        "snapuserd.cpp",
     ],
     cflags: [
         "-Wall",

diff --git a/fs_mgr/libsnapshot/cow_reader.cpp b/fs_mgr/libsnapshot/cow_reader.cpp
index c15a05b..75c05d1 100644
--- a/fs_mgr/libsnapshot/cow_reader.cpp
+++ b/fs_mgr/libsnapshot/cow_reader.cpp

@@ -181,6 +181,7 @@
         ops_buffer->resize(current_op_num);
     }
 
+    LOG(DEBUG) << "COW file read complete. Total ops: " << ops_buffer->size();
     // To successfully parse a COW file, we need either:
     //  (1) a label to read up to, and for that label to be found, or
     //  (2) a valid footer.
@@ -298,10 +299,9 @@
     // are contiguous. These are monotonically increasing numbers.
     //
     // When both (1) and (2) are true, kernel will batch merge the operations.
-    // However, we do not want copy operations to be batch merged as
-    // a crash or system reboot during an overlapping copy can drive the device
-    // to a corrupted state. Hence, merging of copy operations should always be
-    // done as a individual 4k block. In the above case, since the
+    // In the above case, we have to ensure that the copy operations
+    // are merged first before replace operations are done. Hence,
+    // we will not change the order of copy operations. Since,
     // cow_op->new_block numbers are contiguous, we will ensure that the
     // cow block numbers assigned in ReadMetadata() for these respective copy
     // operations are not contiguous forcing kernel to issue merge for each
@@ -328,10 +328,8 @@
     //
     // Merge sequence will look like:
     //
-    // Merge-1 - Copy-op-1
-    // Merge-2 - Copy-op-2
-    // Merge-3 - Copy-op-3
-    // Merge-4 - Batch-merge {Replace-op-7, Replace-op-6, Zero-op-8,
+    // Merge-1 - Batch-merge { Copy-op-1, Copy-op-2, Copy-op-3 }
+    // Merge-2 - Batch-merge {Replace-op-7, Replace-op-6, Zero-op-8,
     //                        Replace-op-4, Zero-op-9, Replace-op-5 }
     //==============================================================
 

diff --git a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
index 7fa23db..045d9db 100644
--- a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
+++ b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp

@@ -36,6 +36,8 @@
 #include <libsnapshot/snapuserd_client.h>
 #include <storage_literals/storage_literals.h>
 
+#include "snapuserd.h"
+
 namespace android {
 namespace snapshot {
 
@@ -119,7 +121,6 @@
     void CreateDmUserDevice();
     void StartSnapuserdDaemon();
     void CreateSnapshotDevice();
-    unique_fd CreateTempFile(const std::string& name, size_t size);
 
     unique_ptr<LoopDevice> base_loop_;
     unique_ptr<TempDevice> dmuser_dev_;
@@ -140,7 +141,24 @@
     int total_base_size_;
 };
 
-unique_fd CowSnapuserdTest::CreateTempFile(const std::string& name, size_t size) {
+class CowSnapuserdMetadataTest final {
+  public:
+    void Setup();
+    void SetupPartialArea();
+    void ValidateMetadata();
+    void ValidatePartialFilledArea();
+
+  private:
+    void InitMetadata();
+    void CreateCowDevice();
+    void CreateCowPartialFilledArea();
+
+    std::unique_ptr<Snapuserd> snapuserd_;
+    std::unique_ptr<TemporaryFile> cow_system_;
+    size_t size_ = 1_MiB;
+};
+
+static unique_fd CreateTempFile(const std::string& name, size_t size) {
     unique_fd fd(syscall(__NR_memfd_create, name.c_str(), MFD_ALLOW_SEALING));
     if (fd < 0) {
         return {};
@@ -430,25 +448,299 @@
 }
 
 void CowSnapuserdTest::MergeInterrupt() {
+    // Interrupt merge at various intervals
     StartMerge();
-    std::this_thread::sleep_for(4s);
+    std::this_thread::sleep_for(250ms);
     SimulateDaemonRestart();
 
     StartMerge();
-    std::this_thread::sleep_for(3s);
+    std::this_thread::sleep_for(250ms);
     SimulateDaemonRestart();
 
     StartMerge();
-    std::this_thread::sleep_for(3s);
+    std::this_thread::sleep_for(150ms);
     SimulateDaemonRestart();
 
     StartMerge();
-    std::this_thread::sleep_for(1s);
+    std::this_thread::sleep_for(100ms);
+    SimulateDaemonRestart();
+
+    StartMerge();
+    std::this_thread::sleep_for(800ms);
+    SimulateDaemonRestart();
+
+    StartMerge();
+    std::this_thread::sleep_for(600ms);
     SimulateDaemonRestart();
 
     ASSERT_TRUE(Merge());
 }
 
+void CowSnapuserdMetadataTest::CreateCowPartialFilledArea() {
+    std::string path = android::base::GetExecutableDirectory();
+    cow_system_ = std::make_unique<TemporaryFile>(path);
+
+    CowOptions options;
+    options.compression = "gz";
+    CowWriter writer(options);
+
+    ASSERT_TRUE(writer.Initialize(cow_system_->fd));
+
+    // Area 0 is completely filled with 256 exceptions
+    for (int i = 0; i < 256; i++) {
+        ASSERT_TRUE(writer.AddCopy(i, 256 + i));
+    }
+
+    // Area 1 is partially filled with 2 copy ops and 10 zero ops
+    ASSERT_TRUE(writer.AddCopy(500, 1000));
+    ASSERT_TRUE(writer.AddCopy(501, 1001));
+
+    ASSERT_TRUE(writer.AddZeroBlocks(300, 10));
+
+    // Flush operations
+    ASSERT_TRUE(writer.Finalize());
+}
+
+void CowSnapuserdMetadataTest::ValidatePartialFilledArea() {
+    int area_sz = snapuserd_->GetMetadataAreaSize();
+
+    ASSERT_EQ(area_sz, 2);
+
+    size_t new_chunk = 263;
+    // Verify the partially filled area
+    void* buffer = snapuserd_->GetExceptionBuffer(1);
+    loff_t offset = 0;
+    struct disk_exception* de;
+    for (int i = 0; i < 12; i++) {
+        de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+        ASSERT_EQ(de->old_chunk, i);
+        ASSERT_EQ(de->new_chunk, new_chunk);
+        offset += sizeof(struct disk_exception);
+        new_chunk += 1;
+    }
+
+    de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+    ASSERT_EQ(de->old_chunk, 0);
+    ASSERT_EQ(de->new_chunk, 0);
+}
+
+void CowSnapuserdMetadataTest::SetupPartialArea() {
+    CreateCowPartialFilledArea();
+    InitMetadata();
+}
+
+void CowSnapuserdMetadataTest::CreateCowDevice() {
+    unique_fd rnd_fd;
+    loff_t offset = 0;
+
+    std::string path = android::base::GetExecutableDirectory();
+    cow_system_ = std::make_unique<TemporaryFile>(path);
+
+    rnd_fd.reset(open("/dev/random", O_RDONLY));
+    ASSERT_TRUE(rnd_fd > 0);
+
+    std::unique_ptr<uint8_t[]> random_buffer_1_ = std::make_unique<uint8_t[]>(size_);
+
+    // Fill random data
+    for (size_t j = 0; j < (size_ / 1_MiB); j++) {
+        ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1_.get() + offset, 1_MiB, 0),
+                  true);
+
+        offset += 1_MiB;
+    }
+
+    CowOptions options;
+    options.compression = "gz";
+    CowWriter writer(options);
+
+    ASSERT_TRUE(writer.Initialize(cow_system_->fd));
+
+    size_t num_blocks = size_ / options.block_size;
+
+    // Overlapping region. This has to be split
+    // into two batch operations
+    ASSERT_TRUE(writer.AddCopy(23, 20));
+    ASSERT_TRUE(writer.AddCopy(22, 19));
+    ASSERT_TRUE(writer.AddCopy(21, 18));
+    ASSERT_TRUE(writer.AddCopy(20, 17));
+    ASSERT_TRUE(writer.AddCopy(19, 16));
+    ASSERT_TRUE(writer.AddCopy(18, 15));
+
+    // Contiguous region but blocks in ascending order
+    // Daemon has to ensure that these blocks are merged
+    // in a batch
+    ASSERT_TRUE(writer.AddCopy(50, 75));
+    ASSERT_TRUE(writer.AddCopy(51, 76));
+    ASSERT_TRUE(writer.AddCopy(52, 77));
+    ASSERT_TRUE(writer.AddCopy(53, 78));
+
+    // Dis-contiguous region
+    ASSERT_TRUE(writer.AddCopy(110, 130));
+    ASSERT_TRUE(writer.AddCopy(105, 125));
+    ASSERT_TRUE(writer.AddCopy(100, 120));
+
+    // Overlap
+    ASSERT_TRUE(writer.AddCopy(25, 30));
+    ASSERT_TRUE(writer.AddCopy(30, 31));
+
+    size_t source_blk = num_blocks;
+
+    ASSERT_TRUE(writer.AddRawBlocks(source_blk, random_buffer_1_.get(), size_));
+
+    size_t blk_zero_copy_start = source_blk + num_blocks;
+
+    ASSERT_TRUE(writer.AddZeroBlocks(blk_zero_copy_start, num_blocks));
+
+    // Flush operations
+    ASSERT_TRUE(writer.Finalize());
+}
+
+void CowSnapuserdMetadataTest::InitMetadata() {
+    snapuserd_ = std::make_unique<Snapuserd>("", cow_system_->path, "");
+    ASSERT_TRUE(snapuserd_->InitCowDevice());
+}
+
+void CowSnapuserdMetadataTest::Setup() {
+    CreateCowDevice();
+    InitMetadata();
+}
+
+void CowSnapuserdMetadataTest::ValidateMetadata() {
+    int area_sz = snapuserd_->GetMetadataAreaSize();
+    ASSERT_EQ(area_sz, 3);
+
+    size_t old_chunk;
+    size_t new_chunk;
+
+    for (int i = 0; i < area_sz; i++) {
+        void* buffer = snapuserd_->GetExceptionBuffer(i);
+        loff_t offset = 0;
+        if (i == 0) {
+            old_chunk = 256;
+            new_chunk = 2;
+        } else if (i == 1) {
+            old_chunk = 512;
+            new_chunk = 259;
+        }
+        for (int j = 0; j < 256; j++) {
+            struct disk_exception* de =
+                    reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+
+            if (i != 2) {
+                ASSERT_EQ(de->old_chunk, old_chunk);
+                ASSERT_EQ(de->new_chunk, new_chunk);
+                old_chunk += 1;
+                new_chunk += 1;
+            } else {
+                break;
+            }
+            offset += sizeof(struct disk_exception);
+        }
+
+        if (i == 2) {
+            // The first 5 copy operation is not batch merged
+            // as the sequence is discontiguous
+            struct disk_exception* de =
+                    reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 30);
+            ASSERT_EQ(de->new_chunk, 518);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 25);
+            ASSERT_EQ(de->new_chunk, 520);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 100);
+            ASSERT_EQ(de->new_chunk, 522);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 105);
+            ASSERT_EQ(de->new_chunk, 524);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 110);
+            ASSERT_EQ(de->new_chunk, 526);
+            offset += sizeof(struct disk_exception);
+
+            // The next 4 operations are batch merged as
+            // both old and new chunk are contiguous
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 50);
+            ASSERT_EQ(de->new_chunk, 528);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 51);
+            ASSERT_EQ(de->new_chunk, 529);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 52);
+            ASSERT_EQ(de->new_chunk, 530);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 53);
+            ASSERT_EQ(de->new_chunk, 531);
+            offset += sizeof(struct disk_exception);
+
+            // This is handling overlap operation with
+            // two batch merge operations.
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 18);
+            ASSERT_EQ(de->new_chunk, 533);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 19);
+            ASSERT_EQ(de->new_chunk, 534);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 20);
+            ASSERT_EQ(de->new_chunk, 535);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 21);
+            ASSERT_EQ(de->new_chunk, 537);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 22);
+            ASSERT_EQ(de->new_chunk, 538);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 23);
+            ASSERT_EQ(de->new_chunk, 539);
+            offset += sizeof(struct disk_exception);
+
+            // End of metadata
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 0);
+            ASSERT_EQ(de->new_chunk, 0);
+            offset += sizeof(struct disk_exception);
+        }
+    }
+}
+
+TEST(Snapuserd_Test, Snapshot_Metadata) {
+    CowSnapuserdMetadataTest harness;
+    harness.Setup();
+    harness.ValidateMetadata();
+}
+
+TEST(Snapuserd_Test, Snapshot_Metadata_Overlap) {
+    CowSnapuserdMetadataTest harness;
+    harness.SetupPartialArea();
+    harness.ValidatePartialFilledArea();
+}
+
 TEST(Snapuserd_Test, Snapshot_Merge_Resume) {
     CowSnapuserdTest harness;
     ASSERT_TRUE(harness.Setup());
@@ -457,7 +749,7 @@
     harness.Shutdown();
 }
 
-TEST(Snapuserd_Test, Snapshot) {
+TEST(Snapuserd_Test, Snapshot_IO_TEST) {
     CowSnapuserdTest harness;
     ASSERT_TRUE(harness.Setup());
     harness.ReadSnapshotDeviceAndValidate();
@@ -465,7 +757,6 @@
     harness.ValidateMerge();
     harness.Shutdown();
 }
-
 }  // namespace snapshot
 }  // namespace android
 

diff --git a/fs_mgr/libsnapshot/cow_writer.cpp b/fs_mgr/libsnapshot/cow_writer.cpp
index c1a5f32..81edc79 100644
--- a/fs_mgr/libsnapshot/cow_writer.cpp
+++ b/fs_mgr/libsnapshot/cow_writer.cpp

@@ -491,7 +491,7 @@
     return true;
 }
 
-bool CowWriter::CommitMerge(int merged_ops, bool sync) {
+bool CowWriter::CommitMerge(int merged_ops) {
     CHECK(merge_in_progress_);
     header_.num_merge_ops += merged_ops;
 
@@ -506,11 +506,7 @@
         return false;
     }
 
-    // Sync only for merging of copy operations.
-    if (sync) {
-        return Sync();
-    }
-    return true;
+    return Sync();
 }
 
 bool CowWriter::Truncate(off_t length) {

diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h
index 22ddfa6..6ffd5d8 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h

@@ -101,7 +101,7 @@
     bool InitializeAppend(android::base::borrowed_fd fd, uint64_t label);
 
     void InitializeMerge(android::base::borrowed_fd fd, CowHeader* header);
-    bool CommitMerge(int merged_ops, bool sync);
+    bool CommitMerge(int merged_ops);
 
     bool Finalize() override;
 

diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
index 7941e68..2b6c8ef 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h

@@ -47,8 +47,8 @@
 static constexpr uint32_t CHUNK_SIZE = 8;
 static constexpr uint32_t CHUNK_SHIFT = (__builtin_ffs(CHUNK_SIZE) - 1);
 
-static constexpr uint32_t BLOCK_SIZE = 4096;
-static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SIZE) - 1);
+static constexpr uint32_t BLOCK_SZ = 4096;
+static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SZ) - 1);
 
 #define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
 

diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp
index 3485474..eb3a501 100644
--- a/fs_mgr/libsnapshot/snapshot.cpp
+++ b/fs_mgr/libsnapshot/snapshot.cpp

@@ -1439,6 +1439,7 @@
         // Wait for ueventd to acknowledge and create the control device node.
         std::string control_device = "/dev/dm-user/" + misc_name;
         if (!WaitForDevice(control_device, 10s)) {
+            LOG(ERROR) << "dm-user control device no found:  " << misc_name;
             continue;
         }
 

diff --git a/fs_mgr/libsnapshot/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd.cpp
index 057acad..0a02bd0 100644
--- a/fs_mgr/libsnapshot/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd.cpp

@@ -17,6 +17,8 @@
 #include "snapuserd.h"
 
 #include <csignal>
+#include <optional>
+#include <set>
 
 #include <libsnapshot/snapuserd_client.h>
 
@@ -32,7 +34,7 @@
 
 static constexpr size_t PAYLOAD_SIZE = (1UL << 20);
 
-static_assert(PAYLOAD_SIZE >= BLOCK_SIZE);
+static_assert(PAYLOAD_SIZE >= BLOCK_SZ);
 
 void BufferSink::Initialize(size_t size) {
     buffer_size_ = size;
@@ -78,10 +80,10 @@
 // request will always be 4k. After constructing
 // the header, zero out the remaining block.
 void Snapuserd::ConstructKernelCowHeader() {
-    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
     CHECK(buffer != nullptr);
 
-    memset(buffer, 0, BLOCK_SIZE);
+    memset(buffer, 0, BLOCK_SZ);
 
     struct disk_header* dh = reinterpret_cast<struct disk_header*>(buffer);
 
@@ -106,13 +108,13 @@
 // Start the copy operation. This will read the backing
 // block device which is represented by cow_op->source.
 bool Snapuserd::ProcessCopyOp(const CowOperation* cow_op) {
-    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
     CHECK(buffer != nullptr);
 
     // Issue a single 4K IO. However, this can be optimized
     // if the successive blocks are contiguous.
-    if (!android::base::ReadFullyAtOffset(backing_store_fd_, buffer, BLOCK_SIZE,
-                                          cow_op->source * BLOCK_SIZE)) {
+    if (!android::base::ReadFullyAtOffset(backing_store_fd_, buffer, BLOCK_SZ,
+                                          cow_op->source * BLOCK_SZ)) {
         SNAP_PLOG(ERROR) << "Copy-op failed. Read from backing store: " << backing_store_device_
                          << "at block :" << cow_op->source;
         return false;
@@ -123,10 +125,10 @@
 
 bool Snapuserd::ProcessZeroOp() {
     // Zero out the entire block
-    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
     CHECK(buffer != nullptr);
 
-    memset(buffer, 0, BLOCK_SIZE);
+    memset(buffer, 0, BLOCK_SZ);
     return true;
 }
 
@@ -173,11 +175,11 @@
         struct dm_user_message* msg = (struct dm_user_message*)(&(buffer[0]));
 
         memmove(msg->payload.buf, (char*)msg->payload.buf + skip_sector_size,
-                (BLOCK_SIZE - skip_sector_size));
+                (BLOCK_SZ - skip_sector_size));
     }
 
     bufsink_.ResetBufferOffset();
-    return std::min(size, (BLOCK_SIZE - skip_sector_size));
+    return std::min(size, (BLOCK_SZ - skip_sector_size));
 }
 
 /*
@@ -234,7 +236,7 @@
         return ReadUnalignedSector(sector, size, it);
     }
 
-    int num_ops = DIV_ROUND_UP(size, BLOCK_SIZE);
+    int num_ops = DIV_ROUND_UP(size, BLOCK_SZ);
     while (num_ops) {
         if (!ProcessCowOp(it->second)) {
             return -1;
@@ -242,7 +244,7 @@
         num_ops -= 1;
         it++;
         // Update the buffer offset
-        bufsink_.UpdateBufferOffset(BLOCK_SIZE);
+        bufsink_.UpdateBufferOffset(BLOCK_SZ);
 
         SNAP_LOG(DEBUG) << "ReadData at sector: " << sector << " size: " << size;
     }
@@ -344,7 +346,7 @@
 }
 
 int Snapuserd::GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer, loff_t offset,
-                                    int unmerged_exceptions, bool* copy_op) {
+                                    int unmerged_exceptions) {
     int merged_ops_cur_iter = 0;
 
     // Find the operations which are merged in this cycle.
@@ -362,10 +364,8 @@
             offset += sizeof(struct disk_exception);
             const CowOperation* cow_op = chunk_map_[ChunkToSector(cow_de->new_chunk)];
             CHECK(cow_op != nullptr);
+
             CHECK(cow_op->new_block == cow_de->old_chunk);
-            if (cow_op->type == kCowCopyOp) {
-                *copy_op = true;
-            }
             // zero out to indicate that operation is merged.
             cow_de->old_chunk = 0;
             cow_de->new_chunk = 0;
@@ -389,10 +389,6 @@
             return -1;
         }
     }
-
-    if (*copy_op) {
-        CHECK(merged_ops_cur_iter == 1);
-    }
     return merged_ops_cur_iter;
 }
 
@@ -414,42 +410,15 @@
     int unmerged_exceptions = 0;
     loff_t offset = GetMergeStartOffset(buffer, vec_[divresult.quot].get(), &unmerged_exceptions);
 
-    bool copy_op = false;
-    // Check if the merged operation is a copy operation. If so, then we need
-    // to explicitly sync the metadata before initiating the next merge.
-    // For ex: Consider a following sequence of copy operations in the COW file:
-    //
-    // Op-1: Copy 2 -> 3
-    // Op-2: Copy 1 -> 2
-    // Op-3: Copy 5 -> 10
-    //
-    // Op-1 and Op-2 are overlapping copy operations. The merge sequence will
-    // look like:
-    //
-    // Merge op-1: Copy 2 -> 3
-    // Merge op-2: Copy 1 -> 2
-    // Merge op-3: Copy 5 -> 10
-    //
-    // Now, let's say we have a crash _after_ Merge op-2; Block 2 contents would
-    // have been over-written by Block-1 after merge op-2. During next reboot,
-    // kernel will request the metadata for all the un-merged blocks. If we had
-    // not sync the metadata after Merge-op 1 and Merge op-2, snapuser daemon
-    // will think that these merge operations are still pending and hence will
-    // inform the kernel that Op-1 and Op-2 are un-merged blocks. When kernel
-    // resumes back the merging process, it will attempt to redo the Merge op-1
-    // once again. However, block 2 contents are wrong as it has the contents
-    // of block 1 from previous merge cycle. Although, merge will silently succeed,
-    // this will lead to silent data corruption.
-    //
-    int merged_ops_cur_iter = GetNumberOfMergedOps(buffer, vec_[divresult.quot].get(), offset,
-                                                   unmerged_exceptions, &copy_op);
+    int merged_ops_cur_iter =
+            GetNumberOfMergedOps(buffer, vec_[divresult.quot].get(), offset, unmerged_exceptions);
 
     // There should be at least one operation merged in this cycle
     CHECK(merged_ops_cur_iter > 0);
 
     header.num_merge_ops += merged_ops_cur_iter;
     reader_->UpdateMergeProgress(merged_ops_cur_iter);
-    if (!writer_->CommitMerge(merged_ops_cur_iter, copy_op)) {
+    if (!writer_->CommitMerge(merged_ops_cur_iter)) {
         SNAP_LOG(ERROR) << "CommitMerge failed... merged_ops_cur_iter: " << merged_ops_cur_iter;
         return false;
     }
@@ -506,12 +475,13 @@
  *      during merge; specifically when the merge operation has dependency.
  *      These dependencies can only happen during copy operations.
  *
- *      To avoid this problem, we make sure that no two copy-operations
- *      do not have contiguous chunk IDs. Additionally, we make sure
- *      that each copy operation is merged individually.
+ *      To avoid this problem, we make sure overlap copy operations
+ *      are not batch merged.
  * 6: Use a monotonically increasing chunk number to assign the
  *    new_chunk
- * 7: Each chunk-id represents either a: Metadata page or b: Data page
+ * 7: Each chunk-id represents either
+ *        a: Metadata page or
+ *        b: Data page
  * 8: Chunk-id representing a data page is stored in a map.
  * 9: Chunk-id representing a metadata page is converted into a vector
  *    index. We store this in vector as kernel requests metadata during
@@ -531,8 +501,8 @@
     reader_ = std::make_unique<CowReader>();
     CowHeader header;
     CowOptions options;
-    bool prev_copy_op = false;
     bool metadata_found = false;
+    int replace_ops = 0, zero_ops = 0, copy_ops = 0;
 
     SNAP_LOG(DEBUG) << "ReadMetadata: Parsing cow file";
 
@@ -546,10 +516,10 @@
         return false;
     }
 
-    CHECK(header.block_size == BLOCK_SIZE);
+    CHECK(header.block_size == BLOCK_SZ);
 
-    SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
     reader_->InitializeMerge();
+    SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
 
     writer_ = std::make_unique<CowWriter>(options);
     writer_->InitializeMerge(cow_fd_.get(), &header);
@@ -584,17 +554,26 @@
         }
 
         metadata_found = true;
-        if ((cow_op->type == kCowCopyOp || prev_copy_op)) {
+        // This loop will handle all the replace and zero ops.
+        // We will handle the copy ops later as it requires special
+        // handling of assigning chunk-id's. Furthermore, we make
+        // sure that replace/zero and copy ops are not batch merged; hence,
+        // the bump in the chunk_id before break of this loop
+        if (cow_op->type == kCowCopyOp) {
             data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
+            break;
         }
 
-        prev_copy_op = (cow_op->type == kCowCopyOp);
+        if (cow_op->type == kCowReplaceOp) {
+            replace_ops++;
+        } else if (cow_op->type == kCowZeroOp) {
+            zero_ops++;
+        }
 
         // Construct the disk-exception
         de->old_chunk = cow_op->new_block;
         de->new_chunk = data_chunk_id;
 
-        SNAP_LOG(DEBUG) << "Old-chunk: " << de->old_chunk << "New-chunk: " << de->new_chunk;
 
         // Store operation pointer.
         chunk_map_[ChunkToSector(data_chunk_id)] = cow_op;
@@ -602,6 +581,9 @@
         offset += sizeof(struct disk_exception);
         cowop_riter_->Next();
 
+        SNAP_LOG(DEBUG) << num_ops << ":"
+                        << " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
+
         if (num_ops == exceptions_per_area_) {
             // Store it in vector at the right index. This maps the chunk-id to
             // vector index.
@@ -616,13 +598,213 @@
 
             if (cowop_riter_->Done()) {
                 vec_.push_back(std::move(de_ptr));
-                SNAP_LOG(DEBUG) << "ReadMetadata() completed; Number of Areas: " << vec_.size();
             }
         }
 
         data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
     }
 
+    std::optional<chunk_t> prev_id = {};
+    std::map<uint64_t, const CowOperation*> map;
+    std::set<uint64_t> dest_blocks;
+    size_t pending_copy_ops = exceptions_per_area_ - num_ops;
+    SNAP_LOG(INFO) << " Processing copy-ops at Area: " << vec_.size()
+                   << " Number of replace/zero ops completed in this area: " << num_ops
+                   << " Pending copy ops for this area: " << pending_copy_ops;
+    while (!cowop_riter_->Done()) {
+        do {
+            const CowOperation* cow_op = &cowop_riter_->Get();
+            if (IsMetadataOp(*cow_op)) {
+                cowop_riter_->Next();
+                continue;
+            }
+
+            // We have two cases specific cases:
+            //
+            // =====================================================
+            // Case 1: Overlapping copy regions
+            //
+            // Ex:
+            //
+            // Source -> Destination
+            //
+            // 1: 15 -> 18
+            // 2: 16 -> 19
+            // 3: 17 -> 20
+            // 4: 18 -> 21
+            // 5: 19 -> 22
+            // 6: 20 -> 23
+            //
+            // We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
+            // will write to COW file as follows:
+            //
+            // Op-1: 20 -> 23
+            // Op-2: 19 -> 22
+            // Op-3: 18 -> 21
+            // Op-4: 17 -> 20
+            // Op-5: 16 -> 19
+            // Op-6: 15 -> 18
+            //
+            // Note that the blocks numbers are contiguous. Hence, all 6 copy
+            // operations can potentially be batch merged. However, that will be
+            // problematic if we have a crash as block 20, 19, 18 would have
+            // been overwritten and hence subsequent recovery may end up with
+            // a silent data corruption when op-1, op-2 and op-3 are
+            // re-executed.
+            //
+            // We will split these 6 operations into two batches viz:
+            //
+            // Batch-1:
+            // ===================
+            // Op-1: 20 -> 23
+            // Op-2: 19 -> 22
+            // Op-3: 18 -> 21
+            // ===================
+            //
+            // Batch-2:
+            // ==================
+            // Op-4: 17 -> 20
+            // Op-5: 16 -> 19
+            // Op-6: 15 -> 18
+            // ==================
+            //
+            // Now, merge sequence will look like:
+            //
+            // 1: Merge Batch-1 { op-1, op-2, op-3 }
+            // 2: Update Metadata in COW File that op-1, op-2, op-3 merge is
+            // done.
+            // 3: Merge Batch-2
+            // 4: Update Metadata in COW File that op-4, op-5, op-6 merge is
+            // done.
+            //
+            // Note, that the order of block operations are still the same.
+            // However, we have two batch merge operations. Any crash between
+            // either of this sequence should be safe as each of these
+            // batches are self-contained.
+            //
+            //===========================================================
+            //
+            // Case 2:
+            //
+            // Let's say we have three copy operations written to COW file
+            // in the following order:
+            //
+            // op-1: 15 -> 18
+            // op-2: 16 -> 19
+            // op-3: 17 -> 20
+            //
+            // As aforementioned, kernel will initiate merge in reverse order.
+            // Hence, we will read these ops in reverse order so that all these
+            // ops are exectued in the same order as requested. Thus, we will
+            // read the metadata in reverse order and for the kernel it will
+            // look like:
+            //
+            // op-3: 17 -> 20
+            // op-2: 16 -> 19
+            // op-1: 15 -> 18   <-- Merge starts here in the kernel
+            //
+            // Now, this is problematic as kernel cannot batch merge them.
+            //
+            // Merge sequence will look like:
+            //
+            // Merge-1: op-1: 15 -> 18
+            // Merge-2: op-2: 16 -> 19
+            // Merge-3: op-3: 17 -> 20
+            //
+            // We have three merge operations.
+            //
+            // Even though the blocks are contiguous, kernel can batch merge
+            // them if the blocks are in descending order. Update engine
+            // addresses this issue partially for overlapping operations as
+            // we see that op-1 to op-3 and op-4 to op-6 operatiosn are in
+            // descending order. However, if the copy operations are not
+            // overlapping, update engine cannot write these blocks
+            // in descending order. Hence, we will try to address it.
+            // Thus, we will send these blocks to the kernel and it will
+            // look like:
+            //
+            // op-3: 15 -> 18
+            // op-2: 16 -> 19
+            // op-1: 17 -> 20  <-- Merge starts here in the kernel
+            //
+            // Now with this change, we can batch merge all these three
+            // operations. Merge sequence will look like:
+            //
+            // Merge-1: {op-1: 17 -> 20, op-2: 16 -> 19, op-3: 15 -> 18}
+            //
+            // Note that we have changed the ordering of merge; However, this
+            // is ok as each of these copy operations are independent and there
+            // is no overlap.
+            //
+            //===================================================================
+            if (prev_id.has_value()) {
+                chunk_t diff = (cow_op->new_block > prev_id.value())
+                                       ? (cow_op->new_block - prev_id.value())
+                                       : (prev_id.value() - cow_op->new_block);
+                if (diff != 1) {
+                    break;
+                }
+                if (dest_blocks.count(cow_op->new_block) || map.count(cow_op->source) > 0) {
+                    break;
+                }
+            }
+            metadata_found = true;
+            pending_copy_ops -= 1;
+            map[cow_op->new_block] = cow_op;
+            dest_blocks.insert(cow_op->source);
+            prev_id = cow_op->new_block;
+            cowop_riter_->Next();
+        } while (!cowop_riter_->Done() && pending_copy_ops);
+
+        data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
+        SNAP_LOG(DEBUG) << "Batch Merge copy-ops of size: " << map.size()
+                        << " Area: " << vec_.size() << " Area offset: " << offset
+                        << " Pending-copy-ops in this area: " << pending_copy_ops;
+
+        for (auto it = map.begin(); it != map.end(); it++) {
+            struct disk_exception* de =
+                    reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
+            de->old_chunk = it->first;
+            de->new_chunk = data_chunk_id;
+
+            // Store operation pointer.
+            chunk_map_[ChunkToSector(data_chunk_id)] = it->second;
+            offset += sizeof(struct disk_exception);
+            num_ops += 1;
+            copy_ops++;
+
+            SNAP_LOG(DEBUG) << num_ops << ":"
+                            << " Copy-op: "
+                            << " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
+
+            if (num_ops == exceptions_per_area_) {
+                // Store it in vector at the right index. This maps the chunk-id to
+                // vector index.
+                vec_.push_back(std::move(de_ptr));
+                num_ops = 0;
+                offset = 0;
+
+                // Create buffer for next area
+                de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
+                                                     sizeof(struct disk_exception));
+                memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
+
+                if (cowop_riter_->Done()) {
+                    vec_.push_back(std::move(de_ptr));
+                    SNAP_LOG(DEBUG) << "ReadMetadata() completed; Number of Areas: " << vec_.size();
+                }
+
+                CHECK(pending_copy_ops == 0);
+                pending_copy_ops = exceptions_per_area_;
+            }
+
+            data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
+        }
+        map.clear();
+        dest_blocks.clear();
+        prev_id.reset();
+    }
+
     // Partially filled area or there is no metadata
     // If there is no metadata, fill with zero so that kernel
     // is aware that merge is completed.
@@ -632,8 +814,11 @@
                         << "Areas : " << vec_.size();
     }
 
-    SNAP_LOG(DEBUG) << "ReadMetadata() completed. Final_chunk_id: " << data_chunk_id
-                    << "Num Sector: " << ChunkToSector(data_chunk_id);
+    SNAP_LOG(INFO) << "ReadMetadata completed. Final-chunk-id: " << data_chunk_id
+                   << " Num Sector: " << ChunkToSector(data_chunk_id)
+                   << " Replace-ops: " << replace_ops << " Zero-ops: " << zero_ops
+                   << " Copy-ops: " << copy_ops << " Areas: " << vec_.size()
+                   << " Num-ops-merged: " << header.num_merge_ops;
 
     // Total number of sectors required for creating dm-user device
     num_sectors_ = ChunkToSector(data_chunk_id);
@@ -742,7 +927,7 @@
 
     size_t remaining_size = header->len;
     size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
-    CHECK(read_size == BLOCK_SIZE);
+    CHECK(read_size == BLOCK_SZ);
 
     CHECK(header->sector > 0);
     chunk_t chunk = SectorToChunk(header->sector);
@@ -793,11 +978,11 @@
         // will always be a single 4k.
         if (header->sector == 0) {
             CHECK(metadata_read_done_ == true);
-            CHECK(read_size == BLOCK_SIZE);
+            CHECK(read_size == BLOCK_SZ);
             ConstructKernelCowHeader();
             SNAP_LOG(DEBUG) << "Kernel header constructed";
         } else {
-            if (!offset && (read_size == BLOCK_SIZE) &&
+            if (!offset && (read_size == BLOCK_SZ) &&
                 chunk_map_.find(header->sector) == chunk_map_.end()) {
                 if (!ReadDiskExceptions(chunk, read_size)) {
                     SNAP_LOG(ERROR) << "ReadDiskExceptions failed for chunk id: " << chunk

diff --git a/fs_mgr/libsnapshot/snapuserd.h b/fs_mgr/libsnapshot/snapuserd.h
index 4587881..dba3186 100644
--- a/fs_mgr/libsnapshot/snapuserd.h
+++ b/fs_mgr/libsnapshot/snapuserd.h

@@ -75,6 +75,8 @@
         cow_fd_ = {};
         backing_store_fd_ = {};
     }
+    size_t GetMetadataAreaSize() { return vec_.size(); }
+    void* GetExceptionBuffer(size_t i) { return vec_[i].get(); }
 
   private:
     bool DmuserReadRequest();
@@ -101,11 +103,11 @@
     loff_t GetMergeStartOffset(void* merged_buffer, void* unmerged_buffer,
                                int* unmerged_exceptions);
     int GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer, loff_t offset,
-                             int unmerged_exceptions, bool* copy_op);
+                             int unmerged_exceptions);
     bool ProcessMergeComplete(chunk_t chunk, void* buffer);
     sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
     chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
-    bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SIZE - 1)) == 0); }
+    bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SZ - 1)) == 0); }
 
     std::string cow_device_;
     std::string backing_store_device_;
commit	7cd1c031783dcc23e03593ee13eedddc07ef5c62	[log] [tgz]
author	Akilesh Kailash <akailash@google.com>	Wed Feb 10 05:08:48 2021 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Wed Feb 10 05:08:48 2021 +0000
tree	e64303bee81b3b9a3718171e6dd9f0347f0444c8
parent	089383938531c3cfb8c0b3400c3e6bba617e6c5c [diff]
parent	66914e2b2e457b439b61bc0ef2e8823707b0299a [diff]