Merge "Enhance checkpoint=disable GC threshold fallback mechanism"
diff --git a/fs_mgr/libsnapshot/cow_api_test.cpp b/fs_mgr/libsnapshot/cow_api_test.cpp
index 7f7e40a..ecfdefe 100644
--- a/fs_mgr/libsnapshot/cow_api_test.cpp
+++ b/fs_mgr/libsnapshot/cow_api_test.cpp
@@ -1013,6 +1013,27 @@
     ASSERT_TRUE(iter->Done());
 }
 
+TEST_F(CowTest, MissingSeqOp) {
+    CowOptions options;
+    CowWriter writer(options);
+    const int seq_len = 10;
+    uint32_t sequence[seq_len];
+    for (int i = 0; i < seq_len; i++) {
+        sequence[i] = i + 1;
+    }
+
+    ASSERT_TRUE(writer.Initialize(cow_->fd));
+
+    ASSERT_TRUE(writer.AddSequenceData(seq_len, sequence));
+    ASSERT_TRUE(writer.AddZeroBlocks(1, seq_len - 1));
+    ASSERT_TRUE(writer.Finalize());
+
+    ASSERT_EQ(lseek(cow_->fd, 0, SEEK_SET), 0);
+
+    CowReader reader;
+    ASSERT_FALSE(reader.Parse(cow_->fd));
+}
+
 TEST_F(CowTest, RevMergeOpItrTest) {
     CowOptions options;
     options.cluster_ops = 5;
diff --git a/fs_mgr/libsnapshot/cow_reader.cpp b/fs_mgr/libsnapshot/cow_reader.cpp
index af49c7d..ace6f59 100644
--- a/fs_mgr/libsnapshot/cow_reader.cpp
+++ b/fs_mgr/libsnapshot/cow_reader.cpp
@@ -413,6 +413,13 @@
         }
         block_map->insert({current_op.new_block, i});
     }
+    for (auto block : *merge_op_blocks) {
+        if (block_map->count(block) == 0) {
+            LOG(ERROR) << "Invalid Sequence Ops. Could not find Cow Op for new block " << block;
+            return false;
+        }
+    }
+
     if (merge_op_blocks->size() > header_.num_merge_ops) {
         num_ordered_ops_to_merge_ = merge_op_blocks->size() - header_.num_merge_ops;
     } else {
diff --git a/fs_mgr/libsnapshot/snapuserd/cow_snapuserd_test.cpp b/fs_mgr/libsnapshot/snapuserd/cow_snapuserd_test.cpp
index fc7153a..a718328 100644
--- a/fs_mgr/libsnapshot/snapuserd/cow_snapuserd_test.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/cow_snapuserd_test.cpp
@@ -96,6 +96,8 @@
 class CowSnapuserdTest final {
   public:
     bool Setup();
+    bool SetupOrderedOps();
+    bool SetupOrderedOpsInverted();
     bool SetupCopyOverlap_1();
     bool SetupCopyOverlap_2();
     bool Merge();
@@ -103,6 +105,8 @@
     void ReadSnapshotDeviceAndValidate();
     void Shutdown();
     void MergeInterrupt();
+    void MergeInterruptFixed(int duration);
+    void MergeInterruptRandomly(int max_duration);
     void ReadDmUserBlockWithoutDaemon();
 
     std::string snapshot_dev() const { return snapshot_dev_->path(); }
@@ -117,6 +121,8 @@
     void StartMerge();
 
     void CreateCowDevice();
+    void CreateCowDeviceOrderedOps();
+    void CreateCowDeviceOrderedOpsInverted();
     void CreateCowDeviceWithCopyOverlap_1();
     void CreateCowDeviceWithCopyOverlap_2();
     bool SetupDaemon();
@@ -197,6 +203,18 @@
     return setup_ok_;
 }
 
+bool CowSnapuserdTest::SetupOrderedOps() {
+    CreateBaseDevice();
+    CreateCowDeviceOrderedOps();
+    return SetupDaemon();
+}
+
+bool CowSnapuserdTest::SetupOrderedOpsInverted() {
+    CreateBaseDevice();
+    CreateCowDeviceOrderedOpsInverted();
+    return SetupDaemon();
+}
+
 bool CowSnapuserdTest::SetupCopyOverlap_1() {
     CreateBaseDevice();
     CreateCowDeviceWithCopyOverlap_1();
@@ -383,6 +401,112 @@
               true);
 }
 
+void CowSnapuserdTest::CreateCowDeviceOrderedOpsInverted() {
+    unique_fd rnd_fd;
+    loff_t offset = 0;
+
+    std::string path = android::base::GetExecutableDirectory();
+    cow_system_ = std::make_unique<TemporaryFile>(path);
+
+    rnd_fd.reset(open("/dev/random", O_RDONLY));
+    ASSERT_TRUE(rnd_fd > 0);
+
+    std::unique_ptr<uint8_t[]> random_buffer_1_ = std::make_unique<uint8_t[]>(size_);
+
+    // Fill random data
+    for (size_t j = 0; j < (size_ / 1_MiB); j++) {
+        ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1_.get() + offset, 1_MiB, 0),
+                  true);
+
+        offset += 1_MiB;
+    }
+
+    CowOptions options;
+    options.compression = "gz";
+    CowWriter writer(options);
+
+    ASSERT_TRUE(writer.Initialize(cow_system_->fd));
+
+    size_t num_blocks = size_ / options.block_size;
+    size_t blk_end_copy = num_blocks * 2;
+    size_t source_blk = num_blocks - 1;
+    size_t blk_src_copy = blk_end_copy - 1;
+
+    size_t x = num_blocks;
+    while (1) {
+        ASSERT_TRUE(writer.AddCopy(source_blk, blk_src_copy));
+        x -= 1;
+        if (x == 0) {
+            break;
+        }
+        source_blk -= 1;
+        blk_src_copy -= 1;
+    }
+
+    // Flush operations
+    ASSERT_TRUE(writer.Finalize());
+    // Construct the buffer required for validation
+    orig_buffer_ = std::make_unique<uint8_t[]>(total_base_size_);
+    // Read the entire base device
+    ASSERT_EQ(android::base::ReadFullyAtOffset(base_fd_, orig_buffer_.get(), total_base_size_, 0),
+              true);
+    // Merged Buffer
+    memmove(orig_buffer_.get(), (char*)orig_buffer_.get() + size_, size_);
+}
+
+void CowSnapuserdTest::CreateCowDeviceOrderedOps() {
+    unique_fd rnd_fd;
+    loff_t offset = 0;
+
+    std::string path = android::base::GetExecutableDirectory();
+    cow_system_ = std::make_unique<TemporaryFile>(path);
+
+    rnd_fd.reset(open("/dev/random", O_RDONLY));
+    ASSERT_TRUE(rnd_fd > 0);
+
+    std::unique_ptr<uint8_t[]> random_buffer_1_ = std::make_unique<uint8_t[]>(size_);
+
+    // Fill random data
+    for (size_t j = 0; j < (size_ / 1_MiB); j++) {
+        ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1_.get() + offset, 1_MiB, 0),
+                  true);
+
+        offset += 1_MiB;
+    }
+
+    CowOptions options;
+    options.compression = "gz";
+    CowWriter writer(options);
+
+    ASSERT_TRUE(writer.Initialize(cow_system_->fd));
+
+    size_t num_blocks = size_ / options.block_size;
+    size_t x = num_blocks;
+    size_t source_blk = 0;
+    size_t blk_src_copy = num_blocks;
+
+    while (1) {
+        ASSERT_TRUE(writer.AddCopy(source_blk, blk_src_copy));
+
+        x -= 1;
+        if (x == 0) {
+            break;
+        }
+        source_blk += 1;
+        blk_src_copy += 1;
+    }
+
+    // Flush operations
+    ASSERT_TRUE(writer.Finalize());
+    // Construct the buffer required for validation
+    orig_buffer_ = std::make_unique<uint8_t[]>(total_base_size_);
+    // Read the entire base device
+    ASSERT_EQ(android::base::ReadFullyAtOffset(base_fd_, orig_buffer_.get(), total_base_size_, 0),
+              true);
+    // Merged Buffer
+    memmove(orig_buffer_.get(), (char*)orig_buffer_.get() + size_, size_);
+}
+
 void CowSnapuserdTest::CreateCowDevice() {
     unique_fd rnd_fd;
     loff_t offset = 0;
@@ -597,6 +721,7 @@
 
 void CowSnapuserdTest::SimulateDaemonRestart() {
     Shutdown();
+    std::this_thread::sleep_for(500ms);
     SetDeviceControlName();
     StartSnapuserdDaemon();
     InitCowDevice();
@@ -605,6 +730,34 @@
     CreateSnapshotDevice();
 }
 
+void CowSnapuserdTest::MergeInterruptRandomly(int max_duration) {
+    std::srand(std::time(nullptr));
+    StartMerge();
+
+    for (int i = 0; i < 20; i++) {
+        int duration = std::rand() % max_duration;
+        std::this_thread::sleep_for(std::chrono::milliseconds(duration));
+        SimulateDaemonRestart();
+        StartMerge();
+    }
+
+    SimulateDaemonRestart();
+    ASSERT_TRUE(Merge());
+}
+
+void CowSnapuserdTest::MergeInterruptFixed(int duration) {
+    StartMerge();
+
+    for (int i = 0; i < 25; i++) {
+        std::this_thread::sleep_for(std::chrono::milliseconds(duration));
+        SimulateDaemonRestart();
+        StartMerge();
+    }
+
+    SimulateDaemonRestart();
+    ASSERT_TRUE(Merge());
+}
+
 void CowSnapuserdTest::MergeInterrupt() {
     // Interrupt merge at various intervals
     StartMerge();
@@ -669,10 +822,9 @@
     void* buffer = snapuserd_->GetExceptionBuffer(1);
     loff_t offset = 0;
     struct disk_exception* de;
-    for (int i = 0; i < 12; i++) {
+    for (int i = 11; i >= 0; i--) {
         de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
         ASSERT_EQ(de->old_chunk, i);
-        ASSERT_EQ(de->new_chunk, new_chunk);
         offset += sizeof(struct disk_exception);
         new_chunk += 1;
     }
@@ -811,71 +963,71 @@
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 100);
-            ASSERT_EQ(de->new_chunk, 522);
+            ASSERT_EQ(de->new_chunk, 521);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 105);
-            ASSERT_EQ(de->new_chunk, 524);
+            ASSERT_EQ(de->new_chunk, 522);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 110);
-            ASSERT_EQ(de->new_chunk, 526);
+            ASSERT_EQ(de->new_chunk, 523);
             offset += sizeof(struct disk_exception);
 
             // The next 4 operations are batch merged as
             // both old and new chunk are contiguous
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
-            ASSERT_EQ(de->old_chunk, 50);
-            ASSERT_EQ(de->new_chunk, 528);
-            offset += sizeof(struct disk_exception);
-
-            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
-            ASSERT_EQ(de->old_chunk, 51);
-            ASSERT_EQ(de->new_chunk, 529);
+            ASSERT_EQ(de->old_chunk, 53);
+            ASSERT_EQ(de->new_chunk, 524);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 52);
-            ASSERT_EQ(de->new_chunk, 530);
+            ASSERT_EQ(de->new_chunk, 525);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
-            ASSERT_EQ(de->old_chunk, 53);
-            ASSERT_EQ(de->new_chunk, 531);
+            ASSERT_EQ(de->old_chunk, 51);
+            ASSERT_EQ(de->new_chunk, 526);
+            offset += sizeof(struct disk_exception);
+
+            de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
+            ASSERT_EQ(de->old_chunk, 50);
+            ASSERT_EQ(de->new_chunk, 527);
             offset += sizeof(struct disk_exception);
 
             // This is handling overlap operation with
             // two batch merge operations.
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 18);
-            ASSERT_EQ(de->new_chunk, 533);
+            ASSERT_EQ(de->new_chunk, 528);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 19);
-            ASSERT_EQ(de->new_chunk, 534);
+            ASSERT_EQ(de->new_chunk, 529);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 20);
-            ASSERT_EQ(de->new_chunk, 535);
+            ASSERT_EQ(de->new_chunk, 530);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 21);
-            ASSERT_EQ(de->new_chunk, 537);
+            ASSERT_EQ(de->new_chunk, 532);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 22);
-            ASSERT_EQ(de->new_chunk, 538);
+            ASSERT_EQ(de->new_chunk, 533);
             offset += sizeof(struct disk_exception);
 
             de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
             ASSERT_EQ(de->old_chunk, 23);
-            ASSERT_EQ(de->new_chunk, 539);
+            ASSERT_EQ(de->new_chunk, 534);
             offset += sizeof(struct disk_exception);
 
             // End of metadata
@@ -945,6 +1097,38 @@
     harness.ReadDmUserBlockWithoutDaemon();
 }
 
+TEST(Snapuserd_Test, Snapshot_Merge_Crash_Fixed_Ordered) {
+    CowSnapuserdTest harness;
+    ASSERT_TRUE(harness.SetupOrderedOps());
+    harness.MergeInterruptFixed(300);
+    harness.ValidateMerge();
+    harness.Shutdown();
+}
+
+TEST(Snapuserd_Test, Snapshot_Merge_Crash_Random_Ordered) {
+    CowSnapuserdTest harness;
+    ASSERT_TRUE(harness.SetupOrderedOps());
+    harness.MergeInterruptRandomly(500);
+    harness.ValidateMerge();
+    harness.Shutdown();
+}
+
+TEST(Snapuserd_Test, Snapshot_Merge_Crash_Fixed_Inverted) {
+    CowSnapuserdTest harness;
+    ASSERT_TRUE(harness.SetupOrderedOpsInverted());
+    harness.MergeInterruptFixed(50);
+    harness.ValidateMerge();
+    harness.Shutdown();
+}
+
+TEST(Snapuserd_Test, Snapshot_Merge_Crash_Random_Inverted) {
+    CowSnapuserdTest harness;
+    ASSERT_TRUE(harness.SetupOrderedOpsInverted());
+    harness.MergeInterruptRandomly(50);
+    harness.ValidateMerge();
+    harness.Shutdown();
+}
+
 }  // namespace snapshot
 }  // namespace android
 
diff --git a/fs_mgr/libsnapshot/snapuserd/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd/snapuserd.cpp
index ad89b24..31d0221 100644
--- a/fs_mgr/libsnapshot/snapuserd/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/snapuserd.cpp
@@ -436,8 +436,9 @@
 
     int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
     std::optional<chunk_t> prev_id = {};
-    std::map<uint64_t, const CowOperation*> map;
+    std::vector<const CowOperation*> vec;
     std::set<uint64_t> dest_blocks;
+    std::set<uint64_t> source_blocks;
     size_t pending_copy_ops = exceptions_per_area_ - num_ops;
     uint64_t total_copy_ops = reader_->get_num_ordered_ops_to_merge();
 
@@ -491,99 +492,45 @@
             // scratch space and re-construct it thereby there
             // is no loss of data.
             //
+            // Note that we will follow the same order of COW operations
+            // as present in the COW file. This will make sure that
+            // the merge of operations are done based on the ops present
+            // in the file.
             //===========================================================
-            //
-            // Case 2:
-            //
-            // Let's say we have three copy operations written to COW file
-            // in the following order:
-            //
-            // op-1: 15 -> 18
-            // op-2: 16 -> 19
-            // op-3: 17 -> 20
-            //
-            // As aforementioned, kernel will initiate merge in reverse order.
-            // Hence, we will read these ops in reverse order so that all these
-            // ops are exectued in the same order as requested. Thus, we will
-            // read the metadata in reverse order and for the kernel it will
-            // look like:
-            //
-            // op-3: 17 -> 20
-            // op-2: 16 -> 19
-            // op-1: 15 -> 18   <-- Merge starts here in the kernel
-            //
-            // Now, this is problematic as kernel cannot batch merge them.
-            //
-            // Merge sequence will look like:
-            //
-            // Merge-1: op-1: 15 -> 18
-            // Merge-2: op-2: 16 -> 19
-            // Merge-3: op-3: 17 -> 20
-            //
-            // We have three merge operations.
-            //
-            // Even though the blocks are contiguous, kernel can batch merge
-            // them if the blocks are in descending order. Update engine
-            // addresses this issue partially for overlapping operations as
-            // we see that op-1 to op-3 and op-4 to op-6 operatiosn are in
-            // descending order. However, if the copy operations are not
-            // overlapping, update engine cannot write these blocks
-            // in descending order. Hence, we will try to address it.
-            // Thus, we will send these blocks to the kernel and it will
-            // look like:
-            //
-            // op-3: 15 -> 18
-            // op-2: 16 -> 19
-            // op-1: 17 -> 20  <-- Merge starts here in the kernel
-            //
-            // Now with this change, we can batch merge all these three
-            // operations. Merge sequence will look like:
-            //
-            // Merge-1: {op-1: 17 -> 20, op-2: 16 -> 19, op-3: 15 -> 18}
-            //
-            // Note that we have changed the ordering of merge; However, this
-            // is ok as each of these copy operations are independent and there
-            // is no overlap.
-            //
-            //===================================================================
             if (prev_id.has_value()) {
-                chunk_t diff = (cow_op->new_block > prev_id.value())
-                                       ? (cow_op->new_block - prev_id.value())
-                                       : (prev_id.value() - cow_op->new_block);
-                if (diff != 1) {
-                    break;
-                }
-
-                if (dest_blocks.count(cow_op->new_block) || map.count(cow_op->source) > 0) {
+                if (dest_blocks.count(cow_op->new_block) || source_blocks.count(cow_op->source)) {
                     break;
                 }
             }
             metadata_found = true;
             pending_copy_ops -= 1;
-            map[cow_op->new_block] = cow_op;
+            vec.push_back(cow_op);
             dest_blocks.insert(cow_op->source);
+            source_blocks.insert(cow_op->new_block);
             prev_id = cow_op->new_block;
             cowop_rm_iter->Next();
         } while (!cowop_rm_iter->Done() && pending_copy_ops);
 
         data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
-        SNAP_LOG(DEBUG) << "Batch Merge copy-ops of size: " << map.size()
+        SNAP_LOG(DEBUG) << "Batch Merge copy-ops of size: " << vec.size()
                         << " Area: " << vec_.size() << " Area offset: " << offset
                         << " Pending-copy-ops in this area: " << pending_copy_ops;
 
-        for (auto it = map.begin(); it != map.end(); it++) {
+        for (size_t i = 0; i < vec.size(); i++) {
             struct disk_exception* de =
                     reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
-            de->old_chunk = it->first;
+            const CowOperation* cow_op = vec[i];
+
+            de->old_chunk = cow_op->new_block;
             de->new_chunk = data_chunk_id;
 
             // Store operation pointer.
-            chunk_vec_.push_back(std::make_pair(ChunkToSector(data_chunk_id), it->second));
+            chunk_vec_.push_back(std::make_pair(ChunkToSector(data_chunk_id), cow_op));
             offset += sizeof(struct disk_exception);
             num_ops += 1;
             copy_ops++;
             if (read_ahead_feature_) {
-                read_ahead_ops_.push_back(it->second);
+                read_ahead_ops_.push_back(cow_op);
             }
 
             SNAP_LOG(DEBUG) << num_ops << ":"
@@ -626,8 +573,9 @@
                 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
             }
         }
-        map.clear();
+        vec.clear();
         dest_blocks.clear();
+        source_blocks.clear();
         prev_id.reset();
     }
 
diff --git a/init/Android.bp b/init/Android.bp
index 3e8d4e3..a04d2db 100644
--- a/init/Android.bp
+++ b/init/Android.bp
@@ -228,17 +228,19 @@
     stem: "init",
     defaults: ["init_defaults"],
     static_libs: ["libinit"],
-    required: [
-        "e2fsdroid",
-        "init.rc",
-        "mke2fs",
-        "sload_f2fs",
-        "make_f2fs",
-        "ueventd.rc",
-    ],
     srcs: ["main.cpp"],
     symlinks: ["ueventd"],
     target: {
+        platform: {
+            required: [
+                "init.rc",
+                "ueventd.rc",
+                "e2fsdroid",
+                "make_f2fs",
+                "mke2fs",
+                "sload_f2fs",
+            ],
+        },
         recovery: {
             cflags: ["-DRECOVERY"],
             exclude_static_libs: [
@@ -248,6 +250,14 @@
                 "libbinder",
                 "libutils",
             ],
+            required: [
+                "init_recovery.rc",
+                "ueventd.rc.recovery",
+                "e2fsdroid.recovery",
+                "make_f2fs.recovery",
+                "mke2fs.recovery",
+                "sload_f2fs.recovery",
+            ],
         },
     },
     visibility: ["//packages/modules/Virtualization/microdroid"],
diff --git a/init/mount_namespace.cpp b/init/mount_namespace.cpp
index 2a57808..575cae9 100644
--- a/init/mount_namespace.cpp
+++ b/init/mount_namespace.cpp
@@ -82,6 +82,21 @@
     return updatable;
 }
 
+static bool IsMicrodroid() {
+    static bool is_microdroid = android::base::GetProperty("ro.hardware", "") == "microdroid";
+    return is_microdroid;
+}
+
+// In case we have two sets of APEXes (non-updatable, updatable), we need two separate mount
+// namespaces.
+static bool NeedsTwoMountNamespaces() {
+    if (!IsApexUpdatable()) return false;
+    if (IsRecoveryMode()) return false;
+    // In microdroid, there's only one set of APEXes in built-in directories include block devices.
+    if (IsMicrodroid()) return false;
+    return true;
+}
+
 #ifdef ACTIVATE_FLATTENED_APEX
 
 static Result<void> MountDir(const std::string& path, const std::string& mount_path) {
@@ -260,7 +275,7 @@
     // number of essential APEXes (e.g. com.android.runtime) are activated.
     // In the namespace for post-apexd processes, all APEXes are activated.
     bool success = true;
-    if (IsApexUpdatable() && !IsRecoveryMode()) {
+    if (NeedsTwoMountNamespaces()) {
         // Creating a new namespace by cloning, saving, and switching back to
         // the original namespace.
         if (unshare(CLONE_NEWNS) == -1) {