libsnapshot: Resume snapshot merge if snapshots are in second
phase
If the device reboots when SnapshotUpdateStatus switches from
first phase to second phase, then track the transition
and resume the merge.
Bug: 374225913
Test: OTA on Pixel - Verify merge resumes when device reboots just after
first phase merge
Change-Id: I5f62a03852a4b012850b11d0c1e6b96ec0556278
Signed-off-by: Akilesh Kailash <akailash@google.com>
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
index 8ff41db..de20526 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
@@ -442,6 +442,7 @@
FRIEND_TEST(SnapshotUpdateTest, QueryStatusError);
FRIEND_TEST(SnapshotUpdateTest, SnapshotStatusFileWithoutCow);
FRIEND_TEST(SnapshotUpdateTest, SpaceSwapUpdate);
+ FRIEND_TEST(SnapshotUpdateTest, InterruptMergeDuringPhaseUpdate);
FRIEND_TEST(SnapshotUpdateTest, MapAllSnapshotsWithoutSlotSwitch);
friend class SnapshotTest;
friend class SnapshotUpdateTest;
diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp
index 05dec68..acabd67 100644
--- a/fs_mgr/libsnapshot/snapshot.cpp
+++ b/fs_mgr/libsnapshot/snapshot.cpp
@@ -1343,10 +1343,25 @@
}
if (merge_status == "snapshot" &&
- DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE &&
- update_status.merge_phase() == MergePhase::FIRST_PHASE) {
- // The snapshot is not being merged because it's in the wrong phase.
- return MergeResult(UpdateState::None);
+ DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE) {
+ if (update_status.merge_phase() == MergePhase::FIRST_PHASE) {
+ // The snapshot is not being merged because it's in the wrong phase.
+ return MergeResult(UpdateState::None);
+ } else {
+ // update_status is already in second phase but the
+ // snapshot_status is still not set to SnapshotState::MERGING.
+ //
+ // Resume the merge at this point. see b/374225913
+ LOG(INFO) << "SwitchSnapshotToMerge: " << name << " after resuming merge";
+ auto code = SwitchSnapshotToMerge(lock, name);
+ if (code != MergeFailureCode::Ok) {
+ LOG(ERROR) << "Failed to switch snapshot: " << name
+ << " to merge during second phase";
+ return MergeResult(UpdateState::MergeFailed,
+ MergeFailureCode::UnknownTargetType);
+ }
+ return MergeResult(UpdateState::Merging);
+ }
}
if (merge_status == "snapshot-merge") {
@@ -1442,8 +1457,14 @@
return MergeFailureCode::WriteStatus;
}
+ auto current_slot_suffix = device_->GetSlotSuffix();
MergeFailureCode result = MergeFailureCode::Ok;
for (const auto& snapshot : snapshots) {
+ if (!android::base::EndsWith(snapshot, current_slot_suffix)) {
+ LOG(ERROR) << "Skipping invalid snapshot: " << snapshot
+ << " during MergeSecondPhaseSnapshots";
+ continue;
+ }
SnapshotStatus snapshot_status;
if (!ReadSnapshotStatus(lock, snapshot, &snapshot_status)) {
return MergeFailureCode::ReadStatus;
diff --git a/fs_mgr/libsnapshot/snapshot_test.cpp b/fs_mgr/libsnapshot/snapshot_test.cpp
index 46c3a35..3e5ae2a 100644
--- a/fs_mgr/libsnapshot/snapshot_test.cpp
+++ b/fs_mgr/libsnapshot/snapshot_test.cpp
@@ -1607,6 +1607,146 @@
}
}
+// Test that shrinking and growing partitions at the same time is handled
+// correctly in VABC.
+TEST_F(SnapshotUpdateTest, InterruptMergeDuringPhaseUpdate) {
+ if (!snapuserd_required_) {
+ // b/179111359
+ GTEST_SKIP() << "Skipping snapuserd test";
+ }
+
+ auto old_sys_size = GetSize(sys_);
+ auto old_prd_size = GetSize(prd_);
+
+ // Grow |sys| but shrink |prd|.
+ SetSize(sys_, old_sys_size * 2);
+ sys_->set_estimate_cow_size(8_MiB);
+ SetSize(prd_, old_prd_size / 2);
+ prd_->set_estimate_cow_size(1_MiB);
+
+ AddOperationForPartitions();
+
+ ASSERT_TRUE(sm->BeginUpdate());
+ ASSERT_TRUE(sm->CreateUpdateSnapshots(manifest_));
+
+ // Check that the old partition sizes were saved correctly.
+ {
+ ASSERT_TRUE(AcquireLock());
+ auto local_lock = std::move(lock_);
+
+ SnapshotStatus status;
+ ASSERT_TRUE(sm->ReadSnapshotStatus(local_lock.get(), "prd_b", &status));
+ ASSERT_EQ(status.old_partition_size(), 3145728);
+ ASSERT_TRUE(sm->ReadSnapshotStatus(local_lock.get(), "sys_b", &status));
+ ASSERT_EQ(status.old_partition_size(), 3145728);
+ }
+
+ ASSERT_TRUE(WriteSnapshotAndHash(sys_));
+ ASSERT_TRUE(WriteSnapshotAndHash(vnd_));
+ ASSERT_TRUE(ShiftAllSnapshotBlocks("prd_b", old_prd_size));
+
+ sync();
+
+ // Assert that source partitions aren't affected.
+ for (const auto& name : {"sys_a", "vnd_a", "prd_a"}) {
+ ASSERT_TRUE(IsPartitionUnchanged(name));
+ }
+
+ ASSERT_TRUE(sm->FinishedSnapshotWrites(false));
+
+ // Simulate shutting down the device.
+ ASSERT_TRUE(UnmapAll());
+
+ // After reboot, init does first stage mount.
+ auto init = NewManagerForFirstStageMount("_b");
+ ASSERT_NE(init, nullptr);
+ ASSERT_TRUE(init->NeedSnapshotsInFirstStageMount());
+ ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super", snapshot_timeout_));
+
+ // Check that the target partitions have the same content.
+ for (const auto& name : {"sys_b", "vnd_b", "prd_b"}) {
+ ASSERT_TRUE(IsPartitionUnchanged(name));
+ }
+
+ // Initiate the merge and wait for it to be completed.
+ if (ShouldSkipLegacyMerging()) {
+ LOG(INFO) << "Skipping legacy merge in test";
+ return;
+ }
+ ASSERT_TRUE(init->InitiateMerge());
+ ASSERT_EQ(init->IsSnapuserdRequired(), snapuserd_required_);
+ {
+ // Check that the merge phase is FIRST_PHASE until at least one call
+ // to ProcessUpdateState() occurs.
+ ASSERT_TRUE(AcquireLock());
+ auto local_lock = std::move(lock_);
+ auto status = init->ReadSnapshotUpdateStatus(local_lock.get());
+ ASSERT_EQ(status.merge_phase(), MergePhase::FIRST_PHASE);
+ }
+
+ // Wait until prd_b merge is completed which is part of first phase
+ std::chrono::milliseconds timeout(6000);
+ auto start = std::chrono::steady_clock::now();
+ // Keep polling until the merge is complete or timeout is reached
+ while (true) {
+ // Query the merge status
+ const auto merge_status = init->snapuserd_client()->QuerySnapshotStatus("prd_b");
+ if (merge_status == "snapshot-merge-complete") {
+ break;
+ }
+
+ auto now = std::chrono::steady_clock::now();
+ auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - start);
+
+ ASSERT_TRUE(elapsed < timeout);
+ // sleep for a second and allow merge to complete
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+ }
+
+ // Now, forcefully update the snapshot-update status to SECOND PHASE
+ // This will not update the snapshot status of sys_b to MERGING
+ if (init->UpdateUsesUserSnapshots()) {
+ ASSERT_TRUE(AcquireLock());
+ auto local_lock = std::move(lock_);
+ auto status = init->ReadSnapshotUpdateStatus(local_lock.get());
+ status.set_merge_phase(MergePhase::SECOND_PHASE);
+ ASSERT_TRUE(init->WriteSnapshotUpdateStatus(local_lock.get(), status));
+ }
+
+ // Simulate shutting down the device and creating partitions again.
+ ASSERT_TRUE(UnmapAll());
+ ASSERT_TRUE(init->CreateLogicalAndSnapshotPartitions("super", snapshot_timeout_));
+
+ DeviceMapper::TargetInfo target;
+ ASSERT_TRUE(init->IsSnapshotDevice("prd_b", &target));
+
+ ASSERT_EQ(DeviceMapper::GetTargetType(target.spec), "user");
+ ASSERT_TRUE(init->IsSnapshotDevice("sys_b", &target));
+ ASSERT_EQ(DeviceMapper::GetTargetType(target.spec), "user");
+ ASSERT_TRUE(init->IsSnapshotDevice("vnd_b", &target));
+ ASSERT_EQ(DeviceMapper::GetTargetType(target.spec), "user");
+
+ // Complete the merge; "sys" and "vnd" should resume the merge
+ // even though merge was interrupted after update_status was updated to
+ // SECOND_PHASE
+ ASSERT_EQ(UpdateState::MergeCompleted, init->ProcessUpdateState());
+
+ // Make sure the second phase ran and deleted snapshots.
+ {
+ ASSERT_TRUE(AcquireLock());
+ auto local_lock = std::move(lock_);
+ std::vector<std::string> snapshots;
+ ASSERT_TRUE(init->ListSnapshots(local_lock.get(), &snapshots));
+ ASSERT_TRUE(snapshots.empty());
+ }
+
+ // Check that the target partitions have the same content after the merge.
+ for (const auto& name : {"sys_b", "vnd_b", "prd_b"}) {
+ ASSERT_TRUE(IsPartitionUnchanged(name))
+ << "Content of " << name << " changes after the merge";
+ }
+}
+
// Test that if new system partitions uses empty space in super, that region is not snapshotted.
TEST_F(SnapshotUpdateTest, DirectWriteEmptySpace) {
GTEST_SKIP() << "b/141889746";