Perform a consistency check before deleting snapshots.
If for some reason the COW state is not fully synced to disk, but
dm-snapshot has flushed its pending merges, we do not want to delete
snapshots. Doing so could potentially leave blocks unmerged.
This situation is quite unexpected so we label it as a merge failure.
The device can recover by completely syncing the COW state, and then
rebooting, which will attempt to make forward progress on the merge.
Bug: 190582627
Test: vts_libsnapshot_test
full OTA on bramble
incremental OTA on bramble
Change-Id: Ib887f1d9e4397a712ed2f800cc1222cf9305a039
diff --git a/fs_mgr/libsnapshot/android/snapshot/snapshot.proto b/fs_mgr/libsnapshot/android/snapshot/snapshot.proto
index 77ed92c..c0649ca 100644
--- a/fs_mgr/libsnapshot/android/snapshot/snapshot.proto
+++ b/fs_mgr/libsnapshot/android/snapshot/snapshot.proto
@@ -158,6 +158,13 @@
ExpectedMergeTarget = 11;
UnmergedSectorsAfterCompletion = 12;
UnexpectedMergeState = 13;
+ GetCowPathConsistencyCheck = 14;
+ OpenCowConsistencyCheck = 15;
+ ParseCowConsistencyCheck = 16;
+ OpenCowDirectConsistencyCheck = 17;
+ MemAlignConsistencyCheck = 18;
+ DirectReadConsistencyCheck = 19;
+ WrongMergeCountConsistencyCheck = 20;
};
// Next: 8
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
index 9ebcfd9..669e58a 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
@@ -143,12 +143,11 @@
void InitializeMerge();
+ // Number of copy, replace, and zero ops. Set if InitializeMerge is called.
void set_total_data_ops(uint64_t size) { total_data_ops_ = size; }
-
uint64_t total_data_ops() { return total_data_ops_; }
-
+ // Number of copy ops. Set if InitializeMerge is called.
void set_copy_ops(uint64_t size) { copy_ops_ = size; }
-
uint64_t total_copy_ops() { return copy_ops_; }
void CloseCowFd() { owned_fd_ = {}; }
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
index 603e896..65034f7 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapshot.h
@@ -603,6 +603,8 @@
MergeResult CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel);
MergeResult CheckTargetMergeState(LockedFile* lock, const std::string& name,
const SnapshotUpdateStatus& update_status);
+ MergeFailureCode CheckMergeConsistency(LockedFile* lock, const std::string& name,
+ const SnapshotStatus& update_status);
// Interact with status files under /metadata/ota/snapshots.
bool WriteSnapshotStatus(LockedFile* lock, const SnapshotStatus& status);
diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp
index e2c03ae..be732ec 100644
--- a/fs_mgr/libsnapshot/snapshot.cpp
+++ b/fs_mgr/libsnapshot/snapshot.cpp
@@ -1126,6 +1126,11 @@
return MergeResult(UpdateState::Merging);
}
+ auto code = CheckMergeConsistency(lock, name, snapshot_status);
+ if (code != MergeFailureCode::Ok) {
+ return MergeResult(UpdateState::MergeFailed, code);
+ }
+
// Merging is done. First, update the status file to indicate the merge
// is complete. We do this before calling OnSnapshotMergeComplete, even
// though this means the write is potentially wasted work (since in the
@@ -1144,6 +1149,91 @@
return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok);
}
+// This returns the backing device, not the dm-user layer.
+static std::string GetMappedCowDeviceName(const std::string& snapshot,
+ const SnapshotStatus& status) {
+ // If no partition was created (the COW exists entirely on /data), the
+ // device-mapper layering is different than if we had a partition.
+ if (status.cow_partition_size() == 0) {
+ return GetCowImageDeviceName(snapshot);
+ }
+ return GetCowName(snapshot);
+}
+
+MergeFailureCode SnapshotManager::CheckMergeConsistency(LockedFile* lock, const std::string& name,
+ const SnapshotStatus& status) {
+ CHECK(lock);
+
+ if (!status.compression_enabled()) {
+ // Do not try to verify old-style COWs yet.
+ return MergeFailureCode::Ok;
+ }
+
+ auto& dm = DeviceMapper::Instance();
+
+ std::string cow_image_name = GetMappedCowDeviceName(name, status);
+ std::string cow_image_path;
+ if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_path)) {
+ LOG(ERROR) << "Failed to get path for cow device: " << cow_image_name;
+ return MergeFailureCode::GetCowPathConsistencyCheck;
+ }
+
+ // First pass, count # of ops.
+ size_t num_ops = 0;
+ {
+ unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_CLOEXEC));
+ if (fd < 0) {
+ PLOG(ERROR) << "Failed to open " << cow_image_name;
+ return MergeFailureCode::OpenCowConsistencyCheck;
+ }
+
+ CowReader reader;
+ if (!reader.Parse(std::move(fd))) {
+ LOG(ERROR) << "Failed to parse cow " << cow_image_path;
+ return MergeFailureCode::ParseCowConsistencyCheck;
+ }
+
+ for (auto iter = reader.GetOpIter(); !iter->Done(); iter->Next()) {
+ if (!IsMetadataOp(iter->Get())) {
+ num_ops++;
+ }
+ }
+ }
+
+ // Second pass, try as hard as we can to get the actual number of blocks
+ // the system thinks is merged.
+ unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_DIRECT | O_SYNC | O_CLOEXEC));
+ if (fd < 0) {
+ PLOG(ERROR) << "Failed to open direct " << cow_image_name;
+ return MergeFailureCode::OpenCowDirectConsistencyCheck;
+ }
+
+ void* addr;
+ size_t page_size = getpagesize();
+ if (posix_memalign(&addr, page_size, page_size) < 0) {
+ PLOG(ERROR) << "posix_memalign with page size " << page_size;
+ return MergeFailureCode::MemAlignConsistencyCheck;
+ }
+
+ // COWs are always at least 2MB, this is guaranteed in snapshot creation.
+ std::unique_ptr<void, decltype(&::free)> buffer(addr, ::free);
+ if (!android::base::ReadFully(fd, buffer.get(), page_size)) {
+ PLOG(ERROR) << "Direct read failed " << cow_image_name;
+ return MergeFailureCode::DirectReadConsistencyCheck;
+ }
+
+ auto header = reinterpret_cast<CowHeader*>(buffer.get());
+ if (header->num_merge_ops != num_ops) {
+ LOG(ERROR) << "COW consistency check failed, expected " << num_ops << " to be merged, "
+ << "but " << header->num_merge_ops << " were actually recorded.";
+ LOG(ERROR) << "Aborting merge progress for snapshot " << name
+ << ", will try again next boot";
+ return MergeFailureCode::WrongMergeCountConsistencyCheck;
+ }
+
+ return MergeFailureCode::Ok;
+}
+
MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
std::vector<std::string> snapshots;
if (!ListSnapshots(lock, &snapshots)) {
@@ -1429,14 +1519,7 @@
continue;
}
- // If no partition was created (the COW exists entirely on /data), the
- // device-mapper layering is different than if we had a partition.
- std::string cow_image_name;
- if (snapshot_status.cow_partition_size() == 0) {
- cow_image_name = GetCowImageDeviceName(snapshot);
- } else {
- cow_image_name = GetCowName(snapshot);
- }
+ std::string cow_image_name = GetMappedCowDeviceName(snapshot, snapshot_status);
std::string cow_image_device;
if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_device)) {