snapuserd: I/O request on overlapping blocks during snapshot-merge.
This fixes the case when all the following conditions are true:
1: Incremental OTA
2: When there are sequence of overlapping COPY operations within one merge-window
(510 blocks)
3: Device is rebooted when snapshot-merge is in-progress of this
merge-window. When device reboots, the state of merge-window (of 510 blocks) was
merge-in-progress (aka - only partial set of blocks were merged in
this window thereby the state of the base device is in-complete for
this window)
4: During the next boot, if there any I/O request from the filesystem
which maps to the merge-window in (3):
a: The data has to be retrieved from the scratch space of the
COW until the snapshot-merge for that window is completed.
b: Once the snapshot-merge is complete for that window, data
has to be retrieved from base device.
The bug was in step 4(a) wherein I/O request was getting routed to base
device.
This patch addresses the above flow by fixing step 4(a).
A new vts test has been added to explicitly track this issue.
Additionally, there is no need to re-scan the partition if partition is in merge resume path. This should cut down the overhead of the scan.
Bug: 275296365
Test: 1: 100 iterations of ./vts_snapuserd_test --gtest_filter=SnapuserdTest.Snapshot_COPY_Overlap_Merge_Resume_IO_Validate_TEST
2: Incremental OTA on Pixel 6 Pro with multiple iterations of device
reboot when merge is in progress
Change-Id: Ib53be7f07ff192a84ec7f7049b2c6be01dad1041
Signed-off-by: Akilesh Kailash <akailash@google.com>
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
index c295851..e886ec3 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.cpp
@@ -173,6 +173,10 @@
}
SNAP_LOG(INFO) << "Merge-ops: " << header.num_merge_ops;
+ if (header.num_merge_ops) {
+ resume_merge_ = true;
+ SNAP_LOG(INFO) << "Resume Snapshot-merge";
+ }
if (!MmapMetadata()) {
SNAP_LOG(ERROR) << "mmap failed";
@@ -295,6 +299,11 @@
if (ra_thread_) {
ra_thread_status =
std::async(std::launch::async, &ReadAhead::RunThread, read_ahead_thread_.get());
+ // If this is a merge-resume path, wait until RA thread is fully up as
+ // the data has to be re-constructed from the scratch space.
+ if (resume_merge_ && ShouldReconstructDataFromCow()) {
+ WaitForRaThreadToStart();
+ }
}
// Launch worker threads
@@ -307,7 +316,9 @@
std::async(std::launch::async, &MergeWorker::Run, merge_thread_.get());
// Now that the worker threads are up, scan the partitions.
- if (perform_verification_) {
+ // If the snapshot-merge is being resumed, there is no need to scan as the
+ // current slot is already marked as boot complete.
+ if (perform_verification_ && !resume_merge_) {
update_verify_->VerifyUpdatePartition();
}
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
index e401c11..f88406d 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_core.h
@@ -147,6 +147,8 @@
void WakeupMonitorMergeThread();
void WaitForMergeComplete();
bool WaitForMergeBegin();
+ void RaThreadStarted();
+ void WaitForRaThreadToStart();
void NotifyRAForMergeReady();
bool WaitForMergeReady();
void MergeFailed();
@@ -221,6 +223,7 @@
// Read-ahead related
bool populate_data_from_cow_ = false;
bool ra_thread_ = false;
+ bool ra_thread_started_ = false;
int total_ra_blocks_merged_ = 0;
MERGE_IO_TRANSITION io_state_ = MERGE_IO_TRANSITION::INVALID;
std::unique_ptr<ReadAhead> read_ahead_thread_;
@@ -242,6 +245,7 @@
bool scratch_space_ = false;
int num_worker_threads_ = kNumWorkerThreads;
bool perform_verification_ = true;
+ bool resume_merge_ = false;
std::unique_ptr<struct io_uring> ring_;
std::unique_ptr<UpdateVerify> update_verify_;
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp
index d2128c5..998d233 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_readahead.cpp
@@ -206,6 +206,7 @@
return false;
}
+ snapuserd_->RaThreadStarted();
SNAP_LOG(INFO) << "ReconstructDataFromCow success";
notify_read_ahead_failed.Cancel();
return true;
@@ -716,9 +717,13 @@
total_ra_blocks_completed_ += total_blocks_merged_;
snapuserd_->SetMergedBlockCountForNextCommit(total_blocks_merged_);
- // Flush the data only if we have a overlapping blocks in the region
+ // Flush the scratch data - Technically, we should flush only for overlapping
+ // blocks; However, since this region is mmap'ed, the dirty pages can still
+ // get flushed to disk at any random point in time. Instead, make sure
+ // the data in scratch is in the correct state before merge thread resumes.
+ //
// Notify the Merge thread to resume merging this window
- if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
+ if (!snapuserd_->ReadAheadIOCompleted(true)) {
SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
snapuserd_->ReadAheadIOFailed();
return false;
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp
index 620ecbd..65f31cf 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_test.cpp
@@ -235,9 +235,11 @@
bool Merge();
void ValidateMerge();
void ReadSnapshotDeviceAndValidate();
+ void ReadSnapshotAndValidateOverlappingBlocks();
void Shutdown();
void MergeInterrupt();
void MergeInterruptFixed(int duration);
+ void MergeInterruptAndValidate(int duration);
void MergeInterruptRandomly(int max_duration);
bool StartMerge();
void CheckMergeCompletion();
@@ -358,6 +360,76 @@
ASSERT_EQ(memcmp(snapuserd_buffer.get(), (char*)orig_buffer_.get() + (size_ * 4), size_), 0);
}
+void SnapuserdTest::ReadSnapshotAndValidateOverlappingBlocks() {
+ // Open COW device
+ unique_fd fd(open(cow_system_->path, O_RDONLY));
+ ASSERT_GE(fd, 0);
+
+ CowReader reader;
+ ASSERT_TRUE(reader.Parse(fd));
+
+ const auto& header = reader.GetHeader();
+ size_t total_mapped_addr_length = header.prefix.header_size + BUFFER_REGION_DEFAULT_SIZE;
+
+ ASSERT_GE(header.prefix.major_version, 2);
+
+ void* mapped_addr = mmap(NULL, total_mapped_addr_length, PROT_READ, MAP_SHARED, fd.get(), 0);
+ ASSERT_NE(mapped_addr, MAP_FAILED);
+
+ bool populate_data_from_scratch = false;
+ struct BufferState* ra_state =
+ reinterpret_cast<struct BufferState*>((char*)mapped_addr + header.prefix.header_size);
+ if (ra_state->read_ahead_state == kCowReadAheadDone) {
+ populate_data_from_scratch = true;
+ }
+
+ size_t num_merge_ops = header.num_merge_ops;
+ // We have some partial merge operations completed.
+ // To test the merge-resume path, forcefully corrupt the data of the base
+ // device for the offsets where the merge is still pending.
+ if (num_merge_ops && populate_data_from_scratch) {
+ std::string corrupt_buffer(4096, 0);
+ // Corrupt two blocks from the point where the merge has to be resumed by
+ // writing down zeroe's.
+ //
+ // Now, since this is a merge-resume path, the "correct" data should be
+ // in the scratch space of the COW device. When there is an I/O request
+ // from the snapshot device, the data has to be retrieved from the
+ // scratch space. If not and I/O is routed to the base device, we
+ // may end up with corruption.
+ off_t corrupt_offset = (num_merge_ops + 2) * 4096;
+
+ if (corrupt_offset < size_) {
+ ASSERT_EQ(android::base::WriteFullyAtOffset(base_fd_, (void*)corrupt_buffer.c_str(),
+ 4096, corrupt_offset),
+ true);
+ corrupt_offset -= 4096;
+ ASSERT_EQ(android::base::WriteFullyAtOffset(base_fd_, (void*)corrupt_buffer.c_str(),
+ 4096, corrupt_offset),
+ true);
+ fsync(base_fd_.get());
+ }
+ }
+
+ // Time to read the snapshot device.
+ unique_fd snapshot_fd(open(dmuser_dev_->GetPath().c_str(), O_RDONLY | O_DIRECT | O_SYNC));
+ ASSERT_GE(snapshot_fd, 0);
+
+ void* buff_addr;
+ ASSERT_EQ(posix_memalign(&buff_addr, 4096, size_), 0);
+
+ std::unique_ptr<void, decltype(&::free)> snapshot_buffer(buff_addr, ::free);
+
+ // Scan the entire snapshot device and read the data and verify data
+ // integrity. Since the base device was forcefully corrupted, the data from
+ // this scan should be retrieved from scratch space of the COW partition.
+ //
+ // Furthermore, after the merge is complete, base device data is again
+ // verified as the aforementioned corrupted blocks aren't persisted.
+ ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapshot_buffer.get(), size_, 0), true);
+ ASSERT_EQ(memcmp(snapshot_buffer.get(), orig_buffer_.get(), size_), 0);
+}
+
void SnapuserdTest::CreateCowDeviceWithCopyOverlap_2() {
auto writer = CreateCowDeviceInternal();
ASSERT_NE(writer, nullptr);
@@ -665,6 +737,20 @@
ASSERT_TRUE(Merge());
}
+void SnapuserdTest::MergeInterruptAndValidate(int duration) {
+ ASSERT_TRUE(StartMerge());
+
+ for (int i = 0; i < 15; i++) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(duration));
+ ASSERT_NO_FATAL_FAILURE(SimulateDaemonRestart());
+ ReadSnapshotAndValidateOverlappingBlocks();
+ ASSERT_TRUE(StartMerge());
+ }
+
+ ASSERT_NO_FATAL_FAILURE(SimulateDaemonRestart());
+ ASSERT_TRUE(Merge());
+}
+
void SnapuserdTest::MergeInterrupt() {
// Interrupt merge at various intervals
ASSERT_TRUE(StartMerge());
@@ -761,6 +847,15 @@
ValidateMerge();
}
+TEST_F(SnapuserdTest, Snapshot_COPY_Overlap_Merge_Resume_IO_Validate_TEST) {
+ if (!harness_->HasUserDevice()) {
+ GTEST_SKIP() << "Skipping snapshot read; not supported";
+ }
+ ASSERT_NO_FATAL_FAILURE(SetupCopyOverlap_2());
+ ASSERT_NO_FATAL_FAILURE(MergeInterruptAndValidate(2));
+ ValidateMerge();
+}
+
TEST_F(SnapuserdTest, Snapshot_Merge_Crash_Fixed_Ordered) {
ASSERT_NO_FATAL_FAILURE(SetupOrderedOps());
ASSERT_NO_FATAL_FAILURE(MergeInterruptFixed(300));
diff --git a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp
index f3e0019..8d090bf 100644
--- a/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp
+++ b/fs_mgr/libsnapshot/snapuserd/user-space-merge/snapuserd_transitions.cpp
@@ -366,6 +366,26 @@
}
}
+void SnapshotHandler::RaThreadStarted() {
+ std::unique_lock<std::mutex> lock(lock_);
+ ra_thread_started_ = true;
+}
+
+void SnapshotHandler::WaitForRaThreadToStart() {
+ auto now = std::chrono::system_clock::now();
+ auto deadline = now + 3s;
+ {
+ std::unique_lock<std::mutex> lock(lock_);
+ while (!ra_thread_started_) {
+ auto status = cv.wait_until(lock, deadline);
+ if (status == std::cv_status::timeout) {
+ SNAP_LOG(ERROR) << "Read-ahead thread did not start";
+ return;
+ }
+ }
+ }
+}
+
std::string SnapshotHandler::GetMergeStatus() {
bool merge_not_initiated = false;
bool merge_monitored = false;
@@ -618,7 +638,6 @@
std::unordered_map<uint64_t, void*>::iterator it = read_ahead_buffer_map_.find(block);
if (it == read_ahead_buffer_map_.end()) {
- SNAP_LOG(ERROR) << "Block: " << block << " not found in RA buffer";
return false;
}
@@ -642,6 +661,13 @@
MERGE_GROUP_STATE state = blk_state->merge_state_;
switch (state) {
case MERGE_GROUP_STATE::GROUP_MERGE_PENDING: {
+ // If this is a merge-resume path, check if the data is
+ // available from scratch space. Data from scratch space takes
+ // higher precedence than from source device for overlapping
+ // blocks.
+ if (resume_merge_ && GetRABuffer(&lock, new_block, buffer)) {
+ return (MERGE_GROUP_STATE::GROUP_MERGE_IN_PROGRESS);
+ }
blk_state->num_ios_in_progress += 1; // ref count
[[fallthrough]];
}