Merge changes from topic "snapuserd-read-ahead"
* changes:
libsnapshot:snapuserd:Add unit test for read-ahead code path.
libsnapshot: Flush data to scratch space only for overlapping regions
libsnapshot:snapuserd: read-ahead COW copy ops
libsnapshot: Retrieve COW version from update engine manifest
libsnapshot:snapuserd: Add 2MB scratch space in COW file
libsnapshot:snapuserd: mmap + msync header after merge
diff --git a/fs_mgr/libsnapshot/Android.bp b/fs_mgr/libsnapshot/Android.bp
index 3cb4123..c97dca0 100644
--- a/fs_mgr/libsnapshot/Android.bp
+++ b/fs_mgr/libsnapshot/Android.bp
@@ -420,7 +420,8 @@
"snapuserd_server.cpp",
"snapuserd.cpp",
"snapuserd_daemon.cpp",
- "snapuserd_worker.cpp",
+ "snapuserd_worker.cpp",
+ "snapuserd_readahead.cpp",
],
cflags: [
diff --git a/fs_mgr/libsnapshot/cow_reader.cpp b/fs_mgr/libsnapshot/cow_reader.cpp
index 7199b38..35a02e6 100644
--- a/fs_mgr/libsnapshot/cow_reader.cpp
+++ b/fs_mgr/libsnapshot/cow_reader.cpp
@@ -94,11 +94,6 @@
<< "Expected: " << kCowMagicNumber;
return false;
}
- if (header_.header_size != sizeof(CowHeader)) {
- LOG(ERROR) << "Header size unknown, read " << header_.header_size << ", expected "
- << sizeof(CowHeader);
- return false;
- }
if (header_.footer_size != sizeof(CowFooter)) {
LOG(ERROR) << "Footer size unknown, read " << header_.footer_size << ", expected "
<< sizeof(CowFooter);
@@ -123,8 +118,7 @@
return false;
}
- if ((header_.major_version != kCowVersionMajor) ||
- (header_.minor_version != kCowVersionMinor)) {
+ if ((header_.major_version > kCowVersionMajor) || (header_.minor_version != kCowVersionMinor)) {
LOG(ERROR) << "Header version mismatch";
LOG(ERROR) << "Major version: " << header_.major_version
<< "Expected: " << kCowVersionMajor;
@@ -137,10 +131,25 @@
}
bool CowReader::ParseOps(std::optional<uint64_t> label) {
- uint64_t pos = lseek(fd_.get(), sizeof(header_), SEEK_SET);
- if (pos != sizeof(header_)) {
- PLOG(ERROR) << "lseek ops failed";
- return false;
+ uint64_t pos;
+
+ // Skip the scratch space
+ if (header_.major_version >= 2 && (header_.buffer_size > 0)) {
+ LOG(DEBUG) << " Scratch space found of size: " << header_.buffer_size;
+ size_t init_offset = header_.header_size + header_.buffer_size;
+ pos = lseek(fd_.get(), init_offset, SEEK_SET);
+ if (pos != init_offset) {
+ PLOG(ERROR) << "lseek ops failed";
+ return false;
+ }
+ } else {
+ pos = lseek(fd_.get(), header_.header_size, SEEK_SET);
+ if (pos != header_.header_size) {
+ PLOG(ERROR) << "lseek ops failed";
+ return false;
+ }
+ // Reading a v1 version of COW which doesn't have buffer_size.
+ header_.buffer_size = 0;
}
auto ops_buffer = std::make_shared<std::vector<CowOperation>>();
@@ -360,13 +369,7 @@
// Replace-op-4, Zero-op-9, Replace-op-5 }
//==============================================================
- for (uint64_t i = 0; i < ops_->size(); i++) {
- auto& current_op = ops_->data()[i];
- if (current_op.type != kCowCopyOp) {
- break;
- }
- num_copy_ops += 1;
- }
+ num_copy_ops = FindNumCopyops();
std::sort(ops_.get()->begin() + num_copy_ops, ops_.get()->end(),
[](CowOperation& op1, CowOperation& op2) -> bool {
@@ -377,6 +380,23 @@
CHECK(ops_->size() >= header_.num_merge_ops);
ops_->erase(ops_.get()->begin(), ops_.get()->begin() + header_.num_merge_ops);
}
+
+ num_copy_ops = FindNumCopyops();
+ set_copy_ops(num_copy_ops);
+}
+
+uint64_t CowReader::FindNumCopyops() {
+ uint64_t num_copy_ops = 0;
+
+ for (uint64_t i = 0; i < ops_->size(); i++) {
+ auto& current_op = ops_->data()[i];
+ if (current_op.type != kCowCopyOp) {
+ break;
+ }
+ num_copy_ops += 1;
+ }
+
+ return num_copy_ops;
}
bool CowReader::GetHeader(CowHeader* header) {
@@ -470,7 +490,7 @@
bool CowReader::GetRawBytes(uint64_t offset, void* buffer, size_t len, size_t* read) {
// Validate the offset, taking care to acknowledge possible overflow of offset+len.
- if (offset < sizeof(header_) || offset >= fd_size_ - sizeof(CowFooter) || len >= fd_size_ ||
+ if (offset < header_.header_size || offset >= fd_size_ - sizeof(CowFooter) || len >= fd_size_ ||
offset + len > fd_size_ - sizeof(CowFooter)) {
LOG(ERROR) << "invalid data offset: " << offset << ", " << len << " bytes";
return false;
diff --git a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
index 045d9db..313eb64 100644
--- a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
+++ b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
@@ -96,6 +96,7 @@
class CowSnapuserdTest final {
public:
bool Setup();
+ bool SetupCopyOverlap();
bool Merge();
void ValidateMerge();
void ReadSnapshotDeviceAndValidate();
@@ -114,6 +115,7 @@
void StartMerge();
void CreateCowDevice();
+ void CreateCowDeviceWithCopyOverlap();
void CreateBaseDevice();
void InitCowDevice();
void SetDeviceControlName();
@@ -191,6 +193,24 @@
return setup_ok_;
}
+bool CowSnapuserdTest::SetupCopyOverlap() {
+ CreateBaseDevice();
+ CreateCowDeviceWithCopyOverlap();
+
+ SetDeviceControlName();
+
+ StartSnapuserdDaemon();
+ InitCowDevice();
+
+ CreateDmUserDevice();
+ InitDaemon();
+
+ CreateSnapshotDevice();
+ setup_ok_ = true;
+
+ return setup_ok_;
+}
+
void CowSnapuserdTest::StartSnapuserdDaemon() {
pid_t pid = fork();
ASSERT_GE(pid, 0);
@@ -255,6 +275,49 @@
ASSERT_EQ(memcmp(snapuserd_buffer.get(), (char*)orig_buffer_.get() + (size_ * 3), size_), 0);
}
+void CowSnapuserdTest::CreateCowDeviceWithCopyOverlap() {
+ std::string path = android::base::GetExecutableDirectory();
+ cow_system_ = std::make_unique<TemporaryFile>(path);
+
+ CowOptions options;
+ options.compression = "gz";
+ CowWriter writer(options);
+
+ ASSERT_TRUE(writer.Initialize(cow_system_->fd));
+
+ size_t num_blocks = size_ / options.block_size;
+ size_t x = num_blocks;
+ size_t blk_src_copy = num_blocks - 1;
+
+ // Create overlapping copy operations
+ while (1) {
+ ASSERT_TRUE(writer.AddCopy(blk_src_copy + 1, blk_src_copy));
+ x -= 1;
+ if (x == 0) {
+ ASSERT_EQ(blk_src_copy, 0);
+ break;
+ }
+ blk_src_copy -= 1;
+ }
+
+ // Flush operations
+ ASSERT_TRUE(writer.Finalize());
+
+ // Construct the buffer required for validation
+ orig_buffer_ = std::make_unique<uint8_t[]>(total_base_size_);
+
+ // Read the entire base device
+ ASSERT_EQ(android::base::ReadFullyAtOffset(base_fd_, orig_buffer_.get(), total_base_size_, 0),
+ true);
+
+ // Merged operations
+ ASSERT_EQ(android::base::ReadFullyAtOffset(base_fd_, orig_buffer_.get(), options.block_size, 0),
+ true);
+ ASSERT_EQ(android::base::ReadFullyAtOffset(
+ base_fd_, (char*)orig_buffer_.get() + options.block_size, size_, 0),
+ true);
+}
+
void CowSnapuserdTest::CreateCowDevice() {
unique_fd rnd_fd;
loff_t offset = 0;
@@ -707,17 +770,17 @@
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 21);
- ASSERT_EQ(de->new_chunk, 537);
+ ASSERT_EQ(de->new_chunk, 536);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 22);
- ASSERT_EQ(de->new_chunk, 538);
+ ASSERT_EQ(de->new_chunk, 537);
offset += sizeof(struct disk_exception);
de = reinterpret_cast<struct disk_exception*>((char*)buffer + offset);
ASSERT_EQ(de->old_chunk, 23);
- ASSERT_EQ(de->new_chunk, 539);
+ ASSERT_EQ(de->new_chunk, 538);
offset += sizeof(struct disk_exception);
// End of metadata
@@ -757,6 +820,23 @@
harness.ValidateMerge();
harness.Shutdown();
}
+
+TEST(Snapuserd_Test, Snapshot_COPY_Overlap_TEST) {
+ CowSnapuserdTest harness;
+ ASSERT_TRUE(harness.SetupCopyOverlap());
+ ASSERT_TRUE(harness.Merge());
+ harness.ValidateMerge();
+ harness.Shutdown();
+}
+
+TEST(Snapuserd_Test, Snapshot_COPY_Overlap_Merge_Resume_TEST) {
+ CowSnapuserdTest harness;
+ ASSERT_TRUE(harness.SetupCopyOverlap());
+ harness.MergeInterrupt();
+ harness.ValidateMerge();
+ harness.Shutdown();
+}
+
} // namespace snapshot
} // namespace android
diff --git a/fs_mgr/libsnapshot/cow_writer.cpp b/fs_mgr/libsnapshot/cow_writer.cpp
index 645ae9d..51c00a9 100644
--- a/fs_mgr/libsnapshot/cow_writer.cpp
+++ b/fs_mgr/libsnapshot/cow_writer.cpp
@@ -94,6 +94,7 @@
header_.block_size = options_.block_size;
header_.num_merge_ops = 0;
header_.cluster_ops = options_.cluster_ops;
+ header_.buffer_size = 0;
footer_ = {};
footer_.op.data_length = 64;
footer_.op.type = kCowFooterOp;
@@ -139,12 +140,6 @@
return true;
}
-void CowWriter::InitializeMerge(borrowed_fd fd, CowHeader* header) {
- fd_ = fd;
- memcpy(&header_, header, sizeof(CowHeader));
- merge_in_progress_ = true;
-}
-
bool CowWriter::Initialize(unique_fd&& fd) {
owned_fd_ = std::move(fd);
return Initialize(borrowed_fd{owned_fd_});
@@ -172,7 +167,7 @@
}
void CowWriter::InitPos() {
- next_op_pos_ = sizeof(header_);
+ next_op_pos_ = sizeof(header_) + header_.buffer_size;
cluster_size_ = header_.cluster_ops * sizeof(CowOperation);
if (header_.cluster_ops) {
next_data_pos_ = next_op_pos_ + cluster_size_;
@@ -196,6 +191,10 @@
return false;
}
+ if (options_.scratch_space) {
+ header_.buffer_size = BUFFER_REGION_DEFAULT_SIZE;
+ }
+
// Headers are not complete, but this ensures the file is at the right
// position.
if (!android::base::WriteFully(fd_, &header_, sizeof(header_))) {
@@ -203,7 +202,27 @@
return false;
}
+ if (options_.scratch_space) {
+ // Initialize the scratch space
+ std::string data(header_.buffer_size, 0);
+ if (!android::base::WriteFully(fd_, data.data(), header_.buffer_size)) {
+ PLOG(ERROR) << "writing scratch space failed";
+ return false;
+ }
+ }
+
+ if (!Sync()) {
+ LOG(ERROR) << "Header sync failed";
+ return false;
+ }
+
+ if (lseek(fd_.get(), sizeof(header_) + header_.buffer_size, SEEK_SET) < 0) {
+ PLOG(ERROR) << "lseek failed";
+ return false;
+ }
+
InitPos();
+
return true;
}
@@ -517,24 +536,6 @@
return true;
}
-bool CowWriter::CommitMerge(int merged_ops) {
- CHECK(merge_in_progress_);
- header_.num_merge_ops += merged_ops;
-
- if (lseek(fd_.get(), 0, SEEK_SET) < 0) {
- PLOG(ERROR) << "lseek failed";
- return false;
- }
-
- if (!android::base::WriteFully(fd_, reinterpret_cast<const uint8_t*>(&header_),
- sizeof(header_))) {
- PLOG(ERROR) << "WriteFully failed";
- return false;
- }
-
- return Sync();
-}
-
bool CowWriter::Truncate(off_t length) {
if (is_dev_null_ || is_block_device_) {
return true;
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_format.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_format.h
index e93254e..c05b7ef 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_format.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_format.h
@@ -21,17 +21,22 @@
namespace snapshot {
static constexpr uint64_t kCowMagicNumber = 0x436f77634f572121ULL;
-static constexpr uint32_t kCowVersionMajor = 1;
+static constexpr uint32_t kCowVersionMajor = 2;
static constexpr uint32_t kCowVersionMinor = 0;
static constexpr uint32_t kCowVersionManifest = 1;
+static constexpr uint32_t BLOCK_SZ = 4096;
+static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SZ) - 1);
+
// This header appears as the first sequence of bytes in the COW. All fields
// in the layout are little-endian encoded. The on-disk layout is:
//
// +-----------------------+
// | Header (fixed) |
// +-----------------------+
+// | Scratch space |
+// +-----------------------+
// | Operation (variable) |
// | Data (variable) |
// +-----------------------+
@@ -70,6 +75,9 @@
// Tracks merge operations completed
uint64_t num_merge_ops;
+
+ // Scratch space used during merge
+ uint32_t buffer_size;
} __attribute__((packed));
// This structure is the same size of a normal Operation, but is repurposed for the footer.
@@ -146,11 +154,31 @@
static constexpr uint8_t kCowCompressGz = 1;
static constexpr uint8_t kCowCompressBrotli = 2;
+static constexpr uint8_t kCowReadAheadNotStarted = 0;
+static constexpr uint8_t kCowReadAheadInProgress = 1;
+static constexpr uint8_t kCowReadAheadDone = 2;
+
struct CowFooter {
CowFooterOperation op;
CowFooterData data;
} __attribute__((packed));
+struct ScratchMetadata {
+ // Block of data in the image that operation modifies
+ // and read-ahead thread stores the modified data
+ // in the scratch space
+ uint64_t new_block;
+ // Offset within the file to read the data
+ uint64_t file_offset;
+} __attribute__((packed));
+
+struct BufferState {
+ uint8_t read_ahead_state;
+} __attribute__((packed));
+
+// 2MB Scratch space used for read-ahead
+static constexpr uint64_t BUFFER_REGION_DEFAULT_SIZE = (1ULL << 21);
+
std::ostream& operator<<(std::ostream& os, CowOperation const& arg);
int64_t GetNextOpOffset(const CowOperation& op, uint32_t cluster_size);
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
index 552fd96..9ebcfd9 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
@@ -141,18 +141,21 @@
bool GetRawBytes(uint64_t offset, void* buffer, size_t len, size_t* read);
- void UpdateMergeProgress(uint64_t merge_ops) { header_.num_merge_ops += merge_ops; }
-
void InitializeMerge();
void set_total_data_ops(uint64_t size) { total_data_ops_ = size; }
uint64_t total_data_ops() { return total_data_ops_; }
+ void set_copy_ops(uint64_t size) { copy_ops_ = size; }
+
+ uint64_t total_copy_ops() { return copy_ops_; }
+
void CloseCowFd() { owned_fd_ = {}; }
private:
bool ParseOps(std::optional<uint64_t> label);
+ uint64_t FindNumCopyops();
android::base::unique_fd owned_fd_;
android::base::borrowed_fd fd_;
@@ -162,6 +165,7 @@
std::optional<uint64_t> last_label_;
std::shared_ptr<std::vector<CowOperation>> ops_;
uint64_t total_data_ops_;
+ uint64_t copy_ops_;
};
} // namespace snapshot
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h
index a9efad8..1192e7d 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_writer.h
@@ -36,6 +36,8 @@
// Number of CowOperations in a cluster. 0 for no clustering. Cannot be 1.
uint32_t cluster_ops = 200;
+
+ bool scratch_space = false;
};
// Interface for writing to a snapuserd COW. All operations are ordered; merges
@@ -100,13 +102,12 @@
bool InitializeAppend(android::base::unique_fd&&, uint64_t label);
bool InitializeAppend(android::base::borrowed_fd fd, uint64_t label);
- void InitializeMerge(android::base::borrowed_fd fd, CowHeader* header);
- bool CommitMerge(int merged_ops);
-
bool Finalize() override;
uint64_t GetCowSize() override;
+ uint32_t GetCowVersion() { return header_.major_version; }
+
protected:
virtual bool EmitCopy(uint64_t new_block, uint64_t old_block) override;
virtual bool EmitRawBlocks(uint64_t new_block_start, const void* data, size_t size) override;
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
index 2b6c8ef..6bb7a39 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
@@ -47,9 +47,6 @@
static constexpr uint32_t CHUNK_SIZE = 8;
static constexpr uint32_t CHUNK_SHIFT = (__builtin_ffs(CHUNK_SIZE) - 1);
-static constexpr uint32_t BLOCK_SZ = 4096;
-static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SZ) - 1);
-
#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
// This structure represents the kernel COW header.
diff --git a/fs_mgr/libsnapshot/snapshot.cpp b/fs_mgr/libsnapshot/snapshot.cpp
index 567eb26..8f3926a 100644
--- a/fs_mgr/libsnapshot/snapshot.cpp
+++ b/fs_mgr/libsnapshot/snapshot.cpp
@@ -2692,8 +2692,18 @@
AutoDeviceList created_devices;
const auto& dap_metadata = manifest.dynamic_partition_metadata();
- bool use_compression =
- IsCompressionEnabled() && dap_metadata.vabc_enabled() && !device_->IsRecovery();
+ CowOptions options;
+ CowWriter writer(options);
+ bool cow_format_support = true;
+ if (dap_metadata.cow_version() < writer.GetCowVersion()) {
+ cow_format_support = false;
+ }
+
+ LOG(INFO) << " dap_metadata.cow_version(): " << dap_metadata.cow_version()
+ << " writer.GetCowVersion(): " << writer.GetCowVersion();
+
+ bool use_compression = IsCompressionEnabled() && dap_metadata.vabc_enabled() &&
+ !device_->IsRecovery() && cow_format_support;
std::string compression_algorithm;
if (use_compression) {
@@ -2960,7 +2970,13 @@
return Return::Error();
}
- CowWriter writer(CowOptions{.compression = it->second.compression_algorithm()});
+ CowOptions options;
+ if (device()->IsTestDevice()) {
+ options.scratch_space = false;
+ }
+ options.compression = it->second.compression_algorithm();
+
+ CowWriter writer(options);
if (!writer.Initialize(fd) || !writer.Finalize()) {
LOG(ERROR) << "Could not initialize COW device for " << target_partition->name();
return Return::Error();
@@ -3069,6 +3085,10 @@
CowOptions cow_options;
cow_options.compression = status.compression_algorithm();
cow_options.max_blocks = {status.device_size() / cow_options.block_size};
+ // Disable scratch space for vts tests
+ if (device()->IsTestDevice()) {
+ cow_options.scratch_space = false;
+ }
// Currently we don't support partial snapshots, since partition_cow_creator
// never creates this scenario.
diff --git a/fs_mgr/libsnapshot/snapshot_reader_test.cpp b/fs_mgr/libsnapshot/snapshot_reader_test.cpp
index 4202d22..9373059 100644
--- a/fs_mgr/libsnapshot/snapshot_reader_test.cpp
+++ b/fs_mgr/libsnapshot/snapshot_reader_test.cpp
@@ -150,6 +150,7 @@
CowOptions options;
options.compression = "gz";
options.max_blocks = {kBlockCount};
+ options.scratch_space = false;
unique_fd cow_fd(dup(cow_->fd));
ASSERT_GE(cow_fd, 0);
diff --git a/fs_mgr/libsnapshot/snapshot_test.cpp b/fs_mgr/libsnapshot/snapshot_test.cpp
index 3554dce..45db7a4 100644
--- a/fs_mgr/libsnapshot/snapshot_test.cpp
+++ b/fs_mgr/libsnapshot/snapshot_test.cpp
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <libsnapshot/cow_format.h>
#include <libsnapshot/snapshot.h>
#include <fcntl.h>
@@ -327,6 +328,7 @@
auto dynamic_partition_metadata = manifest.mutable_dynamic_partition_metadata();
dynamic_partition_metadata->set_vabc_enabled(IsCompressionEnabled());
+ dynamic_partition_metadata->set_cow_version(android::snapshot::kCowVersionMajor);
auto group = dynamic_partition_metadata->add_groups();
group->set_name("group");
@@ -853,6 +855,7 @@
auto dynamic_partition_metadata = manifest_.mutable_dynamic_partition_metadata();
dynamic_partition_metadata->set_vabc_enabled(IsCompressionEnabled());
+ dynamic_partition_metadata->set_cow_version(android::snapshot::kCowVersionMajor);
// Create a fake update package metadata.
// Not using full name "system", "vendor", "product" because these names collide with the
diff --git a/fs_mgr/libsnapshot/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd.cpp
index 5ef9e29..2ccc750 100644
--- a/fs_mgr/libsnapshot/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd.cpp
@@ -47,28 +47,202 @@
worker_threads_.push_back(std::move(wt));
}
+
+ read_ahead_thread_ = std::make_unique<ReadAheadThread>(cow_device_, backing_store_device_,
+ misc_name_, GetSharedPtr());
return true;
}
bool Snapuserd::CommitMerge(int num_merge_ops) {
- {
- std::lock_guard<std::mutex> lock(lock_);
- CowHeader header;
+ struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
+ ch->num_merge_ops += num_merge_ops;
- reader_->GetHeader(&header);
- header.num_merge_ops += num_merge_ops;
- reader_->UpdateMergeProgress(num_merge_ops);
- if (!writer_->CommitMerge(num_merge_ops)) {
- SNAP_LOG(ERROR) << "CommitMerge failed... merged_ops_cur_iter: " << num_merge_ops
- << " Total-merged-ops: " << header.num_merge_ops;
- return false;
- }
- merge_initiated_ = true;
+ if (read_ahead_feature_ && read_ahead_ops_.size() > 0) {
+ struct BufferState* ra_state = GetBufferState();
+ ra_state->read_ahead_state = kCowReadAheadInProgress;
}
+ int ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
+ if (ret < 0) {
+ PLOG(ERROR) << "msync header failed: " << ret;
+ return false;
+ }
+
+ merge_initiated_ = true;
+
return true;
}
+void Snapuserd::PrepareReadAhead() {
+ if (!read_ahead_feature_) {
+ return;
+ }
+
+ struct BufferState* ra_state = GetBufferState();
+ // Check if the data has to be re-constructed from COW device
+ if (ra_state->read_ahead_state == kCowReadAheadDone) {
+ populate_data_from_cow_ = true;
+ } else {
+ populate_data_from_cow_ = false;
+ }
+
+ StartReadAhead();
+}
+
+bool Snapuserd::GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t block, void* buffer) {
+ CHECK(lock->owns_lock());
+ std::unordered_map<uint64_t, void*>::iterator it = read_ahead_buffer_map_.find(block);
+
+ // This will be true only for IO's generated as part of reading a root
+ // filesystem. IO's related to merge should always be in read-ahead cache.
+ if (it == read_ahead_buffer_map_.end()) {
+ return false;
+ }
+
+ // Theoretically, we can send the data back from the read-ahead buffer
+ // all the way to the kernel without memcpy. However, if the IO is
+ // un-aligned, the wrapper function will need to touch the read-ahead
+ // buffers and transitions will be bit more complicated.
+ memcpy(buffer, it->second, BLOCK_SZ);
+ return true;
+}
+
+// ========== State transition functions for read-ahead operations ===========
+
+bool Snapuserd::GetReadAheadPopulatedBuffer(uint64_t block, void* buffer) {
+ if (!read_ahead_feature_) {
+ return false;
+ }
+
+ {
+ std::unique_lock<std::mutex> lock(lock_);
+ if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
+ return false;
+ }
+
+ if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS) {
+ return GetRABuffer(&lock, block, buffer);
+ }
+ }
+
+ {
+ // Read-ahead thread IO is in-progress. Wait for it to complete
+ std::unique_lock<std::mutex> lock(lock_);
+ while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE ||
+ io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS)) {
+ cv.wait(lock);
+ }
+
+ return GetRABuffer(&lock, block, buffer);
+ }
+}
+
+// This is invoked by read-ahead thread waiting for merge IO's
+// to complete
+bool Snapuserd::WaitForMergeToComplete() {
+ {
+ std::unique_lock<std::mutex> lock(lock_);
+ while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN ||
+ io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED)) {
+ cv.wait(lock);
+ }
+
+ if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED) {
+ return false;
+ }
+
+ io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_IN_PROGRESS;
+ return true;
+ }
+}
+
+// This is invoked during the launch of worker threads. We wait
+// for read-ahead thread to by fully up before worker threads
+// are launched; else we will have a race between worker threads
+// and read-ahead thread specifically during re-construction.
+bool Snapuserd::WaitForReadAheadToStart() {
+ {
+ std::unique_lock<std::mutex> lock(lock_);
+ while (!(io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS ||
+ io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE)) {
+ cv.wait(lock);
+ }
+
+ if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
+ return false;
+ }
+
+ return true;
+ }
+}
+
+// Invoked by worker threads when a sequence of merge operation
+// is complete notifying read-ahead thread to make forward
+// progress.
+void Snapuserd::StartReadAhead() {
+ {
+ std::lock_guard<std::mutex> lock(lock_);
+ io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN;
+ }
+
+ cv.notify_one();
+}
+
+void Snapuserd::MergeCompleted() {
+ {
+ std::lock_guard<std::mutex> lock(lock_);
+ io_state_ = READ_AHEAD_IO_TRANSITION::IO_TERMINATED;
+ }
+
+ cv.notify_one();
+}
+
+bool Snapuserd::ReadAheadIOCompleted(bool sync) {
+ if (sync) {
+ // Flush the entire buffer region
+ int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
+ if (ret < 0) {
+ PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
+ return false;
+ }
+
+ // Metadata and data are synced. Now, update the state.
+ // We need to update the state after flushing data; if there is a crash
+ // when read-ahead IO is in progress, the state of data in the COW file
+ // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
+ // in the scratch space is good and during next reboot, read-ahead thread
+ // can safely re-construct the data.
+ struct BufferState* ra_state = GetBufferState();
+ ra_state->read_ahead_state = kCowReadAheadDone;
+
+ ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
+ if (ret < 0) {
+ PLOG(ERROR) << "msync failed to flush Readahead completion state...";
+ return false;
+ }
+ }
+
+ // Notify the worker threads
+ {
+ std::lock_guard<std::mutex> lock(lock_);
+ io_state_ = READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS;
+ }
+
+ cv.notify_all();
+ return true;
+}
+
+void Snapuserd::ReadAheadIOFailed() {
+ {
+ std::lock_guard<std::mutex> lock(lock_);
+ io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE;
+ }
+
+ cv.notify_all();
+}
+
+//========== End of state transition functions ====================
+
bool Snapuserd::IsChunkIdMetadata(chunk_t chunk) {
uint32_t stride = exceptions_per_area_ + 1;
lldiv_t divresult = lldiv(chunk, stride);
@@ -93,9 +267,9 @@
return;
}
- CowHeader header;
- reader_->GetHeader(&header);
- SNAP_LOG(INFO) << "Merge-status: Total-Merged-ops: " << header.num_merge_ops
+ struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
+
+ SNAP_LOG(INFO) << "Merge-status: Total-Merged-ops: " << ch->num_merge_ops
<< " Total-data-ops: " << reader_->total_data_ops();
}
@@ -175,8 +349,10 @@
reader_->InitializeMerge();
SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
- writer_ = std::make_unique<CowWriter>(options);
- writer_->InitializeMerge(cow_fd_.get(), &header);
+ if (!MmapMetadata()) {
+ SNAP_LOG(ERROR) << "mmap failed";
+ return false;
+ }
// Initialize the iterator for reading metadata
cowop_riter_ = reader_->GetRevOpIter();
@@ -258,13 +434,15 @@
data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
}
+ int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
std::optional<chunk_t> prev_id = {};
std::map<uint64_t, const CowOperation*> map;
- std::set<uint64_t> dest_blocks;
size_t pending_copy_ops = exceptions_per_area_ - num_ops;
- SNAP_LOG(INFO) << " Processing copy-ops at Area: " << vec_.size()
- << " Number of replace/zero ops completed in this area: " << num_ops
- << " Pending copy ops for this area: " << pending_copy_ops;
+ uint64_t total_copy_ops = reader_->total_copy_ops();
+
+ SNAP_LOG(DEBUG) << " Processing copy-ops at Area: " << vec_.size()
+ << " Number of replace/zero ops completed in this area: " << num_ops
+ << " Pending copy ops for this area: " << pending_copy_ops;
while (!cowop_riter_->Done()) {
do {
const CowOperation* cow_op = &cowop_riter_->Get();
@@ -300,41 +478,20 @@
// Op-6: 15 -> 18
//
// Note that the blocks numbers are contiguous. Hence, all 6 copy
- // operations can potentially be batch merged. However, that will be
+ // operations can be batch merged. However, that will be
// problematic if we have a crash as block 20, 19, 18 would have
// been overwritten and hence subsequent recovery may end up with
// a silent data corruption when op-1, op-2 and op-3 are
// re-executed.
//
- // We will split these 6 operations into two batches viz:
- //
- // Batch-1:
- // ===================
- // Op-1: 20 -> 23
- // Op-2: 19 -> 22
- // Op-3: 18 -> 21
- // ===================
- //
- // Batch-2:
- // ==================
- // Op-4: 17 -> 20
- // Op-5: 16 -> 19
- // Op-6: 15 -> 18
- // ==================
- //
- // Now, merge sequence will look like:
- //
- // 1: Merge Batch-1 { op-1, op-2, op-3 }
- // 2: Update Metadata in COW File that op-1, op-2, op-3 merge is
- // done.
- // 3: Merge Batch-2
- // 4: Update Metadata in COW File that op-4, op-5, op-6 merge is
- // done.
- //
- // Note, that the order of block operations are still the same.
- // However, we have two batch merge operations. Any crash between
- // either of this sequence should be safe as each of these
- // batches are self-contained.
+ // To address the above problem, read-ahead thread will
+ // read all the 6 source blocks, cache them in the scratch
+ // space of the COW file. During merge, read-ahead
+ // thread will serve the blocks from the read-ahead cache.
+ // If there is a crash during merge; on subsequent reboot,
+ // read-ahead thread will recover the data from the
+ // scratch space and re-construct it thereby there
+ // is no loss of data.
//
//===========================================================
//
@@ -398,14 +555,10 @@
if (diff != 1) {
break;
}
- if (dest_blocks.count(cow_op->new_block) || map.count(cow_op->source) > 0) {
- break;
- }
}
metadata_found = true;
pending_copy_ops -= 1;
map[cow_op->new_block] = cow_op;
- dest_blocks.insert(cow_op->source);
prev_id = cow_op->new_block;
cowop_riter_->Next();
} while (!cowop_riter_->Done() && pending_copy_ops);
@@ -426,6 +579,9 @@
offset += sizeof(struct disk_exception);
num_ops += 1;
copy_ops++;
+ if (read_ahead_feature_) {
+ read_ahead_ops_.push_back(it->second);
+ }
SNAP_LOG(DEBUG) << num_ops << ":"
<< " Copy-op: "
@@ -453,9 +609,17 @@
}
data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
+ total_copy_ops -= 1;
+ /*
+ * Split the number of ops based on the size of read-ahead buffer
+ * region. We need to ensure that kernel doesn't issue IO on blocks
+ * which are not read by the read-ahead thread.
+ */
+ if (read_ahead_feature_ && (total_copy_ops % num_ra_ops_per_iter == 0)) {
+ data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
+ }
}
map.clear();
- dest_blocks.clear();
prev_id.reset();
}
@@ -470,6 +634,7 @@
chunk_vec_.shrink_to_fit();
vec_.shrink_to_fit();
+ read_ahead_ops_.shrink_to_fit();
// Sort the vector based on sectors as we need this during un-aligned access
std::sort(chunk_vec_.begin(), chunk_vec_.end(), compare);
@@ -484,9 +649,41 @@
// Total number of sectors required for creating dm-user device
num_sectors_ = ChunkToSector(data_chunk_id);
merge_initiated_ = false;
+ PrepareReadAhead();
+
return true;
}
+bool Snapuserd::MmapMetadata() {
+ CowHeader header;
+ reader_->GetHeader(&header);
+
+ if (header.major_version >= 2 && header.buffer_size > 0) {
+ total_mapped_addr_length_ = header.header_size + BUFFER_REGION_DEFAULT_SIZE;
+ read_ahead_feature_ = true;
+ } else {
+ // mmap the first 4k page - older COW format
+ total_mapped_addr_length_ = BLOCK_SZ;
+ read_ahead_feature_ = false;
+ }
+
+ mapped_addr_ = mmap(NULL, total_mapped_addr_length_, PROT_READ | PROT_WRITE, MAP_SHARED,
+ cow_fd_.get(), 0);
+ if (mapped_addr_ == MAP_FAILED) {
+ SNAP_LOG(ERROR) << "mmap metadata failed";
+ return false;
+ }
+
+ return true;
+}
+
+void Snapuserd::UnmapBufferRegion() {
+ int ret = munmap(mapped_addr_, total_mapped_addr_length_);
+ if (ret < 0) {
+ SNAP_PLOG(ERROR) << "munmap failed";
+ }
+}
+
void MyLogger(android::base::LogId, android::base::LogSeverity severity, const char*, const char*,
unsigned int, const char* message) {
if (severity == android::base::ERROR) {
@@ -507,11 +704,28 @@
}
/*
- * Entry point to launch worker threads
+ * Entry point to launch threads
*/
bool Snapuserd::Start() {
std::vector<std::future<bool>> threads;
+ std::future<bool> ra_thread;
+ bool rathread = (read_ahead_feature_ && (read_ahead_ops_.size() > 0));
+ // Start the read-ahead thread and wait
+ // for it as the data has to be re-constructed
+ // from COW device.
+ if (rathread) {
+ ra_thread = std::async(std::launch::async, &ReadAheadThread::RunThread,
+ read_ahead_thread_.get());
+ if (!WaitForReadAheadToStart()) {
+ SNAP_LOG(ERROR) << "Failed to start Read-ahead thread...";
+ return false;
+ }
+
+ SNAP_LOG(INFO) << "Read-ahead thread started...";
+ }
+
+ // Launch worker threads
for (int i = 0; i < worker_threads_.size(); i++) {
threads.emplace_back(
std::async(std::launch::async, &WorkerThread::RunThread, worker_threads_[i].get()));
@@ -522,8 +736,69 @@
ret = t.get() && ret;
}
+ if (rathread) {
+ // Notify the read-ahead thread that all worker threads
+ // are done. We need this explicit notification when
+ // there is an IO failure or there was a switch
+ // of dm-user table; thus, forcing the read-ahead
+ // thread to wake up.
+ MergeCompleted();
+ ret = ret && ra_thread.get();
+ }
+
return ret;
}
+uint64_t Snapuserd::GetBufferMetadataOffset() {
+ CowHeader header;
+ reader_->GetHeader(&header);
+
+ size_t size = header.header_size + sizeof(BufferState);
+ return size;
+}
+
+/*
+ * Metadata for read-ahead is 16 bytes. For a 2 MB region, we will
+ * end up with 8k (2 PAGE) worth of metadata. Thus, a 2MB buffer
+ * region is split into:
+ *
+ * 1: 8k metadata
+ *
+ */
+size_t Snapuserd::GetBufferMetadataSize() {
+ CowHeader header;
+ reader_->GetHeader(&header);
+
+ size_t metadata_bytes = (header.buffer_size * sizeof(struct ScratchMetadata)) / BLOCK_SZ;
+ return metadata_bytes;
+}
+
+size_t Snapuserd::GetBufferDataOffset() {
+ CowHeader header;
+ reader_->GetHeader(&header);
+
+ return (header.header_size + GetBufferMetadataSize());
+}
+
+/*
+ * (2MB - 8K = 2088960 bytes) will be the buffer region to hold the data.
+ */
+size_t Snapuserd::GetBufferDataSize() {
+ CowHeader header;
+ reader_->GetHeader(&header);
+
+ size_t size = header.buffer_size - GetBufferMetadataSize();
+ return size;
+}
+
+struct BufferState* Snapuserd::GetBufferState() {
+ CowHeader header;
+ reader_->GetHeader(&header);
+
+ struct BufferState* ra_state =
+ reinterpret_cast<struct BufferState*>((char*)mapped_addr_ + header.header_size);
+ return ra_state;
+}
+
} // namespace snapshot
} // namespace android
diff --git a/fs_mgr/libsnapshot/snapuserd.h b/fs_mgr/libsnapshot/snapuserd.h
index 87c5528..0a5ab50 100644
--- a/fs_mgr/libsnapshot/snapuserd.h
+++ b/fs_mgr/libsnapshot/snapuserd.h
@@ -17,8 +17,10 @@
#include <linux/types.h>
#include <stdint.h>
#include <stdlib.h>
+#include <sys/mman.h>
#include <bitset>
+#include <condition_variable>
#include <csignal>
#include <cstring>
#include <future>
@@ -29,6 +31,7 @@
#include <string>
#include <thread>
#include <unordered_map>
+#include <unordered_set>
#include <vector>
#include <android-base/file.h>
@@ -56,6 +59,35 @@
*/
static constexpr int NUM_THREADS_PER_PARTITION = 4;
+/*
+ * State transitions between worker threads and read-ahead
+ * threads.
+ *
+ * READ_AHEAD_BEGIN: Worker threads initiates the read-ahead
+ * thread to begin reading the copy operations
+ * for each bounded region.
+ *
+ * READ_AHEAD_IN_PROGRESS: When read ahead thread is in-flight
+ * and reading the copy operations.
+ *
+ * IO_IN_PROGRESS: Merge operation is in-progress by worker threads.
+ *
+ * IO_TERMINATED: When all the worker threads are done, request the
+ * read-ahead thread to terminate
+ *
+ * READ_AHEAD_FAILURE: If there are any IO failures when read-ahead
+ * thread is reading from COW device.
+ *
+ * The transition of each states is described in snapuserd_readahead.cpp
+ */
+enum class READ_AHEAD_IO_TRANSITION {
+ READ_AHEAD_BEGIN,
+ READ_AHEAD_IN_PROGRESS,
+ IO_IN_PROGRESS,
+ IO_TERMINATED,
+ READ_AHEAD_FAILURE,
+};
+
class BufferSink : public IByteSink {
public:
void Initialize(size_t size);
@@ -76,6 +108,47 @@
class Snapuserd;
+class ReadAheadThread {
+ public:
+ ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
+ const std::string& misc_name, std::shared_ptr<Snapuserd> snapuserd);
+ bool RunThread();
+
+ private:
+ void InitializeIter();
+ bool IterDone();
+ void IterNext();
+ const CowOperation* GetIterOp();
+ void InitializeBuffer();
+
+ bool InitializeFds();
+ void CloseFds() {
+ cow_fd_ = {};
+ backing_store_fd_ = {};
+ }
+
+ bool ReadAheadIOStart();
+ void PrepareReadAhead(uint64_t* source_block, int* pending_ops, std::vector<uint64_t>& blocks);
+ bool ReconstructDataFromCow();
+ void CheckOverlap(const CowOperation* cow_op);
+
+ void* read_ahead_buffer_;
+ void* metadata_buffer_;
+ std::vector<const CowOperation*>::reverse_iterator read_ahead_iter_;
+ std::string cow_device_;
+ std::string backing_store_device_;
+ std::string misc_name_;
+
+ unique_fd cow_fd_;
+ unique_fd backing_store_fd_;
+
+ std::shared_ptr<Snapuserd> snapuserd_;
+
+ std::unordered_set<uint64_t> dest_blocks_;
+ std::unordered_set<uint64_t> source_blocks_;
+ bool overlap_;
+};
+
class WorkerThread {
public:
WorkerThread(const std::string& cow_device, const std::string& backing_device,
@@ -116,12 +189,16 @@
bool ProcessCopyOp(const CowOperation* cow_op);
bool ProcessZeroOp();
+ bool ReadFromBaseDevice(const CowOperation* cow_op);
+ bool GetReadAheadPopulatedBuffer(const CowOperation* cow_op);
+
// Merge related functions
bool ProcessMergeComplete(chunk_t chunk, void* buffer);
loff_t GetMergeStartOffset(void* merged_buffer, void* unmerged_buffer,
int* unmerged_exceptions);
+
int GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer, loff_t offset,
- int unmerged_exceptions);
+ int unmerged_exceptions, bool* copy_op, bool* commit);
sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
@@ -158,7 +235,10 @@
bool CommitMerge(int num_merge_ops);
void CloseFds() { cow_fd_ = {}; }
- void FreeResources() { worker_threads_.clear(); }
+ void FreeResources() {
+ worker_threads_.clear();
+ read_ahead_thread_ = nullptr;
+ }
size_t GetMetadataAreaSize() { return vec_.size(); }
void* GetExceptionBuffer(size_t i) { return vec_[i].get(); }
@@ -173,16 +253,47 @@
return p1.first < p2.first;
}
- private:
- std::vector<std::unique_ptr<WorkerThread>> worker_threads_;
+ void UnmapBufferRegion();
+ bool MmapMetadata();
+ // Read-ahead related functions
+ std::vector<const CowOperation*>& GetReadAheadOpsVec() { return read_ahead_ops_; }
+ std::unordered_map<uint64_t, void*>& GetReadAheadMap() { return read_ahead_buffer_map_; }
+ void* GetMappedAddr() { return mapped_addr_; }
+ bool IsReadAheadFeaturePresent() { return read_ahead_feature_; }
+ void PrepareReadAhead();
+ void StartReadAhead();
+ void MergeCompleted();
+ bool ReadAheadIOCompleted(bool sync);
+ void ReadAheadIOFailed();
+ bool WaitForMergeToComplete();
+ bool GetReadAheadPopulatedBuffer(uint64_t block, void* buffer);
+ bool ReconstructDataFromCow() { return populate_data_from_cow_; }
+ void ReconstructDataFromCowFinish() { populate_data_from_cow_ = false; }
+ bool WaitForReadAheadToStart();
+
+ uint64_t GetBufferMetadataOffset();
+ size_t GetBufferMetadataSize();
+ size_t GetBufferDataOffset();
+ size_t GetBufferDataSize();
+
+ // Final block to be merged in a given read-ahead buffer region
+ void SetFinalBlockMerged(uint64_t x) { final_block_merged_ = x; }
+ uint64_t GetFinalBlockMerged() { return final_block_merged_; }
+ // Total number of blocks to be merged in a given read-ahead buffer region
+ void SetTotalRaBlocksMerged(int x) { total_ra_blocks_merged_ = x; }
+ int GetTotalRaBlocksMerged() { return total_ra_blocks_merged_; }
+
+ private:
bool IsChunkIdMetadata(chunk_t chunk);
chunk_t GetNextAllocatableChunkId(chunk_t chunk_id);
+ bool GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t block, void* buffer);
bool ReadMetadata();
sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SZ - 1)) == 0); }
+ struct BufferState* GetBufferState();
std::string cow_device_;
std::string backing_store_device_;
@@ -197,7 +308,6 @@
std::unique_ptr<ICowOpIter> cowop_iter_;
std::unique_ptr<ICowOpReverseIter> cowop_riter_;
std::unique_ptr<CowReader> reader_;
- std::unique_ptr<CowWriter> writer_;
// Vector of disk exception which is a
// mapping of old-chunk to new-chunk
@@ -208,6 +318,21 @@
std::vector<std::pair<sector_t, const CowOperation*>> chunk_vec_;
std::mutex lock_;
+ std::condition_variable cv;
+
+ void* mapped_addr_;
+ size_t total_mapped_addr_length_;
+
+ std::vector<std::unique_ptr<WorkerThread>> worker_threads_;
+ // Read-ahead related
+ std::unordered_map<uint64_t, void*> read_ahead_buffer_map_;
+ std::vector<const CowOperation*> read_ahead_ops_;
+ bool populate_data_from_cow_ = false;
+ bool read_ahead_feature_;
+ uint64_t final_block_merged_;
+ int total_ra_blocks_merged_ = 0;
+ READ_AHEAD_IO_TRANSITION io_state_;
+ std::unique_ptr<ReadAheadThread> read_ahead_thread_;
bool merge_initiated_ = false;
bool attached_ = false;
diff --git a/fs_mgr/libsnapshot/snapuserd_readahead.cpp b/fs_mgr/libsnapshot/snapuserd_readahead.cpp
new file mode 100644
index 0000000..09ee2f2
--- /dev/null
+++ b/fs_mgr/libsnapshot/snapuserd_readahead.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (C) 2021 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "snapuserd.h"
+
+#include <csignal>
+#include <optional>
+#include <set>
+
+#include <libsnapshot/snapuserd_client.h>
+
+namespace android {
+namespace snapshot {
+
+using namespace android;
+using namespace android::dm;
+using android::base::unique_fd;
+
+#define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
+#define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
+
+/*
+ * Merging a copy operation involves the following flow:
+ *
+ * 1: dm-snapshot layer requests merge for a 4k block. dm-user sends the request
+ * to the daemon
+ * 2: daemon reads the source block
+ * 3: daemon copies the source data
+ * 4: IO completion sent back to dm-user (a switch from user space to kernel)
+ * 5: dm-snapshot merges the data to base device
+ * 6: dm-snapshot sends the merge-completion IO to dm-user
+ * 7: dm-user re-directs the merge completion IO to daemon (one more switch)
+ * 8: daemon updates the COW file about the completed merge request (a write syscall) and followed
+ * by a fysnc. 9: Send the IO completion back to dm-user
+ *
+ * The above sequence is a significant overhead especially when merging one 4k
+ * block at a time.
+ *
+ * Read-ahead layer will optimize the above path by reading the data from base
+ * device in the background so that merging thread can retrieve the data from
+ * the read-ahead cache. Additionally, syncing of merged data is deferred to
+ * read-ahead thread threadby the IO path is not bottlenecked.
+ *
+ * We create a scratch space of 2MB to store the read-ahead data in the COW
+ * device.
+ *
+ * +-----------------------+
+ * | Header (fixed) |
+ * +-----------------------+
+ * | Scratch space | <-- 2MB
+ * +-----------------------+
+ *
+ * Scratch space is as follows:
+ *
+ * +-----------------------+
+ * | Metadata | <- 4k page
+ * +-----------------------+
+ * | Metadata | <- 4k page
+ * +-----------------------+
+ * | |
+ * | Read-ahead data |
+ * | |
+ * +-----------------------+
+ *
+ * State transitions and communication between read-ahead thread and worker
+ * thread during merge:
+ * =====================================================================
+ *
+ * Worker Threads Read-Ahead thread
+ * ------------------------------------------------------------------
+ *
+ * |
+ * |
+ * --> -----------------READ_AHEAD_BEGIN------------->|
+ * | | | READ_AHEAD_IN_PROGRESS
+ * | WAIT |
+ * | | |
+ * | |<-----------------IO_IN_PROGRESS---------------
+ * | | |
+ * | | IO_IN_PRGRESS WAIT
+ * | | |
+ * |<--| |
+ * | |
+ * ------------------IO_TERMINATED--------------->|
+ * END
+ *
+ *
+ * ===================================================================
+ *
+ * Example:
+ *
+ * We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
+ * will write to COW file as follows:
+ *
+ * Op-1: 20 -> 23
+ * Op-2: 19 -> 22
+ * Op-3: 18 -> 21
+ * Op-4: 17 -> 20
+ * Op-5: 16 -> 19
+ * Op-6: 15 -> 18
+ *
+ * Read-ahead thread will read all the 6 source blocks and store the data in the
+ * scratch space. Metadata will contain the destination block numbers. Thus,
+ * scratch space will look something like this:
+ *
+ * +--------------+
+ * | Block 23 |
+ * | offset - 1 |
+ * +--------------+
+ * | Block 22 |
+ * | offset - 2 |
+ * +--------------+
+ * | Block 21 |
+ * | offset - 3 |
+ * +--------------+
+ * ...
+ * ...
+ * +--------------+
+ * | Data-Block 20| <-- offset - 1
+ * +--------------+
+ * | Data-Block 19| <-- offset - 2
+ * +--------------+
+ * | Data-Block 18| <-- offset - 3
+ * +--------------+
+ * ...
+ * ...
+ *
+ * ====================================================================
+ * IO Path:
+ *
+ * Read-ahead will serve the data to worker threads during merge only
+ * after metadata and data are persisted to the scratch space. Worker
+ * threads during merge will always retrieve the data from cache; if the
+ * cache is not populated, it will wait for the read-ahead thread to finish.
+ * Furthermore, the number of operations merged will by synced to the header
+ * only when all the blocks in the read-ahead cache are merged. In the above
+ * case, when all 6 operations are merged, COW Header is updated with
+ * num_merge_ops = 6.
+ *
+ * Merge resume after crash:
+ *
+ * Let's say we have a crash after 5 operations are merged. i.e. after
+ * Op-5: 16->19 is completed but before the Op-6 is merged. Thus, COW Header
+ * num_merge_ops will be 0 as the all the ops were not merged yet. During next
+ * reboot, read-ahead thread will re-construct the data in-memory from the
+ * scratch space; when merge resumes, Op-1 will be re-exectued. However,
+ * data will be served from read-ahead cache safely even though, block 20
+ * was over-written by Op-4.
+ *
+ */
+
+ReadAheadThread::ReadAheadThread(const std::string& cow_device, const std::string& backing_device,
+ const std::string& misc_name,
+ std::shared_ptr<Snapuserd> snapuserd) {
+ cow_device_ = cow_device;
+ backing_store_device_ = backing_device;
+ misc_name_ = misc_name;
+ snapuserd_ = snapuserd;
+}
+
+void ReadAheadThread::CheckOverlap(const CowOperation* cow_op) {
+ if (dest_blocks_.count(cow_op->new_block) || source_blocks_.count(cow_op->source)) {
+ overlap_ = true;
+ }
+
+ dest_blocks_.insert(cow_op->source);
+ source_blocks_.insert(cow_op->new_block);
+}
+
+void ReadAheadThread::PrepareReadAhead(uint64_t* source_block, int* pending_ops,
+ std::vector<uint64_t>& blocks) {
+ int num_ops = *pending_ops;
+ int nr_consecutive = 0;
+
+ if (!IterDone() && num_ops) {
+ // Get the first block
+ const CowOperation* cow_op = GetIterOp();
+ *source_block = cow_op->source;
+ IterNext();
+ num_ops -= 1;
+ nr_consecutive = 1;
+ blocks.push_back(cow_op->new_block);
+
+ if (!overlap_) {
+ CheckOverlap(cow_op);
+ }
+
+ /*
+ * Find number of consecutive blocks working backwards.
+ */
+ while (!IterDone() && num_ops) {
+ const CowOperation* op = GetIterOp();
+ if (op->source != (*source_block - nr_consecutive)) {
+ break;
+ }
+ nr_consecutive += 1;
+ num_ops -= 1;
+ blocks.push_back(op->new_block);
+ IterNext();
+
+ if (!overlap_) {
+ CheckOverlap(op);
+ }
+ }
+ }
+}
+
+bool ReadAheadThread::ReconstructDataFromCow() {
+ std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
+ read_ahead_buffer_map.clear();
+ loff_t metadata_offset = 0;
+ loff_t start_data_offset = snapuserd_->GetBufferDataOffset();
+ int num_ops = 0;
+ int total_blocks_merged = 0;
+
+ while (true) {
+ struct ScratchMetadata* bm = reinterpret_cast<struct ScratchMetadata*>(
+ (char*)metadata_buffer_ + metadata_offset);
+
+ // Done reading metadata
+ if (bm->new_block == 0 && bm->file_offset == 0) {
+ break;
+ }
+
+ loff_t buffer_offset = bm->file_offset - start_data_offset;
+ void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + buffer_offset);
+ read_ahead_buffer_map[bm->new_block] = bufptr;
+ num_ops += 1;
+ total_blocks_merged += 1;
+
+ metadata_offset += sizeof(struct ScratchMetadata);
+ }
+
+ // We are done re-constructing the mapping; however, we need to make sure
+ // all the COW operations to-be merged are present in the re-constructed
+ // mapping.
+ while (!IterDone()) {
+ const CowOperation* op = GetIterOp();
+ if (read_ahead_buffer_map.find(op->new_block) != read_ahead_buffer_map.end()) {
+ num_ops -= 1;
+ snapuserd_->SetFinalBlockMerged(op->new_block);
+ IterNext();
+ } else {
+ // Verify that we have covered all the ops which were re-constructed
+ // from COW device - These are the ops which are being
+ // re-constructed after crash.
+ CHECK(num_ops == 0);
+ break;
+ }
+ }
+
+ snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
+
+ snapuserd_->ReconstructDataFromCowFinish();
+
+ if (!snapuserd_->ReadAheadIOCompleted(true)) {
+ SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
+ snapuserd_->ReadAheadIOFailed();
+ return false;
+ }
+
+ SNAP_LOG(INFO) << "ReconstructDataFromCow success";
+ return true;
+}
+
+bool ReadAheadThread::ReadAheadIOStart() {
+ // Check if the data has to be constructed from the COW file.
+ // This will be true only once during boot up after a crash
+ // during merge.
+ if (snapuserd_->ReconstructDataFromCow()) {
+ return ReconstructDataFromCow();
+ }
+
+ std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
+ read_ahead_buffer_map.clear();
+
+ int num_ops = (snapuserd_->GetBufferDataSize()) / BLOCK_SZ;
+ loff_t metadata_offset = 0;
+
+ struct ScratchMetadata* bm =
+ reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
+
+ bm->new_block = 0;
+ bm->file_offset = 0;
+
+ std::vector<uint64_t> blocks;
+
+ loff_t buffer_offset = 0;
+ loff_t offset = 0;
+ loff_t file_offset = snapuserd_->GetBufferDataOffset();
+ int total_blocks_merged = 0;
+ overlap_ = false;
+ dest_blocks_.clear();
+ source_blocks_.clear();
+
+ while (true) {
+ uint64_t source_block;
+ int linear_blocks;
+
+ PrepareReadAhead(&source_block, &num_ops, blocks);
+ linear_blocks = blocks.size();
+ if (linear_blocks == 0) {
+ // No more blocks to read
+ SNAP_LOG(DEBUG) << " Read-ahead completed....";
+ break;
+ }
+
+ // Get the first block in the consecutive set of blocks
+ source_block = source_block + 1 - linear_blocks;
+ size_t io_size = (linear_blocks * BLOCK_SZ);
+ num_ops -= linear_blocks;
+ total_blocks_merged += linear_blocks;
+
+ // Mark the block number as the one which will
+ // be the final block to be merged in this entire region.
+ // Read-ahead thread will get
+ // notified when this block is merged to make
+ // forward progress
+ snapuserd_->SetFinalBlockMerged(blocks.back());
+
+ while (linear_blocks) {
+ uint64_t new_block = blocks.back();
+ blocks.pop_back();
+ // Assign the mapping
+ void* bufptr = static_cast<void*>((char*)read_ahead_buffer_ + offset);
+ read_ahead_buffer_map[new_block] = bufptr;
+ offset += BLOCK_SZ;
+
+ bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ +
+ metadata_offset);
+ bm->new_block = new_block;
+ bm->file_offset = file_offset;
+
+ metadata_offset += sizeof(struct ScratchMetadata);
+ file_offset += BLOCK_SZ;
+
+ linear_blocks -= 1;
+ }
+
+ // Read from the base device consecutive set of blocks in one shot
+ if (!android::base::ReadFullyAtOffset(backing_store_fd_,
+ (char*)read_ahead_buffer_ + buffer_offset, io_size,
+ source_block * BLOCK_SZ)) {
+ SNAP_PLOG(ERROR) << "Copy-op failed. Read from backing store: " << backing_store_device_
+ << "at block :" << source_block << " buffer_offset : " << buffer_offset
+ << " io_size : " << io_size << " buf-addr : " << read_ahead_buffer_;
+
+ snapuserd_->ReadAheadIOFailed();
+ return false;
+ }
+
+ // This is important - explicitly set the contents to zero. This is used
+ // when re-constructing the data after crash. This indicates end of
+ // reading metadata contents when re-constructing the data
+ bm = reinterpret_cast<struct ScratchMetadata*>((char*)metadata_buffer_ + metadata_offset);
+ bm->new_block = 0;
+ bm->file_offset = 0;
+
+ buffer_offset += io_size;
+ CHECK(offset == buffer_offset);
+ CHECK((file_offset - snapuserd_->GetBufferDataOffset()) == offset);
+ }
+
+ snapuserd_->SetTotalRaBlocksMerged(total_blocks_merged);
+
+ // Flush the data only if we have a overlapping blocks in the region
+ if (!snapuserd_->ReadAheadIOCompleted(overlap_)) {
+ SNAP_LOG(ERROR) << "ReadAheadIOCompleted failed...";
+ snapuserd_->ReadAheadIOFailed();
+ return false;
+ }
+
+ return true;
+}
+
+bool ReadAheadThread::RunThread() {
+ if (!InitializeFds()) {
+ return false;
+ }
+
+ InitializeIter();
+ InitializeBuffer();
+
+ while (!IterDone()) {
+ if (!ReadAheadIOStart()) {
+ return false;
+ }
+
+ bool status = snapuserd_->WaitForMergeToComplete();
+
+ if (status && !snapuserd_->CommitMerge(snapuserd_->GetTotalRaBlocksMerged())) {
+ return false;
+ }
+
+ if (!status) break;
+ }
+
+ CloseFds();
+ SNAP_LOG(INFO) << " ReadAhead thread terminating....";
+ return true;
+}
+
+// Initialization
+bool ReadAheadThread::InitializeFds() {
+ backing_store_fd_.reset(open(backing_store_device_.c_str(), O_RDONLY));
+ if (backing_store_fd_ < 0) {
+ SNAP_PLOG(ERROR) << "Open Failed: " << backing_store_device_;
+ return false;
+ }
+
+ cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
+ if (cow_fd_ < 0) {
+ SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
+ return false;
+ }
+
+ return true;
+}
+
+void ReadAheadThread::InitializeIter() {
+ std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
+ read_ahead_iter_ = read_ahead_ops.rbegin();
+}
+
+bool ReadAheadThread::IterDone() {
+ std::vector<const CowOperation*>& read_ahead_ops = snapuserd_->GetReadAheadOpsVec();
+ return read_ahead_iter_ == read_ahead_ops.rend();
+}
+
+void ReadAheadThread::IterNext() {
+ read_ahead_iter_++;
+}
+
+const CowOperation* ReadAheadThread::GetIterOp() {
+ return *read_ahead_iter_;
+}
+
+void ReadAheadThread::InitializeBuffer() {
+ void* mapped_addr = snapuserd_->GetMappedAddr();
+ // Map the scratch space region into memory
+ metadata_buffer_ =
+ static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferMetadataOffset());
+ read_ahead_buffer_ = static_cast<void*>((char*)mapped_addr + snapuserd_->GetBufferDataOffset());
+}
+
+} // namespace snapshot
+} // namespace android
diff --git a/fs_mgr/libsnapshot/snapuserd_server.cpp b/fs_mgr/libsnapshot/snapuserd_server.cpp
index 5bbcc66..ff8a259 100644
--- a/fs_mgr/libsnapshot/snapuserd_server.cpp
+++ b/fs_mgr/libsnapshot/snapuserd_server.cpp
@@ -209,10 +209,11 @@
}
handler->snapuserd()->CloseFds();
+ handler->snapuserd()->CheckMergeCompletionStatus();
+ handler->snapuserd()->UnmapBufferRegion();
auto misc_name = handler->misc_name();
LOG(INFO) << "Handler thread about to exit: " << misc_name;
- handler->snapuserd()->CheckMergeCompletionStatus();
{
std::lock_guard<std::mutex> lock(lock_);
diff --git a/fs_mgr/libsnapshot/snapuserd_worker.cpp b/fs_mgr/libsnapshot/snapuserd_worker.cpp
index 1002569..9f42ab8 100644
--- a/fs_mgr/libsnapshot/snapuserd_worker.cpp
+++ b/fs_mgr/libsnapshot/snapuserd_worker.cpp
@@ -135,14 +135,11 @@
return true;
}
-// Start the copy operation. This will read the backing
-// block device which is represented by cow_op->source.
-bool WorkerThread::ProcessCopyOp(const CowOperation* cow_op) {
+bool WorkerThread::ReadFromBaseDevice(const CowOperation* cow_op) {
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
CHECK(buffer != nullptr);
-
- // Issue a single 4K IO. However, this can be optimized
- // if the successive blocks are contiguous.
+ SNAP_LOG(DEBUG) << " ReadFromBaseDevice...: new-block: " << cow_op->new_block
+ << " Source: " << cow_op->source;
if (!android::base::ReadFullyAtOffset(backing_store_fd_, buffer, BLOCK_SZ,
cow_op->source * BLOCK_SZ)) {
SNAP_PLOG(ERROR) << "Copy-op failed. Read from backing store: " << backing_store_device_
@@ -153,6 +150,31 @@
return true;
}
+bool WorkerThread::GetReadAheadPopulatedBuffer(const CowOperation* cow_op) {
+ void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
+ CHECK(buffer != nullptr);
+
+ if (!snapuserd_->GetReadAheadPopulatedBuffer(cow_op->new_block, buffer)) {
+ return false;
+ }
+
+ return true;
+}
+
+// Start the copy operation. This will read the backing
+// block device which is represented by cow_op->source.
+bool WorkerThread::ProcessCopyOp(const CowOperation* cow_op) {
+ if (!GetReadAheadPopulatedBuffer(cow_op)) {
+ SNAP_LOG(DEBUG) << " GetReadAheadPopulatedBuffer failed..."
+ << " new_block: " << cow_op->new_block;
+ if (!ReadFromBaseDevice(cow_op)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
bool WorkerThread::ProcessZeroOp() {
// Zero out the entire block
void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SZ);
@@ -386,8 +408,10 @@
}
int WorkerThread::GetNumberOfMergedOps(void* merged_buffer, void* unmerged_buffer, loff_t offset,
- int unmerged_exceptions) {
+ int unmerged_exceptions, bool* copy_op, bool* commit) {
int merged_ops_cur_iter = 0;
+ std::unordered_map<uint64_t, void*>& read_ahead_buffer_map = snapuserd_->GetReadAheadMap();
+ *copy_op = false;
std::vector<std::pair<sector_t, const CowOperation*>>& chunk_vec = snapuserd_->GetChunkVec();
// Find the operations which are merged in this cycle.
@@ -411,6 +435,23 @@
const CowOperation* cow_op = it->second;
CHECK(cow_op != nullptr);
+ if (snapuserd_->IsReadAheadFeaturePresent() && cow_op->type == kCowCopyOp) {
+ *copy_op = true;
+ // Every single copy operation has to come from read-ahead
+ // cache.
+ if (read_ahead_buffer_map.find(cow_op->new_block) == read_ahead_buffer_map.end()) {
+ SNAP_LOG(ERROR)
+ << " Block: " << cow_op->new_block << " not found in read-ahead cache"
+ << " Source: " << cow_op->source;
+ return -1;
+ }
+ // If this is a final block merged in the read-ahead buffer
+ // region, notify the read-ahead thread to make forward
+ // progress
+ if (cow_op->new_block == snapuserd_->GetFinalBlockMerged()) {
+ *commit = true;
+ }
+ }
CHECK(cow_op->new_block == cow_de->old_chunk);
// zero out to indicate that operation is merged.
@@ -442,6 +483,8 @@
bool WorkerThread::ProcessMergeComplete(chunk_t chunk, void* buffer) {
uint32_t stride = exceptions_per_area_ + 1;
const std::vector<std::unique_ptr<uint8_t[]>>& vec = snapuserd_->GetMetadataVec();
+ bool copy_op = false;
+ bool commit = false;
// ChunkID to vector index
lldiv_t divresult = lldiv(chunk, stride);
@@ -452,13 +495,24 @@
int unmerged_exceptions = 0;
loff_t offset = GetMergeStartOffset(buffer, vec[divresult.quot].get(), &unmerged_exceptions);
- int merged_ops_cur_iter =
- GetNumberOfMergedOps(buffer, vec[divresult.quot].get(), offset, unmerged_exceptions);
+ int merged_ops_cur_iter = GetNumberOfMergedOps(buffer, vec[divresult.quot].get(), offset,
+ unmerged_exceptions, ©_op, &commit);
// There should be at least one operation merged in this cycle
CHECK(merged_ops_cur_iter > 0);
- if (!snapuserd_->CommitMerge(merged_ops_cur_iter)) {
- return false;
+
+ if (copy_op) {
+ if (commit) {
+ // Push the flushing logic to read-ahead thread so that merge thread
+ // can make forward progress. Sync will happen in the background
+ snapuserd_->StartReadAhead();
+ }
+ } else {
+ // Non-copy ops and all ops in older COW format
+ if (!snapuserd_->CommitMerge(merged_ops_cur_iter)) {
+ SNAP_LOG(ERROR) << "CommitMerge failed...";
+ return false;
+ }
}
SNAP_LOG(DEBUG) << "Merge success: " << merged_ops_cur_iter << "chunk: " << chunk;
@@ -613,12 +667,21 @@
}
}
+ // Just return the header if it is an error
+ if (header->type == DM_USER_RESP_ERROR) {
+ ret = 0;
+ }
+
// Daemon will not be terminated if there is any error. We will
// just send the error back to dm-user.
if (!WriteDmUserPayload(ret)) {
return false;
}
+ if (header->type == DM_USER_RESP_ERROR) {
+ break;
+ }
+
remaining_size -= ret;
offset += ret;
} while (remaining_size > 0);
diff --git a/fs_mgr/libsnapshot/update_engine/update_metadata.proto b/fs_mgr/libsnapshot/update_engine/update_metadata.proto
index f31ee31..69d72e1 100644
--- a/fs_mgr/libsnapshot/update_engine/update_metadata.proto
+++ b/fs_mgr/libsnapshot/update_engine/update_metadata.proto
@@ -75,6 +75,7 @@
repeated DynamicPartitionGroup groups = 1;
optional bool vabc_enabled = 3;
optional string vabc_compression_param = 4;
+ optional uint32 cow_version = 5;
}
message DeltaArchiveManifest {