Merge "libsnapshot:Snapuserd: IO path support with dm-snapshot target"
diff --git a/fs_mgr/libsnapshot/Android.bp b/fs_mgr/libsnapshot/Android.bp
index be6db04..d11d3e4 100644
--- a/fs_mgr/libsnapshot/Android.bp
+++ b/fs_mgr/libsnapshot/Android.bp
@@ -148,12 +148,12 @@
     recovery_available: true,
     shared_libs: [
         "libbase",
-        "libcrypto",
         "liblog",
     ],
     static_libs: [
         "libz",
     ],
+    ramdisk_available: true,
 }
 
 cc_library_static {
@@ -347,6 +347,9 @@
 
 cc_defaults {
     name: "snapuserd_defaults",
+    defaults: [
+        "fs_mgr_defaults",
+    ],
     srcs: [
         "snapuserd.cpp",
     ],
@@ -360,6 +363,8 @@
         "libbase",
         "liblog",
         "libdm",
+	"libz",
+	"libsnapshot_cow",
     ],
 }
 
@@ -375,7 +380,6 @@
 
     ramdisk: true,
     static_executable: true,
-    system_shared_libs: [],
 }
 
 cc_test {
@@ -473,3 +477,32 @@
         },
     },
 }
+
+cc_test {
+    name: "cow_snapuserd_test",
+    defaults: [
+        "fs_mgr_defaults",
+    ],
+    srcs: [
+        "cow_snapuserd_test.cpp",
+    ],
+    cflags: [
+        "-Wall",
+        "-Werror",
+    ],
+    shared_libs: [
+        "libbase",
+        "liblog",
+        "libz",
+    ],
+    static_libs: [
+        "libgtest",
+        "libsnapshot_cow",
+    ],
+    header_libs: [
+        "libstorage_literals_headers",
+    ],
+    test_min_api_level: 30,
+    auto_gen_config: true,
+    require_root: false,
+}
diff --git a/fs_mgr/libsnapshot/cow_reader.cpp b/fs_mgr/libsnapshot/cow_reader.cpp
index 86565c4..7f77aec 100644
--- a/fs_mgr/libsnapshot/cow_reader.cpp
+++ b/fs_mgr/libsnapshot/cow_reader.cpp
@@ -20,7 +20,6 @@
 #include <android-base/file.h>
 #include <android-base/logging.h>
 #include <libsnapshot/cow_reader.h>
-#include <openssl/sha.h>
 #include <zlib.h>
 
 namespace android {
@@ -28,11 +27,13 @@
 
 CowReader::CowReader() : fd_(-1), header_(), fd_size_(0) {}
 
-static void SHA256(const void* data, size_t length, uint8_t out[32]) {
+static void SHA256(const void*, size_t, uint8_t[]) {
+#if 0
     SHA256_CTX c;
     SHA256_Init(&c);
     SHA256_Update(&c, data, length);
     SHA256_Final(out, &c);
+#endif
 }
 
 bool CowReader::Parse(android::base::unique_fd&& fd) {
@@ -69,16 +70,35 @@
         return false;
     }
 
+    if (header_.magic != kCowMagicNumber) {
+        LOG(ERROR) << "Header Magic corrupted. Magic: " << header_.magic
+                   << "Expected: " << kCowMagicNumber;
+        return false;
+    }
+
+    if ((header_.major_version != kCowVersionMajor) ||
+        (header_.minor_version != kCowVersionMinor)) {
+        LOG(ERROR) << "Header version mismatch";
+        LOG(ERROR) << "Major version: " << header_.major_version
+                   << "Expected: " << kCowVersionMajor;
+        LOG(ERROR) << "Minor version: " << header_.minor_version
+                   << "Expected: " << kCowVersionMinor;
+        return false;
+    }
+
     uint8_t header_csum[32];
     {
         CowHeader tmp = header_;
         memset(&tmp.header_checksum, 0, sizeof(tmp.header_checksum));
+        memset(header_csum, 0, sizeof(uint8_t) * 32);
+
         SHA256(&tmp, sizeof(tmp), header_csum);
     }
     if (memcmp(header_csum, header_.header_checksum, sizeof(header_csum)) != 0) {
         LOG(ERROR) << "header checksum is invalid";
         return false;
     }
+
     return true;
 }
 
@@ -140,6 +160,8 @@
     }
 
     uint8_t csum[32];
+    memset(csum, 0, sizeof(uint8_t) * 32);
+
     SHA256(ops_buffer.get(), header_.ops_size, csum);
     if (memcmp(csum, header_.ops_checksum, sizeof(csum)) != 0) {
         LOG(ERROR) << "ops checksum does not match";
diff --git a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
new file mode 100644
index 0000000..d767022
--- /dev/null
+++ b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
@@ -0,0 +1,255 @@
+// Copyright (C) 2018 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <fcntl.h>
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <iostream>
+#include <memory>
+#include <string_view>
+
+#include <android-base/file.h>
+#include <android-base/unique_fd.h>
+#include <gtest/gtest.h>
+#include <libsnapshot/cow_writer.h>
+#include <storage_literals/storage_literals.h>
+
+namespace android {
+namespace snapshot {
+
+using namespace android::storage_literals;
+using android::base::unique_fd;
+
+class SnapuserdTest : public ::testing::Test {
+  protected:
+    void SetUp() override {
+        cow_ = std::make_unique<TemporaryFile>();
+        ASSERT_GE(cow_->fd, 0) << strerror(errno);
+    }
+
+    void TearDown() override { cow_ = nullptr; }
+
+    std::unique_ptr<TemporaryFile> cow_;
+};
+
+TEST_F(SnapuserdTest, ReadWrite) {
+    loff_t offset = 0;
+    size_t size = 100_MiB;
+    unique_fd rnd_fd;
+    unique_fd sys_fd;
+    unique_fd snapshot_fd;
+    unique_fd system_a_fd;
+    std::string cmd;
+
+    rnd_fd.reset(open("/dev/random", O_RDONLY));
+    ASSERT_TRUE(rnd_fd > 0);
+
+    std::unique_ptr<uint8_t[]> random_buffer_1;
+    std::unique_ptr<uint8_t[]> random_buffer_2;
+    std::unique_ptr<uint8_t[]> system_buffer;
+
+    random_buffer_1 = std::make_unique<uint8_t[]>(size);
+
+    random_buffer_2 = std::make_unique<uint8_t[]>(size);
+
+    system_buffer = std::make_unique<uint8_t[]>(size);
+
+    // Fill random data
+    for (size_t j = 0; j < (size / 1_MiB); j++) {
+        ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1.get() + offset, 1_MiB, 0), true);
+
+        ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_2.get() + offset, 1_MiB, 0), true);
+
+        offset += 1_MiB;
+    }
+
+    sys_fd.reset(open("/dev/block/mapper/system_a", O_RDONLY));
+    ASSERT_TRUE(sys_fd > 0);
+
+    // Read from system partition from offset 0 of size 100MB
+    ASSERT_EQ(ReadFullyAtOffset(sys_fd, system_buffer.get(), size, 0), true);
+
+    //================Create a COW file with the following operations===========
+    //
+    // Create COW file which is gz compressed
+    //
+    // 0-100 MB of replace operation with random data
+    // 100-200 MB of copy operation
+    // 200-300 MB of zero operation
+    // 300-400 MB of replace operation with random data
+
+    CowOptions options;
+    options.compression = "gz";
+    CowWriter writer(options);
+
+    ASSERT_TRUE(writer.Initialize(cow_->fd));
+
+    // Write 100MB random data to COW file which is gz compressed from block 0
+    ASSERT_TRUE(writer.AddRawBlocks(0, random_buffer_1.get(), size));
+
+    size_t num_blocks = size / options.block_size;
+    size_t blk_start_copy = num_blocks;
+    size_t blk_end_copy = blk_start_copy + num_blocks;
+    size_t source_blk = 0;
+
+    // Copy blocks - source_blk starts from 0 as snapuserd
+    // has to read from block 0 in system_a partition
+    //
+    // This initializes copy operation from block 0 of size 100 MB from
+    // /dev/block/mapper/system_a
+    for (size_t i = blk_start_copy; i < blk_end_copy; i++) {
+        ASSERT_TRUE(writer.AddCopy(i, source_blk));
+        source_blk += 1;
+    }
+
+    size_t blk_zero_copy_start = blk_end_copy;
+    size_t blk_zero_copy_end = blk_zero_copy_start + num_blocks;
+
+    // 100 MB filled with zeroes
+    ASSERT_TRUE(writer.AddZeroBlocks(blk_zero_copy_start, num_blocks));
+
+    // Final 100MB filled with random data which is gz compressed
+    size_t blk_random2_replace_start = blk_zero_copy_end;
+
+    ASSERT_TRUE(writer.AddRawBlocks(blk_random2_replace_start, random_buffer_2.get(), size));
+
+    // Flush operations
+    ASSERT_TRUE(writer.Finalize());
+
+    ASSERT_EQ(lseek(cow_->fd, 0, SEEK_SET), 0);
+
+    //================Setup dm-snapshot and start snapuserd daemon===========
+
+    // Create a COW device. Number of sectors is chosen random which can
+    // hold at least 400MB of data
+
+    system_a_fd.reset(open("/dev/block/mapper/system_a", O_RDONLY));
+    ASSERT_TRUE(system_a_fd > 0);
+
+    int blksize;
+    int err = ioctl(system_a_fd.get(), BLKGETSIZE, &blksize);
+    if (err < 0) {
+        ASSERT_TRUE(0);
+    }
+
+    cmd = "dmctl create system_cow user 0 " + std::to_string(blksize);
+    system(cmd.c_str());
+
+    // Start the snapuserd daemon
+    if (fork() == 0) {
+        const char* argv[] = {"/system/bin/snapuserd", cow_->path, "/dev/block/mapper/system_a",
+                              nullptr};
+        if (execv(argv[0], const_cast<char**>(argv))) {
+            ASSERT_TRUE(0);
+        }
+    }
+
+    cmd.clear();
+
+    cmd = "dmctl create system-snapshot -ro snapshot 0 " + std::to_string(blksize);
+    cmd += " /dev/block/mapper/system_a /dev/block/mapper/system_cow ";
+    cmd += "P 8";
+    system(cmd.c_str());
+
+    // Wait so that snapshot device is created
+    sleep(5);
+    std::unique_ptr<uint8_t[]> snapuserd_buffer = std::make_unique<uint8_t[]>(size);
+
+    offset = 0;
+
+    snapshot_fd.reset(open("/dev/block/mapper/system-snapshot", O_RDONLY));
+    ASSERT_TRUE(snapshot_fd > 0);
+
+    //================Start IO operation on dm-snapshot device=================
+    // This will test the following paths:
+    //
+    // 1: IO path for all three operations and interleaving of operations.
+    // 2: Merging of blocks in kernel during metadata read
+    // 3: Bulk IO issued by kernel duing merge operation
+
+    // Read from snapshot device of size 100MB from offset 0. This tests the
+    // 1st replace operation.
+    //
+    // IO path:
+    //
+    // dm-snap->dm-snap-persistent->dm-user->snapuserd->read_compressed_cow (replace
+    // op)->decompress_cow->return
+
+    ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapuserd_buffer.get(), size, offset), true);
+
+    // Update the offset
+    offset += size;
+
+    // Compare data with random_buffer_1.
+    ASSERT_EQ(memcmp(snapuserd_buffer.get(), random_buffer_1.get(), size), 0);
+
+    // Clear the buffer
+    memset(snapuserd_buffer.get(), 0, size);
+
+    // Read from snapshot device of size 100MB from offset 100MB. This tests the
+    // copy operation.
+    //
+    // IO path:
+    //
+    // dm-snap->dm-snap-persistent->dm-user->snapuserd->read_from_system_a_partition
+    // (copy op) -> return
+    ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapuserd_buffer.get(), size, offset), true);
+
+    // Update the offset
+    offset += size;
+
+    // Compare data with system_buffer.
+    ASSERT_EQ(memcmp(snapuserd_buffer.get(), system_buffer.get(), size), 0);
+
+    // Read from snapshot device of size 100MB from offset 200MB. This tests the
+    // zero operation.
+    //
+    // IO path:
+    //
+    // dm-snap->dm-snap-persistent->dm-user->snapuserd->fill_memory_with_zero
+    // (zero op) -> return
+    ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapuserd_buffer.get(), size, offset), true);
+
+    // Fill the random_buffer_1 with zero as we no longer need it
+    memset(random_buffer_1.get(), 0, size);
+
+    // Compare data with zero filled buffer
+    ASSERT_EQ(memcmp(snapuserd_buffer.get(), random_buffer_1.get(), size), 0);
+
+    // Update the offset
+    offset += size;
+
+    // Read from snapshot device of size 100MB from offset 300MB. This tests the
+    // final replace operation.
+    //
+    // IO path:
+    //
+    // dm-snap->dm-snap-persistent->dm-user->snapuserd->read_compressed_cow (replace
+    // op)->decompress_cow->return
+    ASSERT_EQ(ReadFullyAtOffset(snapshot_fd, snapuserd_buffer.get(), size, offset), true);
+
+    // Compare data with random_buffer_2.
+    ASSERT_EQ(memcmp(snapuserd_buffer.get(), random_buffer_2.get(), size), 0);
+}
+
+}  // namespace snapshot
+}  // namespace android
+
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/fs_mgr/libsnapshot/cow_writer.cpp b/fs_mgr/libsnapshot/cow_writer.cpp
index ff43997..76238c2 100644
--- a/fs_mgr/libsnapshot/cow_writer.cpp
+++ b/fs_mgr/libsnapshot/cow_writer.cpp
@@ -23,7 +23,6 @@
 #include <android-base/logging.h>
 #include <android-base/unique_fd.h>
 #include <libsnapshot/cow_writer.h>
-#include <openssl/sha.h>
 #include <zlib.h>
 
 namespace android {
@@ -179,11 +178,15 @@
     return {};
 }
 
-static void SHA256(const void* data, size_t length, uint8_t out[32]) {
+// TODO: Fix compilation issues when linking libcrypto library
+// when snapuserd is compiled as part of ramdisk.
+static void SHA256(const void*, size_t, uint8_t[]) {
+#if 0
     SHA256_CTX c;
     SHA256_Init(&c);
     SHA256_Update(&c, data, length);
     SHA256_Final(out, &c);
+#endif
 }
 
 bool CowWriter::Finalize() {
@@ -199,6 +202,9 @@
     header_.ops_offset = offs;
     header_.ops_size = ops_.size();
 
+    memset(header_.ops_checksum, 0, sizeof(uint8_t) * 32);
+    memset(header_.header_checksum, 0, sizeof(uint8_t) * 32);
+
     SHA256(ops_.data(), ops_.size(), header_.ops_checksum);
     SHA256(&header_, sizeof(header_), header_.header_checksum);
 
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
index a3b1291..9e9f9b8 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/cow_reader.h
@@ -92,6 +92,10 @@
     bool Parse(android::base::borrowed_fd fd);
 
     bool GetHeader(CowHeader* header) override;
+
+    // Create a CowOpIter object which contains header_.num_ops
+    // CowOperation objects. Get() returns a unique CowOperation object
+    // whose lifeteime depends on the CowOpIter object
     std::unique_ptr<ICowOpIter> GetOpIter() override;
     bool GetRawBytes(uint64_t offset, void* buffer, size_t len) override;
     bool ReadData(const CowOperation& op, IByteSink* sink) override;
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd.h b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd.h
new file mode 100644
index 0000000..e757579
--- /dev/null
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd.h
@@ -0,0 +1,99 @@
+// Copyright (C) 2020 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <stdint.h>
+
+namespace android {
+namespace snapshot {
+
+// Kernel COW header fields
+static constexpr uint32_t SNAP_MAGIC = 0x70416e53;
+
+static constexpr uint32_t SNAPSHOT_DISK_VERSION = 1;
+
+static constexpr uint32_t NUM_SNAPSHOT_HDR_CHUNKS = 1;
+
+static constexpr uint32_t SNAPSHOT_VALID = 1;
+
+/*
+ * The basic unit of block I/O is a sector. It is used in a number of contexts
+ * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9
+ * bytes. Variables of type sector_t represent an offset or size that is a
+ * multiple of 512 bytes. Hence these two constants.
+ */
+static constexpr uint32_t SECTOR_SHIFT = 9;
+
+typedef __u64 sector_t;
+typedef sector_t chunk_t;
+
+static constexpr uint32_t CHUNK_SIZE = 8;
+static constexpr uint32_t CHUNK_SHIFT = (__builtin_ffs(CHUNK_SIZE) - 1);
+
+static constexpr uint32_t BLOCK_SIZE = 4096;
+static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SIZE) - 1);
+
+// This structure represents the kernel COW header.
+// All the below fields should be in Little Endian format.
+struct disk_header {
+    uint32_t magic;
+
+    /*
+     * Is this snapshot valid.  There is no way of recovering
+     * an invalid snapshot.
+     */
+    uint32_t valid;
+
+    /*
+     * Simple, incrementing version. no backward
+     * compatibility.
+     */
+    uint32_t version;
+
+    /* In sectors */
+    uint32_t chunk_size;
+} __packed;
+
+// A disk exception is a mapping of old_chunk to new_chunk
+// old_chunk is the chunk ID of a dm-snapshot device.
+// new_chunk is the chunk ID of the COW device.
+struct disk_exception {
+    uint64_t old_chunk;
+    uint64_t new_chunk;
+} __packed;
+
+// Control structures to communicate with dm-user
+// It comprises of header and a payload
+struct dm_user_header {
+    __u64 seq;
+    __u64 type;
+    __u64 flags;
+    __u64 sector;
+    __u64 len;
+    __u64 io_in_progress;
+} __attribute__((packed));
+
+struct dm_user_payload {
+    __u8 buf[];
+};
+
+// Message comprising both header and payload
+struct dm_user_message {
+    struct dm_user_header header;
+    struct dm_user_payload payload;
+};
+
+}  // namespace snapshot
+}  // namespace android
diff --git a/fs_mgr/libsnapshot/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd.cpp
index a6ff4fd..605af9b 100644
--- a/fs_mgr/libsnapshot/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd.cpp
@@ -15,102 +15,662 @@
  */
 
 #include <linux/types.h>
+#include <stdlib.h>
+
+#include <cstring>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <vector>
 
 #include <android-base/file.h>
 #include <android-base/logging.h>
 #include <android-base/stringprintf.h>
 #include <android-base/unique_fd.h>
 #include <libdm/dm.h>
+#include <libsnapshot/cow_reader.h>
+#include <libsnapshot/cow_writer.h>
+#include <libsnapshot/snapuserd.h>
 
+namespace android {
+namespace snapshot {
+
+using namespace android::dm;
 using android::base::unique_fd;
 
 #define DM_USER_MAP_READ 0
 #define DM_USER_MAP_WRITE 1
 
-struct dm_user_message {
-    __u64 seq;
-    __u64 type;
-    __u64 flags;
-    __u64 sector;
-    __u64 len;
-    __u8 buf[];
+static constexpr size_t PAYLOAD_SIZE = (1UL << 16);
+
+static_assert(PAYLOAD_SIZE >= BLOCK_SIZE);
+
+class BufferSink : public IByteSink {
+  public:
+    void Initialize(size_t size) {
+        buffer_size_ = size;
+        buffer_offset_ = 0;
+        buffer_ = std::make_unique<uint8_t[]>(size);
+    }
+
+    void* GetBufPtr() { return buffer_.get(); }
+
+    void Clear() { memset(GetBufPtr(), 0, buffer_size_); }
+
+    void* GetPayloadBuffer(size_t size) {
+        if ((buffer_size_ - buffer_offset_) < size) return nullptr;
+
+        char* buffer = reinterpret_cast<char*>(GetBufPtr());
+        struct dm_user_message* msg = (struct dm_user_message*)(&(buffer[0]));
+        return (char*)msg->payload.buf + buffer_offset_;
+    }
+
+    void* GetBuffer(size_t requested, size_t* actual) override {
+        void* buf = GetPayloadBuffer(requested);
+        if (!buf) {
+            *actual = 0;
+            return nullptr;
+        }
+        *actual = requested;
+        return buf;
+    }
+
+    void UpdateBufferOffset(size_t size) { buffer_offset_ += size; }
+
+    struct dm_user_header* GetHeaderPtr() {
+        CHECK(sizeof(struct dm_user_header) <= buffer_size_);
+        char* buf = reinterpret_cast<char*>(GetBufPtr());
+        struct dm_user_header* header = (struct dm_user_header*)(&(buf[0]));
+        return header;
+    }
+
+    bool ReturnData(void*, size_t) override { return true; }
+    void ResetBufferOffset() { buffer_offset_ = 0; }
+
+  private:
+    std::unique_ptr<uint8_t[]> buffer_;
+    loff_t buffer_offset_;
+    size_t buffer_size_;
 };
 
-using namespace android::dm;
+class Snapuserd final {
+  public:
+    Snapuserd(const std::string& in_cow_device, const std::string& in_backing_store_device)
+        : in_cow_device_(in_cow_device),
+          in_backing_store_device_(in_backing_store_device),
+          metadata_read_done_(false) {}
 
-static int daemon_main(const std::string& device) {
-    unique_fd block_fd(open(device.c_str(), O_RDWR));
-    if (block_fd < 0) {
-        PLOG(ERROR) << "Unable to open " << device;
-        return 1;
+    int Run();
+    int ReadDmUserHeader();
+    int WriteDmUserPayload(size_t size);
+    int ConstructKernelCowHeader();
+    int ReadMetadata();
+    int ZerofillDiskExceptions(size_t read_size);
+    int ReadDiskExceptions(chunk_t chunk, size_t size);
+    int ReadData(chunk_t chunk, size_t size);
+
+  private:
+    int ProcessReplaceOp(const CowOperation* cow_op);
+    int ProcessCopyOp(const CowOperation* cow_op);
+    int ProcessZeroOp();
+
+    std::string in_cow_device_;
+    std::string in_backing_store_device_;
+
+    unique_fd cow_fd_;
+    unique_fd backing_store_fd_;
+    unique_fd ctrl_fd_;
+
+    uint32_t exceptions_per_area_;
+
+    std::unique_ptr<ICowOpIter> cowop_iter_;
+    std::unique_ptr<CowReader> reader_;
+
+    // Vector of disk exception which is a
+    // mapping of old-chunk to new-chunk
+    std::vector<std::unique_ptr<uint8_t[]>> vec_;
+
+    // Index - Chunk ID
+    // Value - cow operation
+    std::vector<const CowOperation*> chunk_vec_;
+
+    bool metadata_read_done_;
+    BufferSink bufsink_;
+};
+
+// Construct kernel COW header in memory
+// This header will be in sector 0. The IO
+// request will always be 4k. After constructing
+// the header, zero out the remaining block.
+int Snapuserd::ConstructKernelCowHeader() {
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
+    CHECK(buffer != nullptr);
+
+    memset(buffer, 0, BLOCK_SIZE);
+
+    struct disk_header* dh = reinterpret_cast<struct disk_header*>(buffer);
+
+    dh->magic = SNAP_MAGIC;
+    dh->valid = SNAPSHOT_VALID;
+    dh->version = SNAPSHOT_DISK_VERSION;
+    dh->chunk_size = CHUNK_SIZE;
+
+    return BLOCK_SIZE;
+}
+
+// Start the replace operation. This will read the
+// internal COW format and if the block is compressed,
+// it will be de-compressed.
+int Snapuserd::ProcessReplaceOp(const CowOperation* cow_op) {
+    if (!reader_->ReadData(*cow_op, &bufsink_)) {
+        LOG(ERROR) << "ReadData failed for chunk: " << cow_op->new_block;
+        return -EIO;
     }
 
-    unique_fd ctrl_fd(open("/dev/dm-user", O_RDWR));
-    if (ctrl_fd < 0) {
-        PLOG(ERROR) << "Unable to open /dev/dm-user";
-        return 1;
+    return BLOCK_SIZE;
+}
+
+// Start the copy operation. This will read the backing
+// block device which is represented by cow_op->source.
+int Snapuserd::ProcessCopyOp(const CowOperation* cow_op) {
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
+    CHECK(buffer != nullptr);
+
+    // Issue a single 4K IO. However, this can be optimized
+    // if the successive blocks are contiguous.
+    if (!android::base::ReadFullyAtOffset(backing_store_fd_, buffer, BLOCK_SIZE,
+                                          cow_op->source * BLOCK_SIZE)) {
+        LOG(ERROR) << "Copy-op failed. Read from backing store at: " << cow_op->source;
+        return -1;
     }
 
-    size_t buf_size = 1UL << 16;
-    auto buf = std::make_unique<char>(buf_size);
+    return BLOCK_SIZE;
+}
 
-    /* Just keeps pumping messages between userspace and the kernel.  We won't
-     * actually be doing anything, but the sequence numbers line up so it'll at
-     * least make forward progress. */
-    while (true) {
-        struct dm_user_message* msg = (struct dm_user_message*)buf.get();
+int Snapuserd::ProcessZeroOp() {
+    // Zero out the entire block
+    void* buffer = bufsink_.GetPayloadBuffer(BLOCK_SIZE);
+    CHECK(buffer != nullptr);
 
-        memset(buf.get(), 0, buf_size);
+    memset(buffer, 0, BLOCK_SIZE);
+    return BLOCK_SIZE;
+}
 
-        ssize_t readed = read(ctrl_fd.get(), buf.get(), buf_size);
-        if (readed < 0) {
-            PLOG(ERROR) << "Control read failed, trying with more space";
-            buf_size *= 2;
-            buf = std::make_unique<char>(buf_size);
-            continue;
-        }
+/*
+ * Read the data of size bytes from a given chunk.
+ *
+ * Kernel can potentially merge the blocks if the
+ * successive chunks are contiguous. For chunk size of 8,
+ * there can be 256 disk exceptions; and if
+ * all 256 disk exceptions are contiguous, kernel can merge
+ * them into a single IO.
+ *
+ * Since each chunk in the disk exception
+ * mapping represents a 4k block, kernel can potentially
+ * issue 256*4k = 1M IO in one shot.
+ *
+ * Even though kernel assumes that the blocks are
+ * contiguous, we need to split the 1M IO into 4k chunks
+ * as each operation represents 4k and it can either be:
+ *
+ * 1: Replace operation
+ * 2: Copy operation
+ * 3: Zero operation
+ *
+ */
+int Snapuserd::ReadData(chunk_t chunk, size_t size) {
+    int ret = 0;
 
-        LOG(DEBUG) << android::base::StringPrintf("read() from dm-user returned %d bytes:",
-                                                  (int)readed);
-        LOG(DEBUG) << android::base::StringPrintf("    msg->seq:    0x%016llx", msg->seq);
-        LOG(DEBUG) << android::base::StringPrintf("    msg->type:   0x%016llx", msg->type);
-        LOG(DEBUG) << android::base::StringPrintf("    msg->flags:  0x%016llx", msg->flags);
-        LOG(DEBUG) << android::base::StringPrintf("    msg->sector: 0x%016llx", msg->sector);
-        LOG(DEBUG) << android::base::StringPrintf("    msg->len:    0x%016llx", msg->len);
+    size_t read_size = size;
 
-        switch (msg->type) {
-            case DM_USER_MAP_READ: {
-                LOG(DEBUG) << android::base::StringPrintf(
-                        "Responding to read of sector %lld with %lld bytes data", msg->sector,
-                        msg->len);
+    chunk_t chunk_key = chunk;
+    uint32_t stride;
+    lldiv_t divresult;
 
-                if ((sizeof(*msg) + msg->len) > buf_size) {
-                    auto old_buf = std::move(buf);
-                    buf_size = sizeof(*msg) + msg->len;
-                    buf = std::make_unique<char>(buf_size);
-                    memcpy(buf.get(), old_buf.get(), sizeof(*msg));
-                    msg = (struct dm_user_message*)buf.get();
-                }
+    // Size should always be aligned
+    CHECK((read_size & (BLOCK_SIZE - 1)) == 0);
 
-                if (lseek(block_fd.get(), msg->sector * 512, SEEK_SET) < 0) {
-                    PLOG(ERROR) << "lseek failed: " << device;
-                    return 7;
-                }
-                if (!android::base::ReadFully(block_fd.get(), msg->buf, msg->len)) {
-                    PLOG(ERROR) << "read failed: " << device;
-                    return 7;
-                }
+    while (read_size > 0) {
+        const CowOperation* cow_op = chunk_vec_[chunk_key];
+        CHECK(cow_op != nullptr);
+        int result;
 
-                if (!android::base::WriteFully(ctrl_fd.get(), buf.get(), sizeof(*msg) + msg->len)) {
-                    PLOG(ERROR) << "write control failed";
-                    return 3;
-                }
+        switch (cow_op->type) {
+            case kCowReplaceOp: {
+                result = ProcessReplaceOp(cow_op);
                 break;
             }
 
-            case DM_USER_MAP_WRITE:
+            case kCowZeroOp: {
+                result = ProcessZeroOp();
+                break;
+            }
+
+            case kCowCopyOp: {
+                result = ProcessCopyOp(cow_op);
+                break;
+            }
+
+            default: {
+                LOG(ERROR) << "Unknown operation-type found: " << cow_op->type;
+                ret = -EIO;
+                goto done;
+            }
+        }
+
+        if (result < 0) {
+            ret = result;
+            goto done;
+        }
+
+        // Update the buffer offset
+        bufsink_.UpdateBufferOffset(BLOCK_SIZE);
+
+        read_size -= BLOCK_SIZE;
+        ret += BLOCK_SIZE;
+
+        // Start iterating the chunk incrementally; Since while
+        // constructing the metadata, we know that the chunk IDs
+        // are contiguous
+        chunk_key += 1;
+
+        // This is similar to the way when chunk IDs were assigned
+        // in ReadMetadata().
+        //
+        // Skip if the chunk id represents a metadata chunk.
+        stride = exceptions_per_area_ + 1;
+        divresult = lldiv(chunk_key, stride);
+        if (divresult.rem == NUM_SNAPSHOT_HDR_CHUNKS) {
+            // Crossing exception boundary. Kernel will never
+            // issue IO which is spanning between a data chunk
+            // and a metadata chunk. This should be perfectly aligned.
+            //
+            // Since the input read_size is 4k aligned, we will
+            // always end up reading all 256 data chunks in one area.
+            // Thus, every multiple of 4K IO represents 256 data chunks
+            CHECK(read_size == 0);
+            break;
+        }
+    }
+
+done:
+
+    // Reset the buffer offset
+    bufsink_.ResetBufferOffset();
+    return ret;
+}
+
+/*
+ * dm-snap does prefetch reads while reading disk-exceptions.
+ * By default, prefetch value is set to 12; this means that
+ * dm-snap will issue 12 areas wherein each area is a 4k page
+ * of disk-exceptions.
+ *
+ * If during prefetch, if the chunk-id seen is beyond the
+ * actual number of metadata page, fill the buffer with zero.
+ * When dm-snap starts parsing the buffer, it will stop
+ * reading metadata page once the buffer content is zero.
+ */
+int Snapuserd::ZerofillDiskExceptions(size_t read_size) {
+    size_t size = exceptions_per_area_ * sizeof(struct disk_exception);
+
+    if (read_size > size) return -EINVAL;
+
+    void* buffer = bufsink_.GetPayloadBuffer(size);
+    CHECK(buffer != nullptr);
+
+    memset(buffer, 0, size);
+    return size;
+}
+
+/*
+ * A disk exception is a simple mapping of old_chunk to new_chunk.
+ * When dm-snapshot device is created, kernel requests these mapping.
+ *
+ * Each disk exception is of size 16 bytes. Thus a single 4k page can
+ * have:
+ *
+ * exceptions_per_area_ = 4096/16 = 256. This entire 4k page
+ * is considered a metadata page and it is represented by chunk ID.
+ *
+ * Convert the chunk ID to index into the vector which gives us
+ * the metadata page.
+ */
+int Snapuserd::ReadDiskExceptions(chunk_t chunk, size_t read_size) {
+    uint32_t stride = exceptions_per_area_ + 1;
+    size_t size;
+
+    // ChunkID to vector index
+    lldiv_t divresult = lldiv(chunk, stride);
+
+    if (divresult.quot < vec_.size()) {
+        size = exceptions_per_area_ * sizeof(struct disk_exception);
+
+        if (read_size > size) return -EINVAL;
+
+        void* buffer = bufsink_.GetPayloadBuffer(size);
+        CHECK(buffer != nullptr);
+
+        memcpy(buffer, vec_[divresult.quot].get(), size);
+    } else {
+        size = ZerofillDiskExceptions(read_size);
+    }
+
+    return size;
+}
+
+/*
+ * Read the metadata from COW device and
+ * construct the metadata as required by the kernel.
+ *
+ * Please see design on kernel COW format
+ *
+ * 1: Read the metadata from internal COW device
+ * 2: There are 3 COW operations:
+ *     a: Replace op
+ *     b: Copy op
+ *     c: Zero op
+ * 3: For each of the 3 operations, op->new_block
+ *    represents the block number in the base device
+ *    for which one of the 3 operations have to be applied.
+ *    This represents the old_chunk in the kernel COW format
+ * 4: We need to assign new_chunk for a corresponding old_chunk
+ * 5: The algorithm is similar to how kernel assigns chunk number
+ *    while creating exceptions.
+ * 6: Use a monotonically increasing chunk number to assign the
+ *    new_chunk
+ * 7: Each chunk-id represents either a: Metadata page or b: Data page
+ * 8: Chunk-id representing a data page is stored in a vector. Index is the
+ *    chunk-id and value is the pointer to the CowOperation
+ * 9: Chunk-id representing a metadata page is converted into a vector
+ *    index. We store this in vector as kernel requests metadata during
+ *    two stage:
+ *       a: When initial dm-snapshot device is created, kernel requests
+ *          all the metadata and stores it in its internal data-structures.
+ *       b: During merge, kernel once again requests the same metadata
+ *          once-again.
+ *    In both these cases, a quick lookup based on chunk-id is done.
+ * 10: When chunk number is incremented, we need to make sure that
+ *    if the chunk is representing a metadata page and skip.
+ * 11: Each 4k page will contain 256 disk exceptions. We call this
+ *    exceptions_per_area_
+ * 12: Kernel will stop issuing metadata IO request when new-chunk ID is 0.
+ */
+int Snapuserd::ReadMetadata() {
+    reader_ = std::make_unique<CowReader>();
+    CowHeader header;
+
+    if (!reader_->Parse(cow_fd_)) {
+        LOG(ERROR) << "Failed to parse";
+        return 1;
+    }
+
+    if (!reader_->GetHeader(&header)) {
+        LOG(ERROR) << "Failed to get header";
+        return 1;
+    }
+
+    CHECK(header.block_size == BLOCK_SIZE);
+
+    LOG(DEBUG) << "Num-ops: " << std::hex << header.num_ops;
+    LOG(DEBUG) << "ops-offset: " << std::hex << header.ops_offset;
+    LOG(DEBUG) << "ops-size: " << std::hex << header.ops_size;
+
+    cowop_iter_ = reader_->GetOpIter();
+
+    if (cowop_iter_ == nullptr) {
+        LOG(ERROR) << "Failed to get cowop_iter";
+        return 1;
+    }
+
+    exceptions_per_area_ = (CHUNK_SIZE << SECTOR_SHIFT) / sizeof(struct disk_exception);
+
+    // Start from chunk number 2. Chunk 0 represents header and chunk 1
+    // represents first metadata page.
+    chunk_t next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1;
+    chunk_vec_.push_back(nullptr);
+    chunk_vec_.push_back(nullptr);
+
+    loff_t offset = 0;
+    std::unique_ptr<uint8_t[]> de_ptr =
+            std::make_unique<uint8_t[]>(exceptions_per_area_ * sizeof(struct disk_exception));
+
+    // This memset is important. Kernel will stop issuing IO when new-chunk ID
+    // is 0. When Area is not filled completely will all 256 exceptions,
+    // this memset will ensure that metadata read is completed.
+    memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
+    size_t num_ops = 0;
+
+    while (!cowop_iter_->Done()) {
+        const CowOperation* cow_op = &cowop_iter_->Get();
+        struct disk_exception* de =
+                reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
+
+        if (!(cow_op->type == kCowReplaceOp || cow_op->type == kCowZeroOp ||
+              cow_op->type == kCowCopyOp)) {
+            LOG(ERROR) << "Unknown operation-type found: " << cow_op->type;
+            return 1;
+        }
+
+        // Construct the disk-exception
+        de->old_chunk = cow_op->new_block;
+        de->new_chunk = next_free;
+
+        LOG(DEBUG) << "Old-chunk: " << de->old_chunk << "New-chunk: " << de->new_chunk;
+
+        // Store operation pointer. Note, new-chunk ID is the index
+        chunk_vec_.push_back(cow_op);
+        CHECK(next_free == (chunk_vec_.size() - 1));
+
+        offset += sizeof(struct disk_exception);
+
+        cowop_iter_->Next();
+
+        // Find the next free chunk-id to be assigned. Check if the next free
+        // chunk-id represents a metadata page. If so, skip it.
+        next_free += 1;
+        uint32_t stride = exceptions_per_area_ + 1;
+        lldiv_t divresult = lldiv(next_free, stride);
+        num_ops += 1;
+
+        if (divresult.rem == NUM_SNAPSHOT_HDR_CHUNKS) {
+            CHECK(num_ops == exceptions_per_area_);
+            // Store it in vector at the right index. This maps the chunk-id to
+            // vector index.
+            vec_.push_back(std::move(de_ptr));
+            offset = 0;
+            num_ops = 0;
+
+            chunk_t metadata_chunk = (next_free - exceptions_per_area_ - NUM_SNAPSHOT_HDR_CHUNKS);
+
+            LOG(DEBUG) << "Area: " << vec_.size() - 1;
+            LOG(DEBUG) << "Metadata-chunk: " << metadata_chunk;
+            LOG(DEBUG) << "Sector number of Metadata-chunk: " << (metadata_chunk << CHUNK_SHIFT);
+
+            // Create buffer for next area
+            de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
+                                                 sizeof(struct disk_exception));
+            memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
+
+            // Since this is a metadata, store at this index
+            chunk_vec_.push_back(nullptr);
+
+            // Find the next free chunk-id
+            next_free += 1;
+            if (cowop_iter_->Done()) {
+                vec_.push_back(std::move(de_ptr));
+            }
+        }
+    }
+
+    // Partially filled area
+    if (num_ops) {
+        LOG(DEBUG) << "Partially filled area num_ops: " << num_ops;
+        vec_.push_back(std::move(de_ptr));
+    }
+
+    return 0;
+}
+
+void MyLogger(android::base::LogId, android::base::LogSeverity severity, const char*, const char*,
+              unsigned int, const char* message) {
+    if (severity == android::base::ERROR) {
+        fprintf(stderr, "%s\n", message);
+    } else {
+        fprintf(stdout, "%s\n", message);
+    }
+}
+
+// Read Header from dm-user misc device. This gives
+// us the sector number for which IO is issued by dm-snapshot device
+int Snapuserd::ReadDmUserHeader() {
+    if (!android::base::ReadFully(ctrl_fd_, bufsink_.GetBufPtr(), sizeof(struct dm_user_header))) {
+        PLOG(ERROR) << "Control read failed";
+        return -1;
+    }
+
+    return sizeof(struct dm_user_header);
+}
+
+// Send the payload/data back to dm-user misc device.
+int Snapuserd::WriteDmUserPayload(size_t size) {
+    if (!android::base::WriteFully(ctrl_fd_, bufsink_.GetBufPtr(),
+                                   sizeof(struct dm_user_header) + size)) {
+        PLOG(ERROR) << "Write to dm-user failed";
+        return -1;
+    }
+
+    return sizeof(struct dm_user_header) + size;
+}
+
+// Start the daemon.
+// TODO: Handle signals
+int Snapuserd::Run() {
+    backing_store_fd_.reset(open(in_backing_store_device_.c_str(), O_RDONLY));
+    if (backing_store_fd_ < 0) {
+        LOG(ERROR) << "Open Failed: " << in_backing_store_device_;
+        return 1;
+    }
+
+    cow_fd_.reset(open(in_cow_device_.c_str(), O_RDWR));
+    if (cow_fd_ < 0) {
+        LOG(ERROR) << "Open Failed: " << in_cow_device_;
+        return 1;
+    }
+
+    // TODO: use UUID to support multiple partitions
+    ctrl_fd_.reset(open("/dev/dm-user", O_RDWR));
+    if (ctrl_fd_ < 0) {
+        LOG(ERROR) << "Unable to open /dev/dm-user";
+        return 1;
+    }
+
+    int ret = 0;
+
+    // Allocate the buffer which is used to communicate between
+    // daemon and dm-user. The buffer comprises of header and a fixed payload.
+    // If the dm-user requests a big IO, the IO will be broken into chunks
+    // of PAYLOAD_SIZE.
+    size_t buf_size = sizeof(struct dm_user_header) + PAYLOAD_SIZE;
+    bufsink_.Initialize(buf_size);
+
+    while (true) {
+        struct dm_user_header* header = bufsink_.GetHeaderPtr();
+
+        bufsink_.Clear();
+
+        ret = ReadDmUserHeader();
+        if (ret < 0) return ret;
+
+        LOG(DEBUG) << "dm-user returned " << ret << " bytes";
+
+        LOG(DEBUG) << "msg->seq: " << std::hex << header->seq;
+        LOG(DEBUG) << "msg->type: " << std::hex << header->type;
+        LOG(DEBUG) << "msg->flags: " << std::hex << header->flags;
+        LOG(DEBUG) << "msg->sector: " << std::hex << header->sector;
+        LOG(DEBUG) << "msg->len: " << std::hex << header->len;
+
+        switch (header->type) {
+            case DM_USER_MAP_READ: {
+                size_t remaining_size = header->len;
+                loff_t offset = 0;
+                header->io_in_progress = 0;
+                ret = 0;
+                do {
+                    size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
+
+                    // Request to sector 0 is always for kernel
+                    // representation of COW header. This IO should be only
+                    // once during dm-snapshot device creation. We should
+                    // never see multiple IO requests. Additionally this IO
+                    // will always be a single 4k.
+                    if (header->sector == 0) {
+                        // Read the metadata from internal COW device
+                        // and build the in-memory data structures
+                        // for all the operations in the internal COW.
+                        if (!metadata_read_done_ && ReadMetadata()) {
+                            LOG(ERROR) << "Metadata read failed";
+                            return 1;
+                        }
+                        metadata_read_done_ = true;
+
+                        CHECK(read_size == BLOCK_SIZE);
+                        ret = ConstructKernelCowHeader();
+                        if (ret < 0) return ret;
+                    } else {
+                        // Convert the sector number to a chunk ID.
+                        //
+                        // Check if the chunk ID represents a metadata
+                        // page. If the chunk ID is not found in the
+                        // vector, then it points to a metadata page.
+                        chunk_t chunk = (header->sector >> CHUNK_SHIFT);
+
+                        if (chunk >= chunk_vec_.size()) {
+                            ret = ZerofillDiskExceptions(read_size);
+                            if (ret < 0) {
+                                LOG(ERROR) << "ZerofillDiskExceptions failed";
+                                return ret;
+                            }
+                        } else if (chunk_vec_[chunk] == nullptr) {
+                            ret = ReadDiskExceptions(chunk, read_size);
+                            if (ret < 0) {
+                                LOG(ERROR) << "ReadDiskExceptions failed";
+                                return ret;
+                            }
+                        } else {
+                            chunk_t num_chunks_read = (offset >> BLOCK_SHIFT);
+                            ret = ReadData(chunk + num_chunks_read, read_size);
+                            if (ret < 0) {
+                                LOG(ERROR) << "ReadData failed";
+                                return ret;
+                            }
+                        }
+                    }
+
+                    ssize_t written = WriteDmUserPayload(ret);
+                    if (written < 0) return written;
+
+                    remaining_size -= ret;
+                    offset += ret;
+                    if (remaining_size) {
+                        LOG(DEBUG) << "Write done ret: " << ret
+                                   << " remaining size: " << remaining_size;
+                        bufsink_.GetHeaderPtr()->io_in_progress = 1;
+                    }
+                } while (remaining_size);
+
+                break;
+            }
+
+            case DM_USER_MAP_WRITE: {
+                // TODO: After merge operation is completed, kernel issues write
+                // to flush all the exception mappings where the merge is
+                // completed. If dm-user routes the WRITE IO, we need to clear
+                // in-memory data structures representing those exception
+                // mappings.
                 abort();
                 break;
+            }
         }
 
         LOG(DEBUG) << "read() finished, next message";
@@ -119,8 +679,12 @@
     return 0;
 }
 
+}  // namespace snapshot
+}  // namespace android
+
 int main([[maybe_unused]] int argc, char** argv) {
     android::base::InitLogging(argv, &android::base::KernelLogger);
-    daemon_main(argv[1]);
-    return 0;
+    android::snapshot::Snapuserd snapd(argv[1], argv[2]);
+
+    return snapd.Run();
 }