Merge "libsnapshot:snapuserd: Handle un-aligned IO request"
diff --git a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
index ed67a1c..4cc0fd3 100644
--- a/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
+++ b/fs_mgr/libsnapshot/cow_snapuserd_test.cpp
@@ -239,16 +239,12 @@
     ASSERT_TRUE(rnd_fd > 0);
 
     std::unique_ptr<uint8_t[]> random_buffer_1_ = std::make_unique<uint8_t[]>(size_);
-    std::unique_ptr<uint8_t[]> random_buffer_2_ = std::make_unique<uint8_t[]>(size_);
 
     // Fill random data
     for (size_t j = 0; j < (size_ / 1_MiB); j++) {
         ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_1_.get() + offset, 1_MiB, 0),
                   true);
 
-        ASSERT_EQ(ReadFullyAtOffset(rnd_fd, (char*)random_buffer_2_.get() + offset, 1_MiB, 0),
-                  true);
-
         offset += 1_MiB;
     }
 
@@ -280,7 +276,7 @@
 
     size_t blk_random2_replace_start = blk_zero_copy_end;
 
-    ASSERT_TRUE(writer.AddRawBlocks(blk_random2_replace_start, random_buffer_2_.get(), size_));
+    ASSERT_TRUE(writer.AddRawBlocks(blk_random2_replace_start, random_buffer_1_.get(), size_));
 
     // Flush operations
     ASSERT_TRUE(writer.Finalize());
@@ -292,7 +288,7 @@
     ASSERT_EQ(android::base::ReadFullyAtOffset(base_fd_, orig_buffer_.get(), size_, size_), true);
     memcpy((char*)orig_buffer_.get() + size_, random_buffer_1_.get(), size_);
     memcpy((char*)orig_buffer_.get() + (size_ * 2), (void*)zero_buffer.c_str(), size_);
-    memcpy((char*)orig_buffer_.get() + (size_ * 3), random_buffer_2_.get(), size_);
+    memcpy((char*)orig_buffer_.get() + (size_ * 3), random_buffer_1_.get(), size_);
 }
 
 void CowSnapuserdTest::InitCowDevice() {
diff --git a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
index e8dbe6e..7941e68 100644
--- a/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
+++ b/fs_mgr/libsnapshot/include/libsnapshot/snapuserd_kernel.h
@@ -50,6 +50,8 @@
 static constexpr uint32_t BLOCK_SIZE = 4096;
 static constexpr uint32_t BLOCK_SHIFT = (__builtin_ffs(BLOCK_SIZE) - 1);
 
+#define DIV_ROUND_UP(n, d) (((n) + (d)-1) / (d))
+
 // This structure represents the kernel COW header.
 // All the below fields should be in Little Endian format.
 struct disk_header {
diff --git a/fs_mgr/libsnapshot/snapuserd.cpp b/fs_mgr/libsnapshot/snapuserd.cpp
index 4f94806..573b6a5 100644
--- a/fs_mgr/libsnapshot/snapuserd.cpp
+++ b/fs_mgr/libsnapshot/snapuserd.cpp
@@ -129,88 +129,126 @@
     return true;
 }
 
-/*
- * Read the data of size bytes from a given chunk.
- *
- * Kernel can potentially merge the blocks if the
- * successive chunks are contiguous. For chunk size of 8,
- * there can be 256 disk exceptions; and if
- * all 256 disk exceptions are contiguous, kernel can merge
- * them into a single IO.
- *
- * Since each chunk in the disk exception
- * mapping represents a 4k block, kernel can potentially
- * issue 256*4k = 1M IO in one shot.
- *
- * Even though kernel assumes that the blocks are
- * contiguous, we need to split the 1M IO into 4k chunks
- * as each operation represents 4k and it can either be:
- *
- * 1: Replace operation
- * 2: Copy operation
- * 3: Zero operation
- *
- */
-bool Snapuserd::ReadData(chunk_t chunk, size_t size) {
-    size_t read_size = size;
-    bool ret = true;
-    chunk_t chunk_key = chunk;
+bool Snapuserd::ProcessCowOp(const CowOperation* cow_op) {
+    CHECK(cow_op != nullptr);
 
-    if (!((read_size & (BLOCK_SIZE - 1)) == 0)) {
-        SNAP_LOG(ERROR) << "ReadData - unaligned read_size: " << read_size;
-        return false;
+    switch (cow_op->type) {
+        case kCowReplaceOp: {
+            return ProcessReplaceOp(cow_op);
+        }
+
+        case kCowZeroOp: {
+            return ProcessZeroOp();
+        }
+
+        case kCowCopyOp: {
+            return ProcessCopyOp(cow_op);
+        }
+
+        default: {
+            SNAP_LOG(ERROR) << "Unknown operation-type found: " << cow_op->type;
+        }
+    }
+    return false;
+}
+
+int Snapuserd::ReadUnalignedSector(sector_t sector, size_t size,
+                                   std::map<sector_t, const CowOperation*>::iterator& it) {
+    size_t skip_sector_size = 0;
+
+    SNAP_LOG(DEBUG) << "ReadUnalignedSector: sector " << sector << " size: " << size
+                    << " Aligned sector: " << it->second;
+
+    if (!ProcessCowOp(it->second)) {
+        SNAP_LOG(ERROR) << "ReadUnalignedSector: " << sector << " failed";
+        return -1;
     }
 
-    while (read_size > 0) {
-        const CowOperation* cow_op = chunk_map_[chunk_key];
-        CHECK(cow_op != nullptr);
+    int num_sectors_skip = sector - it->first;
 
-        switch (cow_op->type) {
-            case kCowReplaceOp: {
-                ret = ProcessReplaceOp(cow_op);
-                break;
-            }
+    if (num_sectors_skip > 0) {
+        skip_sector_size = num_sectors_skip << SECTOR_SHIFT;
+        char* buffer = reinterpret_cast<char*>(bufsink_.GetBufPtr());
+        struct dm_user_message* msg = (struct dm_user_message*)(&(buffer[0]));
 
-            case kCowZeroOp: {
-                ret = ProcessZeroOp();
-                break;
-            }
+        memmove(msg->payload.buf, (char*)msg->payload.buf + skip_sector_size,
+                (BLOCK_SIZE - skip_sector_size));
+    }
 
-            case kCowCopyOp: {
-                ret = ProcessCopyOp(cow_op);
-                break;
-            }
+    bufsink_.ResetBufferOffset();
+    return std::min(size, (BLOCK_SIZE - skip_sector_size));
+}
 
-            default: {
-                SNAP_LOG(ERROR) << "Unknown operation-type found: " << cow_op->type;
-                ret = false;
-                break;
-            }
+/*
+ * Read the data for a given COW Operation.
+ *
+ * Kernel can issue IO at a sector granularity.
+ * Hence, an IO may end up with reading partial
+ * data from a COW operation or we may also
+ * end up with interspersed request between
+ * two COW operations.
+ *
+ */
+int Snapuserd::ReadData(sector_t sector, size_t size) {
+    /*
+     * chunk_map stores COW operation at 4k granularity.
+     * If the requested IO with the sector falls on the 4k
+     * boundary, then we can read the COW op directly without
+     * any issue.
+     *
+     * However, if the requested sector is not 4K aligned,
+     * then we will have the find the nearest COW operation
+     * and chop the 4K block to fetch the requested sector.
+     */
+    std::map<sector_t, const CowOperation*>::iterator it = chunk_map_.find(sector);
+    if (it == chunk_map_.end()) {
+        it = chunk_map_.lower_bound(sector);
+        if (it != chunk_map_.begin()) {
+            --it;
         }
 
-        if (!ret) {
-            SNAP_LOG(ERROR) << "ReadData failed for operation: " << cow_op->type;
-            return false;
-        }
+        /*
+         * If the IO is spanned between two COW operations,
+         * split the IO into two parts:
+         *
+         * 1: Read the first part from the single COW op
+         * 2: Read the second part from the next COW op.
+         *
+         * Ex: Let's say we have a 1024 Bytes IO request.
+         *
+         * 0       COW OP-1  4096     COW OP-2  8192
+         * |******************|*******************|
+         *              |*****|*****|
+         *           3584           4608
+         *              <- 1024B - >
+         *
+         * We have two COW operations which are 4k blocks.
+         * The IO is requested for 1024 Bytes which are spanned
+         * between two COW operations. We will split this IO
+         * into two parts:
+         *
+         * 1: IO of size 512B from offset 3584 bytes (COW OP-1)
+         * 2: IO of size 512B from offset 4096 bytes (COW OP-2)
+         */
+        return ReadUnalignedSector(sector, size, it);
+    }
 
+    int num_ops = DIV_ROUND_UP(size, BLOCK_SIZE);
+    while (num_ops) {
+        if (!ProcessCowOp(it->second)) {
+            return -1;
+        }
+        num_ops -= 1;
+        it++;
         // Update the buffer offset
         bufsink_.UpdateBufferOffset(BLOCK_SIZE);
 
-        read_size -= BLOCK_SIZE;
-
-        // Start iterating the chunk incrementally; Since while
-        // constructing the metadata, we know that the chunk IDs
-        // are contiguous
-        chunk_key += 1;
-
-        if (cow_op->type == kCowCopyOp) {
-            CHECK(read_size == 0);
-        }
+        SNAP_LOG(DEBUG) << "ReadData at sector: " << sector << " size: " << size;
     }
 
     // Reset the buffer offset
     bufsink_.ResetBufferOffset();
-    return ret;
+    return size;
 }
 
 /*
@@ -261,9 +299,7 @@
     if (divresult.quot < vec_.size()) {
         size = exceptions_per_area_ * sizeof(struct disk_exception);
 
-        if (read_size > size) {
-            return false;
-        }
+        CHECK(read_size == size);
 
         void* buffer = bufsink_.GetPayloadBuffer(size);
         CHECK(buffer != nullptr);
@@ -329,7 +365,7 @@
         if (cow_de->new_chunk != 0) {
             merged_ops_cur_iter += 1;
             offset += sizeof(struct disk_exception);
-            const CowOperation* cow_op = chunk_map_[cow_de->new_chunk];
+            const CowOperation* cow_op = chunk_map_[ChunkToSector(cow_de->new_chunk)];
             CHECK(cow_op != nullptr);
             CHECK(cow_op->new_block == cow_de->old_chunk);
             if (cow_op->type == kCowCopyOp) {
@@ -563,7 +599,7 @@
         SNAP_LOG(DEBUG) << "Old-chunk: " << de->old_chunk << "New-chunk: " << de->new_chunk;
 
         // Store operation pointer.
-        chunk_map_[data_chunk_id] = cow_op;
+        chunk_map_[ChunkToSector(data_chunk_id)] = cow_op;
         num_ops += 1;
         offset += sizeof(struct disk_exception);
         cowop_riter_->Next();
@@ -680,6 +716,126 @@
     return true;
 }
 
+bool Snapuserd::DmuserWriteRequest() {
+    struct dm_user_header* header = bufsink_.GetHeaderPtr();
+
+    // device mapper has the capability to allow
+    // targets to flush the cache when writes are completed. This
+    // is controlled by each target by a flag "flush_supported".
+    // This flag is set by dm-user. When flush is supported,
+    // a number of zero-length bio's will be submitted to
+    // the target for the purpose of flushing cache. It is the
+    // responsibility of the target driver - which is dm-user in this
+    // case, to remap these bio's to the underlying device. Since,
+    // there is no underlying device for dm-user, this zero length
+    // bio's gets routed to daemon.
+    //
+    // Flush operations are generated post merge by dm-snap by having
+    // REQ_PREFLUSH flag set. Snapuser daemon doesn't have anything
+    // to flush per se; hence, just respond back with a success message.
+    if (header->sector == 0) {
+        CHECK(header->len == 0);
+        header->type = DM_USER_RESP_SUCCESS;
+        if (!WriteDmUserPayload(0)) {
+            return false;
+        }
+        return true;
+    }
+
+    size_t remaining_size = header->len;
+    size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
+    CHECK(read_size == BLOCK_SIZE);
+
+    CHECK(header->sector > 0);
+    chunk_t chunk = SectorToChunk(header->sector);
+    CHECK(chunk_map_.find(header->sector) == chunk_map_.end());
+
+    void* buffer = bufsink_.GetPayloadBuffer(read_size);
+    CHECK(buffer != nullptr);
+    header->type = DM_USER_RESP_SUCCESS;
+
+    if (!ReadDmUserPayload(buffer, read_size)) {
+        SNAP_LOG(ERROR) << "ReadDmUserPayload failed for chunk id: " << chunk
+                        << "Sector: " << header->sector;
+        header->type = DM_USER_RESP_ERROR;
+    }
+
+    if (header->type == DM_USER_RESP_SUCCESS && !ProcessMergeComplete(chunk, buffer)) {
+        SNAP_LOG(ERROR) << "ProcessMergeComplete failed for chunk id: " << chunk
+                        << "Sector: " << header->sector;
+        header->type = DM_USER_RESP_ERROR;
+    } else {
+        SNAP_LOG(DEBUG) << "ProcessMergeComplete success for chunk id: " << chunk
+                        << "Sector: " << header->sector;
+    }
+
+    if (!WriteDmUserPayload(0)) {
+        return false;
+    }
+
+    return true;
+}
+
+bool Snapuserd::DmuserReadRequest() {
+    struct dm_user_header* header = bufsink_.GetHeaderPtr();
+    size_t remaining_size = header->len;
+    loff_t offset = 0;
+    sector_t sector = header->sector;
+    do {
+        size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
+
+        int ret = read_size;
+        header->type = DM_USER_RESP_SUCCESS;
+        chunk_t chunk = SectorToChunk(header->sector);
+
+        // Request to sector 0 is always for kernel
+        // representation of COW header. This IO should be only
+        // once during dm-snapshot device creation. We should
+        // never see multiple IO requests. Additionally this IO
+        // will always be a single 4k.
+        if (header->sector == 0) {
+            CHECK(metadata_read_done_ == true);
+            CHECK(read_size == BLOCK_SIZE);
+            ConstructKernelCowHeader();
+            SNAP_LOG(DEBUG) << "Kernel header constructed";
+        } else {
+            if (!offset && (read_size == BLOCK_SIZE) &&
+                chunk_map_.find(header->sector) == chunk_map_.end()) {
+                if (!ReadDiskExceptions(chunk, read_size)) {
+                    SNAP_LOG(ERROR) << "ReadDiskExceptions failed for chunk id: " << chunk
+                                    << "Sector: " << header->sector;
+                    header->type = DM_USER_RESP_ERROR;
+                } else {
+                    SNAP_LOG(DEBUG) << "ReadDiskExceptions success for chunk id: " << chunk
+                                    << "Sector: " << header->sector;
+                }
+            } else {
+                chunk_t num_sectors_read = (offset >> SECTOR_SHIFT);
+                ret = ReadData(sector + num_sectors_read, read_size);
+                if (ret < 0) {
+                    SNAP_LOG(ERROR) << "ReadData failed for chunk id: " << chunk
+                                    << "Sector: " << header->sector;
+                    header->type = DM_USER_RESP_ERROR;
+                } else {
+                    SNAP_LOG(DEBUG) << "ReadData success for chunk id: " << chunk
+                                    << "Sector: " << header->sector;
+                }
+            }
+        }
+
+        // Daemon will not be terminated if there is any error. We will
+        // just send the error back to dm-user.
+        if (!WriteDmUserPayload(ret)) {
+            return false;
+        }
+
+        remaining_size -= ret;
+        offset += ret;
+    } while (remaining_size > 0);
+
+    return true;
+}
+
 bool Snapuserd::Run() {
     struct dm_user_header* header = bufsink_.GetHeaderPtr();
 
@@ -698,122 +854,16 @@
 
     switch (header->type) {
         case DM_USER_REQ_MAP_READ: {
-            size_t remaining_size = header->len;
-            loff_t offset = 0;
-            do {
-                size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
-                header->type = DM_USER_RESP_SUCCESS;
-
-                // Request to sector 0 is always for kernel
-                // representation of COW header. This IO should be only
-                // once during dm-snapshot device creation. We should
-                // never see multiple IO requests. Additionally this IO
-                // will always be a single 4k.
-                if (header->sector == 0) {
-                    CHECK(metadata_read_done_ == true);
-                    CHECK(read_size == BLOCK_SIZE);
-                    ConstructKernelCowHeader();
-                    SNAP_LOG(DEBUG) << "Kernel header constructed";
-                } else {
-                    // Convert the sector number to a chunk ID.
-                    //
-                    // Check if the chunk ID represents a metadata
-                    // page. If the chunk ID is not found in the
-                    // vector, then it points to a metadata page.
-                    chunk_t chunk = SectorToChunk(header->sector);
-
-                    if (chunk_map_.find(chunk) == chunk_map_.end()) {
-                        if (!ReadDiskExceptions(chunk, read_size)) {
-                            SNAP_LOG(ERROR) << "ReadDiskExceptions failed for chunk id: " << chunk
-                                            << "Sector: " << header->sector;
-                            header->type = DM_USER_RESP_ERROR;
-                        } else {
-                            SNAP_LOG(DEBUG) << "ReadDiskExceptions success for chunk id: " << chunk
-                                            << "Sector: " << header->sector;
-                        }
-                    } else {
-                        SNAP_LOG(DEBUG) << "ReadData: chunk: " << chunk << " len: " << header->len
-                                        << " read_size: " << read_size << " offset: " << offset;
-                        chunk_t num_chunks_read = (offset >> BLOCK_SHIFT);
-                        if (!ReadData(chunk + num_chunks_read, read_size)) {
-                            SNAP_LOG(ERROR) << "ReadData failed for chunk id: " << chunk
-                                            << "Sector: " << header->sector;
-                            header->type = DM_USER_RESP_ERROR;
-                        } else {
-                            SNAP_LOG(DEBUG) << "ReadData success for chunk id: " << chunk
-                                            << "Sector: " << header->sector;
-                        }
-                    }
-                }
-
-                // Daemon will not be terminated if there is any error. We will
-                // just send the error back to dm-user.
-                if (!WriteDmUserPayload(read_size)) {
-                    return false;
-                }
-
-                remaining_size -= read_size;
-                offset += read_size;
-            } while (remaining_size);
-
+            if (!DmuserReadRequest()) {
+                return false;
+            }
             break;
         }
 
         case DM_USER_REQ_MAP_WRITE: {
-            // device mapper has the capability to allow
-            // targets to flush the cache when writes are completed. This
-            // is controlled by each target by a flag "flush_supported".
-            // This flag is set by dm-user. When flush is supported,
-            // a number of zero-length bio's will be submitted to
-            // the target for the purpose of flushing cache. It is the
-            // responsibility of the target driver - which is dm-user in this
-            // case, to remap these bio's to the underlying device. Since,
-            // there is no underlying device for dm-user, this zero length
-            // bio's gets routed to daemon.
-            //
-            // Flush operations are generated post merge by dm-snap by having
-            // REQ_PREFLUSH flag set. Snapuser daemon doesn't have anything
-            // to flush per se; hence, just respond back with a success message.
-            if (header->sector == 0) {
-                CHECK(header->len == 0);
-                header->type = DM_USER_RESP_SUCCESS;
-                if (!WriteDmUserPayload(0)) {
-                    return false;
-                }
-                break;
-            }
-
-            size_t remaining_size = header->len;
-            size_t read_size = std::min(PAYLOAD_SIZE, remaining_size);
-            CHECK(read_size == BLOCK_SIZE);
-
-            CHECK(header->sector > 0);
-            chunk_t chunk = SectorToChunk(header->sector);
-            CHECK(chunk_map_.find(chunk) == chunk_map_.end());
-
-            void* buffer = bufsink_.GetPayloadBuffer(read_size);
-            CHECK(buffer != nullptr);
-            header->type = DM_USER_RESP_SUCCESS;
-
-            if (!ReadDmUserPayload(buffer, read_size)) {
-                SNAP_LOG(ERROR) << "ReadDmUserPayload failed for chunk id: " << chunk
-                                << "Sector: " << header->sector;
-                header->type = DM_USER_RESP_ERROR;
-            }
-
-            if (header->type == DM_USER_RESP_SUCCESS && !ProcessMergeComplete(chunk, buffer)) {
-                SNAP_LOG(ERROR) << "ProcessMergeComplete failed for chunk id: " << chunk
-                                << "Sector: " << header->sector;
-                header->type = DM_USER_RESP_ERROR;
-            } else {
-                SNAP_LOG(DEBUG) << "ProcessMergeComplete success for chunk id: " << chunk
-                                << "Sector: " << header->sector;
-            }
-
-            if (!WriteDmUserPayload(0)) {
+            if (!DmuserWriteRequest()) {
                 return false;
             }
-
             break;
         }
     }
diff --git a/fs_mgr/libsnapshot/snapuserd.h b/fs_mgr/libsnapshot/snapuserd.h
index eec6d45..c01fee3 100644
--- a/fs_mgr/libsnapshot/snapuserd.h
+++ b/fs_mgr/libsnapshot/snapuserd.h
@@ -22,9 +22,9 @@
 #include <cstring>
 #include <iostream>
 #include <limits>
+#include <map>
 #include <string>
 #include <thread>
-#include <unordered_map>
 #include <vector>
 
 #include <android-base/file.h>
@@ -72,6 +72,9 @@
     bool IsAttached() const { return ctrl_fd_ >= 0; }
 
   private:
+    bool DmuserReadRequest();
+    bool DmuserWriteRequest();
+
     bool ReadDmUserHeader();
     bool ReadDmUserPayload(void* buffer, size_t size);
     bool WriteDmUserPayload(size_t size);
@@ -79,10 +82,13 @@
     bool ReadMetadata();
     bool ZerofillDiskExceptions(size_t read_size);
     bool ReadDiskExceptions(chunk_t chunk, size_t size);
-    bool ReadData(chunk_t chunk, size_t size);
+    int ReadUnalignedSector(sector_t sector, size_t size,
+                            std::map<sector_t, const CowOperation*>::iterator& it);
+    int ReadData(sector_t sector, size_t size);
     bool IsChunkIdMetadata(chunk_t chunk);
     chunk_t GetNextAllocatableChunkId(chunk_t chunk_id);
 
+    bool ProcessCowOp(const CowOperation* cow_op);
     bool ProcessReplaceOp(const CowOperation* cow_op);
     bool ProcessCopyOp(const CowOperation* cow_op);
     bool ProcessZeroOp();
@@ -94,6 +100,7 @@
     bool ProcessMergeComplete(chunk_t chunk, void* buffer);
     sector_t ChunkToSector(chunk_t chunk) { return chunk << CHUNK_SHIFT; }
     chunk_t SectorToChunk(sector_t sector) { return sector >> CHUNK_SHIFT; }
+    bool IsBlockAligned(int read_size) { return ((read_size & (BLOCK_SIZE - 1)) == 0); }
 
     std::string cow_device_;
     std::string backing_store_device_;
@@ -116,9 +123,15 @@
     // mapping of old-chunk to new-chunk
     std::vector<std::unique_ptr<uint8_t[]>> vec_;
 
-    // Key - Chunk ID
+    // Key - Sector
     // Value - cow operation
-    std::unordered_map<chunk_t, const CowOperation*> chunk_map_;
+    //
+    // chunk_map stores the pseudo mapping of sector
+    // to COW operations. Each COW op is 4k; however,
+    // we can get a read request which are as small
+    // as 512 bytes. Hence, we need to binary search
+    // in the chunk_map to find the nearest COW op.
+    std::map<sector_t, const CowOperation*> chunk_map_;
 
     bool metadata_read_done_ = false;
     BufferSink bufsink_;