AU: remove identical src/dst blocks when generating MOVE operations

This eliminates a scenario where some (or all) blocks in a file are
being moved to the same location from which they were read. It is
achieved by post-processing the sequence of source/target extents of
a MOVE operation, removing the i-th block if it is identical in both.
In practice, this removal is done rather efficiently as we identify the
largest contiguous blocks that can be removed within extents. Note that
there are four different cases to handle when removing blocks from
extents (head, tail, middle, or all).

In turn, it opens up the possibility that a delta operation (MOVE)
becomes a no-op, for example, if a file (or a kernel partition) remains
the same and does not move at all; therefore, we also care to identify
such empty operations and avoid adding them entirely.

NOTE: the correctness of this change depends on the assumption that not
all blocks in the target partition must be written during an update! It
could be that some further logic (such as cycle breaking) assumes the
opposite; that said, we now have good confidence in the robustness of
our update payloads, so we should be fine enabling this feature.

NOTE 2: once this is merged and the minimum delta generator version used
in paygen is bumped, we should re-enable the test for identical src/dst
blocks when verifying payloads.

BUG=chromium:263550
TEST=New unit test covering all cases; all unit tests pass.

Change-Id: If3bd1bf79720e076120074ad913d8bbe6c098710
Reviewed-on: https://chromium-review.googlesource.com/181515
Reviewed-by: Gilad Arnold <garnold@chromium.org>
Commit-Queue: Gilad Arnold <garnold@chromium.org>
Tested-by: Gilad Arnold <garnold@chromium.org>
diff --git a/delta_diff_generator.cc b/delta_diff_generator.cc
index 8d70749..f69a1fe 100644
--- a/delta_diff_generator.cc
+++ b/delta_diff_generator.cc
@@ -64,6 +64,13 @@
 const uint64_t kVersionNumber = 1;
 const uint64_t kFullUpdateChunkSize = 1024 * 1024;  // bytes
 
+// Needed for testing purposes, in case we can't use actual filesystem objects.
+// TODO(garnold)(chromium:331965) Replace this hack with a properly injected
+// parameter in form of a mockable abstract class.
+bool (*get_extents_with_chunk_func)(const std::string&, off_t, off_t,
+                                    std::vector<Extent>*) =
+    extent_mapper::ExtentsForFileChunkFibmap;
+
 namespace {
 const size_t kBlockSize = 4096;  // bytes
 const string kNonexistentPath = "";
@@ -76,16 +83,15 @@
   "BSDIFF"
 };
 
-// Stores all Extents for a file into 'out'. Returns true on success.
+// Stores all the extents of |path| into |extents|. Returns true on success.
 bool GatherExtents(const string& path,
                    off_t chunk_offset,
                    off_t chunk_size,
-                   google::protobuf::RepeatedPtrField<Extent>* out) {
-  vector<Extent> extents;
+                   vector<Extent>* extents) {
+  extents->clear();
   TEST_AND_RETURN_FALSE(
-      extent_mapper::ExtentsForFileChunkFibmap(
-          path, chunk_offset, chunk_size, &extents));
-  DeltaDiffGenerator::StoreExtents(extents, out);
+      get_extents_with_chunk_func(
+          path, chunk_offset, chunk_size, extents));
   return true;
 }
 
@@ -131,6 +137,17 @@
                                                            &operation,
                                                            true));
 
+  // Check if the operation writes nothing.
+  if (operation.dst_extents_size() == 0) {
+    if (operation.type() == DeltaArchiveManifest_InstallOperation_Type_MOVE) {
+      LOG(INFO) << "Empty MOVE operation (" << new_root + path << "), skipping";
+      return true;
+    } else {
+      LOG(ERROR) << "Empty non-MOVE operation";
+      return false;
+    }
+  }
+
   // Write the data
   if (operation.type() != DeltaArchiveManifest_InstallOperation_Type_MOVE) {
     operation.set_data_offset(*data_file_size);
@@ -440,10 +457,7 @@
   LOG(INFO) << "Delta compressing kernel partition...";
   LOG_IF(INFO, old_kernel_part.empty()) << "Generating full kernel update...";
 
-  // Add a new install operation
-  ops->resize(1);
-  DeltaArchiveManifest_InstallOperation* op = &(*ops)[0];
-
+  DeltaArchiveManifest_InstallOperation op;
   vector<char> data;
   TEST_AND_RETURN_FALSE(
       DeltaDiffGenerator::ReadFileToDiff(old_kernel_part,
@@ -452,20 +466,35 @@
                                          -1,  // chunk_size
                                          true, // bsdiff_allowed
                                          &data,
-                                         op,
+                                         &op,
                                          false));
 
-  // Write the data
-  if (op->type() != DeltaArchiveManifest_InstallOperation_Type_MOVE) {
-    op->set_data_offset(*blobs_length);
-    op->set_data_length(data.size());
+  // Check if the operation writes nothing.
+  if (op.dst_extents_size() == 0) {
+    if (op.type() == DeltaArchiveManifest_InstallOperation_Type_MOVE) {
+      LOG(INFO) << "Empty MOVE operation, nothing to do.";
+      return true;
+    } else {
+      LOG(ERROR) << "Empty non-MOVE operation";
+      return false;
+    }
   }
 
+  // Write the data.
+  if (op.type() != DeltaArchiveManifest_InstallOperation_Type_MOVE) {
+    op.set_data_offset(*blobs_length);
+    op.set_data_length(data.size());
+  }
+
+  // Add the new install operation.
+  ops->clear();
+  ops->push_back(op);
+
   TEST_AND_RETURN_FALSE(utils::WriteAll(blobs_fd, &data[0], data.size()));
   *blobs_length += data.size();
 
   LOG(INFO) << "Done delta compressing kernel partition: "
-            << kInstallOperationTypes[op->type()];
+            << kInstallOperationTypes[op.type()];
   return true;
 }
 
@@ -528,6 +557,89 @@
   fprintf(stderr, kFormatString, 100.0, total_size, "", "<total>");
 }
 
+// Process a range of blocks from |range_start| to |range_end| in the extent at
+// position |*idx_p| of |extents|. If |do_remove| is true, this range will be
+// removed, which may cause the extent to be trimmed, split or removed entirely.
+// The value of |*idx_p| is updated to point to the next extent to be processed.
+// Returns true iff the next extent to process is a new or updated one.
+bool ProcessExtentBlockRange(vector<Extent>* extents, size_t* idx_p,
+                             const bool do_remove, uint64_t range_start,
+                             uint64_t range_end) {
+  size_t idx = *idx_p;
+  uint64_t start_block = (*extents)[idx].start_block();
+  uint64_t num_blocks = (*extents)[idx].num_blocks();
+  uint64_t range_size = range_end - range_start;
+
+  if (do_remove) {
+    if (range_size == num_blocks) {
+      // Remove the entire extent.
+      extents->erase(extents->begin() + idx);
+    } else if (range_end == num_blocks) {
+      // Trim the end of the extent.
+      (*extents)[idx].set_num_blocks(num_blocks - range_size);
+      idx++;
+    } else if (range_start == 0) {
+      // Trim the head of the extent.
+      (*extents)[idx].set_start_block(start_block + range_size);
+      (*extents)[idx].set_num_blocks(num_blocks - range_size);
+    } else {
+      // Trim the middle, splitting the remainder into two parts.
+      (*extents)[idx].set_num_blocks(range_start);
+      Extent e;
+      e.set_start_block(start_block + range_end);
+      e.set_num_blocks(num_blocks - range_end);
+      idx++;
+      extents->insert(extents->begin() + idx, e);
+    }
+  } else if (range_end == num_blocks) {
+    // Done with this extent.
+    idx++;
+  } else {
+    return false;
+  }
+
+  *idx_p = idx;
+  return true;
+}
+
+// Remove identical corresponding block ranges in |src_extents| and
+// |dst_extents|. Used for preventing moving of blocks onto themselves during
+// MOVE operations.
+void RemoveIdenticalBlockRanges(vector<Extent>* src_extents,
+                                vector<Extent>* dst_extents) {
+  size_t src_idx = 0;
+  size_t dst_idx = 0;
+  uint64_t src_offset = 0, dst_offset = 0;
+  bool new_src = true, new_dst = true;
+  while (src_idx < src_extents->size() && dst_idx < dst_extents->size()) {
+    if (new_src) {
+      src_offset = 0;
+      new_src = false;
+    }
+    if (new_dst) {
+      dst_offset = 0;
+      new_dst = false;
+    }
+
+    bool do_remove = ((*src_extents)[src_idx].start_block() + src_offset ==
+                      (*dst_extents)[dst_idx].start_block() + dst_offset);
+
+    uint64_t src_num_blocks = (*src_extents)[src_idx].num_blocks();
+    uint64_t dst_num_blocks = (*dst_extents)[dst_idx].num_blocks();
+    uint64_t min_num_blocks = min(src_num_blocks - src_offset,
+                                  dst_num_blocks - dst_offset);
+    uint64_t prev_src_offset = src_offset;
+    uint64_t prev_dst_offset = dst_offset;
+    src_offset += min_num_blocks;
+    dst_offset += min_num_blocks;
+
+    new_src = ProcessExtentBlockRange(src_extents, &src_idx, do_remove,
+                                      prev_src_offset, src_offset);
+    new_dst = ProcessExtentBlockRange(dst_extents, &dst_idx, do_remove,
+                                      prev_dst_offset, dst_offset);
+  }
+}
+
 }  // namespace {}
 
 bool DeltaDiffGenerator::ReadFileToDiff(
@@ -617,6 +729,7 @@
   // Set parameters of the operations
   CHECK_EQ(data.size(), current_best_size);
 
+  vector<Extent> src_extents, dst_extents;
   if (operation.type() == DeltaArchiveManifest_InstallOperation_Type_MOVE ||
       operation.type() == DeltaArchiveManifest_InstallOperation_Type_BSDIFF) {
     if (gather_extents) {
@@ -624,7 +737,7 @@
           GatherExtents(old_filename,
                         chunk_offset,
                         chunk_size,
-                        operation.mutable_src_extents()));
+                        &src_extents));
     } else {
       Extent* src_extent = operation.add_src_extents();
       src_extent->set_start_block(0);
@@ -639,7 +752,7 @@
         GatherExtents(new_filename,
                       chunk_offset,
                       chunk_size,
-                      operation.mutable_dst_extents()));
+                      &dst_extents));
   } else {
     Extent* dst_extent = operation.add_dst_extents();
     dst_extent->set_start_block(0);
@@ -647,6 +760,18 @@
   }
   operation.set_dst_length(new_data.size());
 
+  if (gather_extents) {
+    // Remove identical src/dst block ranges in MOVE operations.
+    if (operation.type() == DeltaArchiveManifest_InstallOperation_Type_MOVE)
+      RemoveIdenticalBlockRanges(&src_extents, &dst_extents);
+
+    // Embed extents in the operation.
+    DeltaDiffGenerator::StoreExtents(src_extents,
+                                     operation.mutable_src_extents());
+    DeltaDiffGenerator::StoreExtents(dst_extents,
+                                     operation.mutable_dst_extents());
+  }
+
   out_data->swap(data);
   *out_op = operation;
 
diff --git a/delta_diff_generator_unittest.cc b/delta_diff_generator_unittest.cc
index f4570f9..510d1ed 100644
--- a/delta_diff_generator_unittest.cc
+++ b/delta_diff_generator_unittest.cc
@@ -7,6 +7,7 @@
 #include <fcntl.h>
 #include <unistd.h>
 
+#include <map>
 #include <set>
 #include <sstream>
 #include <string>
@@ -20,6 +21,7 @@
 #include "update_engine/cycle_breaker.h"
 #include "update_engine/delta_diff_generator.h"
 #include "update_engine/delta_performer.h"
+#include "update_engine/extent_mapper.h"
 #include "update_engine/extent_ranges.h"
 #include "update_engine/graph_types.h"
 #include "update_engine/graph_utils.h"
@@ -38,6 +40,10 @@
 
 typedef DeltaDiffGenerator::Block Block;
 
+typedef bool (*GetExtentsWithChunk)(const std::string&, off_t, off_t,
+                                    std::vector<Extent>*);
+extern GetExtentsWithChunk get_extents_with_chunk_func;
+
 namespace {
 int64_t BlocksInExtents(
     const google::protobuf::RepeatedPtrField<Extent>& extents) {
@@ -103,6 +109,121 @@
   EXPECT_EQ(1, BlocksInExtents(op.dst_extents()));
 }
 
+std::map<string, vector<Extent>*> fake_file_extents;
+
+bool FakeGetExtents(const string& path, off_t chunk_offset, off_t chunk_size,
+                    vector<Extent>* out) {
+  if (fake_file_extents.count(path) == 1) {
+    *out = *fake_file_extents[path];
+    return true;
+  } else {
+    return false;
+  }
+}
+
+uint64_t AddExtent(uint64_t start_block, uint64_t num_blocks,
+                   vector<Extent>* extents) {
+  Extent e;
+  e.set_start_block(start_block);
+  e.set_num_blocks(num_blocks);
+  extents->push_back(e);
+  return num_blocks;
+}
+
+TEST_F(DeltaDiffGeneratorTest, RunAsRootMoveWithSameBlock) {
+  // Mock out the extent gathering function.
+  GetExtentsWithChunk orig_get_extents_with_chunk_func =
+      get_extents_with_chunk_func;
+  get_extents_with_chunk_func = FakeGetExtents;
+
+  // Setup the old/new files so that it has immobile chunks; we make sure to
+  // utilize all sub-cases of such chunks: blocks 21--22 induce a split (src)
+  // and complete removal (dst), whereas blocks 24--25 induce trimming of the
+  // tail (src) and head (dst) of extents. The detailed configuration:
+  //
+  // Old:  [ 20     21 22     23     24 25 ] [ 28 ]
+  // New:  [ 18 ] [ 21 22 ] [ 20 ] [ 24 25     26 ]
+  // Same:          ^^ ^^            ^^ ^^
+  vector<Extent> old_extents;
+  uint64_t num_extents = 0;
+  num_extents += AddExtent(20, 6, &old_extents);
+  num_extents += AddExtent(28, 1, &old_extents);
+  fake_file_extents[old_path()] = &old_extents;
+  vector<Extent> new_extents;
+  AddExtent(18, 1, &new_extents);
+  AddExtent(21, 2, &new_extents);
+  AddExtent(20, 1, &new_extents);
+  AddExtent(24, 3, &new_extents);
+  fake_file_extents[new_path()] = &new_extents;
+
+  // The size of the data should match the total number of blocks.
+  const size_t file_len = 5 * 4096;
+  const string random_string(reinterpret_cast<const char*>(kRandomString),
+                             sizeof(kRandomString));
+  string random_data;
+  while (random_data.size() < file_len)
+    random_data += random_string;
+  if (random_data.size() > file_len)
+    random_data.erase(file_len);
+  EXPECT_TRUE(utils::WriteFile(old_path().c_str(),
+                               random_data.c_str(), file_len));
+  EXPECT_TRUE(utils::WriteFile(new_path().c_str(),
+                               random_data.c_str(), file_len));
+
+  vector<char> data;
+  DeltaArchiveManifest_InstallOperation op;
+  EXPECT_TRUE(DeltaDiffGenerator::ReadFileToDiff(old_path(),
+                                                 new_path(),
+                                                 0,  // chunk_offset
+                                                 -1,  // chunk_size
+                                                 true, // bsdiff_allowed
+                                                 &data,
+                                                 &op,
+                                                 true));
+
+  // Adjust the old/new extents to remove duplicates.
+  old_extents[0].set_num_blocks(1);
+  Extent e;
+  e.set_start_block(23);
+  e.set_num_blocks(1);
+  old_extents.insert(old_extents.begin() + 1, e);
+  new_extents.erase(new_extents.begin() + 1);
+  new_extents[2].set_start_block(26);
+  new_extents[2].set_num_blocks(1);
+  num_extents -= 4;
+
+  EXPECT_TRUE(data.empty());
+
+  EXPECT_TRUE(op.has_type());
+  EXPECT_EQ(DeltaArchiveManifest_InstallOperation_Type_MOVE, op.type());
+  EXPECT_FALSE(op.has_data_offset());
+  EXPECT_FALSE(op.has_data_length());
+
+  EXPECT_EQ(file_len, op.src_length());
+  EXPECT_EQ(num_extents, BlocksInExtents(op.src_extents()));
+  EXPECT_EQ(old_extents.size(), op.src_extents_size());
+  for (int i = 0; i < op.src_extents_size(); i++) {
+    EXPECT_EQ(old_extents[i].start_block(), op.src_extents(i).start_block())
+        << "i == " << i;
+    EXPECT_EQ(old_extents[i].num_blocks(), op.src_extents(i).num_blocks())
+        << "i == " << i;
+  }
+
+  EXPECT_EQ(file_len, op.dst_length());
+  EXPECT_EQ(num_extents, BlocksInExtents(op.dst_extents()));
+  EXPECT_EQ(new_extents.size(), op.dst_extents_size());
+  for (int i = 0; i < op.dst_extents_size(); i++) {
+    EXPECT_EQ(new_extents[i].start_block(), op.dst_extents(i).start_block())
+        << "i == " << i;
+    EXPECT_EQ(new_extents[i].num_blocks(), op.dst_extents(i).num_blocks())
+        << "i == " << i;
+  }
+
+  // Clean up fake extents and restore the module's extent gathering logic.
+  fake_file_extents.clear();
+  get_extents_with_chunk_func = orig_get_extents_with_chunk_func;
+}
+
 TEST_F(DeltaDiffGeneratorTest, RunAsRootBsdiffSmallTest) {
   EXPECT_TRUE(utils::WriteFile(old_path().c_str(),
                                reinterpret_cast<const char*>(kRandomString),