Use a similar file if no old files matches new file name.

Find the old file with the shortest levenshtein distance to the new file
name, and use that for diff operations.

This works great if the file name has version number in it, but even for
a completely new file, using a similar file could still help.

Test: generated a diff
Change-Id: I45062f81772220a5d5872a98a7af01ab69837dcc
diff --git a/payload_generator/delta_diff_utils.cc b/payload_generator/delta_diff_utils.cc
index 4ac299d..bd1c891 100644
--- a/payload_generator/delta_diff_utils.cc
+++ b/payload_generator/delta_diff_utils.cc
@@ -33,6 +33,7 @@
 #include <list>
 #include <map>
 #include <memory>
+#include <numeric>
 #include <utility>
 
 #include <base/files/file_util.h>
@@ -205,6 +206,26 @@
          old_blob_size;
 }
 
+// Returns the levenshtein distance between string |a| and |b|.
+// https://en.wikipedia.org/wiki/Levenshtein_distance
+int LevenshteinDistance(const string& a, const string& b) {
+  vector<int> distances(a.size() + 1);
+  std::iota(distances.begin(), distances.end(), 0);
+
+  for (size_t i = 1; i <= b.size(); i++) {
+    distances[0] = i;
+    int previous_distance = i - 1;
+    for (size_t j = 1; j <= a.size(); j++) {
+      int new_distance =
+          std::min({distances[j] + 1,
+                    distances[j - 1] + 1,
+                    previous_distance + (a[j - 1] == b[i - 1] ? 0 : 1)});
+      previous_distance = distances[j];
+      distances[j] = new_distance;
+    }
+  }
+  return distances.back();
+}
 }  // namespace
 
 namespace diff_utils {
@@ -317,6 +338,33 @@
   return true;
 }
 
+FilesystemInterface::File GetOldFile(
+    const map<string, FilesystemInterface::File>& old_files_map,
+    const string& new_file_name) {
+  if (old_files_map.empty())
+    return {};
+
+  auto old_file_iter = old_files_map.find(new_file_name);
+  if (old_file_iter != old_files_map.end())
+    return old_file_iter->second;
+
+  // No old file match for the new file name, use a similar file with the
+  // shortest levenshtein distance.
+  // This works great if the file has version number in it, but even for
+  // a completely new file, using a similar file can still help.
+  int min_distance = new_file_name.size();
+  const FilesystemInterface::File* old_file;
+  for (const auto& pair : old_files_map) {
+    int distance = LevenshteinDistance(new_file_name, pair.first);
+    if (distance < min_distance) {
+      min_distance = distance;
+      old_file = &pair.second;
+    }
+  }
+  LOG(INFO) << "Using " << old_file->name << " as source for " << new_file_name;
+  return *old_file;
+}
+
 bool DeltaReadPartition(vector<AnnotatedOperation>* aops,
                         const PartitionConfig& old_part,
                         const PartitionConfig& new_part,
@@ -395,7 +443,8 @@
     // from using a graph/cycle detection/etc to generate diffs, and at that
     // time, it will be easy (non-complex) to have many operations read
     // from the same source blocks. At that time, this code can die. -adlr
-    auto old_file = old_files_map[new_file.name];
+    FilesystemInterface::File old_file =
+        GetOldFile(old_files_map, new_file.name);
     vector<Extent> old_file_extents;
     if (version.InplaceUpdate())
       old_file_extents =
diff --git a/payload_generator/delta_diff_utils.h b/payload_generator/delta_diff_utils.h
index dea8535..c1f62fc 100644
--- a/payload_generator/delta_diff_utils.h
+++ b/payload_generator/delta_diff_utils.h
@@ -17,6 +17,7 @@
 #ifndef UPDATE_ENGINE_PAYLOAD_GENERATOR_DELTA_DIFF_UTILS_H_
 #define UPDATE_ENGINE_PAYLOAD_GENERATOR_DELTA_DIFF_UTILS_H_
 
+#include <map>
 #include <string>
 #include <vector>
 
@@ -149,6 +150,12 @@
 // Returns the max number of threads to process the files(chunks) in parallel.
 size_t GetMaxThreads();
 
+// Returns the old file which file name has the shortest levenshtein distance to
+// |new_file_name|.
+FilesystemInterface::File GetOldFile(
+    const std::map<std::string, FilesystemInterface::File>& old_files_map,
+    const std::string& new_file_name);
+
 }  // namespace diff_utils
 
 }  // namespace chromeos_update_engine
diff --git a/payload_generator/delta_diff_utils_unittest.cc b/payload_generator/delta_diff_utils_unittest.cc
index 62cdfe9..f6677f1 100644
--- a/payload_generator/delta_diff_utils_unittest.cc
+++ b/payload_generator/delta_diff_utils_unittest.cc
@@ -779,4 +779,45 @@
       test_utils::GetBuildArtifactsPath("gen/disk_ext2_4k.img")));
 }
 
+TEST_F(DeltaDiffUtilsTest, GetOldFileEmptyTest) {
+  EXPECT_TRUE(diff_utils::GetOldFile({}, "filename").name.empty());
+}
+
+TEST_F(DeltaDiffUtilsTest, GetOldFileTest) {
+  std::map<string, FilesystemInterface::File> old_files_map;
+  auto file_list = {
+      "filename",
+      "filename.zip",
+      "version1.1",
+      "version2.0",
+      "version",
+      "update_engine",
+      "delta_generator",
+  };
+  for (const auto& name : file_list) {
+    FilesystemInterface::File file;
+    file.name = name;
+    old_files_map.emplace(name, file);
+  }
+
+  // Always return exact match if possible.
+  for (const auto& name : file_list)
+    EXPECT_EQ(diff_utils::GetOldFile(old_files_map, name).name, name);
+
+  EXPECT_EQ(diff_utils::GetOldFile(old_files_map, "file_name").name,
+            "filename");
+  EXPECT_EQ(diff_utils::GetOldFile(old_files_map, "filename_new.zip").name,
+            "filename.zip");
+  EXPECT_EQ(diff_utils::GetOldFile(old_files_map, "version1.2").name,
+            "version1.1");
+  EXPECT_EQ(diff_utils::GetOldFile(old_files_map, "version3.0").name,
+            "version2.0");
+  EXPECT_EQ(diff_utils::GetOldFile(old_files_map, "_version").name, "version");
+  EXPECT_EQ(
+      diff_utils::GetOldFile(old_files_map, "update_engine_unittest").name,
+      "update_engine");
+  EXPECT_EQ(diff_utils::GetOldFile(old_files_map, "bin/delta_generator").name,
+            "delta_generator");
+}
+
 }  // namespace chromeos_update_engine