Add lz4 decompress/compress routines

During OTA generation, we decompress blobs on disk using lz4, and
perform diffing on the decompressed blobs. This is known to help OTA
size a lot. This CL adds decompression routines, following CLs will
start to actually call these routines.

Test: th
Bug: 206729162

Change-Id: Ifee87220e95740cb73a68ef84935c1cbb6a78666
diff --git a/lz4diff/lz4diff_compress.cc b/lz4diff/lz4diff_compress.cc
new file mode 100644
index 0000000..333148a
--- /dev/null
+++ b/lz4diff/lz4diff_compress.cc
@@ -0,0 +1,243 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "lz4diff_compress.h"
+
+#include "update_engine/common/utils.h"
+#include "update_engine/common/hash_calculator.h"
+#include "update_engine/payload_generator/delta_diff_generator.h"
+#include "update_engine/payload_generator/payload_generation_config.h"
+
+#include <base/logging.h>
+#include <lz4.h>
+#include <lz4hc.h>
+
+namespace chromeos_update_engine {
+
+Blob TryCompressBlob(std::string_view blob,
+                     const std::vector<CompressedBlock>& block_info,
+                     const bool zero_padding_enabled,
+                     const CompressionAlgorithm compression_algo) {
+  size_t uncompressed_size = 0;
+  size_t compressed_size = 0;
+  for (const auto& block : block_info) {
+    CHECK_EQ(uncompressed_size, block.uncompressed_offset)
+        << "Compressed block info is expected to be sorted.";
+    uncompressed_size += block.uncompressed_length;
+    compressed_size += block.compressed_length;
+  }
+  CHECK_EQ(uncompressed_size, blob.size());
+  Blob output;
+  output.resize(utils::RoundUp(compressed_size, kBlockSize));
+  auto hc = LZ4_createStreamHC();
+  DEFER {
+    if (hc) {
+      LZ4_freeStreamHC(hc);
+      hc = nullptr;
+    }
+  };
+  size_t compressed_offset = 0;
+  for (const auto& block : block_info) {
+    // Execute the increment at end of each loop
+    DEFER { compressed_offset += block.compressed_length; };
+    CHECK_LE(compressed_offset + block.compressed_length, output.size());
+
+    if (!block.IsCompressed()) {
+      std::memcpy(output.data() + compressed_offset,
+                  blob.data() + block.uncompressed_offset,
+                  block.compressed_length);
+      continue;
+    }
+    // LZ4 spec enforces that last op of a compressed block must be an insert op
+    // of at least 5 bytes. Compressors will try to conform to that requirement
+    // if the input size is just right. We don't want that. So always give a
+    // little bit more data.
+    int src_size = uncompressed_size - block.uncompressed_offset;
+    uint64_t bytes_written = 0;
+    switch (compression_algo.type()) {
+      case CompressionAlgorithm::LZ4HC:
+        bytes_written = LZ4_compress_HC_destSize(
+            hc,
+            blob.data() + block.uncompressed_offset,
+            reinterpret_cast<char*>(output.data()) + compressed_offset,
+            &src_size,
+            block.compressed_length,
+            compression_algo.level());
+        break;
+      case CompressionAlgorithm::LZ4:
+        bytes_written = LZ4_compress_destSize(
+            blob.data() + block.uncompressed_offset,
+            reinterpret_cast<char*>(output.data()) + compressed_offset,
+            &src_size,
+            block.compressed_length);
+        break;
+      default:
+        CHECK(false) << "Unrecognized compression algorithm: "
+                     << compression_algo.type();
+        break;
+    }
+    // Last block may have trailing zeros
+    CHECK_LE(bytes_written, block.compressed_length);
+    if (bytes_written < block.compressed_length) {
+      if (zero_padding_enabled) {
+        const auto padding = block.compressed_length - bytes_written;
+        // LOG(INFO) << "Padding: " << padding;
+        CHECK_LE(compressed_offset + padding + bytes_written, output.size());
+        std::memmove(output.data() + compressed_offset + padding,
+                     output.data() + compressed_offset,
+                     bytes_written);
+        CHECK_LE(compressed_offset + padding, output.size());
+        std::fill(output.data() + compressed_offset,
+                  output.data() + compressed_offset + padding,
+                  0);
+
+      } else {
+        std::fill(output.data() + compressed_offset + bytes_written,
+                  output.data() + compressed_offset + block.compressed_length,
+                  0);
+      }
+    }
+    if (static_cast<uint64_t>(src_size) != block.uncompressed_length) {
+      LOG(WARNING) << "Recompress size mismatch: " << src_size << ", "
+                   << block.uncompressed_length;
+    }
+  }
+  // Any trailing data will be copied to the output buffer.
+  output.insert(output.end(), blob.begin() + uncompressed_size, blob.end());
+  return output;
+}
+
+Blob TryDecompressBlob(std::string_view blob,
+                       const std::vector<CompressedBlock>& block_info,
+                       const bool zero_padding_enabled) {
+  if (block_info.empty()) {
+    return {};
+  }
+  size_t uncompressed_size = 0;
+  size_t compressed_size = 0;
+  for (const auto& block : block_info) {
+    CHECK_EQ(uncompressed_size, block.uncompressed_offset)
+        << " Compressed block info is expected to be sorted, expected offset "
+        << uncompressed_size << ", actual block " << block;
+    uncompressed_size += block.uncompressed_length;
+    compressed_size += block.compressed_length;
+  }
+  if (blob.size() < compressed_size) {
+    LOG(INFO) << "File is chunked. Skip lz4 decompress.Expected size : "
+              << compressed_size << ", actual size: " << blob.size();
+    return {};
+  }
+  Blob output;
+  output.reserve(uncompressed_size);
+  size_t compressed_offset = 0;
+  for (const auto& block : block_info) {
+    std::string_view cluster =
+        blob.substr(compressed_offset, block.compressed_length);
+    if (!block.IsCompressed()) {
+      CHECK_NE(cluster.size(), 0UL);
+      output.insert(output.end(), cluster.begin(), cluster.end());
+      compressed_offset += cluster.size();
+      continue;
+    }
+    size_t inputmargin = 0;
+    if (zero_padding_enabled) {
+      while (cluster[inputmargin] == 0 &&
+             inputmargin < std::min(kBlockSize, cluster.size())) {
+        inputmargin++;
+      }
+    }
+    output.resize(output.size() + block.uncompressed_length);
+
+    const auto bytes_decompressed = LZ4_decompress_safe_partial(
+        cluster.data() + inputmargin,
+        reinterpret_cast<char*>(output.data()) + output.size() -
+            block.uncompressed_length,
+        cluster.size() - inputmargin,
+        block.uncompressed_length,
+        block.uncompressed_length);
+    if (bytes_decompressed < 0) {
+      Blob cluster_hash;
+      HashCalculator::RawHashOfBytes(
+          cluster.data(), cluster.size(), &cluster_hash);
+      Blob blob_hash;
+      HashCalculator::RawHashOfBytes(blob.data(), blob.size(), &blob_hash);
+      LOG(FATAL) << "Failed to decompress, " << bytes_decompressed
+                 << ", output_cursor = "
+                 << output.size() - block.uncompressed_length
+                 << ", input_cursor = " << compressed_offset
+                 << ", blob.size() = " << blob.size()
+                 << ", cluster_size = " << block.compressed_length
+                 << ", dest capacity = " << block.uncompressed_length
+                 << ", input margin = " << inputmargin << " "
+                 << HexEncode(cluster_hash) << " " << HexEncode(blob_hash);
+      return {};
+    }
+    compressed_offset += block.compressed_length;
+    CHECK_EQ(static_cast<uint64_t>(bytes_decompressed),
+             block.uncompressed_length);
+  }
+  CHECK_EQ(output.size(), uncompressed_size);
+
+  // Trailing data not recorded by compressed block info will be treated as
+  // uncompressed, most of the time these are xattrs or trailing zeros.
+  CHECK_EQ(blob.size(), compressed_offset)
+      << " Unexpected data the end of compressed data ";
+  if (compressed_offset < blob.size()) {
+    output.insert(output.end(), blob.begin() + compressed_offset, blob.end());
+  }
+
+  return output;
+}
+
+[[nodiscard]] std::string_view ToStringView(const Blob& blob) noexcept {
+  return std::string_view{reinterpret_cast<const char*>(blob.data()),
+                          blob.size()};
+}
+
+Blob TryDecompressBlob(const Blob& blob,
+                       const std::vector<CompressedBlock>& block_info,
+                       const bool zero_padding_enabled) {
+  return TryDecompressBlob(
+      ToStringView(blob), block_info, zero_padding_enabled);
+}
+
+std::ostream& operator<<(std::ostream& out, const CompressedBlock& block) {
+  out << "CompressedBlock{.uncompressed_offset = " << block.uncompressed_offset
+      << ", .compressed_length = " << block.compressed_length
+      << ", .uncompressed_length = " << block.uncompressed_length << "}";
+  return out;
+}
+
+[[nodiscard]] std::string_view ToStringView(const void* data,
+                                            size_t size) noexcept {
+  return std::string_view(reinterpret_cast<const char*>(data), size);
+}
+
+std::ostream& operator<<(std::ostream& out, const CompressedBlockInfo& info) {
+  out << "BlockInfo { compressed_length: " << info.compressed_length()
+      << ", uncompressed_length: " << info.uncompressed_length()
+      << ", uncompressed_offset: " << info.uncompressed_offset();
+  if (!info.sha256_hash().empty()) {
+    out << ", sha256_hash: " << HexEncode(info.sha256_hash());
+  }
+  if (!info.postfix_bspatch().empty()) {
+    out << ", postfix_bspatch: " << info.postfix_bspatch().size();
+  }
+  out << "}";
+  return out;
+}
+
+}  // namespace chromeos_update_engine
diff --git a/lz4diff/lz4diff_compress.h b/lz4diff/lz4diff_compress.h
new file mode 100644
index 0000000..7cbb9ac
--- /dev/null
+++ b/lz4diff/lz4diff_compress.h
@@ -0,0 +1,58 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#ifndef UPDATE_ENGINE_LZ4DIFF_LZ4DIFF_COMPRESS_H_
+#define UPDATE_ENGINE_LZ4DIFF_LZ4DIFF_COMPRESS_H_
+
+#include "lz4diff_format.h"
+
+#include <string_view>
+
+namespace chromeos_update_engine {
+
+// |TryCompressBlob| and |TryDecompressBlob| are inverse function of each other.
+// One compresses data into fixed size output chunks, one decompresses fixed
+// size blocks.
+// The |TryCompressBlob| routine is supposed to mimic how EROFS compresses input
+// files when creating an EROFS image. After calling |TryCompressBlob|, LZ4DIFF
+// will compare the re-compressed blob and EROFS's ground truth blob, and
+// generate a BSDIFF patch between them if there's mismatch. Therefore, it is OK
+// that |TryCompressBlob| produces slightly different output than mkfs.erofs, so
+// as long as |TryCompressBlob| exhibits consistne bebavior across platforms.
+Blob TryCompressBlob(std::string_view blob,
+                     const std::vector<CompressedBlock>& block_info,
+                     const bool zero_padding_enabled,
+                     const CompressionAlgorithm compression_algo);
+
+Blob TryDecompressBlob(std::string_view blob,
+                       const std::vector<CompressedBlock>& block_info,
+                       const bool zero_padding_enabled);
+Blob TryDecompressBlob(const Blob& blob,
+                       const std::vector<CompressedBlock>& block_info,
+                       const bool zero_padding_enabled);
+
+[[nodiscard]] std::string_view ToStringView(const Blob& blob) noexcept;
+
+[[nodiscard]] std::string_view ToStringView(const void* data,
+                                            size_t size) noexcept;
+
+std::ostream& operator<<(std::ostream& out, const CompressedBlockInfo& info);
+
+std::ostream& operator<<(std::ostream& out, const CompressedBlock& block);
+
+}  // namespace chromeos_update_engine
+
+#endif
diff --git a/lz4diff/lz4diff_compress_unittest.cc b/lz4diff/lz4diff_compress_unittest.cc
new file mode 100644
index 0000000..b4b56d2
--- /dev/null
+++ b/lz4diff/lz4diff_compress_unittest.cc
@@ -0,0 +1,117 @@
+//
+// Copyright (C) 2021 The Android Open Source Project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include <base/format_macros.h>
+#include <base/logging.h>
+#include <base/strings/string_number_conversions.h>
+#include <base/strings/string_util.h>
+#include <base/strings/stringprintf.h>
+#include <gtest/gtest.h>
+#include <erofs/internal.h>
+#include <erofs/io.h>
+
+#include "update_engine/common/test_utils.h"
+#include "update_engine/common/utils.h"
+#include "update_engine/lz4diff/lz4diff_compress.h"
+#include "update_engine/payload_generator/delta_diff_generator.h"
+#include "update_engine/payload_generator/erofs_filesystem.h"
+#include "update_engine/payload_generator/extent_utils.h"
+
+using std::string;
+using std::vector;
+
+namespace chromeos_update_engine {
+
+namespace {
+
+static void ExtractErofsImage(const char* erofs_image,
+                              const char* inode_path,
+                              Blob* output) {
+  // EROFS has plenty of global variable usage. Protect calls to EROFS APIs with
+  // global mutex.
+  // TODO(b/202784930) Replace erofs-utils with a cleaner and more C++ friendly
+  // library. (Or turn erofs-utils into one)
+  static std::mutex mutex;
+  std::lock_guard lock(mutex);
+  auto err = dev_open_ro(erofs_image);
+  ASSERT_EQ(err, 0);
+  DEFER { dev_close(); };
+
+  err = erofs_read_superblock();
+  ASSERT_EQ(err, 0);
+  struct erofs_inode inode;
+  err = erofs_ilookup(inode_path, &inode);
+  ASSERT_EQ(err, 0);
+  output->resize(inode.i_size);
+  err = erofs_pread(&inode,
+                    reinterpret_cast<char*>(output->data()),
+                    output->size(),
+                    0 /* offset */);
+  ASSERT_EQ(err, 0);
+}
+
+class Lz4diffCompressTest : public ::testing::Test {};
+
+using test_utils::GetBuildArtifactsPath;
+
+// This test parses the sample images generated during build time with the
+// "generate_image.sh" script. The expected conditions of each file in these
+// images is encoded in the file name, as defined in the mentioned script.
+TEST_F(Lz4diffCompressTest, ExtractElfBinary) {
+  const auto build_path = GetBuildArtifactsPath("gen/erofs.img");
+  auto fs = ErofsFilesystem::CreateFromFile(build_path);
+  ASSERT_NE(fs, nullptr);
+  ASSERT_EQ(kBlockSize, fs->GetBlockSize());
+
+  vector<ErofsFilesystem::File> files;
+  ASSERT_TRUE(fs->GetFiles(&files));
+
+  const auto it =
+      std::find_if(files.begin(), files.end(), [](const auto& file) {
+        return file.name == "/delta_generator";
+      });
+  ASSERT_NE(it, files.end())
+      << "There should be a delta_generator entry in gen/erofs.img. Is the "
+         "generate_test_erofs_imgages.sh script implemented wrong?";
+
+  const auto delta_generator = *it;
+  Blob expected_blob;
+  ASSERT_NO_FATAL_FAILURE(ExtractErofsImage(
+      build_path.c_str(), "/delta_generator", &expected_blob));
+  Blob compressed_blob;
+  ASSERT_TRUE(utils::ReadExtents(
+      build_path, delta_generator.extents, &compressed_blob, kBlockSize));
+  auto decompressed_blob = TryDecompressBlob(
+      compressed_blob,
+      delta_generator.compressed_file_info.blocks,
+      delta_generator.compressed_file_info.zero_padding_enabled);
+  ASSERT_GT(decompressed_blob.size(), 0UL);
+  ASSERT_GE(decompressed_blob.size(),
+            static_cast<size_t>(delta_generator.file_stat.st_size));
+  decompressed_blob.resize(delta_generator.file_stat.st_size);
+  ASSERT_EQ(decompressed_blob, expected_blob);
+}
+
+}  // namespace
+
+}  // namespace chromeos_update_engine