Support parsing of data descriptor

The size fields in the data descriptor can be either 4 bytes or 8 bytes.
This depends on if the size are read from the zip64 extended field in
the local file header. This cl adds support to parse these cases.

Also fix a misconception in that the uncompressed and compressed size
doesn't need to exist together in the zip64 fields of the central
directory. But they still need to co-exist in the fields of the local
file header.

Bug: 150900468
Test: unit tests pass, python tests pass
Change-Id: Ia54f9bf56c85ff456ead90a136f7fddc5be5220c
diff --git a/libziparchive/include/ziparchive/zip_archive.h b/libziparchive/include/ziparchive/zip_archive.h
index 435bfb6..4697bb7 100644
--- a/libziparchive/include/ziparchive/zip_archive.h
+++ b/libziparchive/include/ziparchive/zip_archive.h
@@ -77,6 +77,10 @@
   // footer.
   uint32_t uncompressed_length;
 
+  // If the value of uncompressed length and compressed length are stored in
+  // the zip64 extended info of the extra field.
+  bool zip64_format_size{false};
+
   // The offset to the start of data for this ZipEntry.
   off64_t offset;
 
diff --git a/libziparchive/test_ziparchive_large.py b/libziparchive/test_ziparchive_large.py
index c29c37e..6b82f63 100644
--- a/libziparchive/test_ziparchive_large.py
+++ b/libziparchive/test_ziparchive_large.py
@@ -26,13 +26,17 @@
 
 class Zip64Test(unittest.TestCase):
   @staticmethod
+  def _WriteFile(path, size_in_kib):
+    contents = os.path.basename(path)[0] * 1024
+    with open(path, 'w') as f:
+      for it in range(0, size_in_kib):
+        f.write(contents)
+
+  @staticmethod
   def _AddEntriesToZip(output_zip, entries_dict=None):
     for name, size in entries_dict.items():
-      contents = name[0] * 1024
       file_path = tempfile.NamedTemporaryFile()
-      with open(file_path.name, 'w') as f:
-        for it in range(0, size):
-          f.write(contents)
+      Zip64Test._WriteFile(file_path.name, size)
       output_zip.write(file_path.name, arcname = name)
 
   def _getEntryNames(self, zip_name):
@@ -93,6 +97,22 @@
     self._ExtractEntries(zip_path.name)
 
 
+  def test_forceDataDescriptor(self):
+    file_path = tempfile.NamedTemporaryFile(suffix='.txt')
+    # TODO create the entry > 4GiB.
+    self._WriteFile(file_path.name, 1024)
+
+    zip_path = tempfile.NamedTemporaryFile(suffix='.zip')
+    with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as output_zip:
+      pass
+    # The fd option force writes a data descriptor
+    cmd = ['zip', '-fd', zip_path.name, file_path.name]
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    proc.communicate()
+    read_names = self._getEntryNames(zip_path.name)
+    self.assertEquals([file_path.name[1:]], read_names)
+    self._ExtractEntries(zip_path.name)
+
 if __name__ == '__main__':
   testsuite = unittest.TestLoader().discover(
       os.path.dirname(os.path.realpath(__file__)))
diff --git a/libziparchive/testdata/zip64.zip b/libziparchive/testdata/zip64.zip
new file mode 100644
index 0000000..3f25a4c
--- /dev/null
+++ b/libziparchive/testdata/zip64.zip
Binary files differ
diff --git a/libziparchive/zip_archive.cc b/libziparchive/zip_archive.cc
index 849b68c..031d43a 100644
--- a/libziparchive/zip_archive.cc
+++ b/libziparchive/zip_archive.cc
@@ -57,8 +57,6 @@
 #include "zip_archive_common.h"
 #include "zip_archive_private.h"
 
-using android::base::get_unaligned;
-
 // Used to turn on crc checks - verify that the content CRC matches the values
 // specified in the local file header and the central directory.
 static const bool kCrcChecksEnabled = false;
@@ -221,7 +219,7 @@
   for (; i >= 0; i--) {
     if (scan_buffer[i] == 0x50) {
       uint32_t* sig_addr = reinterpret_cast<uint32_t*>(&scan_buffer[i]);
-      if (get_unaligned<uint32_t>(sig_addr) == EocdRecord::kSignature) {
+      if (android::base::get_unaligned<uint32_t>(sig_addr) == EocdRecord::kSignature) {
         ALOGV("+++ Found EOCD at buf+%d", i);
         break;
       }
@@ -360,8 +358,9 @@
   // Data Size - 2 bytes
   uint16_t offset = 0;
   while (offset < extraFieldLength - 4) {
-    auto headerId = get_unaligned<uint16_t>(extraFieldStart + offset);
-    auto dataSize = get_unaligned<uint16_t>(extraFieldStart + offset + 2);
+    auto readPtr = const_cast<uint8_t*>(extraFieldStart + offset);
+    auto headerId = ConsumeUnaligned<uint16_t>(&readPtr);
+    auto dataSize = ConsumeUnaligned<uint16_t>(&readPtr);
 
     offset += 4;
     if (dataSize > extraFieldLength - offset) {
@@ -376,54 +375,44 @@
       continue;
     }
 
-    uint16_t expectedDataSize = 0;
-    // We expect the extended field to include both uncompressed and compressed size.
-    if (zip32UncompressedSize == UINT32_MAX || zip32CompressedSize == UINT32_MAX) {
-      expectedDataSize += 16;
+    std::optional<uint64_t> uncompressedFileSize;
+    std::optional<uint64_t> compressedFileSize;
+    std::optional<uint64_t> localHeaderOffset;
+    if (zip32UncompressedSize == UINT32_MAX) {
+      uncompressedFileSize = ConsumeUnaligned<uint64_t>(&readPtr);
+    }
+    if (zip32CompressedSize == UINT32_MAX) {
+      compressedFileSize = ConsumeUnaligned<uint64_t>(&readPtr);
     }
     if (zip32LocalFileHeaderOffset == UINT32_MAX) {
-      expectedDataSize += 8;
+      localHeaderOffset = ConsumeUnaligned<uint64_t>(&readPtr);
     }
 
-    if (expectedDataSize == 0) {
+    // calculate how many bytes we read after the data size field.
+    size_t bytesRead = readPtr - (extraFieldStart + offset);
+    if (bytesRead == 0) {
       ALOGW("Zip: Data size should not be 0 in zip64 extended field");
       return kInvalidFile;
     }
 
-    if (dataSize != expectedDataSize) {
+    if (dataSize != bytesRead) {
       auto localOffsetString = zip32LocalFileHeaderOffset.has_value()
                                    ? std::to_string(zip32LocalFileHeaderOffset.value())
                                    : "missing";
-      ALOGW("Zip: Invalid data size in zip64 extended field, expect %" PRIu16 ", get %" PRIu16
+      ALOGW("Zip: Invalid data size in zip64 extended field, expect %zu , get %" PRIu16
             ", uncompressed size %" PRIu32 ", compressed size %" PRIu32 ", local header offset %s",
-            expectedDataSize, dataSize, zip32UncompressedSize, zip32CompressedSize,
+            bytesRead, dataSize, zip32UncompressedSize, zip32CompressedSize,
             localOffsetString.c_str());
       return kInvalidFile;
     }
 
-    std::optional<uint64_t> uncompressedFileSize;
-    std::optional<uint64_t> compressedFileSize;
-    std::optional<uint64_t> localHeaderOffset;
-    if (zip32UncompressedSize == UINT32_MAX || zip32CompressedSize == UINT32_MAX) {
-      uncompressedFileSize = get_unaligned<uint64_t>(extraFieldStart + offset);
-      compressedFileSize = get_unaligned<uint64_t>(extraFieldStart + offset + 8);
-      offset += 16;
-
-      // TODO(xunchang) Support handling file large than UINT32_MAX. It's theoretically possible
-      // for libz to (de)compressing file larger than UINT32_MAX. But we should use our own
-      // bytes counter to replace stream.total_out.
-      if (uncompressedFileSize.value() >= UINT32_MAX || compressedFileSize.value() >= UINT32_MAX) {
-        ALOGW(
-            "Zip: File size larger than UINT32_MAX isn't supported yet. uncompressed size %" PRIu64
-            ", compressed size %" PRIu64,
-            uncompressedFileSize.value(), compressedFileSize.value());
-        return kInvalidFile;
-      }
-    }
-
-    if (zip32LocalFileHeaderOffset == UINT32_MAX) {
-      localHeaderOffset = get_unaligned<uint64_t>(extraFieldStart + offset);
-      offset += 8;
+    // TODO(xunchang) Support handling file large than UINT32_MAX. It's theoretically possible
+    // for libz to (de)compressing file larger than UINT32_MAX. But we should use our own
+    // bytes counter to replace stream.total_out.
+    if ((uncompressedFileSize.has_value() && uncompressedFileSize.value() > UINT32_MAX) ||
+        (compressedFileSize.has_value() && compressedFileSize.value() > UINT32_MAX)) {
+      ALOGW("Zip: File size larger than UINT32_MAX isn't supported yet");
+      return kInvalidFile;
     }
 
     zip64Info->uncompressed_file_size = uncompressedFileSize;
@@ -625,7 +614,8 @@
 }
 
 static int32_t ValidateDataDescriptor(MappedZipFile& mapped_zip, ZipEntry* entry) {
-  uint8_t ddBuf[sizeof(DataDescriptor) + sizeof(DataDescriptor::kOptSignature)];
+  // Maximum possible size for data descriptor: 2 * 4 + 2 * 8 = 24 bytes
+  uint8_t ddBuf[24];
   off64_t offset = entry->offset;
   if (entry->method != kCompressStored) {
     offset += entry->compressed_length;
@@ -638,18 +628,26 @@
   }
 
   const uint32_t ddSignature = *(reinterpret_cast<const uint32_t*>(ddBuf));
-  const uint16_t ddOffset = (ddSignature == DataDescriptor::kOptSignature) ? 4 : 0;
-  const DataDescriptor* descriptor = reinterpret_cast<const DataDescriptor*>(ddBuf + ddOffset);
+  uint8_t* ddReadPtr = (ddSignature == DataDescriptor::kOptSignature) ? ddBuf + 4 : ddBuf;
+  DataDescriptor descriptor{};
+  descriptor.crc32 = ConsumeUnaligned<uint32_t>(&ddReadPtr);
+  if (entry->zip64_format_size) {
+    descriptor.compressed_size = ConsumeUnaligned<uint64_t>(&ddReadPtr);
+    descriptor.uncompressed_size = ConsumeUnaligned<uint64_t>(&ddReadPtr);
+  } else {
+    descriptor.compressed_size = ConsumeUnaligned<uint32_t>(&ddReadPtr);
+    descriptor.uncompressed_size = ConsumeUnaligned<uint32_t>(&ddReadPtr);
+  }
 
   // Validate that the values in the data descriptor match those in the central
   // directory.
-  if (entry->compressed_length != descriptor->compressed_size ||
-      entry->uncompressed_length != descriptor->uncompressed_size ||
-      entry->crc32 != descriptor->crc32) {
+  if (entry->compressed_length != descriptor.compressed_size ||
+      entry->uncompressed_length != descriptor.uncompressed_size ||
+      entry->crc32 != descriptor.crc32) {
     ALOGW("Zip: size/crc32 mismatch. expected {%" PRIu32 ", %" PRIu32 ", %" PRIx32
-          "}, was {%" PRIu32 ", %" PRIu32 ", %" PRIx32 "}",
+          "}, was {%" PRIu64 ", %" PRIu64 ", %" PRIx32 "}",
           entry->compressed_length, entry->uncompressed_length, entry->crc32,
-          descriptor->compressed_size, descriptor->uncompressed_size, descriptor->crc32);
+          descriptor.compressed_size, descriptor.uncompressed_size, descriptor.crc32);
     return kInconsistentInformation;
   }
 
@@ -706,18 +704,14 @@
       return status;
     }
 
-    if (cdr->uncompressed_size == UINT32_MAX || cdr->compressed_size == UINT32_MAX) {
-      CHECK(zip64_info.uncompressed_file_size.has_value());
-      CHECK(zip64_info.compressed_file_size.has_value());
-      // TODO(xunchang) remove the size limit and support entry length > UINT32_MAX.
-      data->uncompressed_length = static_cast<uint32_t>(zip64_info.uncompressed_file_size.value());
-      data->compressed_length = static_cast<uint32_t>(zip64_info.compressed_file_size.value());
-    }
-
-    if (local_header_offset == UINT32_MAX) {
-      CHECK(zip64_info.local_header_offset.has_value());
-      local_header_offset = zip64_info.local_header_offset.value();
-    }
+    // TODO(xunchang) remove the size limit and support entry length > UINT32_MAX.
+    data->uncompressed_length =
+        static_cast<uint32_t>(zip64_info.uncompressed_file_size.value_or(cdr->uncompressed_size));
+    data->compressed_length =
+        static_cast<uint32_t>(zip64_info.compressed_file_size.value_or(cdr->compressed_size));
+    local_header_offset = zip64_info.local_header_offset.value_or(local_header_offset);
+    data->zip64_format_size =
+        cdr->uncompressed_size == UINT32_MAX || cdr->compressed_size == UINT32_MAX;
   }
 
   if (local_header_offset + static_cast<off64_t>(sizeof(LocalFileHeader)) >= cd_offset) {
@@ -766,6 +760,13 @@
   uint64_t lfh_uncompressed_size = lfh->uncompressed_size;
   uint64_t lfh_compressed_size = lfh->compressed_size;
   if (lfh_uncompressed_size == UINT32_MAX || lfh_compressed_size == UINT32_MAX) {
+    if (lfh_uncompressed_size != UINT32_MAX || lfh_compressed_size != UINT32_MAX) {
+      ALOGW(
+          "Zip: The zip64 extended field in the local header MUST include BOTH original and "
+          "compressed file size fields.");
+      return kInvalidFile;
+    }
+
     const off64_t lfh_extra_field_offset = name_offset + lfh->file_name_length;
     const uint16_t lfh_extra_field_size = lfh->extra_field_length;
     if (lfh_extra_field_offset > cd_offset - lfh_extra_field_size) {
diff --git a/libziparchive/zip_archive_common.h b/libziparchive/zip_archive_common.h
index a92d4d2..d461856 100644
--- a/libziparchive/zip_archive_common.h
+++ b/libziparchive/zip_archive_common.h
@@ -165,15 +165,24 @@
 
   // CRC-32 checksum of the entry.
   uint32_t crc32;
-  // Compressed size of the entry.
-  uint32_t compressed_size;
-  // Uncompressed size of the entry.
-  uint32_t uncompressed_size;
+
+  // For ZIP64 format archives, the compressed and uncompressed sizes are 8
+  // bytes each. Also, the ZIP64 format MAY be used regardless of the size
+  // of a file.  When extracting, if the zip64 extended information extra field
+  // is present for the file the compressed and uncompressed sizes will be 8
+  // byte values.
+
+  // Compressed size of the entry, the field can be either 4 bytes or 8 bytes
+  // in the zip file.
+  uint64_t compressed_size;
+  // Uncompressed size of the entry, the field can be either 4 bytes or 8 bytes
+  // in the zip file.
+  uint64_t uncompressed_size;
 
  private:
   DataDescriptor() = default;
   DISALLOW_COPY_AND_ASSIGN(DataDescriptor);
-} __attribute__((packed));
+};
 
 // The zip64 end of central directory locator helps to find the zip64 EOCD.
 struct Zip64EocdLocator {
diff --git a/libziparchive/zip_archive_private.h b/libziparchive/zip_archive_private.h
index 5f5232f..4ed07aa 100644
--- a/libziparchive/zip_archive_private.h
+++ b/libziparchive/zip_archive_private.h
@@ -28,6 +28,7 @@
 
 #include "android-base/macros.h"
 #include "android-base/mapped_file.h"
+#include "android-base/memory.h"
 #include "zip_cd_entry_map.h"
 #include "zip_error.h"
 
@@ -104,3 +105,20 @@
 
   bool InitializeCentralDirectory(off64_t cd_start_offset, size_t cd_size);
 };
+
+int32_t ExtractToWriter(ZipArchiveHandle handle, ZipEntry* entry, zip_archive::Writer* writer);
+
+// Reads the unaligned data of type |T| and auto increment the offset.
+template <typename T>
+static T ConsumeUnaligned(uint8_t** address) {
+  auto ret = android::base::get_unaligned<T>(*address);
+  *address += sizeof(T);
+  return ret;
+}
+
+// Writes the unaligned data of type |T| and auto increment the offset.
+template <typename T>
+void EmitUnaligned(uint8_t** address, T data) {
+  android::base::put_unaligned<T>(*address, data);
+  *address += sizeof(T);
+}
diff --git a/libziparchive/zip_archive_test.cc b/libziparchive/zip_archive_test.cc
index 69be3df..3563340 100644
--- a/libziparchive/zip_archive_test.cc
+++ b/libziparchive/zip_archive_test.cc
@@ -979,10 +979,11 @@
     std::vector<uint8_t> extended_field;
     // Fake data to mimic the compressed bytes in the zipfile.
     std::vector<uint8_t> compressed_bytes;
+    std::vector<uint8_t> data_descriptor;
 
     size_t GetSize() const {
       return local_file_header.size() + file_name.size() + extended_field.size() +
-             compressed_bytes.size();
+             compressed_bytes.size() + data_descriptor.size();
     }
 
     void CopyToOutput(std::vector<uint8_t>* output) const {
@@ -990,6 +991,7 @@
       std::copy(file_name.begin(), file_name.end(), std::back_inserter(*output));
       std::copy(extended_field.begin(), extended_field.end(), std::back_inserter(*output));
       std::copy(compressed_bytes.begin(), compressed_bytes.end(), std::back_inserter(*output));
+      std::copy(data_descriptor.begin(), data_descriptor.end(), std::back_inserter(*output));
     }
   };
 
@@ -1057,7 +1059,7 @@
   // Add an entry to the zipfile, construct the corresponding local header and cd entry.
   void AddEntry(const std::string& name, const std::vector<uint8_t>& content,
                 bool uncompressed_size_in_extended, bool compressed_size_in_extended,
-                bool local_offset_in_extended) {
+                bool local_offset_in_extended, bool include_data_descriptor = false) {
     auto uncompressed_size = static_cast<uint32_t>(content.size());
     auto compressed_size = static_cast<uint32_t>(content.size());
     uint32_t local_file_header_offset = 0;
@@ -1084,6 +1086,21 @@
     ConstructLocalFileHeader(name, &local_entry.local_file_header, uncompressed_size,
                              compressed_size);
     ConstructExtendedField(zip64_fields, &local_entry.extended_field);
+    if (include_data_descriptor) {
+      size_t descriptor_size = compressed_size_in_extended ? 24 : 16;
+      local_entry.data_descriptor.resize(descriptor_size);
+      uint8_t* write_ptr = local_entry.data_descriptor.data();
+      EmitUnaligned<uint32_t>(&write_ptr, DataDescriptor::kOptSignature);
+      EmitUnaligned<uint32_t>(&write_ptr, 0 /* crc */);
+      if (compressed_size_in_extended) {
+        EmitUnaligned<uint64_t>(&write_ptr, compressed_size_in_extended);
+        EmitUnaligned<uint64_t>(&write_ptr, uncompressed_size_in_extended);
+      } else {
+        EmitUnaligned<uint32_t>(&write_ptr, compressed_size_in_extended);
+        EmitUnaligned<uint32_t>(&write_ptr, uncompressed_size_in_extended);
+      }
+    }
+
     file_entries_.push_back(std::move(local_entry));
 
     if (local_offset_in_extended) {
@@ -1270,3 +1287,38 @@
       0, OpenArchiveFromMemory(zip_content_.data(), zip_content_.size(), "debug_zip64", &handle));
   CloseArchive(handle);
 }
+
+TEST_F(Zip64ParseTest, extract) {
+  std::vector<uint8_t> content(200, 'a');
+  AddEntry("a.txt", content, true, true, true);
+  ConstructEocd();
+  ConstructZipFile();
+
+  ZipArchiveHandle handle;
+  ASSERT_EQ(
+      0, OpenArchiveFromMemory(zip_content_.data(), zip_content_.size(), "debug_zip64", &handle));
+  ZipEntry entry;
+  ASSERT_EQ(0, FindEntry(handle, "a.txt", &entry));
+
+  VectorWriter writer;
+  ASSERT_EQ(0, ExtractToWriter(handle, &entry, &writer));
+  ASSERT_EQ(content, writer.GetOutput());
+}
+
+TEST_F(Zip64ParseTest, extractWithDataDescriptor) {
+  std::vector<uint8_t> content(300, 'b');
+  AddEntry("a.txt", std::vector<uint8_t>(200, 'a'), true, true, true);
+  AddEntry("b.txt", content, true, true, true, true /* data descriptor */);
+  ConstructEocd();
+  ConstructZipFile();
+
+  ZipArchiveHandle handle;
+  ASSERT_EQ(
+      0, OpenArchiveFromMemory(zip_content_.data(), zip_content_.size(), "debug_zip64", &handle));
+  ZipEntry entry;
+  ASSERT_EQ(0, FindEntry(handle, "b.txt", &entry));
+
+  VectorWriter writer;
+  ASSERT_EQ(0, ExtractToWriter(handle, &entry, &writer));
+  ASSERT_EQ(content, writer.GetOutput());
+}
diff --git a/libziparchive/zip_writer.cc b/libziparchive/zip_writer.cc
index 67279a6..25b1da4 100644
--- a/libziparchive/zip_writer.cc
+++ b/libziparchive/zip_writer.cc
@@ -475,19 +475,16 @@
   if (ShouldUseDataDescriptor()) {
     // Some versions of ZIP don't allow STORED data to have a trailing DataDescriptor.
     // If this file is not seekable, or if the data is compressed, write a DataDescriptor.
-    const uint32_t sig = DataDescriptor::kOptSignature;
-    if (fwrite(&sig, sizeof(sig), 1, file_) != 1) {
+    // We haven't supported zip64 format yet. Write both uncompressed size and compressed
+    // size as uint32_t.
+    std::vector<uint32_t> dataDescriptor = {
+        DataDescriptor::kOptSignature, current_file_entry_.crc32,
+        current_file_entry_.compressed_size, current_file_entry_.uncompressed_size};
+    if (fwrite(dataDescriptor.data(), dataDescriptor.size() * sizeof(uint32_t), 1, file_) != 1) {
       return HandleError(kIoError);
     }
 
-    DataDescriptor dd = {};
-    dd.crc32 = current_file_entry_.crc32;
-    dd.compressed_size = current_file_entry_.compressed_size;
-    dd.uncompressed_size = current_file_entry_.uncompressed_size;
-    if (fwrite(&dd, sizeof(dd), 1, file_) != 1) {
-      return HandleError(kIoError);
-    }
-    current_offset_ += sizeof(DataDescriptor::kOptSignature) + sizeof(dd);
+    current_offset_ += sizeof(uint32_t) * dataDescriptor.size();
   } else {
     // Seek back to the header and rewrite to include the size.
     if (fseeko(file_, current_file_entry_.local_file_header_offset, SEEK_SET) != 0) {