Support storing SHA256 checksum for files in soong_zip

Add a -sha256 argument to soong_zip that computes SHA256 checksum for
each file and stores the value in file header. The SHA information can
then be used by downstream systems that use content addressing.

Bug: 259513199
Test: zip_test.go
Test: soong_zip -o test.zip -D test_dir -sha256
Change-Id: I20e9f424bd0a4604f0dc7cc77bd65f10eb49a163
diff --git a/zip/cmd/main.go b/zip/cmd/main.go
index cbc73ed..def76aa 100644
--- a/zip/cmd/main.go
+++ b/zip/cmd/main.go
@@ -163,6 +163,7 @@
 	parallelJobs := flags.Int("parallel", runtime.NumCPU(), "number of parallel threads to use")
 	cpuProfile := flags.String("cpuprofile", "", "write cpu profile to file")
 	traceFile := flags.String("trace", "", "write trace to file")
+	sha256Checksum := flags.Bool("sha256", false, "add a zip header to each file containing its SHA256 digest")
 
 	flags.Var(&rootPrefix{}, "P", "path prefix within the zip at which to place files")
 	flags.Var(&listFiles{}, "l", "file containing list of files to zip")
@@ -224,6 +225,7 @@
 		WriteIfChanged:           *writeIfChanged,
 		StoreSymlinks:            *symlinks,
 		IgnoreMissingFiles:       *ignoreMissingFiles,
+		Sha256Checksum:           *sha256Checksum,
 	})
 	if err != nil {
 		fmt.Fprintln(os.Stderr, "error:", err.Error())
diff --git a/zip/zip.go b/zip/zip.go
index 955fe68..6f1a8ad 100644
--- a/zip/zip.go
+++ b/zip/zip.go
@@ -17,8 +17,11 @@
 import (
 	"bytes"
 	"compress/flate"
+	"crypto/sha256"
+	"encoding/binary"
 	"errors"
 	"fmt"
+	"hash"
 	"hash/crc32"
 	"io"
 	"io/ioutil"
@@ -38,6 +41,14 @@
 	"android/soong/third_party/zip"
 )
 
+// Sha256HeaderID is a custom Header ID for the `extra` field in
+// the file header to store the SHA checksum.
+const Sha256HeaderID = 0x4967
+
+// Sha256HeaderSignature is the signature to verify that the extra
+// data block is used to store the SHA checksum.
+const Sha256HeaderSignature = 0x9514
+
 // Block size used during parallel compression of a single file.
 const parallelBlockSize = 1 * 1024 * 1024 // 1MB
 
@@ -231,6 +242,8 @@
 
 	stderr io.Writer
 	fs     pathtools.FileSystem
+
+	sha256Checksum bool
 }
 
 type zipEntry struct {
@@ -257,6 +270,7 @@
 	WriteIfChanged           bool
 	StoreSymlinks            bool
 	IgnoreMissingFiles       bool
+	Sha256Checksum           bool
 
 	Stderr     io.Writer
 	Filesystem pathtools.FileSystem
@@ -280,6 +294,7 @@
 		ignoreMissingFiles: args.IgnoreMissingFiles,
 		stderr:             args.Stderr,
 		fs:                 args.Filesystem,
+		sha256Checksum:     args.Sha256Checksum,
 	}
 
 	if z.fs == nil {
@@ -782,15 +797,17 @@
 		// this based on actual buffer sizes in RateLimit.
 		ze.futureReaders = make(chan chan io.Reader, (fileSize/parallelBlockSize)+1)
 
-		// Calculate the CRC in the background, since reading the entire
-		// file could take a while.
+		// Calculate the CRC and SHA256 in the background, since reading
+		// the entire file could take a while.
 		//
 		// We could split this up into chunks as well, but it's faster
 		// than the compression. Due to the Go Zip API, we also need to
 		// know the result before we can begin writing the compressed
 		// data out to the zipfile.
+		//
+		// We calculate SHA256 only if `-sha256` is set.
 		wg.Add(1)
-		go z.crcFile(r, ze, compressChan, wg)
+		go z.checksumFileAsync(r, ze, compressChan, wg)
 
 		for start := int64(0); start < fileSize; start += parallelBlockSize {
 			sr := io.NewSectionReader(r, start, parallelBlockSize)
@@ -829,20 +846,53 @@
 	return nil
 }
 
-func (z *ZipWriter) crcFile(r io.Reader, ze *zipEntry, resultChan chan *zipEntry, wg *sync.WaitGroup) {
+func (z *ZipWriter) checksumFileAsync(r io.ReadSeeker, ze *zipEntry, resultChan chan *zipEntry, wg *sync.WaitGroup) {
 	defer wg.Done()
 	defer z.cpuRateLimiter.Finish()
 
+	z.checksumFile(r, ze)
+
+	resultChan <- ze
+	close(resultChan)
+}
+
+func (z *ZipWriter) checksumFile(r io.ReadSeeker, ze *zipEntry) {
 	crc := crc32.NewIEEE()
-	_, err := io.Copy(crc, r)
+	writers := []io.Writer{crc}
+
+	var shaHasher hash.Hash
+	if z.sha256Checksum && !ze.fh.Mode().IsDir() {
+		shaHasher = sha256.New()
+		writers = append(writers, shaHasher)
+	}
+
+	w := io.MultiWriter(writers...)
+
+	_, err := io.Copy(w, r)
 	if err != nil {
 		z.errors <- err
 		return
 	}
 
 	ze.fh.CRC32 = crc.Sum32()
-	resultChan <- ze
-	close(resultChan)
+	if shaHasher != nil {
+		z.appendSHAToExtra(ze, shaHasher.Sum(nil))
+	}
+}
+
+func (z *ZipWriter) appendSHAToExtra(ze *zipEntry, checksum []byte) {
+	// The block of SHA256 checksum consist of:
+	// - Header ID, equals to Sha256HeaderID (2 bytes)
+	// - Data size (2 bytes)
+	// - Data block:
+	//   - Signature, equals to Sha256HeaderSignature (2 bytes)
+	//   - Data, SHA checksum value
+	var buf []byte
+	buf = binary.LittleEndian.AppendUint16(buf, Sha256HeaderID)
+	buf = binary.LittleEndian.AppendUint16(buf, uint16(len(checksum)+2))
+	buf = binary.LittleEndian.AppendUint16(buf, Sha256HeaderSignature)
+	buf = append(buf, checksum...)
+	ze.fh.Extra = append(ze.fh.Extra, buf...)
 }
 
 func (z *ZipWriter) compressPartialFile(r io.Reader, dict []byte, last bool, resultChan chan io.Reader, wg *sync.WaitGroup) {
@@ -894,17 +944,9 @@
 }
 
 func (z *ZipWriter) compressWholeFile(ze *zipEntry, r io.ReadSeeker, compressChan chan *zipEntry) {
+	z.checksumFile(r, ze)
 
-	crc := crc32.NewIEEE()
-	_, err := io.Copy(crc, r)
-	if err != nil {
-		z.errors <- err
-		return
-	}
-
-	ze.fh.CRC32 = crc.Sum32()
-
-	_, err = r.Seek(0, 0)
+	_, err := r.Seek(0, 0)
 	if err != nil {
 		z.errors <- err
 		return
diff --git a/zip/zip_test.go b/zip/zip_test.go
index c4832dc..e7fdea8 100644
--- a/zip/zip_test.go
+++ b/zip/zip_test.go
@@ -16,6 +16,7 @@
 
 import (
 	"bytes"
+	"encoding/hex"
 	"hash/crc32"
 	"io"
 	"os"
@@ -35,6 +36,10 @@
 	fileEmpty    = []byte("")
 	fileManifest = []byte("Manifest-Version: 1.0\nCreated-By: soong_zip\n\n")
 
+	sha256FileA = "d53eda7a637c99cc7fb566d96e9fa109bf15c478410a3f5eb4d4c4e26cd081f6"
+	sha256FileB = "430c56c5818e62bcb6d478901ef86284e97714c138f3c86aa14fd6a84b7ce5d3"
+	sha256FileC = "31c5ab6111f1d6aa13c2c4e92bb3c0f7c76b61b42d141af1e846eb7f6586a51c"
+
 	fileCustomManifest  = []byte("Custom manifest: true\n")
 	customManifestAfter = []byte("Manifest-Version: 1.0\nCreated-By: soong_zip\nCustom manifest: true\n\n")
 )
@@ -67,6 +72,20 @@
 	}
 }
 
+func fhWithSHA256(name string, contents []byte, method uint16, sha256 string) zip.FileHeader {
+	h := fh(name, contents, method)
+	// The extra field contains 38 bytes, including 2 bytes of header ID, 2 bytes
+	// of size, 2 bytes of signature, and 32 bytes of checksum data block.
+	var extra [38]byte
+	// The first 6 bytes contains Sha256HeaderID (0x4967), size (unit(34)) and
+	// Sha256HeaderSignature (0x9514)
+	copy(extra[0:], []byte{103, 73, 34, 0, 20, 149})
+	sha256Bytes, _ := hex.DecodeString(sha256)
+	copy(extra[6:], sha256Bytes)
+	h.Extra = append(h.Extra, extra[:]...)
+	return h
+}
+
 func fhManifest(contents []byte) zip.FileHeader {
 	return zip.FileHeader{
 		Name:               "META-INF/MANIFEST.MF",
@@ -87,13 +106,18 @@
 	}
 }
 
-func fhDir(name string) zip.FileHeader {
+type fhDirOptions struct {
+	extra []byte
+}
+
+func fhDir(name string, opts fhDirOptions) zip.FileHeader {
 	return zip.FileHeader{
 		Name:               name,
 		Method:             zip.Store,
 		CRC32:              crc32.ChecksumIEEE(nil),
 		UncompressedSize64: 0,
 		ExternalAttrs:      (syscall.S_IFDIR|0755)<<16 | 0x10,
+		Extra:              opts.extra,
 	}
 }
 
@@ -114,6 +138,7 @@
 		manifest           string
 		storeSymlinks      bool
 		ignoreMissingFiles bool
+		sha256Checksum     bool
 
 		files []zip.FileHeader
 		err   error
@@ -320,10 +345,10 @@
 			emulateJar:       true,
 
 			files: []zip.FileHeader{
-				fhDir("META-INF/"),
+				fhDir("META-INF/", fhDirOptions{extra: []byte{254, 202, 0, 0}}),
 				fhManifest(fileManifest),
-				fhDir("a/"),
-				fhDir("a/a/"),
+				fhDir("a/", fhDirOptions{}),
+				fhDir("a/a/", fhDirOptions{}),
 				fh("a/a/a", fileA, zip.Deflate),
 				fh("a/a/b", fileB, zip.Deflate),
 			},
@@ -338,10 +363,10 @@
 			manifest:         "manifest.txt",
 
 			files: []zip.FileHeader{
-				fhDir("META-INF/"),
+				fhDir("META-INF/", fhDirOptions{extra: []byte{254, 202, 0, 0}}),
 				fhManifest(customManifestAfter),
-				fhDir("a/"),
-				fhDir("a/a/"),
+				fhDir("a/", fhDirOptions{}),
+				fhDir("a/a/", fhDirOptions{}),
 				fh("a/a/a", fileA, zip.Deflate),
 				fh("a/a/b", fileB, zip.Deflate),
 			},
@@ -355,8 +380,8 @@
 			dirEntries:       true,
 
 			files: []zip.FileHeader{
-				fhDir("a/"),
-				fhDir("a/a/"),
+				fhDir("a/", fhDirOptions{}),
+				fhDir("a/a/", fhDirOptions{}),
 				fh("a/a/a", fileA, zip.Deflate),
 				fh("a/a/b", fileB, zip.Deflate),
 			},
@@ -412,6 +437,23 @@
 				fh("a/a/a", fileA, zip.Deflate),
 			},
 		},
+		{
+			name: "generate SHA256 checksum",
+			args: fileArgsBuilder().
+				File("a/a/a").
+				File("a/a/b").
+				File("a/a/c").
+				File("c"),
+			compressionLevel: 9,
+			sha256Checksum:   true,
+
+			files: []zip.FileHeader{
+				fhWithSHA256("a/a/a", fileA, zip.Deflate, sha256FileA),
+				fhWithSHA256("a/a/b", fileB, zip.Deflate, sha256FileB),
+				fhWithSHA256("a/a/c", fileC, zip.Deflate, sha256FileC),
+				fhWithSHA256("c", fileC, zip.Deflate, sha256FileC),
+			},
+		},
 
 		// errors
 		{
@@ -465,6 +507,7 @@
 			args.ManifestSourcePath = test.manifest
 			args.StoreSymlinks = test.storeSymlinks
 			args.IgnoreMissingFiles = test.ignoreMissingFiles
+			args.Sha256Checksum = test.sha256Checksum
 			args.Filesystem = mockFs
 			args.Stderr = &bytes.Buffer{}
 
@@ -555,6 +598,11 @@
 					t.Errorf("incorrect file %s method want %v got %v", want.Name,
 						want.Method, got.Method)
 				}
+
+				if !bytes.Equal(want.Extra, got.Extra) {
+					t.Errorf("incorrect file %s extra want %v got %v", want.Name,
+						want.Extra, got.Extra)
+				}
 			}
 		})
 	}