license metadata text notice files

Introduce the below command-line tool:

textnotice outputs a NOTICE text file constructed from the license
texts of the transitive closure of dependencies.

Bug: 68860345
Bug: 151177513
Bug: 151953481
Bug: 213388645
Bug: 210912771

Test: m all
Test: m systemlicense
Test: m textnotice; out/soong/host/linux-x85/textnotice ...

where ... is the path to the .meta_lic file for the system image. In my
case if

$ export PRODUCT=$(realpath $ANDROID_PRODUCT_OUT --relative-to=$PWD)

... can be expressed as:

${PRODUCT}/gen/META/lic_intermediates/${PRODUCT}/system.img.meta_lic

Change-Id: Ia691869fd8e58ef008024f48c23b1a4b4435677a
diff --git a/tools/compliance/noticeindex.go b/tools/compliance/noticeindex.go
new file mode 100644
index 0000000..06c2627
--- /dev/null
+++ b/tools/compliance/noticeindex.go
@@ -0,0 +1,527 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compliance
+
+import (
+	"bufio"
+	"crypto/md5"
+	"fmt"
+	"io"
+	"io/fs"
+	"path/filepath"
+	"regexp"
+	"sort"
+	"strings"
+)
+
+const (
+	noProjectName = "\u2205"
+)
+
+var (
+	nameRegexp         = regexp.MustCompile(`^\s*name\s*:\s*"(.*)"\s*$`)
+	descRegexp         = regexp.MustCompile(`^\s*description\s*:\s*"(.*)"\s*$`)
+	versionRegexp      = regexp.MustCompile(`^\s*version\s*:\s*"(.*)"\s*$`)
+	licensesPathRegexp = regexp.MustCompile(`licen[cs]es?/`)
+)
+
+// NoticeIndex transforms license metadata into license text hashes, library
+// names, and install paths indexing them for fast lookup/iteration.
+type NoticeIndex struct {
+	// lg identifies the license graph to which the index applies.
+	lg *LicenseGraph
+	// rs identifies the set of resolutions upon which the index is based.
+	rs ResolutionSet
+	// shipped identifies the set of target nodes shipped directly or as derivative works.
+	shipped *TargetNodeSet
+	// rootFS locates the root of the file system from which to read the files.
+	rootFS fs.FS
+	// hash maps license text filenames to content hashes
+	hash map[string]hash
+	// text maps content hashes to content
+	text map[hash][]byte
+	// hashLibInstall maps hashes to libraries to install paths.
+	hashLibInstall map[hash]map[string]map[string]struct{}
+	// installLibHash maps install paths to libraries to hashes.
+	installLibHash map[string]map[string]map[hash]struct{}
+	// libHash maps libraries to hashes.
+	libHash map[string]map[hash]struct{}
+	// targetHash maps target nodes to hashes.
+	targetHashes map[*TargetNode]map[hash]struct{}
+	// projectName maps project directory names to project name text.
+	projectName map[string]string
+}
+
+// IndexLicenseTexts creates a hashed index of license texts for `lg` and `rs`
+// using the files rooted at `rootFS`.
+func IndexLicenseTexts(rootFS fs.FS, lg *LicenseGraph, rs ResolutionSet) (*NoticeIndex, error) {
+	if rs == nil {
+		rs = ResolveNotices(lg)
+	}
+	ni := &NoticeIndex{
+		lg, rs, ShippedNodes(lg), rootFS,
+		make(map[string]hash),
+		make(map[hash][]byte),
+		make(map[hash]map[string]map[string]struct{}),
+		make(map[string]map[string]map[hash]struct{}),
+		make(map[string]map[hash]struct{}),
+		make(map[*TargetNode]map[hash]struct{}),
+		make(map[string]string),
+	}
+
+	// index adds all license texts for `tn` to the index.
+	index := func(tn *TargetNode) (map[hash]struct{}, error) {
+		if hashes, ok := ni.targetHashes[tn]; ok {
+			return hashes, nil
+		}
+		hashes := make(map[hash]struct{})
+		for _, text := range tn.LicenseTexts() {
+			if _, ok := ni.hash[text]; !ok {
+				err := ni.addText(text)
+				if err != nil {
+					return nil, err
+				}
+			}
+			hash := ni.hash[text]
+			if _, ok := hashes[hash]; !ok {
+				hashes[hash] = struct{}{}
+			}
+		}
+		ni.targetHashes[tn] = hashes
+		return hashes, nil
+	}
+
+	link := func(libName string, hashes map[hash]struct{}, installPaths []string) {
+		if _, ok := ni.libHash[libName]; !ok {
+			ni.libHash[libName] = make(map[hash]struct{})
+		}
+		for h := range hashes {
+			if _, ok := ni.hashLibInstall[h]; !ok {
+				ni.hashLibInstall[h] = make(map[string]map[string]struct{})
+			}
+			if _, ok := ni.libHash[libName][h]; !ok {
+				ni.libHash[libName][h] = struct{}{}
+			}
+			for _, installPath := range installPaths {
+				if _, ok := ni.installLibHash[installPath]; !ok {
+					ni.installLibHash[installPath] = make(map[string]map[hash]struct{})
+					ni.installLibHash[installPath][libName] = make(map[hash]struct{})
+					ni.installLibHash[installPath][libName][h] = struct{}{}
+				} else if _, ok = ni.installLibHash[installPath][libName]; !ok {
+					ni.installLibHash[installPath][libName] = make(map[hash]struct{})
+					ni.installLibHash[installPath][libName][h] = struct{}{}
+				} else if _, ok = ni.installLibHash[installPath][libName][h]; !ok {
+					ni.installLibHash[installPath][libName][h] = struct{}{}
+				}
+				if _, ok := ni.hashLibInstall[h]; !ok {
+					ni.hashLibInstall[h] = make(map[string]map[string]struct{})
+					ni.hashLibInstall[h][libName] = make(map[string]struct{})
+					ni.hashLibInstall[h][libName][installPath] = struct{}{}
+				} else if _, ok = ni.hashLibInstall[h][libName]; !ok {
+					ni.hashLibInstall[h][libName] = make(map[string]struct{})
+					ni.hashLibInstall[h][libName][installPath] = struct{}{}
+				} else if _, ok = ni.hashLibInstall[h][libName][installPath]; !ok {
+					ni.hashLibInstall[h][libName][installPath] = struct{}{}
+				}
+			}
+		}
+	}
+
+	// returns error from walk below.
+	var err error
+
+	WalkTopDown(NoEdgeContext{}, lg, func(lg *LicenseGraph, tn *TargetNode, path TargetEdgePath) bool {
+		if err != nil {
+			return false
+		}
+		if !ni.shipped.Contains(tn) {
+			return false
+		}
+		installPaths := getInstallPaths(tn, path)
+		var hashes map[hash]struct{}
+		hashes, err = index(tn)
+		if err != nil {
+			return false
+		}
+		link(ni.getLibName(tn), hashes, installPaths)
+		if tn.IsContainer() {
+			return true
+		}
+
+		for _, r := range rs.Resolutions(tn) {
+			hashes, err = index(r.actsOn)
+			if err != nil {
+				return false
+			}
+			link(ni.getLibName(r.actsOn), hashes, installPaths)
+		}
+		return false
+	})
+
+	if err != nil {
+		return nil, err
+	}
+
+	return ni, nil
+}
+
+// Hashes returns an ordered channel of the hashed license texts.
+func (ni *NoticeIndex) Hashes() chan hash {
+	c := make(chan hash)
+	go func() {
+		libs := make([]string, 0, len(ni.libHash))
+		for libName := range ni.libHash {
+			libs = append(libs, libName)
+		}
+		sort.Strings(libs)
+		hashes := make(map[hash]struct{})
+		for _, libName := range libs {
+			hl := make([]hash, 0, len(ni.libHash[libName]))
+			for h := range ni.libHash[libName] {
+				if _, ok := hashes[h]; ok {
+					continue
+				}
+				hashes[h] = struct{}{}
+				hl = append(hl, h)
+			}
+			if len(hl) > 0 {
+				sort.Sort(hashList{ni, libName, &hl})
+				for _, h := range hl {
+					c <- h
+				}
+			}
+		}
+		close(c)
+	}()
+	return c
+}
+
+// HashLibs returns the ordered array of library names using the license text
+// hashed as `h`.
+func (ni *NoticeIndex) HashLibs(h hash) []string {
+	libs := make([]string, 0, len(ni.hashLibInstall[h]))
+	for libName := range ni.hashLibInstall[h] {
+		libs = append(libs, libName)
+	}
+	sort.Strings(libs)
+	return libs
+}
+
+// HashLibInstalls returns the ordered array of install paths referencing
+// library `libName` using the license text hashed as `h`.
+func (ni *NoticeIndex) HashLibInstalls(h hash, libName string) []string {
+	installs := make([]string, 0, len(ni.hashLibInstall[h][libName]))
+	for installPath := range ni.hashLibInstall[h][libName] {
+		installs = append(installs, installPath)
+	}
+	sort.Strings(installs)
+	return installs
+}
+
+// HashText returns the file content of the license text hashed as `h`.
+func (ni *NoticeIndex) HashText(h hash) []byte {
+	return ni.text[h]
+}
+
+// getLibName returns the name of the library associated with `noticeFor`.
+func (ni *NoticeIndex) getLibName(noticeFor *TargetNode) string {
+	// use name from METADATA if available
+	ln := ni.checkMetadata(noticeFor)
+	if len(ln) > 0 {
+		return ln
+	}
+	// use package_name: from license{} module if available
+	pn := noticeFor.PackageName()
+	if len(pn) > 0 {
+		return pn
+	}
+	for _, p := range noticeFor.Projects() {
+		if strings.HasPrefix(p, "prebuilts/") {
+			for _, licenseText := range noticeFor.LicenseTexts() {
+				if !strings.HasPrefix(licenseText, "prebuilts/") {
+					continue
+				}
+				for r, prefix := range SafePrebuiltPrefixes {
+					match := r.FindString(licenseText)
+					if len(match) == 0 {
+						continue
+					}
+					strip := SafePathPrefixes[prefix]
+					if strip {
+						// strip entire prefix
+						match = licenseText[len(match):]
+					} else {
+						// strip from prebuilts/ until safe prefix
+						match = licenseText[len(match)-len(prefix):]
+					}
+					// remove LICENSE or NOTICE or other filename
+					li := strings.LastIndex(match, "/")
+					if 0 < li {
+						match = match[:li]
+					}
+					// remove *licenses/ path segment and subdirectory if in path
+					if offsets := licensesPathRegexp.FindAllStringIndex(match, -1); offsets != nil && 0 < offsets[len(offsets)-1][0] {
+						match = match[:offsets[len(offsets)-1][0]]
+						li = strings.LastIndex(match, "/")
+						if 0 < li {
+							match = match[:li]
+						}
+					}
+					return match
+				}
+				break
+			}
+		}
+		for prefix, strip := range SafePathPrefixes {
+			if strings.HasPrefix(p, prefix) {
+				if strip {
+					return p[len(prefix):]
+				} else {
+					return p
+				}
+			}
+		}
+	}
+	// strip off [./]meta_lic from license metadata path and extract base name
+	n := noticeFor.name[:len(noticeFor.name)-9]
+	li := strings.LastIndex(n, "/")
+	if 0 < li {
+		n = n[li+1:]
+	}
+	return n
+}
+
+// checkMetadata tries to look up a library name from a METADATA file associated with `noticeFor`.
+func (ni *NoticeIndex) checkMetadata(noticeFor *TargetNode) string {
+	for _, p := range noticeFor.Projects() {
+		if name, ok := ni.projectName[p]; ok {
+			if name == noProjectName {
+				continue
+			}
+			return name
+		}
+		f, err := ni.rootFS.Open(filepath.Join(p, "METADATA"))
+		if err != nil {
+			ni.projectName[p] = noProjectName
+			continue
+		}
+		name := ""
+		description := ""
+		version := ""
+		s := bufio.NewScanner(f)
+		for s.Scan() {
+			line := s.Text()
+			m := nameRegexp.FindStringSubmatch(line)
+			if m != nil {
+				if 1 < len(m) && m[1] != "" {
+					name = m[1]
+				}
+				if version != "" {
+					break
+				}
+				continue
+			}
+			m = versionRegexp.FindStringSubmatch(line)
+			if m != nil {
+				if 1 < len(m) && m[1] != "" {
+					version = m[1]
+				}
+				if name != "" {
+					break
+				}
+				continue
+			}
+			m = descRegexp.FindStringSubmatch(line)
+			if m != nil {
+				if 1 < len(m) && m[1] != "" {
+					description = m[1]
+				}
+			}
+		}
+		_ = s.Err()
+		_ = f.Close()
+		if name != "" {
+			if version != "" {
+				if version[0] == 'v' || version[0] == 'V' {
+					ni.projectName[p] = name + "_" + version
+				} else {
+					ni.projectName[p] = name + "_v_" + version
+				}
+			} else {
+				ni.projectName[p] = name
+			}
+			return ni.projectName[p]
+		}
+		if description != "" {
+			ni.projectName[p] = description
+			return ni.projectName[p]
+		}
+		ni.projectName[p] = noProjectName
+	}
+	return ""
+}
+
+// addText reads and indexes the content of a license text file.
+func (ni *NoticeIndex) addText(file string) error {
+	f, err := ni.rootFS.Open(filepath.Clean(file))
+	if err != nil {
+		return fmt.Errorf("error opening license text file %q: %w", file, err)
+	}
+
+	// read the file
+	text, err := io.ReadAll(f)
+	if err != nil {
+		return fmt.Errorf("error reading license text file %q: %w", file, err)
+	}
+
+	hash := hash{fmt.Sprintf("%x", md5.Sum(text))}
+	ni.hash[file] = hash
+	if _, alreadyPresent := ni.text[hash]; !alreadyPresent {
+		ni.text[hash] = text
+	}
+
+	return nil
+}
+
+// getInstallPaths returns the names of the used dependencies mapped to their
+// installed locations.
+func getInstallPaths(attachesTo *TargetNode, path TargetEdgePath) []string {
+	if len(path) == 0 {
+		installs := attachesTo.Installed()
+		if 0 == len(installs) {
+			installs = attachesTo.Built()
+		}
+		return installs
+	}
+
+	var getInstalls func(path TargetEdgePath) []string
+
+	getInstalls = func(path TargetEdgePath) []string {
+		// deps contains the output targets from the dependencies in the path
+		var deps []string
+		if len(path) > 1 {
+			// recursively get the targets from the sub-path skipping 1 path segment
+			deps = getInstalls(path[1:])
+		} else {
+			// stop recursion at 1 path segment
+			deps = path[0].Dependency().TargetFiles()
+		}
+		size := 0
+		prefixes := path[0].Target().TargetFiles()
+		installMap := path[0].Target().InstallMap()
+		sources := path[0].Target().Sources()
+		for _, dep := range deps {
+			found := false
+			for _, source := range sources {
+				if strings.HasPrefix(dep, source) {
+					found = true
+					break
+				}
+			}
+			if !found {
+				continue
+			}
+			for _, im := range installMap {
+				if strings.HasPrefix(dep, im.FromPath) {
+					size += len(prefixes)
+					break
+				}
+			}
+		}
+
+		installs := make([]string, 0, size)
+		for _, dep := range deps {
+			found := false
+			for _, source := range sources {
+				if strings.HasPrefix(dep, source) {
+					found = true
+					break
+				}
+			}
+			if !found {
+				continue
+			}
+			for _, im := range installMap {
+				if strings.HasPrefix(dep, im.FromPath) {
+					for _, prefix := range prefixes {
+						installs = append(installs, prefix+im.ContainerPath+dep[len(im.FromPath):])
+					}
+					break
+				}
+			}
+		}
+		return installs
+	}
+	allInstalls := getInstalls(path)
+	installs := path[0].Target().Installed()
+	if len(installs) == 0 {
+		return allInstalls
+	}
+	result := make([]string, 0, len(allInstalls))
+	for _, install := range allInstalls {
+		for _, prefix := range installs {
+			if strings.HasPrefix(install, prefix) {
+				result = append(result, install)
+			}
+		}
+	}
+	return result
+}
+
+// hash is an opaque string derived from md5sum.
+type hash struct {
+	key string
+}
+
+// String returns the hexadecimal representation of the hash.
+func (h hash) String() string {
+	return h.key
+}
+
+// hashList orders an array of hashes
+type hashList struct {
+	ni      *NoticeIndex
+	libName string
+	hashes  *[]hash
+}
+
+// Len returns the count of elements in the slice.
+func (l hashList) Len() int { return len(*l.hashes) }
+
+// Swap rearranges 2 elements of the slice so that each occupies the other's
+// former position.
+func (l hashList) Swap(i, j int) { (*l.hashes)[i], (*l.hashes)[j] = (*l.hashes)[j], (*l.hashes)[i] }
+
+// Less returns true when the `i`th element is lexicographically less than
+// the `j`th element.
+func (l hashList) Less(i, j int) bool {
+	var insti, instj int
+	if 0 < len(l.libName) {
+		insti = len(l.ni.hashLibInstall[(*l.hashes)[i]][l.libName])
+		instj = len(l.ni.hashLibInstall[(*l.hashes)[j]][l.libName])
+	}
+	if insti == instj {
+		leni := len(l.ni.text[(*l.hashes)[i]])
+		lenj := len(l.ni.text[(*l.hashes)[j]])
+		if leni == lenj {
+			// all else equal, just order by hash value
+			return (*l.hashes)[i].key < (*l.hashes)[j].key
+		}
+		// put shortest texts first within same # of installs
+		return leni < lenj
+	}
+	// reverse order of # installs so that most popular appears first
+	return instj < insti
+}