compliance package structures for license metadata

package to read, consume, and analyze license metadata and dependency
graph.

Bug: 68860345
Bug: 151177513
Bug: 151953481
Change-Id: I3ebf44e4d5195b9851fd076161049bf82ed76dd2
diff --git a/tools/compliance/readgraph.go b/tools/compliance/readgraph.go
new file mode 100644
index 0000000..0b5ebfe
--- /dev/null
+++ b/tools/compliance/readgraph.go
@@ -0,0 +1,259 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package compliance
+
+import (
+	"fmt"
+	"io"
+	"io/fs"
+	"strings"
+	"sync"
+
+	"android/soong/compliance/license_metadata_proto"
+
+	"google.golang.org/protobuf/encoding/prototext"
+)
+
+var (
+	// ConcurrentReaders is the size of the task pool for limiting resource usage e.g. open files.
+	ConcurrentReaders = 5
+)
+
+// result describes the outcome of reading and parsing a single license metadata file.
+type result struct {
+	// file identifies the path to the license metadata file
+	file string
+
+	// target contains the parsed metadata or nil if an error
+	target *TargetNode
+
+	// edges contains the parsed dependencies
+	edges []*dependencyEdge
+
+	// err is nil unless an error occurs
+	err error
+}
+
+// receiver coordinates the tasks for reading and parsing license metadata files.
+type receiver struct {
+	// lg accumulates the read metadata and becomes the final resulting LicensGraph.
+	lg *LicenseGraph
+
+	// rootFS locates the root of the file system from which to read the files.
+	rootFS fs.FS
+
+	// stderr identifies the error output writer.
+	stderr io.Writer
+
+	// task provides a fixed-size task pool to limit concurrent open files etc.
+	task chan bool
+
+	// results returns one license metadata file result at a time.
+	results chan *result
+
+	// wg detects when done
+	wg sync.WaitGroup
+}
+
+// ReadLicenseGraph reads and parses `files` and their dependencies into a LicenseGraph.
+//
+// `files` become the root files of the graph for top-down walks of the graph.
+func ReadLicenseGraph(rootFS fs.FS, stderr io.Writer, files []string) (*LicenseGraph, error) {
+	if len(files) == 0 {
+		return nil, fmt.Errorf("no license metadata to analyze")
+	}
+	if ConcurrentReaders < 1 {
+		return nil, fmt.Errorf("need at least one task in pool")
+	}
+
+	lg := newLicenseGraph()
+	for _, f := range files {
+		if strings.HasSuffix(f, ".meta_lic") {
+			lg.rootFiles = append(lg.rootFiles, f)
+		} else {
+			lg.rootFiles = append(lg.rootFiles, f+".meta_lic")
+		}
+	}
+
+	recv := &receiver{
+		lg:      lg,
+		rootFS:  rootFS,
+		stderr:  stderr,
+		task:    make(chan bool, ConcurrentReaders),
+		results: make(chan *result, ConcurrentReaders),
+		wg:      sync.WaitGroup{},
+	}
+	for i := 0; i < ConcurrentReaders; i++ {
+		recv.task <- true
+	}
+
+	readFiles := func() {
+		lg.mu.Lock()
+		// identify the metadata files to schedule reading tasks for
+		for _, f := range lg.rootFiles {
+			lg.targets[f] = nil
+		}
+		lg.mu.Unlock()
+
+		// schedule tasks to read the files
+		for _, f := range lg.rootFiles {
+			readFile(recv, f)
+		}
+
+		// schedule a task to wait until finished and close the channel.
+		go func() {
+			recv.wg.Wait()
+			close(recv.task)
+			close(recv.results)
+		}()
+	}
+	go readFiles()
+
+	// tasks to read license metadata files are scheduled; read and process results from channel
+	var err error
+	for recv.results != nil {
+		select {
+		case r, ok := <-recv.results:
+			if ok {
+				// handle errors by nil'ing ls, setting err, and clobbering results channel
+				if r.err != nil {
+					err = r.err
+					fmt.Fprintf(recv.stderr, "%s\n", err.Error())
+					lg = nil
+					recv.results = nil
+					continue
+				}
+
+				// record the parsed metadata (guarded by mutex)
+				recv.lg.mu.Lock()
+				recv.lg.targets[r.file] = r.target
+				if len(r.edges) > 0 {
+					recv.lg.edges = append(recv.lg.edges, r.edges...)
+				}
+				recv.lg.mu.Unlock()
+			} else {
+				// finished -- nil the results channel
+				recv.results = nil
+			}
+		}
+	}
+
+	return lg, err
+
+}
+
+// targetNode contains the license metadata for a node in the license graph.
+type targetNode struct {
+	proto license_metadata_proto.LicenseMetadata
+
+	// name is the path to the metadata file
+	name string
+}
+
+// dependencyEdge describes a single edge in the license graph.
+type dependencyEdge struct {
+	// target identifies the target node being built and/or installed.
+	target string
+
+	// dependency identifies the target node being depended on.
+	//
+	// i.e. `dependency` is necessary to build `target`.
+	dependency string
+
+	// annotations are a set of text attributes attached to the edge.
+	//
+	// Policy prescribes meaning to a limited set of annotations; others
+	// are preserved and ignored.
+	annotations TargetEdgeAnnotations
+}
+
+// addDependencies converts the proto AnnotatedDependencies into `edges`
+func addDependencies(edges *[]*dependencyEdge, target string, dependencies []*license_metadata_proto.AnnotatedDependency) error {
+	for _, ad := range dependencies {
+		dependency := ad.GetFile()
+		if len(dependency) == 0 {
+			return fmt.Errorf("missing dependency name")
+		}
+		annotations := newEdgeAnnotations()
+		for _, a := range ad.Annotations {
+			if len(a) == 0 {
+				continue
+			}
+			annotations.annotations[a] = true
+		}
+		*edges = append(*edges, &dependencyEdge{target, dependency, annotations})
+	}
+	return nil
+}
+
+// readFile is a task to read and parse a single license metadata file, and to schedule
+// additional tasks for reading and parsing dependencies as necessary.
+func readFile(recv *receiver, file string) {
+	recv.wg.Add(1)
+	<-recv.task
+	go func() {
+		f, err := recv.rootFS.Open(file)
+		if err != nil {
+			recv.results <- &result{file, nil, nil, fmt.Errorf("error opening license metadata %q: %w", file, err)}
+			return
+		}
+
+		// read the file
+		data, err := io.ReadAll(f)
+		if err != nil {
+			recv.results <- &result{file, nil, nil, fmt.Errorf("error reading license metadata %q: %w", file, err)}
+			return
+		}
+
+		tn := &TargetNode{name: file}
+
+		err = prototext.Unmarshal(data, &tn.proto)
+		if err != nil {
+			recv.results <- &result{file, nil, nil, fmt.Errorf("error license metadata %q: %w", file, err)}
+			return
+		}
+
+		edges := []*dependencyEdge{}
+		err = addDependencies(&edges, file, tn.proto.Deps)
+		if err != nil {
+			recv.results <- &result{file, nil, nil, fmt.Errorf("error license metadata dependency %q: %w", file, err)}
+			return
+		}
+		tn.proto.Deps = []*license_metadata_proto.AnnotatedDependency{}
+
+		// send result for this file and release task before scheduling dependencies,
+		// but do not signal done to WaitGroup until dependencies are scheduled.
+		recv.results <- &result{file, tn, edges, nil}
+		recv.task <- true
+
+		// schedule tasks as necessary to read dependencies
+		for _, e := range edges {
+			// decide, signal and record whether to schedule task in critical section
+			recv.lg.mu.Lock()
+			_, alreadyScheduled := recv.lg.targets[e.dependency]
+			if !alreadyScheduled {
+				recv.lg.targets[e.dependency] = nil
+			}
+			recv.lg.mu.Unlock()
+			// schedule task to read dependency file outside critical section
+			if !alreadyScheduled {
+				readFile(recv, e.dependency)
+			}
+		}
+
+		// signal task done after scheduling dependencies
+		recv.wg.Done()
+	}()
+}