blob: 72f04ff879118b931dfdb98ee71695dddeccd5e4 [file] [log] [blame]
Bob Badoure6fdd142021-12-09 22:10:43 -08001// Copyright 2021 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package compliance
16
17import (
18 "bufio"
19 "crypto/md5"
20 "fmt"
21 "io"
22 "io/fs"
Bob Badour5028abc2022-02-09 11:56:58 -080023 "net/url"
Bob Badoure6fdd142021-12-09 22:10:43 -080024 "path/filepath"
25 "regexp"
26 "sort"
27 "strings"
28)
29
30const (
31 noProjectName = "\u2205"
32)
33
34var (
35 nameRegexp = regexp.MustCompile(`^\s*name\s*:\s*"(.*)"\s*$`)
36 descRegexp = regexp.MustCompile(`^\s*description\s*:\s*"(.*)"\s*$`)
37 versionRegexp = regexp.MustCompile(`^\s*version\s*:\s*"(.*)"\s*$`)
38 licensesPathRegexp = regexp.MustCompile(`licen[cs]es?/`)
39)
40
41// NoticeIndex transforms license metadata into license text hashes, library
42// names, and install paths indexing them for fast lookup/iteration.
43type NoticeIndex struct {
44 // lg identifies the license graph to which the index applies.
45 lg *LicenseGraph
46 // rs identifies the set of resolutions upon which the index is based.
47 rs ResolutionSet
48 // shipped identifies the set of target nodes shipped directly or as derivative works.
49 shipped *TargetNodeSet
50 // rootFS locates the root of the file system from which to read the files.
51 rootFS fs.FS
52 // hash maps license text filenames to content hashes
53 hash map[string]hash
54 // text maps content hashes to content
55 text map[hash][]byte
56 // hashLibInstall maps hashes to libraries to install paths.
57 hashLibInstall map[hash]map[string]map[string]struct{}
Bob Badour6ea14572022-01-23 17:15:46 -080058 // installHashLib maps install paths to libraries to hashes.
59 installHashLib map[string]map[hash]map[string]struct{}
Bob Badoure6fdd142021-12-09 22:10:43 -080060 // libHash maps libraries to hashes.
61 libHash map[string]map[hash]struct{}
62 // targetHash maps target nodes to hashes.
63 targetHashes map[*TargetNode]map[hash]struct{}
64 // projectName maps project directory names to project name text.
65 projectName map[string]string
Colin Crossbb45f8c2022-01-28 15:18:19 -080066 // files lists all the files accessed during indexing
67 files []string
Bob Badoure6fdd142021-12-09 22:10:43 -080068}
69
70// IndexLicenseTexts creates a hashed index of license texts for `lg` and `rs`
71// using the files rooted at `rootFS`.
72func IndexLicenseTexts(rootFS fs.FS, lg *LicenseGraph, rs ResolutionSet) (*NoticeIndex, error) {
73 if rs == nil {
74 rs = ResolveNotices(lg)
75 }
76 ni := &NoticeIndex{
Colin Crossbb45f8c2022-01-28 15:18:19 -080077 lg: lg,
78 rs: rs,
79 shipped: ShippedNodes(lg),
80 rootFS: rootFS,
81 hash: make(map[string]hash),
82 text: make(map[hash][]byte),
83 hashLibInstall: make(map[hash]map[string]map[string]struct{}),
84 installHashLib: make(map[string]map[hash]map[string]struct{}),
85 libHash: make(map[string]map[hash]struct{}),
86 targetHashes: make(map[*TargetNode]map[hash]struct{}),
87 projectName: make(map[string]string),
Bob Badoure6fdd142021-12-09 22:10:43 -080088 }
89
90 // index adds all license texts for `tn` to the index.
91 index := func(tn *TargetNode) (map[hash]struct{}, error) {
92 if hashes, ok := ni.targetHashes[tn]; ok {
93 return hashes, nil
94 }
95 hashes := make(map[hash]struct{})
96 for _, text := range tn.LicenseTexts() {
Bob Badour5028abc2022-02-09 11:56:58 -080097 fname := strings.SplitN(text, ":", 2)[0]
98 if _, ok := ni.hash[fname]; !ok {
99 err := ni.addText(fname)
Bob Badoure6fdd142021-12-09 22:10:43 -0800100 if err != nil {
101 return nil, err
102 }
103 }
Bob Badour5028abc2022-02-09 11:56:58 -0800104 hash := ni.hash[fname]
Bob Badoure6fdd142021-12-09 22:10:43 -0800105 if _, ok := hashes[hash]; !ok {
106 hashes[hash] = struct{}{}
107 }
108 }
109 ni.targetHashes[tn] = hashes
110 return hashes, nil
111 }
112
Bob Badour5028abc2022-02-09 11:56:58 -0800113 link := func(tn *TargetNode, hashes map[hash]struct{}, installPaths []string) {
Bob Badoure6fdd142021-12-09 22:10:43 -0800114 for h := range hashes {
Bob Badour5028abc2022-02-09 11:56:58 -0800115 libName := ni.getLibName(tn, h)
116 if _, ok := ni.libHash[libName]; !ok {
117 ni.libHash[libName] = make(map[hash]struct{})
118 }
Bob Badoure6fdd142021-12-09 22:10:43 -0800119 if _, ok := ni.hashLibInstall[h]; !ok {
120 ni.hashLibInstall[h] = make(map[string]map[string]struct{})
121 }
122 if _, ok := ni.libHash[libName][h]; !ok {
123 ni.libHash[libName][h] = struct{}{}
124 }
125 for _, installPath := range installPaths {
Bob Badour6ea14572022-01-23 17:15:46 -0800126 if _, ok := ni.installHashLib[installPath]; !ok {
127 ni.installHashLib[installPath] = make(map[hash]map[string]struct{})
128 ni.installHashLib[installPath][h] = make(map[string]struct{})
129 ni.installHashLib[installPath][h][libName] = struct{}{}
130 } else if _, ok = ni.installHashLib[installPath][h]; !ok {
131 ni.installHashLib[installPath][h] = make(map[string]struct{})
132 ni.installHashLib[installPath][h][libName] = struct{}{}
133 } else if _, ok = ni.installHashLib[installPath][h][libName]; !ok {
134 ni.installHashLib[installPath][h][libName] = struct{}{}
Bob Badoure6fdd142021-12-09 22:10:43 -0800135 }
136 if _, ok := ni.hashLibInstall[h]; !ok {
137 ni.hashLibInstall[h] = make(map[string]map[string]struct{})
138 ni.hashLibInstall[h][libName] = make(map[string]struct{})
139 ni.hashLibInstall[h][libName][installPath] = struct{}{}
140 } else if _, ok = ni.hashLibInstall[h][libName]; !ok {
141 ni.hashLibInstall[h][libName] = make(map[string]struct{})
142 ni.hashLibInstall[h][libName][installPath] = struct{}{}
143 } else if _, ok = ni.hashLibInstall[h][libName][installPath]; !ok {
144 ni.hashLibInstall[h][libName][installPath] = struct{}{}
145 }
146 }
147 }
148 }
149
150 // returns error from walk below.
151 var err error
152
153 WalkTopDown(NoEdgeContext{}, lg, func(lg *LicenseGraph, tn *TargetNode, path TargetEdgePath) bool {
154 if err != nil {
155 return false
156 }
157 if !ni.shipped.Contains(tn) {
158 return false
159 }
160 installPaths := getInstallPaths(tn, path)
161 var hashes map[hash]struct{}
162 hashes, err = index(tn)
163 if err != nil {
164 return false
165 }
Bob Badour5028abc2022-02-09 11:56:58 -0800166 link(tn, hashes, installPaths)
Bob Badoure6fdd142021-12-09 22:10:43 -0800167 if tn.IsContainer() {
168 return true
169 }
170
171 for _, r := range rs.Resolutions(tn) {
172 hashes, err = index(r.actsOn)
173 if err != nil {
174 return false
175 }
Bob Badour5028abc2022-02-09 11:56:58 -0800176 link(r.actsOn, hashes, installPaths)
Bob Badoure6fdd142021-12-09 22:10:43 -0800177 }
178 return false
179 })
180
181 if err != nil {
182 return nil, err
183 }
184
185 return ni, nil
186}
187
188// Hashes returns an ordered channel of the hashed license texts.
189func (ni *NoticeIndex) Hashes() chan hash {
190 c := make(chan hash)
191 go func() {
192 libs := make([]string, 0, len(ni.libHash))
193 for libName := range ni.libHash {
194 libs = append(libs, libName)
195 }
196 sort.Strings(libs)
197 hashes := make(map[hash]struct{})
198 for _, libName := range libs {
199 hl := make([]hash, 0, len(ni.libHash[libName]))
200 for h := range ni.libHash[libName] {
201 if _, ok := hashes[h]; ok {
202 continue
203 }
204 hashes[h] = struct{}{}
205 hl = append(hl, h)
206 }
207 if len(hl) > 0 {
Bob Badour6ea14572022-01-23 17:15:46 -0800208 sort.Sort(hashList{ni, libName, "", &hl})
Bob Badoure6fdd142021-12-09 22:10:43 -0800209 for _, h := range hl {
210 c <- h
211 }
212 }
213 }
214 close(c)
215 }()
216 return c
217}
218
Colin Crossbb45f8c2022-01-28 15:18:19 -0800219// InputNoticeFiles returns the list of files that were hashed during IndexLicenseTexts.
220func (ni *NoticeIndex) InputNoticeFiles() []string {
221 files := append([]string(nil), ni.files...)
222 sort.Strings(files)
223 return files
224}
225
Bob Badoure6fdd142021-12-09 22:10:43 -0800226// HashLibs returns the ordered array of library names using the license text
227// hashed as `h`.
228func (ni *NoticeIndex) HashLibs(h hash) []string {
229 libs := make([]string, 0, len(ni.hashLibInstall[h]))
230 for libName := range ni.hashLibInstall[h] {
231 libs = append(libs, libName)
232 }
233 sort.Strings(libs)
234 return libs
235}
236
237// HashLibInstalls returns the ordered array of install paths referencing
238// library `libName` using the license text hashed as `h`.
239func (ni *NoticeIndex) HashLibInstalls(h hash, libName string) []string {
240 installs := make([]string, 0, len(ni.hashLibInstall[h][libName]))
241 for installPath := range ni.hashLibInstall[h][libName] {
242 installs = append(installs, installPath)
243 }
244 sort.Strings(installs)
245 return installs
246}
247
Bob Badour6ea14572022-01-23 17:15:46 -0800248// InstallPaths returns the ordered channel of indexed install paths.
249func (ni *NoticeIndex) InstallPaths() chan string {
250 c := make(chan string)
251 go func() {
252 paths := make([]string, 0, len(ni.installHashLib))
253 for path := range ni.installHashLib {
254 paths = append(paths, path)
255 }
256 sort.Strings(paths)
257 for _, installPath := range paths {
258 c <- installPath
259 }
260 close(c)
261 }()
262 return c
263}
264
265// InstallHashes returns the ordered array of hashes attached to `installPath`.
266func (ni *NoticeIndex) InstallHashes(installPath string) []hash {
267 result := make([]hash, 0, len(ni.installHashLib[installPath]))
268 for h := range ni.installHashLib[installPath] {
269 result = append(result, h)
270 }
271 if len(result) > 0 {
272 sort.Sort(hashList{ni, "", installPath, &result})
273 }
274 return result
275}
276
277// InstallHashLibs returns the ordered array of library names attached to
278// `installPath` as hash `h`.
279func (ni *NoticeIndex) InstallHashLibs(installPath string, h hash) []string {
280 result := make([]string, 0, len(ni.installHashLib[installPath][h]))
281 for libName := range ni.installHashLib[installPath][h] {
282 result = append(result, libName)
283 }
284 sort.Strings(result)
285 return result
286}
287
Bob Badour00c8a382022-01-26 17:21:39 -0800288// Libraries returns the ordered channel of indexed library names.
289func (ni *NoticeIndex) Libraries() chan string {
290 c := make(chan string)
291 go func() {
292 libs := make([]string, 0, len(ni.libHash))
293 for lib := range ni.libHash {
294 libs = append(libs, lib)
295 }
296 sort.Strings(libs)
297 for _, lib := range libs {
298 c <- lib
299 }
300 close(c)
301 }()
302 return c
303}
304
Bob Badoure6fdd142021-12-09 22:10:43 -0800305// HashText returns the file content of the license text hashed as `h`.
306func (ni *NoticeIndex) HashText(h hash) []byte {
307 return ni.text[h]
308}
309
310// getLibName returns the name of the library associated with `noticeFor`.
Bob Badour5028abc2022-02-09 11:56:58 -0800311func (ni *NoticeIndex) getLibName(noticeFor *TargetNode, h hash) string {
312 for _, text := range noticeFor.LicenseTexts() {
313 if !strings.Contains(text, ":") {
314 continue
315 }
316
317 fields := strings.SplitN(text, ":", 2)
318 fname, pname := fields[0], fields[1]
319 if ni.hash[fname].key != h.key {
320 continue
321 }
322
323 ln, err := url.QueryUnescape(pname)
324 if err != nil {
325 continue
326 }
327 return ln
328 }
Bob Badoure6fdd142021-12-09 22:10:43 -0800329 // use name from METADATA if available
330 ln := ni.checkMetadata(noticeFor)
331 if len(ln) > 0 {
332 return ln
333 }
334 // use package_name: from license{} module if available
335 pn := noticeFor.PackageName()
336 if len(pn) > 0 {
337 return pn
338 }
339 for _, p := range noticeFor.Projects() {
340 if strings.HasPrefix(p, "prebuilts/") {
341 for _, licenseText := range noticeFor.LicenseTexts() {
342 if !strings.HasPrefix(licenseText, "prebuilts/") {
343 continue
344 }
345 for r, prefix := range SafePrebuiltPrefixes {
346 match := r.FindString(licenseText)
347 if len(match) == 0 {
348 continue
349 }
350 strip := SafePathPrefixes[prefix]
351 if strip {
352 // strip entire prefix
353 match = licenseText[len(match):]
354 } else {
355 // strip from prebuilts/ until safe prefix
356 match = licenseText[len(match)-len(prefix):]
357 }
358 // remove LICENSE or NOTICE or other filename
359 li := strings.LastIndex(match, "/")
360 if 0 < li {
361 match = match[:li]
362 }
363 // remove *licenses/ path segment and subdirectory if in path
364 if offsets := licensesPathRegexp.FindAllStringIndex(match, -1); offsets != nil && 0 < offsets[len(offsets)-1][0] {
365 match = match[:offsets[len(offsets)-1][0]]
366 li = strings.LastIndex(match, "/")
367 if 0 < li {
368 match = match[:li]
369 }
370 }
371 return match
372 }
373 break
374 }
375 }
376 for prefix, strip := range SafePathPrefixes {
377 if strings.HasPrefix(p, prefix) {
378 if strip {
379 return p[len(prefix):]
380 } else {
381 return p
382 }
383 }
384 }
385 }
386 // strip off [./]meta_lic from license metadata path and extract base name
387 n := noticeFor.name[:len(noticeFor.name)-9]
388 li := strings.LastIndex(n, "/")
389 if 0 < li {
390 n = n[li+1:]
391 }
392 return n
393}
394
395// checkMetadata tries to look up a library name from a METADATA file associated with `noticeFor`.
396func (ni *NoticeIndex) checkMetadata(noticeFor *TargetNode) string {
397 for _, p := range noticeFor.Projects() {
398 if name, ok := ni.projectName[p]; ok {
399 if name == noProjectName {
400 continue
401 }
402 return name
403 }
404 f, err := ni.rootFS.Open(filepath.Join(p, "METADATA"))
405 if err != nil {
406 ni.projectName[p] = noProjectName
407 continue
408 }
409 name := ""
410 description := ""
411 version := ""
412 s := bufio.NewScanner(f)
413 for s.Scan() {
414 line := s.Text()
415 m := nameRegexp.FindStringSubmatch(line)
416 if m != nil {
417 if 1 < len(m) && m[1] != "" {
418 name = m[1]
419 }
420 if version != "" {
421 break
422 }
423 continue
424 }
425 m = versionRegexp.FindStringSubmatch(line)
426 if m != nil {
427 if 1 < len(m) && m[1] != "" {
428 version = m[1]
429 }
430 if name != "" {
431 break
432 }
433 continue
434 }
435 m = descRegexp.FindStringSubmatch(line)
436 if m != nil {
437 if 1 < len(m) && m[1] != "" {
438 description = m[1]
439 }
440 }
441 }
442 _ = s.Err()
443 _ = f.Close()
444 if name != "" {
445 if version != "" {
446 if version[0] == 'v' || version[0] == 'V' {
447 ni.projectName[p] = name + "_" + version
448 } else {
449 ni.projectName[p] = name + "_v_" + version
450 }
451 } else {
452 ni.projectName[p] = name
453 }
454 return ni.projectName[p]
455 }
456 if description != "" {
457 ni.projectName[p] = description
458 return ni.projectName[p]
459 }
460 ni.projectName[p] = noProjectName
461 }
462 return ""
463}
464
465// addText reads and indexes the content of a license text file.
466func (ni *NoticeIndex) addText(file string) error {
467 f, err := ni.rootFS.Open(filepath.Clean(file))
468 if err != nil {
469 return fmt.Errorf("error opening license text file %q: %w", file, err)
470 }
471
472 // read the file
473 text, err := io.ReadAll(f)
474 if err != nil {
475 return fmt.Errorf("error reading license text file %q: %w", file, err)
476 }
477
478 hash := hash{fmt.Sprintf("%x", md5.Sum(text))}
479 ni.hash[file] = hash
480 if _, alreadyPresent := ni.text[hash]; !alreadyPresent {
481 ni.text[hash] = text
482 }
483
Colin Crossbb45f8c2022-01-28 15:18:19 -0800484 ni.files = append(ni.files, file)
485
Bob Badoure6fdd142021-12-09 22:10:43 -0800486 return nil
487}
488
489// getInstallPaths returns the names of the used dependencies mapped to their
490// installed locations.
491func getInstallPaths(attachesTo *TargetNode, path TargetEdgePath) []string {
492 if len(path) == 0 {
493 installs := attachesTo.Installed()
494 if 0 == len(installs) {
495 installs = attachesTo.Built()
496 }
497 return installs
498 }
499
500 var getInstalls func(path TargetEdgePath) []string
501
502 getInstalls = func(path TargetEdgePath) []string {
503 // deps contains the output targets from the dependencies in the path
504 var deps []string
505 if len(path) > 1 {
506 // recursively get the targets from the sub-path skipping 1 path segment
507 deps = getInstalls(path[1:])
508 } else {
509 // stop recursion at 1 path segment
510 deps = path[0].Dependency().TargetFiles()
511 }
512 size := 0
513 prefixes := path[0].Target().TargetFiles()
514 installMap := path[0].Target().InstallMap()
515 sources := path[0].Target().Sources()
516 for _, dep := range deps {
517 found := false
518 for _, source := range sources {
519 if strings.HasPrefix(dep, source) {
520 found = true
521 break
522 }
523 }
524 if !found {
525 continue
526 }
527 for _, im := range installMap {
528 if strings.HasPrefix(dep, im.FromPath) {
529 size += len(prefixes)
530 break
531 }
532 }
533 }
534
535 installs := make([]string, 0, size)
536 for _, dep := range deps {
537 found := false
538 for _, source := range sources {
539 if strings.HasPrefix(dep, source) {
540 found = true
541 break
542 }
543 }
544 if !found {
545 continue
546 }
547 for _, im := range installMap {
548 if strings.HasPrefix(dep, im.FromPath) {
549 for _, prefix := range prefixes {
550 installs = append(installs, prefix+im.ContainerPath+dep[len(im.FromPath):])
551 }
552 break
553 }
554 }
555 }
556 return installs
557 }
558 allInstalls := getInstalls(path)
559 installs := path[0].Target().Installed()
560 if len(installs) == 0 {
561 return allInstalls
562 }
563 result := make([]string, 0, len(allInstalls))
564 for _, install := range allInstalls {
565 for _, prefix := range installs {
566 if strings.HasPrefix(install, prefix) {
567 result = append(result, install)
568 }
569 }
570 }
571 return result
572}
573
574// hash is an opaque string derived from md5sum.
575type hash struct {
576 key string
577}
578
579// String returns the hexadecimal representation of the hash.
580func (h hash) String() string {
581 return h.key
582}
583
584// hashList orders an array of hashes
585type hashList struct {
Bob Badour6ea14572022-01-23 17:15:46 -0800586 ni *NoticeIndex
587 libName string
588 installPath string
589 hashes *[]hash
Bob Badoure6fdd142021-12-09 22:10:43 -0800590}
591
592// Len returns the count of elements in the slice.
593func (l hashList) Len() int { return len(*l.hashes) }
594
595// Swap rearranges 2 elements of the slice so that each occupies the other's
596// former position.
597func (l hashList) Swap(i, j int) { (*l.hashes)[i], (*l.hashes)[j] = (*l.hashes)[j], (*l.hashes)[i] }
598
599// Less returns true when the `i`th element is lexicographically less than
600// the `j`th element.
601func (l hashList) Less(i, j int) bool {
602 var insti, instj int
603 if 0 < len(l.libName) {
604 insti = len(l.ni.hashLibInstall[(*l.hashes)[i]][l.libName])
605 instj = len(l.ni.hashLibInstall[(*l.hashes)[j]][l.libName])
Bob Badour6ea14572022-01-23 17:15:46 -0800606 } else {
607 libsi := l.ni.InstallHashLibs(l.installPath, (*l.hashes)[i])
608 libsj := l.ni.InstallHashLibs(l.installPath, (*l.hashes)[j])
609 libsis := strings.Join(libsi, " ")
610 libsjs := strings.Join(libsj, " ")
611 if libsis != libsjs {
612 return libsis < libsjs
613 }
Bob Badoure6fdd142021-12-09 22:10:43 -0800614 }
615 if insti == instj {
616 leni := len(l.ni.text[(*l.hashes)[i]])
617 lenj := len(l.ni.text[(*l.hashes)[j]])
618 if leni == lenj {
619 // all else equal, just order by hash value
620 return (*l.hashes)[i].key < (*l.hashes)[j].key
621 }
622 // put shortest texts first within same # of installs
623 return leni < lenj
624 }
625 // reverse order of # installs so that most popular appears first
626 return instj < insti
627}