blob: ad85ee9aff39bdeebd1ad68a2e96cfcd5e724783 [file] [log] [blame]
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package finder
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/json"
21 "fmt"
22 "io"
23 "os"
24 "path/filepath"
25 "runtime"
26 "sort"
27 "strings"
28 "sync"
29 "sync/atomic"
30 "time"
31
32 "android/soong/fs"
33 "errors"
34)
35
36// This file provides a Finder struct that can quickly search for files satisfying
37// certain criteria.
38// This Finder gets its speed partially from parallelism and partially from caching.
39// If a Stat call returns the same result as last time, then it means Finder
40// can skip the ReadDir call for that dir.
41
42// The primary data structure used by the finder is the field Finder.nodes ,
43// which is a tree of nodes of type *pathMap .
44// Each node represents a directory on disk, along with its stats, subdirectories,
45// and contained files.
46
47// The common use case for the Finder is that the caller creates a Finder and gives
48// it the same query that was given to it in the previous execution.
49// In this situation, the major events that take place are:
50// 1. The Finder begins to load its db
51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
52// Calling Stat on each of these directories is generally a large fraction of the total time
53// 3. The Finder begins to construct a separate tree of nodes in each of its threads
54// 4. The Finder merges the individual node trees into the main node tree
55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
56// These ReadDir calls might prompt additional Stat calls, etc
57// 6. The Finder waits for all loading to complete
58// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
59
60// These are the invariants regarding concurrency:
61// 1. The public methods of Finder are threadsafe.
62// The public methods are only performance-optimized for one caller at a time, however.
63// For the moment, multiple concurrent callers shouldn't expect any better performance than
64// multiple serial callers.
65// 2. While building the node tree, only one thread may ever access the <children> collection of a
66// *pathMap at once.
67// a) The thread that accesses the <children> collection is the thread that discovers the
68// children (by reading them from the cache or by having received a response to ReadDir).
69// 1) Consequently, the thread that discovers the children also spawns requests to stat
70// subdirs.
71// b) Consequently, while building the node tree, no thread may do a lookup of its
72// *pathMap via filepath because another thread may be adding children to the
73// <children> collection of an ancestor node. Additionally, in rare cases, another thread
74// may be removing children from an ancestor node if the children were only discovered to
75// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
76// 3. No query will begin to be serviced until all loading (both reading the db
77// and scanning the filesystem) is complete.
78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
79// generate it, making this not a huge loss in performance.
80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
82
83// see cmd/finder.go or finder_test.go for usage examples
84
85// Update versionString whenever making a backwards-incompatible change to the cache file format
86const versionString = "Android finder version 1"
87
88// a CacheParams specifies which files and directories the user wishes be scanned and
89// potentially added to the cache
90type CacheParams struct {
91 // WorkingDirectory is used as a base for any relative file paths given to the Finder
92 WorkingDirectory string
93
94 // RootDirs are the root directories used to initiate the search
95 RootDirs []string
96
97 // ExcludeDirs are directory names that if encountered are removed from the search
98 ExcludeDirs []string
99
100 // PruneFiles are file names that if encountered prune their entire directory
101 // (including siblings)
102 PruneFiles []string
103
104 // IncludeFiles are file names to include as matches
105 IncludeFiles []string
106}
107
108// a cacheConfig stores the inputs that determine what should be included in the cache
109type cacheConfig struct {
110 CacheParams
111
112 // FilesystemView is a unique identifier telling which parts of which file systems
113 // are readable by the Finder. In practice its value is essentially username@hostname.
114 // FilesystemView is set to ensure that a cache file copied to another host or
115 // found by another user doesn't inadvertently get reused.
116 FilesystemView string
117}
118
119func (p *cacheConfig) Dump() ([]byte, error) {
120 bytes, err := json.Marshal(p)
121 return bytes, err
122}
123
124// a cacheMetadata stores version information about the cache
125type cacheMetadata struct {
126 // The Version enables the Finder to determine whether it can even parse the file
127 // If the version changes, the entire cache file must be regenerated
128 Version string
129
130 // The CacheParams enables the Finder to determine whether the parameters match
131 // If the CacheParams change, the Finder can choose how much of the cache file to reuse
132 // (although in practice, the Finder will probably choose to ignore the entire file anyway)
133 Config cacheConfig
134}
135
136type Logger interface {
137 Output(calldepth int, s string) error
138}
139
140// the Finder is the main struct that callers will want to use
141type Finder struct {
142 // configuration
143 DbPath string
144 numDbLoadingThreads int
145 numSearchingThreads int
146 cacheMetadata cacheMetadata
147 logger Logger
148 filesystem fs.FileSystem
149
150 // temporary state
151 threadPool *threadPool
152 mutex sync.Mutex
153
154 // non-temporary state
155 modifiedFlag int32
156 nodes pathMap
157}
158
159// New creates a new Finder for use
160func New(cacheParams CacheParams, filesystem fs.FileSystem,
161 logger Logger, dbPath string) *Finder {
162
163 numThreads := runtime.NumCPU() * 2
164 numDbLoadingThreads := numThreads
165 numSearchingThreads := numThreads
166
167 metadata := cacheMetadata{
168 Version: versionString,
169 Config: cacheConfig{
170 CacheParams: cacheParams,
171 FilesystemView: filesystem.ViewId(),
172 },
173 }
174
175 finder := &Finder{
176 numDbLoadingThreads: numDbLoadingThreads,
177 numSearchingThreads: numSearchingThreads,
178 cacheMetadata: metadata,
179 logger: logger,
180 filesystem: filesystem,
181
182 nodes: *newPathMap("/"),
183 DbPath: dbPath,
184 }
185
186 finder.loadFromFilesystem()
187
188 finder.verbosef("Done parsing db\n")
189 return finder
190}
191
192// FindNamed searches for every cached file
193func (f *Finder) FindAll() []string {
194 return f.FindAt("/")
195}
196
197// FindNamed searches for every cached file under <rootDir>
198func (f *Finder) FindAt(rootDir string) []string {
199 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
200 return entries.DirNames, entries.FileNames
201 }
202 return f.FindMatching(rootDir, filter)
203}
204
205// FindNamed searches for every cached file named <fileName>
206func (f *Finder) FindNamed(fileName string) []string {
207 return f.FindNamedAt("/", fileName)
208}
209
210// FindNamedAt searches under <rootPath> for every file named <fileName>
211// The reason a caller might use FindNamedAt instead of FindNamed is if they want
212// to limit their search to a subset of the cache
213func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
214 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
215 matches := []string{}
216 for _, foundName := range entries.FileNames {
217 if foundName == fileName {
218 matches = append(matches, foundName)
219 }
220 }
221 return entries.DirNames, matches
222
223 }
224 return f.FindMatching(rootPath, filter)
225}
226
227// FindFirstNamed searches for every file named <fileName>
228// Whenever it finds a match, it stops search subdirectories
229func (f *Finder) FindFirstNamed(fileName string) []string {
230 return f.FindFirstNamedAt("/", fileName)
231}
232
233// FindFirstNamedAt searches for every file named <fileName>
234// Whenever it finds a match, it stops search subdirectories
235func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
236 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
237 matches := []string{}
238 for _, foundName := range entries.FileNames {
239 if foundName == fileName {
240 matches = append(matches, foundName)
241 }
242 }
243
244 if len(matches) > 0 {
245 return []string{}, matches
246 }
247 return entries.DirNames, matches
248 }
249 return f.FindMatching(rootPath, filter)
250}
251
252// FindMatching is the most general exported function for searching for files in the cache
253// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
254// in place, removing file paths and directories as desired.
255// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
256func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
257 // set up some parameters
258 scanStart := time.Now()
259 var isRel bool
260 workingDir := f.cacheMetadata.Config.WorkingDirectory
261
262 isRel = !filepath.IsAbs(rootPath)
263 if isRel {
264 rootPath = filepath.Join(workingDir, rootPath)
265 }
266
267 rootPath = filepath.Clean(rootPath)
268
269 // ensure nothing else is using the Finder
270 f.verbosef("FindMatching waiting for finder to be idle\n")
271 f.lock()
272 defer f.unlock()
273
274 node := f.nodes.GetNode(rootPath, false)
275 if node == nil {
276 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
277 rootPath, f.cacheMetadata.Config.CacheParams)
278 // path is not found; don't do a search
279 return []string{}
280 }
281
282 // search for matching files
283 f.verbosef("Finder finding %v using cache\n", rootPath)
284 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
285
286 // format and return results
287 if isRel {
288 for i := 0; i < len(results); i++ {
289 results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
290 }
291 }
292 sort.Strings(results)
293 f.verbosef("Found %v files under %v in %v using cache\n",
294 len(results), rootPath, time.Since(scanStart))
295 return results
296}
297
298// Shutdown saves the contents of the Finder to its database file
299func (f *Finder) Shutdown() {
300 f.verbosef("Shutting down\n")
301 if f.wasModified() {
302 err := f.dumpDb()
303 if err != nil {
304 f.verbosef("%v\n", err)
305 }
306 } else {
307 f.verbosef("Skipping dumping unmodified db\n")
308 }
309}
310
311// End of public api
312
313// joinCleanPaths is like filepath.Join but is faster because
314// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
315func joinCleanPaths(base string, leaf string) string {
316 if base == "" {
317 return leaf
318 }
319 if base == "/" {
320 return base + leaf
321 }
322 if leaf == "" {
323 return base
324 }
325 return base + "/" + leaf
326}
327
328func (f *Finder) verbosef(format string, args ...interface{}) {
329 f.logger.Output(2, fmt.Sprintf(format, args...))
330}
331
332// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
333func (f *Finder) loadFromFilesystem() {
334 f.threadPool = newThreadPool(f.numDbLoadingThreads)
335
336 err := f.startFromExternalCache()
337 if err != nil {
338 f.startWithoutExternalCache()
339 }
340
341 startTime := time.Now()
342 f.verbosef("Waiting for pending requests to complete\n")
343 f.threadPool.Wait()
344 f.verbosef("Is idle after %v\n", time.Now().Sub(startTime))
345 f.threadPool = nil
346}
347
348func (f *Finder) startFind(path string) {
349 if !filepath.IsAbs(path) {
350 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
351 }
352 node := f.nodes.GetNode(path, true)
353 f.statDirAsync(node)
354}
355
356func (f *Finder) lock() {
357 f.mutex.Lock()
358}
359
360func (f *Finder) unlock() {
361 f.mutex.Unlock()
362}
363
364// a statResponse is the relevant portion of the response from the filesystem to a Stat call
365type statResponse struct {
366 ModTime int64
367 Inode uint64
368 Device uint64
369}
370
371// a pathAndStats stores a path and its stats
372type pathAndStats struct {
373 statResponse
374
375 Path string
376}
377
378// a dirFullInfo stores all of the relevant information we know about a directory
379type dirFullInfo struct {
380 pathAndStats
381
382 FileNames []string
383}
384
385// a PersistedDirInfo is the information about a dir that we save to our cache on disk
386type PersistedDirInfo struct {
387 // These field names are short because they are repeated many times in the output json file
388 P string // path
389 T int64 // modification time
390 I uint64 // inode number
391 F []string // relevant filenames contained
392}
393
394// a PersistedDirs is the information that we persist for a group of dirs
395type PersistedDirs struct {
396 // the device on which each directory is stored
397 Device uint64
398 // the common root path to which all contained dirs are relative
399 Root string
400 // the directories themselves
401 Dirs []PersistedDirInfo
402}
403
404// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
405type CacheEntry []PersistedDirs
406
407// a DirEntries lists the files and directories contained directly within a specific directory
408type DirEntries struct {
409 Path string
410
411 // elements of DirNames are just the dir names; they don't include any '/' character
412 DirNames []string
413 // elements of FileNames are just the file names; they don't include '/' character
414 FileNames []string
415}
416
417// a WalkFunc is the type that is passed into various Find functions for determining which
418// directories the caller wishes be walked. The WalkFunc is expected to decide which
419// directories to walk and which files to consider as matches to the original query.
420type WalkFunc func(DirEntries) (dirs []string, files []string)
421
422// a mapNode stores the relevant stats about a directory to be stored in a pathMap
423type mapNode struct {
424 statResponse
425 FileNames []string
426}
427
428// a pathMap implements the directory tree structure of nodes
429type pathMap struct {
430 mapNode
431
432 path string
433
434 children map[string]*pathMap
435
436 // number of descendent nodes, including self
437 approximateNumDescendents int
438}
439
440func newPathMap(path string) *pathMap {
441 result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
442 approximateNumDescendents: 1}
443 return result
444}
445
446// GetNode returns the node at <path>
447func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap {
448 if len(path) > 0 && path[0] == '/' {
449 path = path[1:]
450 }
451
452 node := m
453 for {
454 if path == "" {
455 return node
456 }
457
458 index := strings.Index(path, "/")
459 var firstComponent string
460 if index >= 0 {
461 firstComponent = path[:index]
462 path = path[index+1:]
463 } else {
464 firstComponent = path
465 path = ""
466 }
467
468 child, found := node.children[firstComponent]
469
470 if !found {
471 if createIfNotFound {
472 child = node.newChild(firstComponent)
473 } else {
474 return nil
475 }
476 }
477
478 node = child
479 }
480}
481
482func (m *pathMap) newChild(name string) (child *pathMap) {
483 path := joinCleanPaths(m.path, name)
484 newChild := newPathMap(path)
485 m.children[name] = newChild
486
487 return m.children[name]
488}
489
490func (m *pathMap) UpdateNumDescendents() int {
491 count := 1
492 for _, child := range m.children {
493 count += child.approximateNumDescendents
494 }
495 m.approximateNumDescendents = count
496 return count
497}
498
499func (m *pathMap) UpdateNumDescendentsRecursive() {
500 for _, child := range m.children {
501 child.UpdateNumDescendentsRecursive()
502 }
503 m.UpdateNumDescendents()
504}
505
506func (m *pathMap) MergeIn(other *pathMap) {
507 for key, theirs := range other.children {
508 ours, found := m.children[key]
509 if found {
510 ours.MergeIn(theirs)
511 } else {
512 m.children[key] = theirs
513 }
514 }
515 if other.ModTime != 0 {
516 m.mapNode = other.mapNode
517 }
518 m.UpdateNumDescendents()
519}
520
521func (m *pathMap) DumpAll() []dirFullInfo {
522 results := []dirFullInfo{}
523 m.dumpInto("", &results)
524 return results
525}
526
527func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) {
528 *results = append(*results,
529 dirFullInfo{
530 pathAndStats{statResponse: m.statResponse, Path: path},
531 m.FileNames},
532 )
533 for key, child := range m.children {
534 childPath := joinCleanPaths(path, key)
535 if len(childPath) == 0 || childPath[0] != '/' {
536 childPath = "/" + childPath
537 }
538 child.dumpInto(childPath, results)
539 }
540}
541
542// a semaphore can be locked by up to <capacity> callers at once
543type semaphore struct {
544 pool chan bool
545}
546
547func newSemaphore(capacity int) *semaphore {
548 return &semaphore{pool: make(chan bool, capacity)}
549}
550
551func (l *semaphore) Lock() {
552 l.pool <- true
553}
554
555func (l *semaphore) Unlock() {
556 <-l.pool
557}
558
559// A threadPool runs goroutines and supports throttling and waiting.
560// Without throttling, Go may exhaust the maximum number of various resources, such as
561// threads or file descriptors, and crash the program.
562type threadPool struct {
563 receivedRequests sync.WaitGroup
564 activeRequests semaphore
565}
566
567func newThreadPool(maxNumConcurrentThreads int) *threadPool {
568 return &threadPool{
569 receivedRequests: sync.WaitGroup{},
570 activeRequests: *newSemaphore(maxNumConcurrentThreads),
571 }
572}
573
574// Run requests to run the given function in its own goroutine
575func (p *threadPool) Run(function func()) {
576 p.receivedRequests.Add(1)
577 // If Run() was called from within a goroutine spawned by this threadPool,
578 // then we may need to return from Run() before having capacity to actually
579 // run <function>.
580 //
581 // It's possible that the body of <function> contains a statement (such as a syscall)
582 // that will cause Go to pin it to a thread, or will contain a statement that uses
583 // another resource that is in short supply (such as a file descriptor), so we can't
584 // actually run <function> until we have capacity.
585 //
586 // However, the semaphore used for synchronization is implemented via a channel and
587 // shouldn't require a new thread for each access.
588 go func() {
589 p.activeRequests.Lock()
590 function()
591 p.activeRequests.Unlock()
592 p.receivedRequests.Done()
593 }()
594}
595
596// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
597func (p *threadPool) Wait() {
598 p.receivedRequests.Wait()
599}
600
601func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
602 // group each dirFullInfo by its Device, to avoid having to repeat it in the output
603 dirsByDevice := map[uint64][]PersistedDirInfo{}
604 for _, entry := range dirInfos {
605 _, found := dirsByDevice[entry.Device]
606 if !found {
607 dirsByDevice[entry.Device] = []PersistedDirInfo{}
608 }
609 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
610 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
611 }
612
613 cacheEntry := CacheEntry{}
614
615 for device, infos := range dirsByDevice {
616 // find common prefix
617 prefix := ""
618 if len(infos) > 0 {
619 prefix = infos[0].P
620 }
621 for _, info := range infos {
622 for !strings.HasPrefix(info.P+"/", prefix+"/") {
623 prefix = filepath.Dir(prefix)
624 }
625 }
626 // remove common prefix
627 for i := range infos {
628 suffix := strings.Replace(infos[i].P, prefix, "", 1)
629 if len(suffix) > 0 && suffix[0] == '/' {
630 suffix = suffix[1:]
631 }
632 infos[i].P = suffix
633 }
634
635 // turn the map (keyed by device) into a list of structs with labeled fields
636 // this is to improve readability of the output
637 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
638 }
639
640 // convert to json.
641 // it would save some space to use a different format than json for the db file,
642 // but the space and time savings are small, and json is easy for humans to read
643 bytes, err := json.Marshal(cacheEntry)
644 return bytes, err
645}
646
647func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
648 var cacheEntry CacheEntry
649 err := json.Unmarshal(bytes, &cacheEntry)
650 if err != nil {
651 return nil, err
652 }
653
654 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
655 capacity := 0
656 for _, element := range cacheEntry {
657 capacity += len(element.Dirs)
658 }
659 nodes := make([]dirFullInfo, capacity)
660 count := 0
661 for _, element := range cacheEntry {
662 for _, dir := range element.Dirs {
663 path := joinCleanPaths(element.Root, dir.P)
664
665 nodes[count] = dirFullInfo{
666 pathAndStats: pathAndStats{
667 statResponse: statResponse{
668 ModTime: dir.T, Inode: dir.I, Device: element.Device,
669 },
670 Path: path},
671 FileNames: dir.F}
672 count++
673 }
674 }
675 return nodes, nil
676}
677
678// We use the following separator byte to distinguish individually parseable blocks of json
679// because we know this separator won't appear in the json that we're parsing.
680//
681// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
682// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
683// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
684// character.
685//
686// We know that the newline character will never appear in our json string, because:
687// - If a newline character appears as part of a data string, then json encoding will
688// emit two characters instead: '\' and 'n'.
689// - The json encoder that we use doesn't emit the optional newlines between any of its
690// other outputs.
691const lineSeparator = byte('\n')
692
693func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) {
694 return reader.ReadBytes(lineSeparator)
695}
696
697// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
698func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool {
699 cacheVersionBytes, err := f.readLine(cacheReader)
700 if err != nil {
701 f.verbosef("Failed to read database header; database is invalid\n")
702 return false
703 }
704 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
705 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
706 }
707 cacheVersionString := string(cacheVersionBytes)
708 currentVersion := f.cacheMetadata.Version
709 if cacheVersionString != currentVersion {
710 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
711 return false
712 }
713
714 cacheParamBytes, err := f.readLine(cacheReader)
715 if err != nil {
716 f.verbosef("Failed to read database search params; database is invalid\n")
717 return false
718 }
719
720 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
721 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
722 }
723
724 currentParamBytes, err := f.cacheMetadata.Config.Dump()
725 if err != nil {
726 panic("Finder failed to serialize its parameters")
727 }
728 cacheParamString := string(cacheParamBytes)
729 currentParamString := string(currentParamBytes)
730 if cacheParamString != currentParamString {
731 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
732 return false
733 }
734 return true
735}
736
737// loadBytes compares the cache info in <data> to the state of the filesystem
738// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
739func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) {
740
741 helperStartTime := time.Now()
742
743 cachedNodes, err := f.parseCacheEntry(data)
744 if err != nil {
745 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
746 }
747
748 unmarshalDate := time.Now()
749 f.verbosef("Unmarshaled %v objects for %v in %v\n",
750 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
751
752 tempMap := newPathMap("/")
753 stats := make([]statResponse, len(cachedNodes))
754
755 for i, node := range cachedNodes {
756 // check the file system for an updated timestamp
757 stats[i] = f.statDirSync(node.Path)
758 }
759
760 dirsToWalk = []string{}
761 for i, cachedNode := range cachedNodes {
762 updated := stats[i]
763 // save the cached value
764 container := tempMap.GetNode(cachedNode.Path, true)
765 container.mapNode = mapNode{statResponse: updated}
766
767 // if the metadata changed and the directory still exists, then
768 // make a note to walk it later
769 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
770 f.setModified()
771 // make a note that the directory needs to be walked
772 dirsToWalk = append(dirsToWalk, cachedNode.Path)
773 } else {
774 container.mapNode.FileNames = cachedNode.FileNames
775 }
776 }
777 // count the number of nodes to improve our understanding of the shape of the tree,
778 // thereby improving parallelism of subsequent searches
779 tempMap.UpdateNumDescendentsRecursive()
780
781 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
782 return tempMap, dirsToWalk, nil
783}
784
785// startFromExternalCache loads the cache database from disk
786// startFromExternalCache waits to return until the load of the cache db is complete, but
787// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
788func (f *Finder) startFromExternalCache() (err error) {
789 startTime := time.Now()
790 dbPath := f.DbPath
791
792 // open cache file and validate its header
793 reader, err := f.filesystem.Open(dbPath)
794 if err != nil {
795 return errors.New("No data to load from database\n")
796 }
797 bufferedReader := bufio.NewReader(reader)
798 if !f.validateCacheHeader(bufferedReader) {
799 return errors.New("Cache header does not match")
800 }
801 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
802
803 // read the file and spawn threads to process it
804 nodesToWalk := [][]*pathMap{}
805 mainTree := newPathMap("/")
806
807 // read the blocks and stream them into <blockChannel>
808 type dataBlock struct {
809 id int
810 err error
811 data []byte
812 }
813 blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
814 readBlocks := func() {
815 index := 0
816 for {
817 // It takes some time to unmarshal the input from json, so we want
818 // to unmarshal it in parallel. In order to find valid places to
819 // break the input, we scan for the line separators that we inserted
820 // (for this purpose) when we dumped the database.
821 data, err := f.readLine(bufferedReader)
822 var response dataBlock
823 done := false
824 if err != nil && err != io.EOF {
825 response = dataBlock{id: index, err: err, data: nil}
826 done = true
827 } else {
828 done = (err == io.EOF)
829 response = dataBlock{id: index, err: nil, data: data}
830 }
831 blockChannel <- response
832 index++
833 duration := time.Since(startTime)
834 f.verbosef("Read block %v after %v\n", index, duration)
835 if done {
836 f.verbosef("Read %v blocks in %v\n", index, duration)
837 close(blockChannel)
838 return
839 }
840 }
841 }
842 go readBlocks()
843
844 // Read from <blockChannel> and stream the responses into <resultChannel>.
845 type workResponse struct {
846 id int
847 err error
848 tree *pathMap
849 updatedDirs []string
850 }
851 resultChannel := make(chan workResponse)
852 processBlocks := func() {
853 numProcessed := 0
854 threadPool := newThreadPool(f.numDbLoadingThreads)
855 for {
856 // get a block to process
857 block, received := <-blockChannel
858 if !received {
859 break
860 }
861
862 if block.err != nil {
863 resultChannel <- workResponse{err: block.err}
864 break
865 }
866 numProcessed++
867 // wait until there is CPU available to process it
868 threadPool.Run(
869 func() {
870 processStartTime := time.Now()
871 f.verbosef("Starting to process block %v after %v\n",
872 block.id, processStartTime.Sub(startTime))
873 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
874 var response workResponse
875 if err != nil {
876 f.verbosef(
877 "Block %v failed to parse with error %v\n",
878 block.id, err)
879 response = workResponse{err: err}
880 } else {
881 response = workResponse{
882 id: block.id,
883 err: nil,
884 tree: tempMap,
885 updatedDirs: updatedDirs,
886 }
887 }
888 f.verbosef("Processed block %v in %v\n",
889 block.id, time.Since(processStartTime),
890 )
891 resultChannel <- response
892 },
893 )
894 }
895 threadPool.Wait()
896 f.verbosef("Finished processing %v blocks in %v\n",
897 numProcessed, time.Since(startTime))
898 close(resultChannel)
899 }
900 go processBlocks()
901
902 // Read from <resultChannel> and use the results
903 combineResults := func() (err error) {
904 for {
905 result, received := <-resultChannel
906 if !received {
907 break
908 }
909 if err != nil {
910 // In case of an error, wait for work to complete before
911 // returning the error. This ensures that any subsequent
912 // work doesn't need to compete for resources (and possibly
913 // fail due to, for example, a filesystem limit on the number of
914 // concurrently open files) with past work.
915 continue
916 }
917 if result.err != nil {
918 err = result.err
919 continue
920 }
921 // update main tree
922 mainTree.MergeIn(result.tree)
923 // record any new directories that we will need to Stat()
924 updatedNodes := make([]*pathMap, len(result.updatedDirs))
925 for j, dir := range result.updatedDirs {
926 node := mainTree.GetNode(dir, false)
927 updatedNodes[j] = node
928 }
929 nodesToWalk = append(nodesToWalk, updatedNodes)
930 }
931 return err
932 }
933 err = combineResults()
934 if err != nil {
935 return err
936 }
937
938 f.nodes = *mainTree
939
940 // after having loaded the entire db and therefore created entries for
941 // the directories we know of, now it's safe to start calling ReadDir on
942 // any updated directories
943 for i := range nodesToWalk {
944 f.listDirsAsync(nodesToWalk[i])
945 }
946 f.verbosef("Loaded db and statted its contents in %v\n", time.Since(startTime))
947 return err
948}
949
950// startWithoutExternalCache starts scanning the filesystem according to the cache config
951// startWithoutExternalCache should be called if startFromExternalCache is not applicable
952func (f *Finder) startWithoutExternalCache() {
953 configDirs := f.cacheMetadata.Config.RootDirs
954
955 // clean paths
956 candidates := make([]string, len(configDirs))
957 for i, dir := range configDirs {
958 candidates[i] = filepath.Clean(dir)
959 }
960 // remove duplicates
961 dirsToScan := make([]string, 0, len(configDirs))
962 for _, candidate := range candidates {
963 include := true
964 for _, included := range dirsToScan {
965 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") {
966 include = false
967 break
968 }
969 }
970 if include {
971 dirsToScan = append(dirsToScan, candidate)
972 }
973 }
974
975 // start searching finally
976 for _, path := range dirsToScan {
977 f.verbosef("Starting find of %v\n", path)
978 f.startFind(path)
979 }
980}
981
982// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
983func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
984 if old.Inode != new.Inode {
985 return false
986 }
987 if old.ModTime != new.ModTime {
988 return false
989 }
990 if old.Device != new.Device {
991 return false
992 }
993 return true
994}
995
996func (f *Finder) wasModified() bool {
997 return atomic.LoadInt32(&f.modifiedFlag) > 0
998}
999
1000func (f *Finder) setModified() {
1001 var newVal int32
1002 newVal = 1
1003 atomic.StoreInt32(&f.modifiedFlag, newVal)
1004}
1005
1006// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
1007func (f *Finder) sortedDirEntries() []dirFullInfo {
1008 startTime := time.Now()
1009 nodes := make([]dirFullInfo, 0)
1010 for _, node := range f.nodes.DumpAll() {
1011 if node.ModTime != 0 {
1012 nodes = append(nodes, node)
1013 }
1014 }
1015 discoveryDate := time.Now()
1016 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
1017 less := func(i int, j int) bool {
1018 return nodes[i].Path < nodes[j].Path
1019 }
1020 sort.Slice(nodes, less)
1021 sortDate := time.Now()
1022 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
1023
1024 return nodes
1025}
1026
1027// serializeDb converts the cache database into a form to save to disk
1028func (f *Finder) serializeDb() ([]byte, error) {
1029 // sort dir entries
1030 var entryList = f.sortedDirEntries()
1031
1032 // Generate an output file that can be conveniently loaded using the same number of threads
1033 // as were used in this execution (because presumably that will be the number of threads
1034 // used in the next execution too)
1035
1036 // generate header
1037 header := []byte{}
1038 header = append(header, []byte(f.cacheMetadata.Version)...)
1039 header = append(header, lineSeparator)
1040 configDump, err := f.cacheMetadata.Config.Dump()
1041 if err != nil {
1042 return nil, err
1043 }
1044 header = append(header, configDump...)
1045
1046 // serialize individual blocks in parallel
1047 numBlocks := f.numDbLoadingThreads
1048 if numBlocks > len(entryList) {
1049 numBlocks = len(entryList)
1050 }
1051 blocks := make([][]byte, 1+numBlocks)
1052 blocks[0] = header
1053 blockMin := 0
1054 wg := sync.WaitGroup{}
1055 var errLock sync.Mutex
1056
1057 for i := 1; i <= numBlocks; i++ {
1058 // identify next block
1059 blockMax := len(entryList) * i / numBlocks
1060 block := entryList[blockMin:blockMax]
1061
1062 // process block
1063 wg.Add(1)
1064 go func(index int, block []dirFullInfo) {
1065 byteBlock, subErr := f.serializeCacheEntry(block)
1066 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
1067 if subErr != nil {
1068 f.verbosef("%v\n", subErr.Error())
1069 errLock.Lock()
1070 err = subErr
1071 errLock.Unlock()
1072 } else {
1073 blocks[index] = byteBlock
1074 }
1075 wg.Done()
1076 }(i, block)
1077
1078 blockMin = blockMax
1079 }
1080
1081 wg.Wait()
1082
1083 if err != nil {
1084 return nil, err
1085 }
1086
1087 content := bytes.Join(blocks, []byte{lineSeparator})
1088
1089 return content, nil
1090}
1091
1092// dumpDb saves the cache database to disk
1093func (f *Finder) dumpDb() error {
1094 startTime := time.Now()
1095 f.verbosef("Dumping db\n")
1096
1097 tempPath := f.DbPath + ".tmp"
1098
1099 bytes, err := f.serializeDb()
1100 if err != nil {
1101 return err
1102 }
1103 serializeDate := time.Now()
1104 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
1105 // dump file and atomically move
1106 err = f.filesystem.WriteFile(tempPath, bytes, 0777)
1107 if err != nil {
1108 return err
1109 }
1110 err = f.filesystem.Rename(tempPath, f.DbPath)
1111 if err != nil {
1112 return err
1113 }
1114
1115 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
1116 return nil
1117}
1118
1119func (f *Finder) statDirAsync(dir *pathMap) {
1120 node := dir
1121 path := dir.path
1122 f.threadPool.Run(
1123 func() {
1124 updatedStats := f.statDirSync(path)
1125
1126 if !f.isInfoUpToDate(node.statResponse, updatedStats) {
1127 node.mapNode = mapNode{
1128 statResponse: updatedStats,
1129 FileNames: []string{},
1130 }
1131 f.setModified()
1132 if node.statResponse.ModTime != 0 {
1133 // modification time was updated, so re-scan for
1134 // child directories
1135 f.listDirAsync(dir)
1136 }
1137 }
1138 },
1139 )
1140}
1141
1142func (f *Finder) statDirSync(path string) statResponse {
1143
1144 fileInfo, err := f.filesystem.Lstat(path)
1145
1146 var stats statResponse
1147 if err != nil {
1148 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
1149 return stats
1150 }
1151 modTime := fileInfo.ModTime()
1152 stats = statResponse{}
1153 inode, err := f.filesystem.InodeNumber(fileInfo)
1154 if err != nil {
1155 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
1156 }
1157 stats.Inode = inode
1158 device, err := f.filesystem.DeviceNumber(fileInfo)
1159 if err != nil {
1160 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
1161 }
1162 stats.Device = device
1163 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
1164
1165 if err != nil {
1166 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
1167 }
1168 // We're only interested in knowing whether anything about the directory
1169 // has changed since last check, so we use the latest of the two
1170 // modification times (content modification (mtime) and
1171 // permission modification (ctime))
1172 if permissionsChangeTime.After(modTime) {
1173 modTime = permissionsChangeTime
1174 }
1175 stats.ModTime = modTime.UnixNano()
1176
1177 return stats
1178}
1179
1180// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
1181func (f *Finder) pruneCacheCandidates(items *DirEntries) {
1182
1183 for _, fileName := range items.FileNames {
1184 for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
1185 if fileName == abortedName {
1186 items.FileNames = []string{}
1187 items.DirNames = []string{}
1188 return
1189 }
1190 }
1191 }
1192
1193 // remove any files that aren't the ones we want to include
1194 writeIndex := 0
1195 for _, fileName := range items.FileNames {
1196 // include only these files
1197 for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
1198 if fileName == includedName {
1199 items.FileNames[writeIndex] = fileName
1200 writeIndex++
1201 break
1202 }
1203 }
1204 }
1205 // resize
1206 items.FileNames = items.FileNames[:writeIndex]
1207
1208 writeIndex = 0
1209 for _, dirName := range items.DirNames {
1210 items.DirNames[writeIndex] = dirName
1211 // ignore other dirs that are known to not be inputs to the build process
1212 include := true
1213 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
1214 if dirName == excludedName {
1215 // don't include
1216 include = false
1217 break
1218 }
1219 }
1220 if include {
1221 writeIndex++
1222 }
1223 }
1224 // resize
1225 items.DirNames = items.DirNames[:writeIndex]
1226}
1227
1228func (f *Finder) listDirsAsync(nodes []*pathMap) {
1229 f.threadPool.Run(
1230 func() {
1231 for i := range nodes {
1232 f.listDirSync(nodes[i])
1233 }
1234 },
1235 )
1236}
1237
1238func (f *Finder) listDirAsync(node *pathMap) {
1239 f.threadPool.Run(
1240 func() {
1241 f.listDirSync(node)
1242 },
1243 )
1244}
1245
1246func (f *Finder) listDirSync(dir *pathMap) {
1247 path := dir.path
1248 children, err := f.filesystem.ReadDir(path)
1249
1250 if err != nil {
1251 // if listing the contents of the directory fails (presumably due to
1252 // permission denied), then treat the directory as empty
1253 children = []os.FileInfo{}
1254 }
1255
1256 var subdirs []string
1257 var subfiles []string
1258
1259 for _, child := range children {
1260 linkBits := child.Mode() & os.ModeSymlink
1261 isLink := linkBits != 0
1262 if child.IsDir() {
1263 if !isLink {
1264 // Skip symlink dirs.
1265 // We don't have to support symlink dirs because
1266 // that would cause duplicates.
1267 subdirs = append(subdirs, child.Name())
1268 }
1269 } else {
1270 // We do have to support symlink files because the link name might be
1271 // different than the target name
1272 // (for example, Android.bp -> build/soong/root.bp)
1273 subfiles = append(subfiles, child.Name())
1274 }
1275
1276 }
1277 parentNode := dir
1278
1279 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
1280 f.pruneCacheCandidates(entry)
1281
1282 // create a pathMap node for each relevant subdirectory
1283 relevantChildren := map[string]*pathMap{}
1284 for _, subdirName := range entry.DirNames {
1285 childNode, found := parentNode.children[subdirName]
1286 // if we already knew of this directory, then we already have a request pending to Stat it
1287 // if we didn't already know of this directory, then we must Stat it now
1288 if !found {
1289 childNode = parentNode.newChild(subdirName)
1290 f.statDirAsync(childNode)
1291 }
1292 relevantChildren[subdirName] = childNode
1293 }
1294 // Note that in rare cases, it's possible that we're reducing the set of
1295 // children via this statement, if these are all true:
1296 // 1. we previously had a cache that knew about subdirectories of parentNode
1297 // 2. the user created a prune-file (described in pruneCacheCandidates)
1298 // inside <parentNode>, which specifies that the contents of parentNode
1299 // are to be ignored.
1300 // The fact that it's possible to remove children here means that *pathMap structs
1301 // must not be looked up from f.nodes by filepath (and instead must be accessed by
1302 // direct pointer) until after every listDirSync completes
1303 parentNode.FileNames = entry.FileNames
1304 parentNode.children = relevantChildren
1305
1306}
1307
1308// listMatches takes a node and a function that specifies which subdirectories and
1309// files to include, and listMatches returns the matches
1310func (f *Finder) listMatches(node *pathMap,
1311 filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
1312 entries := DirEntries{
1313 FileNames: node.FileNames,
1314 }
1315 entries.DirNames = make([]string, 0, len(node.children))
1316 for childName := range node.children {
1317 entries.DirNames = append(entries.DirNames, childName)
1318 }
1319
1320 dirNames, fileNames := filter(entries)
1321
1322 subDirs = []*pathMap{}
1323 filePaths = make([]string, 0, len(fileNames))
1324 for _, fileName := range fileNames {
1325 filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
1326 }
1327 subDirs = make([]*pathMap, 0, len(dirNames))
1328 for _, childName := range dirNames {
1329 child, ok := node.children[childName]
1330 if ok {
1331 subDirs = append(subDirs, child)
1332 }
1333 }
1334
1335 return subDirs, filePaths
1336}
1337
1338// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
1339func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc,
1340 approxNumThreads int) []string {
1341
1342 if approxNumThreads < 2 {
1343 // Done spawning threads; process remaining directories
1344 return f.findInCacheSinglethreaded(node, filter)
1345 }
1346
1347 totalWork := 0
1348 for _, child := range node.children {
1349 totalWork += child.approximateNumDescendents
1350 }
1351 childrenResults := make(chan []string, len(node.children))
1352
1353 subDirs, filePaths := f.listMatches(node, filter)
1354
1355 // process child directories
1356 for _, child := range subDirs {
1357 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
1358 childProcessor := func(child *pathMap) {
1359 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
1360 childrenResults <- childResults
1361 }
1362 // If we're allowed to use more than 1 thread to process this directory,
1363 // then instead we use 1 thread for each subdirectory.
1364 // It would be strange to spawn threads for only some subdirectories.
1365 go childProcessor(child)
1366 }
1367
1368 // collect results
1369 for i := 0; i < len(subDirs); i++ {
1370 childResults := <-childrenResults
1371 filePaths = append(filePaths, childResults...)
1372 }
1373 close(childrenResults)
1374
1375 return filePaths
1376}
1377
1378// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
1379// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
1380func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string {
1381 if node == nil {
1382 return []string{}
1383 }
1384
1385 nodes := []*pathMap{node}
1386 matches := []string{}
1387
1388 for len(nodes) > 0 {
1389 currentNode := nodes[0]
1390 nodes = nodes[1:]
1391
1392 subDirs, filePaths := f.listMatches(currentNode, filter)
1393
1394 nodes = append(nodes, subDirs...)
1395
1396 matches = append(matches, filePaths...)
1397 }
1398 return matches
1399}