blob: f15c8c13a0309861d34472d5ad217144fc920bd2 [file] [log] [blame]
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package finder
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/json"
21 "fmt"
22 "io"
23 "os"
24 "path/filepath"
25 "runtime"
26 "sort"
27 "strings"
28 "sync"
29 "sync/atomic"
30 "time"
31
32 "android/soong/fs"
33 "errors"
34)
35
36// This file provides a Finder struct that can quickly search for files satisfying
37// certain criteria.
38// This Finder gets its speed partially from parallelism and partially from caching.
39// If a Stat call returns the same result as last time, then it means Finder
40// can skip the ReadDir call for that dir.
41
42// The primary data structure used by the finder is the field Finder.nodes ,
43// which is a tree of nodes of type *pathMap .
44// Each node represents a directory on disk, along with its stats, subdirectories,
45// and contained files.
46
47// The common use case for the Finder is that the caller creates a Finder and gives
48// it the same query that was given to it in the previous execution.
49// In this situation, the major events that take place are:
50// 1. The Finder begins to load its db
51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
52// Calling Stat on each of these directories is generally a large fraction of the total time
53// 3. The Finder begins to construct a separate tree of nodes in each of its threads
54// 4. The Finder merges the individual node trees into the main node tree
55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
56// These ReadDir calls might prompt additional Stat calls, etc
57// 6. The Finder waits for all loading to complete
58// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
59
60// These are the invariants regarding concurrency:
61// 1. The public methods of Finder are threadsafe.
62// The public methods are only performance-optimized for one caller at a time, however.
63// For the moment, multiple concurrent callers shouldn't expect any better performance than
64// multiple serial callers.
65// 2. While building the node tree, only one thread may ever access the <children> collection of a
66// *pathMap at once.
67// a) The thread that accesses the <children> collection is the thread that discovers the
68// children (by reading them from the cache or by having received a response to ReadDir).
69// 1) Consequently, the thread that discovers the children also spawns requests to stat
70// subdirs.
71// b) Consequently, while building the node tree, no thread may do a lookup of its
72// *pathMap via filepath because another thread may be adding children to the
73// <children> collection of an ancestor node. Additionally, in rare cases, another thread
74// may be removing children from an ancestor node if the children were only discovered to
75// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
76// 3. No query will begin to be serviced until all loading (both reading the db
77// and scanning the filesystem) is complete.
78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
79// generate it, making this not a huge loss in performance.
80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
82
83// see cmd/finder.go or finder_test.go for usage examples
84
85// Update versionString whenever making a backwards-incompatible change to the cache file format
86const versionString = "Android finder version 1"
87
88// a CacheParams specifies which files and directories the user wishes be scanned and
89// potentially added to the cache
90type CacheParams struct {
91 // WorkingDirectory is used as a base for any relative file paths given to the Finder
92 WorkingDirectory string
93
94 // RootDirs are the root directories used to initiate the search
95 RootDirs []string
96
97 // ExcludeDirs are directory names that if encountered are removed from the search
98 ExcludeDirs []string
99
100 // PruneFiles are file names that if encountered prune their entire directory
101 // (including siblings)
102 PruneFiles []string
103
104 // IncludeFiles are file names to include as matches
105 IncludeFiles []string
106}
107
108// a cacheConfig stores the inputs that determine what should be included in the cache
109type cacheConfig struct {
110 CacheParams
111
112 // FilesystemView is a unique identifier telling which parts of which file systems
113 // are readable by the Finder. In practice its value is essentially username@hostname.
114 // FilesystemView is set to ensure that a cache file copied to another host or
115 // found by another user doesn't inadvertently get reused.
116 FilesystemView string
117}
118
119func (p *cacheConfig) Dump() ([]byte, error) {
120 bytes, err := json.Marshal(p)
121 return bytes, err
122}
123
124// a cacheMetadata stores version information about the cache
125type cacheMetadata struct {
126 // The Version enables the Finder to determine whether it can even parse the file
127 // If the version changes, the entire cache file must be regenerated
128 Version string
129
130 // The CacheParams enables the Finder to determine whether the parameters match
131 // If the CacheParams change, the Finder can choose how much of the cache file to reuse
132 // (although in practice, the Finder will probably choose to ignore the entire file anyway)
133 Config cacheConfig
134}
135
136type Logger interface {
137 Output(calldepth int, s string) error
138}
139
140// the Finder is the main struct that callers will want to use
141type Finder struct {
142 // configuration
143 DbPath string
144 numDbLoadingThreads int
145 numSearchingThreads int
146 cacheMetadata cacheMetadata
147 logger Logger
148 filesystem fs.FileSystem
149
150 // temporary state
151 threadPool *threadPool
152 mutex sync.Mutex
Jeff Gastonb629e182017-08-14 16:49:18 -0700153 fsErrs []fsErr
154 errlock sync.Mutex
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700155
156 // non-temporary state
157 modifiedFlag int32
158 nodes pathMap
159}
160
161// New creates a new Finder for use
162func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gastonb629e182017-08-14 16:49:18 -0700163 logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700164
165 numThreads := runtime.NumCPU() * 2
166 numDbLoadingThreads := numThreads
167 numSearchingThreads := numThreads
168
169 metadata := cacheMetadata{
170 Version: versionString,
171 Config: cacheConfig{
172 CacheParams: cacheParams,
173 FilesystemView: filesystem.ViewId(),
174 },
175 }
176
Jeff Gastonb629e182017-08-14 16:49:18 -0700177 f = &Finder{
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700178 numDbLoadingThreads: numDbLoadingThreads,
179 numSearchingThreads: numSearchingThreads,
180 cacheMetadata: metadata,
181 logger: logger,
182 filesystem: filesystem,
183
184 nodes: *newPathMap("/"),
185 DbPath: dbPath,
186 }
187
Jeff Gastonb629e182017-08-14 16:49:18 -0700188 f.loadFromFilesystem()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700189
Jeff Gastonb629e182017-08-14 16:49:18 -0700190 // check for any filesystem errors
191 err = f.getErr()
192 if err != nil {
193 return nil, err
194 }
195
196 // confirm that every path mentioned in the CacheConfig exists
197 for _, path := range cacheParams.RootDirs {
198 node := f.nodes.GetNode(filepath.Clean(path), false)
199 if node == nil || node.ModTime == 0 {
200 return nil, fmt.Errorf("%v does not exist\n", path)
201 }
202 }
203
204 return f, nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700205}
206
207// FindNamed searches for every cached file
208func (f *Finder) FindAll() []string {
209 return f.FindAt("/")
210}
211
212// FindNamed searches for every cached file under <rootDir>
213func (f *Finder) FindAt(rootDir string) []string {
214 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
215 return entries.DirNames, entries.FileNames
216 }
217 return f.FindMatching(rootDir, filter)
218}
219
220// FindNamed searches for every cached file named <fileName>
221func (f *Finder) FindNamed(fileName string) []string {
222 return f.FindNamedAt("/", fileName)
223}
224
225// FindNamedAt searches under <rootPath> for every file named <fileName>
226// The reason a caller might use FindNamedAt instead of FindNamed is if they want
227// to limit their search to a subset of the cache
228func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
229 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
230 matches := []string{}
231 for _, foundName := range entries.FileNames {
232 if foundName == fileName {
233 matches = append(matches, foundName)
234 }
235 }
236 return entries.DirNames, matches
237
238 }
239 return f.FindMatching(rootPath, filter)
240}
241
242// FindFirstNamed searches for every file named <fileName>
243// Whenever it finds a match, it stops search subdirectories
244func (f *Finder) FindFirstNamed(fileName string) []string {
245 return f.FindFirstNamedAt("/", fileName)
246}
247
248// FindFirstNamedAt searches for every file named <fileName>
249// Whenever it finds a match, it stops search subdirectories
250func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
251 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
252 matches := []string{}
253 for _, foundName := range entries.FileNames {
254 if foundName == fileName {
255 matches = append(matches, foundName)
256 }
257 }
258
259 if len(matches) > 0 {
260 return []string{}, matches
261 }
262 return entries.DirNames, matches
263 }
264 return f.FindMatching(rootPath, filter)
265}
266
267// FindMatching is the most general exported function for searching for files in the cache
268// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
269// in place, removing file paths and directories as desired.
270// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
271func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
272 // set up some parameters
273 scanStart := time.Now()
274 var isRel bool
275 workingDir := f.cacheMetadata.Config.WorkingDirectory
276
277 isRel = !filepath.IsAbs(rootPath)
278 if isRel {
279 rootPath = filepath.Join(workingDir, rootPath)
280 }
281
282 rootPath = filepath.Clean(rootPath)
283
284 // ensure nothing else is using the Finder
285 f.verbosef("FindMatching waiting for finder to be idle\n")
286 f.lock()
287 defer f.unlock()
288
289 node := f.nodes.GetNode(rootPath, false)
290 if node == nil {
291 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
292 rootPath, f.cacheMetadata.Config.CacheParams)
293 // path is not found; don't do a search
294 return []string{}
295 }
296
297 // search for matching files
298 f.verbosef("Finder finding %v using cache\n", rootPath)
299 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
300
301 // format and return results
302 if isRel {
303 for i := 0; i < len(results); i++ {
304 results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
305 }
306 }
307 sort.Strings(results)
308 f.verbosef("Found %v files under %v in %v using cache\n",
309 len(results), rootPath, time.Since(scanStart))
310 return results
311}
312
313// Shutdown saves the contents of the Finder to its database file
314func (f *Finder) Shutdown() {
315 f.verbosef("Shutting down\n")
316 if f.wasModified() {
317 err := f.dumpDb()
318 if err != nil {
319 f.verbosef("%v\n", err)
320 }
321 } else {
322 f.verbosef("Skipping dumping unmodified db\n")
323 }
324}
325
326// End of public api
327
328// joinCleanPaths is like filepath.Join but is faster because
329// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
330func joinCleanPaths(base string, leaf string) string {
331 if base == "" {
332 return leaf
333 }
334 if base == "/" {
335 return base + leaf
336 }
337 if leaf == "" {
338 return base
339 }
340 return base + "/" + leaf
341}
342
343func (f *Finder) verbosef(format string, args ...interface{}) {
344 f.logger.Output(2, fmt.Sprintf(format, args...))
345}
346
347// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
348func (f *Finder) loadFromFilesystem() {
349 f.threadPool = newThreadPool(f.numDbLoadingThreads)
350
351 err := f.startFromExternalCache()
352 if err != nil {
353 f.startWithoutExternalCache()
354 }
355
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700356 f.threadPool = nil
357}
358
359func (f *Finder) startFind(path string) {
360 if !filepath.IsAbs(path) {
361 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
362 }
363 node := f.nodes.GetNode(path, true)
364 f.statDirAsync(node)
365}
366
367func (f *Finder) lock() {
368 f.mutex.Lock()
369}
370
371func (f *Finder) unlock() {
372 f.mutex.Unlock()
373}
374
375// a statResponse is the relevant portion of the response from the filesystem to a Stat call
376type statResponse struct {
377 ModTime int64
378 Inode uint64
379 Device uint64
380}
381
382// a pathAndStats stores a path and its stats
383type pathAndStats struct {
384 statResponse
385
386 Path string
387}
388
389// a dirFullInfo stores all of the relevant information we know about a directory
390type dirFullInfo struct {
391 pathAndStats
392
393 FileNames []string
394}
395
396// a PersistedDirInfo is the information about a dir that we save to our cache on disk
397type PersistedDirInfo struct {
398 // These field names are short because they are repeated many times in the output json file
399 P string // path
400 T int64 // modification time
401 I uint64 // inode number
402 F []string // relevant filenames contained
403}
404
405// a PersistedDirs is the information that we persist for a group of dirs
406type PersistedDirs struct {
407 // the device on which each directory is stored
408 Device uint64
409 // the common root path to which all contained dirs are relative
410 Root string
411 // the directories themselves
412 Dirs []PersistedDirInfo
413}
414
415// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
416type CacheEntry []PersistedDirs
417
418// a DirEntries lists the files and directories contained directly within a specific directory
419type DirEntries struct {
420 Path string
421
422 // elements of DirNames are just the dir names; they don't include any '/' character
423 DirNames []string
424 // elements of FileNames are just the file names; they don't include '/' character
425 FileNames []string
426}
427
428// a WalkFunc is the type that is passed into various Find functions for determining which
429// directories the caller wishes be walked. The WalkFunc is expected to decide which
430// directories to walk and which files to consider as matches to the original query.
431type WalkFunc func(DirEntries) (dirs []string, files []string)
432
433// a mapNode stores the relevant stats about a directory to be stored in a pathMap
434type mapNode struct {
435 statResponse
436 FileNames []string
437}
438
439// a pathMap implements the directory tree structure of nodes
440type pathMap struct {
441 mapNode
442
443 path string
444
445 children map[string]*pathMap
446
447 // number of descendent nodes, including self
448 approximateNumDescendents int
449}
450
451func newPathMap(path string) *pathMap {
452 result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
453 approximateNumDescendents: 1}
454 return result
455}
456
457// GetNode returns the node at <path>
458func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap {
459 if len(path) > 0 && path[0] == '/' {
460 path = path[1:]
461 }
462
463 node := m
464 for {
465 if path == "" {
466 return node
467 }
468
469 index := strings.Index(path, "/")
470 var firstComponent string
471 if index >= 0 {
472 firstComponent = path[:index]
473 path = path[index+1:]
474 } else {
475 firstComponent = path
476 path = ""
477 }
478
479 child, found := node.children[firstComponent]
480
481 if !found {
482 if createIfNotFound {
483 child = node.newChild(firstComponent)
484 } else {
485 return nil
486 }
487 }
488
489 node = child
490 }
491}
492
493func (m *pathMap) newChild(name string) (child *pathMap) {
494 path := joinCleanPaths(m.path, name)
495 newChild := newPathMap(path)
496 m.children[name] = newChild
497
498 return m.children[name]
499}
500
501func (m *pathMap) UpdateNumDescendents() int {
502 count := 1
503 for _, child := range m.children {
504 count += child.approximateNumDescendents
505 }
506 m.approximateNumDescendents = count
507 return count
508}
509
510func (m *pathMap) UpdateNumDescendentsRecursive() {
511 for _, child := range m.children {
512 child.UpdateNumDescendentsRecursive()
513 }
514 m.UpdateNumDescendents()
515}
516
517func (m *pathMap) MergeIn(other *pathMap) {
518 for key, theirs := range other.children {
519 ours, found := m.children[key]
520 if found {
521 ours.MergeIn(theirs)
522 } else {
523 m.children[key] = theirs
524 }
525 }
526 if other.ModTime != 0 {
527 m.mapNode = other.mapNode
528 }
529 m.UpdateNumDescendents()
530}
531
532func (m *pathMap) DumpAll() []dirFullInfo {
533 results := []dirFullInfo{}
534 m.dumpInto("", &results)
535 return results
536}
537
538func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) {
539 *results = append(*results,
540 dirFullInfo{
541 pathAndStats{statResponse: m.statResponse, Path: path},
542 m.FileNames},
543 )
544 for key, child := range m.children {
545 childPath := joinCleanPaths(path, key)
546 if len(childPath) == 0 || childPath[0] != '/' {
547 childPath = "/" + childPath
548 }
549 child.dumpInto(childPath, results)
550 }
551}
552
553// a semaphore can be locked by up to <capacity> callers at once
554type semaphore struct {
555 pool chan bool
556}
557
558func newSemaphore(capacity int) *semaphore {
559 return &semaphore{pool: make(chan bool, capacity)}
560}
561
562func (l *semaphore) Lock() {
563 l.pool <- true
564}
565
566func (l *semaphore) Unlock() {
567 <-l.pool
568}
569
570// A threadPool runs goroutines and supports throttling and waiting.
571// Without throttling, Go may exhaust the maximum number of various resources, such as
572// threads or file descriptors, and crash the program.
573type threadPool struct {
574 receivedRequests sync.WaitGroup
575 activeRequests semaphore
576}
577
578func newThreadPool(maxNumConcurrentThreads int) *threadPool {
579 return &threadPool{
580 receivedRequests: sync.WaitGroup{},
581 activeRequests: *newSemaphore(maxNumConcurrentThreads),
582 }
583}
584
585// Run requests to run the given function in its own goroutine
586func (p *threadPool) Run(function func()) {
587 p.receivedRequests.Add(1)
588 // If Run() was called from within a goroutine spawned by this threadPool,
589 // then we may need to return from Run() before having capacity to actually
590 // run <function>.
591 //
592 // It's possible that the body of <function> contains a statement (such as a syscall)
593 // that will cause Go to pin it to a thread, or will contain a statement that uses
594 // another resource that is in short supply (such as a file descriptor), so we can't
595 // actually run <function> until we have capacity.
596 //
597 // However, the semaphore used for synchronization is implemented via a channel and
598 // shouldn't require a new thread for each access.
599 go func() {
600 p.activeRequests.Lock()
601 function()
602 p.activeRequests.Unlock()
603 p.receivedRequests.Done()
604 }()
605}
606
607// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
608func (p *threadPool) Wait() {
609 p.receivedRequests.Wait()
610}
611
Jeff Gastonb629e182017-08-14 16:49:18 -0700612type fsErr struct {
613 path string
614 err error
615}
616
617func (e fsErr) String() string {
618 return e.path + ": " + e.err.Error()
619}
620
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700621func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
622 // group each dirFullInfo by its Device, to avoid having to repeat it in the output
623 dirsByDevice := map[uint64][]PersistedDirInfo{}
624 for _, entry := range dirInfos {
625 _, found := dirsByDevice[entry.Device]
626 if !found {
627 dirsByDevice[entry.Device] = []PersistedDirInfo{}
628 }
629 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
630 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
631 }
632
633 cacheEntry := CacheEntry{}
634
635 for device, infos := range dirsByDevice {
636 // find common prefix
637 prefix := ""
638 if len(infos) > 0 {
639 prefix = infos[0].P
640 }
641 for _, info := range infos {
642 for !strings.HasPrefix(info.P+"/", prefix+"/") {
643 prefix = filepath.Dir(prefix)
644 }
645 }
646 // remove common prefix
647 for i := range infos {
648 suffix := strings.Replace(infos[i].P, prefix, "", 1)
649 if len(suffix) > 0 && suffix[0] == '/' {
650 suffix = suffix[1:]
651 }
652 infos[i].P = suffix
653 }
654
655 // turn the map (keyed by device) into a list of structs with labeled fields
656 // this is to improve readability of the output
657 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
658 }
659
660 // convert to json.
661 // it would save some space to use a different format than json for the db file,
662 // but the space and time savings are small, and json is easy for humans to read
663 bytes, err := json.Marshal(cacheEntry)
664 return bytes, err
665}
666
667func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
668 var cacheEntry CacheEntry
669 err := json.Unmarshal(bytes, &cacheEntry)
670 if err != nil {
671 return nil, err
672 }
673
674 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
675 capacity := 0
676 for _, element := range cacheEntry {
677 capacity += len(element.Dirs)
678 }
679 nodes := make([]dirFullInfo, capacity)
680 count := 0
681 for _, element := range cacheEntry {
682 for _, dir := range element.Dirs {
683 path := joinCleanPaths(element.Root, dir.P)
684
685 nodes[count] = dirFullInfo{
686 pathAndStats: pathAndStats{
687 statResponse: statResponse{
688 ModTime: dir.T, Inode: dir.I, Device: element.Device,
689 },
690 Path: path},
691 FileNames: dir.F}
692 count++
693 }
694 }
695 return nodes, nil
696}
697
698// We use the following separator byte to distinguish individually parseable blocks of json
699// because we know this separator won't appear in the json that we're parsing.
700//
701// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
702// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
703// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
704// character.
705//
706// We know that the newline character will never appear in our json string, because:
707// - If a newline character appears as part of a data string, then json encoding will
708// emit two characters instead: '\' and 'n'.
709// - The json encoder that we use doesn't emit the optional newlines between any of its
710// other outputs.
711const lineSeparator = byte('\n')
712
713func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) {
714 return reader.ReadBytes(lineSeparator)
715}
716
717// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
718func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool {
719 cacheVersionBytes, err := f.readLine(cacheReader)
720 if err != nil {
721 f.verbosef("Failed to read database header; database is invalid\n")
722 return false
723 }
724 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
725 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
726 }
727 cacheVersionString := string(cacheVersionBytes)
728 currentVersion := f.cacheMetadata.Version
729 if cacheVersionString != currentVersion {
730 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
731 return false
732 }
733
734 cacheParamBytes, err := f.readLine(cacheReader)
735 if err != nil {
736 f.verbosef("Failed to read database search params; database is invalid\n")
737 return false
738 }
739
740 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
741 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
742 }
743
744 currentParamBytes, err := f.cacheMetadata.Config.Dump()
745 if err != nil {
746 panic("Finder failed to serialize its parameters")
747 }
748 cacheParamString := string(cacheParamBytes)
749 currentParamString := string(currentParamBytes)
750 if cacheParamString != currentParamString {
751 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
752 return false
753 }
754 return true
755}
756
757// loadBytes compares the cache info in <data> to the state of the filesystem
758// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
759func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) {
760
761 helperStartTime := time.Now()
762
763 cachedNodes, err := f.parseCacheEntry(data)
764 if err != nil {
765 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
766 }
767
768 unmarshalDate := time.Now()
769 f.verbosef("Unmarshaled %v objects for %v in %v\n",
770 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
771
772 tempMap := newPathMap("/")
773 stats := make([]statResponse, len(cachedNodes))
774
775 for i, node := range cachedNodes {
776 // check the file system for an updated timestamp
777 stats[i] = f.statDirSync(node.Path)
778 }
779
780 dirsToWalk = []string{}
781 for i, cachedNode := range cachedNodes {
782 updated := stats[i]
783 // save the cached value
784 container := tempMap.GetNode(cachedNode.Path, true)
785 container.mapNode = mapNode{statResponse: updated}
786
787 // if the metadata changed and the directory still exists, then
788 // make a note to walk it later
789 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
790 f.setModified()
791 // make a note that the directory needs to be walked
792 dirsToWalk = append(dirsToWalk, cachedNode.Path)
793 } else {
794 container.mapNode.FileNames = cachedNode.FileNames
795 }
796 }
797 // count the number of nodes to improve our understanding of the shape of the tree,
798 // thereby improving parallelism of subsequent searches
799 tempMap.UpdateNumDescendentsRecursive()
800
801 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
802 return tempMap, dirsToWalk, nil
803}
804
805// startFromExternalCache loads the cache database from disk
806// startFromExternalCache waits to return until the load of the cache db is complete, but
807// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
808func (f *Finder) startFromExternalCache() (err error) {
809 startTime := time.Now()
810 dbPath := f.DbPath
811
812 // open cache file and validate its header
813 reader, err := f.filesystem.Open(dbPath)
814 if err != nil {
815 return errors.New("No data to load from database\n")
816 }
817 bufferedReader := bufio.NewReader(reader)
818 if !f.validateCacheHeader(bufferedReader) {
819 return errors.New("Cache header does not match")
820 }
821 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
822
823 // read the file and spawn threads to process it
824 nodesToWalk := [][]*pathMap{}
825 mainTree := newPathMap("/")
826
827 // read the blocks and stream them into <blockChannel>
828 type dataBlock struct {
829 id int
830 err error
831 data []byte
832 }
833 blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
834 readBlocks := func() {
835 index := 0
836 for {
837 // It takes some time to unmarshal the input from json, so we want
838 // to unmarshal it in parallel. In order to find valid places to
839 // break the input, we scan for the line separators that we inserted
840 // (for this purpose) when we dumped the database.
841 data, err := f.readLine(bufferedReader)
842 var response dataBlock
843 done := false
844 if err != nil && err != io.EOF {
845 response = dataBlock{id: index, err: err, data: nil}
846 done = true
847 } else {
848 done = (err == io.EOF)
849 response = dataBlock{id: index, err: nil, data: data}
850 }
851 blockChannel <- response
852 index++
853 duration := time.Since(startTime)
854 f.verbosef("Read block %v after %v\n", index, duration)
855 if done {
856 f.verbosef("Read %v blocks in %v\n", index, duration)
857 close(blockChannel)
858 return
859 }
860 }
861 }
862 go readBlocks()
863
864 // Read from <blockChannel> and stream the responses into <resultChannel>.
865 type workResponse struct {
866 id int
867 err error
868 tree *pathMap
869 updatedDirs []string
870 }
871 resultChannel := make(chan workResponse)
872 processBlocks := func() {
873 numProcessed := 0
874 threadPool := newThreadPool(f.numDbLoadingThreads)
875 for {
876 // get a block to process
877 block, received := <-blockChannel
878 if !received {
879 break
880 }
881
882 if block.err != nil {
883 resultChannel <- workResponse{err: block.err}
884 break
885 }
886 numProcessed++
887 // wait until there is CPU available to process it
888 threadPool.Run(
889 func() {
890 processStartTime := time.Now()
891 f.verbosef("Starting to process block %v after %v\n",
892 block.id, processStartTime.Sub(startTime))
893 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
894 var response workResponse
895 if err != nil {
896 f.verbosef(
897 "Block %v failed to parse with error %v\n",
898 block.id, err)
899 response = workResponse{err: err}
900 } else {
901 response = workResponse{
902 id: block.id,
903 err: nil,
904 tree: tempMap,
905 updatedDirs: updatedDirs,
906 }
907 }
908 f.verbosef("Processed block %v in %v\n",
909 block.id, time.Since(processStartTime),
910 )
911 resultChannel <- response
912 },
913 )
914 }
915 threadPool.Wait()
916 f.verbosef("Finished processing %v blocks in %v\n",
917 numProcessed, time.Since(startTime))
918 close(resultChannel)
919 }
920 go processBlocks()
921
922 // Read from <resultChannel> and use the results
923 combineResults := func() (err error) {
924 for {
925 result, received := <-resultChannel
926 if !received {
927 break
928 }
929 if err != nil {
930 // In case of an error, wait for work to complete before
931 // returning the error. This ensures that any subsequent
932 // work doesn't need to compete for resources (and possibly
933 // fail due to, for example, a filesystem limit on the number of
934 // concurrently open files) with past work.
935 continue
936 }
937 if result.err != nil {
938 err = result.err
939 continue
940 }
941 // update main tree
942 mainTree.MergeIn(result.tree)
943 // record any new directories that we will need to Stat()
944 updatedNodes := make([]*pathMap, len(result.updatedDirs))
945 for j, dir := range result.updatedDirs {
946 node := mainTree.GetNode(dir, false)
947 updatedNodes[j] = node
948 }
949 nodesToWalk = append(nodesToWalk, updatedNodes)
950 }
951 return err
952 }
953 err = combineResults()
954 if err != nil {
955 return err
956 }
957
958 f.nodes = *mainTree
959
960 // after having loaded the entire db and therefore created entries for
961 // the directories we know of, now it's safe to start calling ReadDir on
962 // any updated directories
963 for i := range nodesToWalk {
964 f.listDirsAsync(nodesToWalk[i])
965 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700966 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
967 f.threadPool.Wait()
968 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
969
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700970 return err
971}
972
973// startWithoutExternalCache starts scanning the filesystem according to the cache config
974// startWithoutExternalCache should be called if startFromExternalCache is not applicable
975func (f *Finder) startWithoutExternalCache() {
Jeff Gastonb629e182017-08-14 16:49:18 -0700976 startTime := time.Now()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700977 configDirs := f.cacheMetadata.Config.RootDirs
978
979 // clean paths
980 candidates := make([]string, len(configDirs))
981 for i, dir := range configDirs {
982 candidates[i] = filepath.Clean(dir)
983 }
984 // remove duplicates
985 dirsToScan := make([]string, 0, len(configDirs))
986 for _, candidate := range candidates {
987 include := true
988 for _, included := range dirsToScan {
989 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") {
990 include = false
991 break
992 }
993 }
994 if include {
995 dirsToScan = append(dirsToScan, candidate)
996 }
997 }
998
999 // start searching finally
1000 for _, path := range dirsToScan {
1001 f.verbosef("Starting find of %v\n", path)
1002 f.startFind(path)
1003 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001004
1005 f.threadPool.Wait()
1006
1007 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001008}
1009
1010// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
1011func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
1012 if old.Inode != new.Inode {
1013 return false
1014 }
1015 if old.ModTime != new.ModTime {
1016 return false
1017 }
1018 if old.Device != new.Device {
1019 return false
1020 }
1021 return true
1022}
1023
1024func (f *Finder) wasModified() bool {
1025 return atomic.LoadInt32(&f.modifiedFlag) > 0
1026}
1027
1028func (f *Finder) setModified() {
1029 var newVal int32
1030 newVal = 1
1031 atomic.StoreInt32(&f.modifiedFlag, newVal)
1032}
1033
1034// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
1035func (f *Finder) sortedDirEntries() []dirFullInfo {
1036 startTime := time.Now()
1037 nodes := make([]dirFullInfo, 0)
1038 for _, node := range f.nodes.DumpAll() {
1039 if node.ModTime != 0 {
1040 nodes = append(nodes, node)
1041 }
1042 }
1043 discoveryDate := time.Now()
1044 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
1045 less := func(i int, j int) bool {
1046 return nodes[i].Path < nodes[j].Path
1047 }
1048 sort.Slice(nodes, less)
1049 sortDate := time.Now()
1050 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
1051
1052 return nodes
1053}
1054
1055// serializeDb converts the cache database into a form to save to disk
1056func (f *Finder) serializeDb() ([]byte, error) {
1057 // sort dir entries
1058 var entryList = f.sortedDirEntries()
1059
1060 // Generate an output file that can be conveniently loaded using the same number of threads
1061 // as were used in this execution (because presumably that will be the number of threads
1062 // used in the next execution too)
1063
1064 // generate header
1065 header := []byte{}
1066 header = append(header, []byte(f.cacheMetadata.Version)...)
1067 header = append(header, lineSeparator)
1068 configDump, err := f.cacheMetadata.Config.Dump()
1069 if err != nil {
1070 return nil, err
1071 }
1072 header = append(header, configDump...)
1073
1074 // serialize individual blocks in parallel
1075 numBlocks := f.numDbLoadingThreads
1076 if numBlocks > len(entryList) {
1077 numBlocks = len(entryList)
1078 }
1079 blocks := make([][]byte, 1+numBlocks)
1080 blocks[0] = header
1081 blockMin := 0
1082 wg := sync.WaitGroup{}
1083 var errLock sync.Mutex
1084
1085 for i := 1; i <= numBlocks; i++ {
1086 // identify next block
1087 blockMax := len(entryList) * i / numBlocks
1088 block := entryList[blockMin:blockMax]
1089
1090 // process block
1091 wg.Add(1)
1092 go func(index int, block []dirFullInfo) {
1093 byteBlock, subErr := f.serializeCacheEntry(block)
1094 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
1095 if subErr != nil {
1096 f.verbosef("%v\n", subErr.Error())
1097 errLock.Lock()
1098 err = subErr
1099 errLock.Unlock()
1100 } else {
1101 blocks[index] = byteBlock
1102 }
1103 wg.Done()
1104 }(i, block)
1105
1106 blockMin = blockMax
1107 }
1108
1109 wg.Wait()
1110
1111 if err != nil {
1112 return nil, err
1113 }
1114
1115 content := bytes.Join(blocks, []byte{lineSeparator})
1116
1117 return content, nil
1118}
1119
1120// dumpDb saves the cache database to disk
1121func (f *Finder) dumpDb() error {
1122 startTime := time.Now()
1123 f.verbosef("Dumping db\n")
1124
1125 tempPath := f.DbPath + ".tmp"
1126
1127 bytes, err := f.serializeDb()
1128 if err != nil {
1129 return err
1130 }
1131 serializeDate := time.Now()
1132 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
1133 // dump file and atomically move
1134 err = f.filesystem.WriteFile(tempPath, bytes, 0777)
1135 if err != nil {
1136 return err
1137 }
1138 err = f.filesystem.Rename(tempPath, f.DbPath)
1139 if err != nil {
1140 return err
1141 }
1142
1143 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
1144 return nil
Jeff Gastonb629e182017-08-14 16:49:18 -07001145
1146}
1147
1148// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
1149func (f *Finder) canIgnoreFsErr(err error) bool {
1150 pathErr, isPathErr := err.(*os.PathError)
1151 if !isPathErr {
1152 // Don't recognize this error
1153 return false
1154 }
1155 if pathErr.Err == os.ErrPermission {
1156 // Permission errors are ignored:
1157 // https://issuetracker.google.com/37553659
1158 // https://github.com/google/kati/pull/116
1159 return true
1160 }
1161 if pathErr.Err == os.ErrNotExist {
1162 // If a directory doesn't exist, that generally means the cache is out-of-date
1163 return true
1164 }
1165 // Don't recognize this error
1166 return false
1167}
1168
1169// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
1170func (f *Finder) onFsError(path string, err error) {
1171 if !f.canIgnoreFsErr(err) {
1172 // We could send the errors through a channel instead, although that would cause this call
1173 // to block unless we preallocated a sufficient buffer or spawned a reader thread.
1174 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
1175 // more convenient to use a lock. Only in an unusual situation should this code be
1176 // invoked anyway.
1177 f.errlock.Lock()
1178 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
1179 f.errlock.Unlock()
1180 }
1181}
1182
1183// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
1184func (f *Finder) discardErrsForPrunedPaths() {
1185 // This function could be somewhat inefficient due to being single-threaded,
1186 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
1187 relevantErrs := make([]fsErr, 0, len(f.fsErrs))
1188 for _, fsErr := range f.fsErrs {
1189 path := fsErr.path
1190 node := f.nodes.GetNode(path, false)
1191 if node != nil {
1192 // The path in question wasn't pruned due to a failure to process a parent directory.
1193 // So, the failure to process this path is important
1194 relevantErrs = append(relevantErrs, fsErr)
1195 }
1196 }
1197 f.fsErrs = relevantErrs
1198}
1199
1200// getErr returns an error based on previous calls to onFsErr, if any
1201func (f *Finder) getErr() error {
1202 f.discardErrsForPrunedPaths()
1203
1204 numErrs := len(f.fsErrs)
1205 if numErrs < 1 {
1206 return nil
1207 }
1208
1209 maxNumErrsToInclude := 10
1210 message := ""
1211 if numErrs > maxNumErrsToInclude {
1212 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
1213 } else {
1214 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
1215 }
1216
1217 return errors.New(message)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001218}
1219
1220func (f *Finder) statDirAsync(dir *pathMap) {
1221 node := dir
1222 path := dir.path
1223 f.threadPool.Run(
1224 func() {
1225 updatedStats := f.statDirSync(path)
1226
1227 if !f.isInfoUpToDate(node.statResponse, updatedStats) {
1228 node.mapNode = mapNode{
1229 statResponse: updatedStats,
1230 FileNames: []string{},
1231 }
1232 f.setModified()
1233 if node.statResponse.ModTime != 0 {
1234 // modification time was updated, so re-scan for
1235 // child directories
1236 f.listDirAsync(dir)
1237 }
1238 }
1239 },
1240 )
1241}
1242
1243func (f *Finder) statDirSync(path string) statResponse {
1244
1245 fileInfo, err := f.filesystem.Lstat(path)
1246
1247 var stats statResponse
1248 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001249 // possibly record this error
1250 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001251 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
1252 return stats
1253 }
1254 modTime := fileInfo.ModTime()
1255 stats = statResponse{}
1256 inode, err := f.filesystem.InodeNumber(fileInfo)
1257 if err != nil {
1258 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
1259 }
1260 stats.Inode = inode
1261 device, err := f.filesystem.DeviceNumber(fileInfo)
1262 if err != nil {
1263 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
1264 }
1265 stats.Device = device
1266 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
1267
1268 if err != nil {
1269 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
1270 }
1271 // We're only interested in knowing whether anything about the directory
1272 // has changed since last check, so we use the latest of the two
1273 // modification times (content modification (mtime) and
1274 // permission modification (ctime))
1275 if permissionsChangeTime.After(modTime) {
1276 modTime = permissionsChangeTime
1277 }
1278 stats.ModTime = modTime.UnixNano()
1279
1280 return stats
1281}
1282
1283// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
1284func (f *Finder) pruneCacheCandidates(items *DirEntries) {
1285
1286 for _, fileName := range items.FileNames {
1287 for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
1288 if fileName == abortedName {
1289 items.FileNames = []string{}
1290 items.DirNames = []string{}
1291 return
1292 }
1293 }
1294 }
1295
1296 // remove any files that aren't the ones we want to include
1297 writeIndex := 0
1298 for _, fileName := range items.FileNames {
1299 // include only these files
1300 for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
1301 if fileName == includedName {
1302 items.FileNames[writeIndex] = fileName
1303 writeIndex++
1304 break
1305 }
1306 }
1307 }
1308 // resize
1309 items.FileNames = items.FileNames[:writeIndex]
1310
1311 writeIndex = 0
1312 for _, dirName := range items.DirNames {
1313 items.DirNames[writeIndex] = dirName
1314 // ignore other dirs that are known to not be inputs to the build process
1315 include := true
1316 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
1317 if dirName == excludedName {
1318 // don't include
1319 include = false
1320 break
1321 }
1322 }
1323 if include {
1324 writeIndex++
1325 }
1326 }
1327 // resize
1328 items.DirNames = items.DirNames[:writeIndex]
1329}
1330
1331func (f *Finder) listDirsAsync(nodes []*pathMap) {
1332 f.threadPool.Run(
1333 func() {
1334 for i := range nodes {
1335 f.listDirSync(nodes[i])
1336 }
1337 },
1338 )
1339}
1340
1341func (f *Finder) listDirAsync(node *pathMap) {
1342 f.threadPool.Run(
1343 func() {
1344 f.listDirSync(node)
1345 },
1346 )
1347}
1348
1349func (f *Finder) listDirSync(dir *pathMap) {
1350 path := dir.path
1351 children, err := f.filesystem.ReadDir(path)
1352
1353 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001354 // possibly record this error
1355 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001356 // if listing the contents of the directory fails (presumably due to
1357 // permission denied), then treat the directory as empty
1358 children = []os.FileInfo{}
1359 }
1360
1361 var subdirs []string
1362 var subfiles []string
1363
1364 for _, child := range children {
1365 linkBits := child.Mode() & os.ModeSymlink
1366 isLink := linkBits != 0
1367 if child.IsDir() {
1368 if !isLink {
1369 // Skip symlink dirs.
1370 // We don't have to support symlink dirs because
1371 // that would cause duplicates.
1372 subdirs = append(subdirs, child.Name())
1373 }
1374 } else {
1375 // We do have to support symlink files because the link name might be
1376 // different than the target name
1377 // (for example, Android.bp -> build/soong/root.bp)
1378 subfiles = append(subfiles, child.Name())
1379 }
1380
1381 }
1382 parentNode := dir
1383
1384 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
1385 f.pruneCacheCandidates(entry)
1386
1387 // create a pathMap node for each relevant subdirectory
1388 relevantChildren := map[string]*pathMap{}
1389 for _, subdirName := range entry.DirNames {
1390 childNode, found := parentNode.children[subdirName]
1391 // if we already knew of this directory, then we already have a request pending to Stat it
1392 // if we didn't already know of this directory, then we must Stat it now
1393 if !found {
1394 childNode = parentNode.newChild(subdirName)
1395 f.statDirAsync(childNode)
1396 }
1397 relevantChildren[subdirName] = childNode
1398 }
1399 // Note that in rare cases, it's possible that we're reducing the set of
1400 // children via this statement, if these are all true:
1401 // 1. we previously had a cache that knew about subdirectories of parentNode
1402 // 2. the user created a prune-file (described in pruneCacheCandidates)
1403 // inside <parentNode>, which specifies that the contents of parentNode
1404 // are to be ignored.
1405 // The fact that it's possible to remove children here means that *pathMap structs
1406 // must not be looked up from f.nodes by filepath (and instead must be accessed by
1407 // direct pointer) until after every listDirSync completes
1408 parentNode.FileNames = entry.FileNames
1409 parentNode.children = relevantChildren
1410
1411}
1412
1413// listMatches takes a node and a function that specifies which subdirectories and
1414// files to include, and listMatches returns the matches
1415func (f *Finder) listMatches(node *pathMap,
1416 filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
1417 entries := DirEntries{
1418 FileNames: node.FileNames,
1419 }
1420 entries.DirNames = make([]string, 0, len(node.children))
1421 for childName := range node.children {
1422 entries.DirNames = append(entries.DirNames, childName)
1423 }
1424
1425 dirNames, fileNames := filter(entries)
1426
1427 subDirs = []*pathMap{}
1428 filePaths = make([]string, 0, len(fileNames))
1429 for _, fileName := range fileNames {
1430 filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
1431 }
1432 subDirs = make([]*pathMap, 0, len(dirNames))
1433 for _, childName := range dirNames {
1434 child, ok := node.children[childName]
1435 if ok {
1436 subDirs = append(subDirs, child)
1437 }
1438 }
1439
1440 return subDirs, filePaths
1441}
1442
1443// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
1444func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc,
1445 approxNumThreads int) []string {
1446
1447 if approxNumThreads < 2 {
1448 // Done spawning threads; process remaining directories
1449 return f.findInCacheSinglethreaded(node, filter)
1450 }
1451
1452 totalWork := 0
1453 for _, child := range node.children {
1454 totalWork += child.approximateNumDescendents
1455 }
1456 childrenResults := make(chan []string, len(node.children))
1457
1458 subDirs, filePaths := f.listMatches(node, filter)
1459
1460 // process child directories
1461 for _, child := range subDirs {
1462 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
1463 childProcessor := func(child *pathMap) {
1464 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
1465 childrenResults <- childResults
1466 }
1467 // If we're allowed to use more than 1 thread to process this directory,
1468 // then instead we use 1 thread for each subdirectory.
1469 // It would be strange to spawn threads for only some subdirectories.
1470 go childProcessor(child)
1471 }
1472
1473 // collect results
1474 for i := 0; i < len(subDirs); i++ {
1475 childResults := <-childrenResults
1476 filePaths = append(filePaths, childResults...)
1477 }
1478 close(childrenResults)
1479
1480 return filePaths
1481}
1482
1483// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
1484// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
1485func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string {
1486 if node == nil {
1487 return []string{}
1488 }
1489
1490 nodes := []*pathMap{node}
1491 matches := []string{}
1492
1493 for len(nodes) > 0 {
1494 currentNode := nodes[0]
1495 nodes = nodes[1:]
1496
1497 subDirs, filePaths := f.listMatches(currentNode, filter)
1498
1499 nodes = append(nodes, subDirs...)
1500
1501 matches = append(matches, filePaths...)
1502 }
1503 return matches
1504}