blob: c2df928925298035e65ffc78fceaae930ad0323e [file] [log] [blame]
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001// Copyright 2017 Google Inc. All rights reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package finder
16
17import (
18 "bufio"
19 "bytes"
20 "encoding/json"
21 "fmt"
22 "io"
23 "os"
24 "path/filepath"
25 "runtime"
26 "sort"
27 "strings"
28 "sync"
29 "sync/atomic"
30 "time"
31
32 "android/soong/fs"
33 "errors"
34)
35
36// This file provides a Finder struct that can quickly search for files satisfying
37// certain criteria.
38// This Finder gets its speed partially from parallelism and partially from caching.
39// If a Stat call returns the same result as last time, then it means Finder
40// can skip the ReadDir call for that dir.
41
42// The primary data structure used by the finder is the field Finder.nodes ,
43// which is a tree of nodes of type *pathMap .
44// Each node represents a directory on disk, along with its stats, subdirectories,
45// and contained files.
46
47// The common use case for the Finder is that the caller creates a Finder and gives
48// it the same query that was given to it in the previous execution.
49// In this situation, the major events that take place are:
50// 1. The Finder begins to load its db
51// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
52// Calling Stat on each of these directories is generally a large fraction of the total time
53// 3. The Finder begins to construct a separate tree of nodes in each of its threads
54// 4. The Finder merges the individual node trees into the main node tree
55// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
56// These ReadDir calls might prompt additional Stat calls, etc
57// 6. The Finder waits for all loading to complete
58// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
59
60// These are the invariants regarding concurrency:
61// 1. The public methods of Finder are threadsafe.
62// The public methods are only performance-optimized for one caller at a time, however.
63// For the moment, multiple concurrent callers shouldn't expect any better performance than
64// multiple serial callers.
65// 2. While building the node tree, only one thread may ever access the <children> collection of a
66// *pathMap at once.
67// a) The thread that accesses the <children> collection is the thread that discovers the
68// children (by reading them from the cache or by having received a response to ReadDir).
69// 1) Consequently, the thread that discovers the children also spawns requests to stat
70// subdirs.
71// b) Consequently, while building the node tree, no thread may do a lookup of its
72// *pathMap via filepath because another thread may be adding children to the
73// <children> collection of an ancestor node. Additionally, in rare cases, another thread
74// may be removing children from an ancestor node if the children were only discovered to
75// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
76// 3. No query will begin to be serviced until all loading (both reading the db
77// and scanning the filesystem) is complete.
78// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
79// generate it, making this not a huge loss in performance.
80// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
81// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
82
83// see cmd/finder.go or finder_test.go for usage examples
84
85// Update versionString whenever making a backwards-incompatible change to the cache file format
86const versionString = "Android finder version 1"
87
88// a CacheParams specifies which files and directories the user wishes be scanned and
89// potentially added to the cache
90type CacheParams struct {
91 // WorkingDirectory is used as a base for any relative file paths given to the Finder
92 WorkingDirectory string
93
94 // RootDirs are the root directories used to initiate the search
95 RootDirs []string
96
97 // ExcludeDirs are directory names that if encountered are removed from the search
98 ExcludeDirs []string
99
100 // PruneFiles are file names that if encountered prune their entire directory
101 // (including siblings)
102 PruneFiles []string
103
104 // IncludeFiles are file names to include as matches
105 IncludeFiles []string
106}
107
108// a cacheConfig stores the inputs that determine what should be included in the cache
109type cacheConfig struct {
110 CacheParams
111
112 // FilesystemView is a unique identifier telling which parts of which file systems
113 // are readable by the Finder. In practice its value is essentially username@hostname.
114 // FilesystemView is set to ensure that a cache file copied to another host or
115 // found by another user doesn't inadvertently get reused.
116 FilesystemView string
117}
118
119func (p *cacheConfig) Dump() ([]byte, error) {
120 bytes, err := json.Marshal(p)
121 return bytes, err
122}
123
124// a cacheMetadata stores version information about the cache
125type cacheMetadata struct {
126 // The Version enables the Finder to determine whether it can even parse the file
127 // If the version changes, the entire cache file must be regenerated
128 Version string
129
130 // The CacheParams enables the Finder to determine whether the parameters match
131 // If the CacheParams change, the Finder can choose how much of the cache file to reuse
132 // (although in practice, the Finder will probably choose to ignore the entire file anyway)
133 Config cacheConfig
134}
135
136type Logger interface {
137 Output(calldepth int, s string) error
138}
139
140// the Finder is the main struct that callers will want to use
141type Finder struct {
142 // configuration
143 DbPath string
144 numDbLoadingThreads int
145 numSearchingThreads int
146 cacheMetadata cacheMetadata
147 logger Logger
148 filesystem fs.FileSystem
149
150 // temporary state
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700151 threadPool *threadPool
152 mutex sync.Mutex
153 fsErrs []fsErr
154 errlock sync.Mutex
155 shutdownWaitgroup sync.WaitGroup
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700156
157 // non-temporary state
158 modifiedFlag int32
159 nodes pathMap
160}
161
162// New creates a new Finder for use
163func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gastonb629e182017-08-14 16:49:18 -0700164 logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700165
166 numThreads := runtime.NumCPU() * 2
167 numDbLoadingThreads := numThreads
168 numSearchingThreads := numThreads
169
170 metadata := cacheMetadata{
171 Version: versionString,
172 Config: cacheConfig{
173 CacheParams: cacheParams,
174 FilesystemView: filesystem.ViewId(),
175 },
176 }
177
Jeff Gastonb629e182017-08-14 16:49:18 -0700178 f = &Finder{
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700179 numDbLoadingThreads: numDbLoadingThreads,
180 numSearchingThreads: numSearchingThreads,
181 cacheMetadata: metadata,
182 logger: logger,
183 filesystem: filesystem,
184
185 nodes: *newPathMap("/"),
186 DbPath: dbPath,
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700187
188 shutdownWaitgroup: sync.WaitGroup{},
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700189 }
190
Jeff Gastonb629e182017-08-14 16:49:18 -0700191 f.loadFromFilesystem()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700192
Jeff Gastonb629e182017-08-14 16:49:18 -0700193 // check for any filesystem errors
194 err = f.getErr()
195 if err != nil {
196 return nil, err
197 }
198
199 // confirm that every path mentioned in the CacheConfig exists
200 for _, path := range cacheParams.RootDirs {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700201 if !filepath.IsAbs(path) {
202 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
203 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700204 node := f.nodes.GetNode(filepath.Clean(path), false)
205 if node == nil || node.ModTime == 0 {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700206 return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path)
Jeff Gastonb629e182017-08-14 16:49:18 -0700207 }
208 }
209
210 return f, nil
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700211}
212
213// FindNamed searches for every cached file
214func (f *Finder) FindAll() []string {
215 return f.FindAt("/")
216}
217
218// FindNamed searches for every cached file under <rootDir>
219func (f *Finder) FindAt(rootDir string) []string {
220 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
221 return entries.DirNames, entries.FileNames
222 }
223 return f.FindMatching(rootDir, filter)
224}
225
226// FindNamed searches for every cached file named <fileName>
227func (f *Finder) FindNamed(fileName string) []string {
228 return f.FindNamedAt("/", fileName)
229}
230
231// FindNamedAt searches under <rootPath> for every file named <fileName>
232// The reason a caller might use FindNamedAt instead of FindNamed is if they want
233// to limit their search to a subset of the cache
234func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
235 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
236 matches := []string{}
237 for _, foundName := range entries.FileNames {
238 if foundName == fileName {
239 matches = append(matches, foundName)
240 }
241 }
242 return entries.DirNames, matches
243
244 }
245 return f.FindMatching(rootPath, filter)
246}
247
248// FindFirstNamed searches for every file named <fileName>
249// Whenever it finds a match, it stops search subdirectories
250func (f *Finder) FindFirstNamed(fileName string) []string {
251 return f.FindFirstNamedAt("/", fileName)
252}
253
254// FindFirstNamedAt searches for every file named <fileName>
255// Whenever it finds a match, it stops search subdirectories
256func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
257 filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
258 matches := []string{}
259 for _, foundName := range entries.FileNames {
260 if foundName == fileName {
261 matches = append(matches, foundName)
262 }
263 }
264
265 if len(matches) > 0 {
266 return []string{}, matches
267 }
268 return entries.DirNames, matches
269 }
270 return f.FindMatching(rootPath, filter)
271}
272
273// FindMatching is the most general exported function for searching for files in the cache
274// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
275// in place, removing file paths and directories as desired.
276// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
277func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
278 // set up some parameters
279 scanStart := time.Now()
280 var isRel bool
281 workingDir := f.cacheMetadata.Config.WorkingDirectory
282
283 isRel = !filepath.IsAbs(rootPath)
284 if isRel {
285 rootPath = filepath.Join(workingDir, rootPath)
286 }
287
288 rootPath = filepath.Clean(rootPath)
289
290 // ensure nothing else is using the Finder
291 f.verbosef("FindMatching waiting for finder to be idle\n")
292 f.lock()
293 defer f.unlock()
294
295 node := f.nodes.GetNode(rootPath, false)
296 if node == nil {
297 f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
298 rootPath, f.cacheMetadata.Config.CacheParams)
299 // path is not found; don't do a search
300 return []string{}
301 }
302
303 // search for matching files
304 f.verbosef("Finder finding %v using cache\n", rootPath)
305 results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
306
307 // format and return results
308 if isRel {
309 for i := 0; i < len(results); i++ {
310 results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
311 }
312 }
313 sort.Strings(results)
314 f.verbosef("Found %v files under %v in %v using cache\n",
315 len(results), rootPath, time.Since(scanStart))
316 return results
317}
318
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700319// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete
320// Currently, that only entails waiting for the database dump to complete.
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700321func (f *Finder) Shutdown() {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700322 f.waitForDbDump()
323}
324
325// End of public api
326
327func (f *Finder) goDumpDb() {
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700328 if f.wasModified() {
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700329 f.shutdownWaitgroup.Add(1)
330 go func() {
331 err := f.dumpDb()
332 if err != nil {
333 f.verbosef("%v\n", err)
334 }
335 f.shutdownWaitgroup.Done()
336 }()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700337 } else {
338 f.verbosef("Skipping dumping unmodified db\n")
339 }
340}
341
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700342func (f *Finder) waitForDbDump() {
343 f.shutdownWaitgroup.Wait()
344}
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700345
346// joinCleanPaths is like filepath.Join but is faster because
347// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
348func joinCleanPaths(base string, leaf string) string {
349 if base == "" {
350 return leaf
351 }
352 if base == "/" {
353 return base + leaf
354 }
355 if leaf == "" {
356 return base
357 }
358 return base + "/" + leaf
359}
360
361func (f *Finder) verbosef(format string, args ...interface{}) {
362 f.logger.Output(2, fmt.Sprintf(format, args...))
363}
364
365// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
366func (f *Finder) loadFromFilesystem() {
367 f.threadPool = newThreadPool(f.numDbLoadingThreads)
368
369 err := f.startFromExternalCache()
370 if err != nil {
371 f.startWithoutExternalCache()
372 }
373
Jeff Gastonb64fc1c2017-08-04 12:30:12 -0700374 f.goDumpDb()
375
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700376 f.threadPool = nil
377}
378
379func (f *Finder) startFind(path string) {
380 if !filepath.IsAbs(path) {
381 path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
382 }
383 node := f.nodes.GetNode(path, true)
384 f.statDirAsync(node)
385}
386
387func (f *Finder) lock() {
388 f.mutex.Lock()
389}
390
391func (f *Finder) unlock() {
392 f.mutex.Unlock()
393}
394
395// a statResponse is the relevant portion of the response from the filesystem to a Stat call
396type statResponse struct {
397 ModTime int64
398 Inode uint64
399 Device uint64
400}
401
402// a pathAndStats stores a path and its stats
403type pathAndStats struct {
404 statResponse
405
406 Path string
407}
408
409// a dirFullInfo stores all of the relevant information we know about a directory
410type dirFullInfo struct {
411 pathAndStats
412
413 FileNames []string
414}
415
416// a PersistedDirInfo is the information about a dir that we save to our cache on disk
417type PersistedDirInfo struct {
418 // These field names are short because they are repeated many times in the output json file
419 P string // path
420 T int64 // modification time
421 I uint64 // inode number
422 F []string // relevant filenames contained
423}
424
425// a PersistedDirs is the information that we persist for a group of dirs
426type PersistedDirs struct {
427 // the device on which each directory is stored
428 Device uint64
429 // the common root path to which all contained dirs are relative
430 Root string
431 // the directories themselves
432 Dirs []PersistedDirInfo
433}
434
435// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
436type CacheEntry []PersistedDirs
437
438// a DirEntries lists the files and directories contained directly within a specific directory
439type DirEntries struct {
440 Path string
441
442 // elements of DirNames are just the dir names; they don't include any '/' character
443 DirNames []string
444 // elements of FileNames are just the file names; they don't include '/' character
445 FileNames []string
446}
447
448// a WalkFunc is the type that is passed into various Find functions for determining which
449// directories the caller wishes be walked. The WalkFunc is expected to decide which
450// directories to walk and which files to consider as matches to the original query.
451type WalkFunc func(DirEntries) (dirs []string, files []string)
452
453// a mapNode stores the relevant stats about a directory to be stored in a pathMap
454type mapNode struct {
455 statResponse
456 FileNames []string
457}
458
459// a pathMap implements the directory tree structure of nodes
460type pathMap struct {
461 mapNode
462
463 path string
464
465 children map[string]*pathMap
466
467 // number of descendent nodes, including self
468 approximateNumDescendents int
469}
470
471func newPathMap(path string) *pathMap {
472 result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
473 approximateNumDescendents: 1}
474 return result
475}
476
477// GetNode returns the node at <path>
478func (m *pathMap) GetNode(path string, createIfNotFound bool) *pathMap {
479 if len(path) > 0 && path[0] == '/' {
480 path = path[1:]
481 }
482
483 node := m
484 for {
485 if path == "" {
486 return node
487 }
488
489 index := strings.Index(path, "/")
490 var firstComponent string
491 if index >= 0 {
492 firstComponent = path[:index]
493 path = path[index+1:]
494 } else {
495 firstComponent = path
496 path = ""
497 }
498
499 child, found := node.children[firstComponent]
500
501 if !found {
502 if createIfNotFound {
503 child = node.newChild(firstComponent)
504 } else {
505 return nil
506 }
507 }
508
509 node = child
510 }
511}
512
513func (m *pathMap) newChild(name string) (child *pathMap) {
514 path := joinCleanPaths(m.path, name)
515 newChild := newPathMap(path)
516 m.children[name] = newChild
517
518 return m.children[name]
519}
520
521func (m *pathMap) UpdateNumDescendents() int {
522 count := 1
523 for _, child := range m.children {
524 count += child.approximateNumDescendents
525 }
526 m.approximateNumDescendents = count
527 return count
528}
529
530func (m *pathMap) UpdateNumDescendentsRecursive() {
531 for _, child := range m.children {
532 child.UpdateNumDescendentsRecursive()
533 }
534 m.UpdateNumDescendents()
535}
536
537func (m *pathMap) MergeIn(other *pathMap) {
538 for key, theirs := range other.children {
539 ours, found := m.children[key]
540 if found {
541 ours.MergeIn(theirs)
542 } else {
543 m.children[key] = theirs
544 }
545 }
546 if other.ModTime != 0 {
547 m.mapNode = other.mapNode
548 }
549 m.UpdateNumDescendents()
550}
551
552func (m *pathMap) DumpAll() []dirFullInfo {
553 results := []dirFullInfo{}
554 m.dumpInto("", &results)
555 return results
556}
557
558func (m *pathMap) dumpInto(path string, results *[]dirFullInfo) {
559 *results = append(*results,
560 dirFullInfo{
561 pathAndStats{statResponse: m.statResponse, Path: path},
562 m.FileNames},
563 )
564 for key, child := range m.children {
565 childPath := joinCleanPaths(path, key)
566 if len(childPath) == 0 || childPath[0] != '/' {
567 childPath = "/" + childPath
568 }
569 child.dumpInto(childPath, results)
570 }
571}
572
573// a semaphore can be locked by up to <capacity> callers at once
574type semaphore struct {
575 pool chan bool
576}
577
578func newSemaphore(capacity int) *semaphore {
579 return &semaphore{pool: make(chan bool, capacity)}
580}
581
582func (l *semaphore) Lock() {
583 l.pool <- true
584}
585
586func (l *semaphore) Unlock() {
587 <-l.pool
588}
589
590// A threadPool runs goroutines and supports throttling and waiting.
591// Without throttling, Go may exhaust the maximum number of various resources, such as
592// threads or file descriptors, and crash the program.
593type threadPool struct {
594 receivedRequests sync.WaitGroup
595 activeRequests semaphore
596}
597
598func newThreadPool(maxNumConcurrentThreads int) *threadPool {
599 return &threadPool{
600 receivedRequests: sync.WaitGroup{},
601 activeRequests: *newSemaphore(maxNumConcurrentThreads),
602 }
603}
604
605// Run requests to run the given function in its own goroutine
606func (p *threadPool) Run(function func()) {
607 p.receivedRequests.Add(1)
608 // If Run() was called from within a goroutine spawned by this threadPool,
609 // then we may need to return from Run() before having capacity to actually
610 // run <function>.
611 //
612 // It's possible that the body of <function> contains a statement (such as a syscall)
613 // that will cause Go to pin it to a thread, or will contain a statement that uses
614 // another resource that is in short supply (such as a file descriptor), so we can't
615 // actually run <function> until we have capacity.
616 //
617 // However, the semaphore used for synchronization is implemented via a channel and
618 // shouldn't require a new thread for each access.
619 go func() {
620 p.activeRequests.Lock()
621 function()
622 p.activeRequests.Unlock()
623 p.receivedRequests.Done()
624 }()
625}
626
627// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
628func (p *threadPool) Wait() {
629 p.receivedRequests.Wait()
630}
631
Jeff Gastonb629e182017-08-14 16:49:18 -0700632type fsErr struct {
633 path string
634 err error
635}
636
637func (e fsErr) String() string {
638 return e.path + ": " + e.err.Error()
639}
640
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700641func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
642 // group each dirFullInfo by its Device, to avoid having to repeat it in the output
643 dirsByDevice := map[uint64][]PersistedDirInfo{}
644 for _, entry := range dirInfos {
645 _, found := dirsByDevice[entry.Device]
646 if !found {
647 dirsByDevice[entry.Device] = []PersistedDirInfo{}
648 }
649 dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
650 PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
651 }
652
653 cacheEntry := CacheEntry{}
654
655 for device, infos := range dirsByDevice {
656 // find common prefix
657 prefix := ""
658 if len(infos) > 0 {
659 prefix = infos[0].P
660 }
661 for _, info := range infos {
662 for !strings.HasPrefix(info.P+"/", prefix+"/") {
663 prefix = filepath.Dir(prefix)
Jeff Gaston996716a2017-08-22 13:33:19 -0700664 if prefix == "/" {
665 break
666 }
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700667 }
668 }
669 // remove common prefix
670 for i := range infos {
671 suffix := strings.Replace(infos[i].P, prefix, "", 1)
672 if len(suffix) > 0 && suffix[0] == '/' {
673 suffix = suffix[1:]
674 }
675 infos[i].P = suffix
676 }
677
678 // turn the map (keyed by device) into a list of structs with labeled fields
679 // this is to improve readability of the output
680 cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
681 }
682
683 // convert to json.
684 // it would save some space to use a different format than json for the db file,
685 // but the space and time savings are small, and json is easy for humans to read
686 bytes, err := json.Marshal(cacheEntry)
687 return bytes, err
688}
689
690func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
691 var cacheEntry CacheEntry
692 err := json.Unmarshal(bytes, &cacheEntry)
693 if err != nil {
694 return nil, err
695 }
696
697 // convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
698 capacity := 0
699 for _, element := range cacheEntry {
700 capacity += len(element.Dirs)
701 }
702 nodes := make([]dirFullInfo, capacity)
703 count := 0
704 for _, element := range cacheEntry {
705 for _, dir := range element.Dirs {
706 path := joinCleanPaths(element.Root, dir.P)
707
708 nodes[count] = dirFullInfo{
709 pathAndStats: pathAndStats{
710 statResponse: statResponse{
711 ModTime: dir.T, Inode: dir.I, Device: element.Device,
712 },
713 Path: path},
714 FileNames: dir.F}
715 count++
716 }
717 }
718 return nodes, nil
719}
720
721// We use the following separator byte to distinguish individually parseable blocks of json
722// because we know this separator won't appear in the json that we're parsing.
723//
724// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
725// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
726// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
727// character.
728//
729// We know that the newline character will never appear in our json string, because:
730// - If a newline character appears as part of a data string, then json encoding will
731// emit two characters instead: '\' and 'n'.
732// - The json encoder that we use doesn't emit the optional newlines between any of its
733// other outputs.
734const lineSeparator = byte('\n')
735
736func (f *Finder) readLine(reader *bufio.Reader) ([]byte, error) {
737 return reader.ReadBytes(lineSeparator)
738}
739
740// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
741func (f *Finder) validateCacheHeader(cacheReader *bufio.Reader) bool {
742 cacheVersionBytes, err := f.readLine(cacheReader)
743 if err != nil {
744 f.verbosef("Failed to read database header; database is invalid\n")
745 return false
746 }
747 if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
748 cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
749 }
750 cacheVersionString := string(cacheVersionBytes)
751 currentVersion := f.cacheMetadata.Version
752 if cacheVersionString != currentVersion {
753 f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
754 return false
755 }
756
757 cacheParamBytes, err := f.readLine(cacheReader)
758 if err != nil {
759 f.verbosef("Failed to read database search params; database is invalid\n")
760 return false
761 }
762
763 if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
764 cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
765 }
766
767 currentParamBytes, err := f.cacheMetadata.Config.Dump()
768 if err != nil {
769 panic("Finder failed to serialize its parameters")
770 }
771 cacheParamString := string(cacheParamBytes)
772 currentParamString := string(currentParamBytes)
773 if cacheParamString != currentParamString {
774 f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
775 return false
776 }
777 return true
778}
779
780// loadBytes compares the cache info in <data> to the state of the filesystem
781// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
782func (f *Finder) loadBytes(id int, data []byte) (m *pathMap, dirsToWalk []string, err error) {
783
784 helperStartTime := time.Now()
785
786 cachedNodes, err := f.parseCacheEntry(data)
787 if err != nil {
788 return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
789 }
790
791 unmarshalDate := time.Now()
792 f.verbosef("Unmarshaled %v objects for %v in %v\n",
793 len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
794
795 tempMap := newPathMap("/")
796 stats := make([]statResponse, len(cachedNodes))
797
798 for i, node := range cachedNodes {
799 // check the file system for an updated timestamp
800 stats[i] = f.statDirSync(node.Path)
801 }
802
803 dirsToWalk = []string{}
804 for i, cachedNode := range cachedNodes {
805 updated := stats[i]
806 // save the cached value
807 container := tempMap.GetNode(cachedNode.Path, true)
808 container.mapNode = mapNode{statResponse: updated}
809
810 // if the metadata changed and the directory still exists, then
811 // make a note to walk it later
812 if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
813 f.setModified()
814 // make a note that the directory needs to be walked
815 dirsToWalk = append(dirsToWalk, cachedNode.Path)
816 } else {
817 container.mapNode.FileNames = cachedNode.FileNames
818 }
819 }
820 // count the number of nodes to improve our understanding of the shape of the tree,
821 // thereby improving parallelism of subsequent searches
822 tempMap.UpdateNumDescendentsRecursive()
823
824 f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
825 return tempMap, dirsToWalk, nil
826}
827
828// startFromExternalCache loads the cache database from disk
829// startFromExternalCache waits to return until the load of the cache db is complete, but
830// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
831func (f *Finder) startFromExternalCache() (err error) {
832 startTime := time.Now()
833 dbPath := f.DbPath
834
835 // open cache file and validate its header
836 reader, err := f.filesystem.Open(dbPath)
837 if err != nil {
838 return errors.New("No data to load from database\n")
839 }
840 bufferedReader := bufio.NewReader(reader)
841 if !f.validateCacheHeader(bufferedReader) {
842 return errors.New("Cache header does not match")
843 }
844 f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
845
846 // read the file and spawn threads to process it
847 nodesToWalk := [][]*pathMap{}
848 mainTree := newPathMap("/")
849
850 // read the blocks and stream them into <blockChannel>
851 type dataBlock struct {
852 id int
853 err error
854 data []byte
855 }
856 blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
857 readBlocks := func() {
858 index := 0
859 for {
860 // It takes some time to unmarshal the input from json, so we want
861 // to unmarshal it in parallel. In order to find valid places to
862 // break the input, we scan for the line separators that we inserted
863 // (for this purpose) when we dumped the database.
864 data, err := f.readLine(bufferedReader)
865 var response dataBlock
866 done := false
867 if err != nil && err != io.EOF {
868 response = dataBlock{id: index, err: err, data: nil}
869 done = true
870 } else {
871 done = (err == io.EOF)
872 response = dataBlock{id: index, err: nil, data: data}
873 }
874 blockChannel <- response
875 index++
876 duration := time.Since(startTime)
877 f.verbosef("Read block %v after %v\n", index, duration)
878 if done {
879 f.verbosef("Read %v blocks in %v\n", index, duration)
880 close(blockChannel)
881 return
882 }
883 }
884 }
885 go readBlocks()
886
887 // Read from <blockChannel> and stream the responses into <resultChannel>.
888 type workResponse struct {
889 id int
890 err error
891 tree *pathMap
892 updatedDirs []string
893 }
894 resultChannel := make(chan workResponse)
895 processBlocks := func() {
896 numProcessed := 0
897 threadPool := newThreadPool(f.numDbLoadingThreads)
898 for {
899 // get a block to process
900 block, received := <-blockChannel
901 if !received {
902 break
903 }
904
905 if block.err != nil {
906 resultChannel <- workResponse{err: block.err}
907 break
908 }
909 numProcessed++
910 // wait until there is CPU available to process it
911 threadPool.Run(
912 func() {
913 processStartTime := time.Now()
914 f.verbosef("Starting to process block %v after %v\n",
915 block.id, processStartTime.Sub(startTime))
916 tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
917 var response workResponse
918 if err != nil {
919 f.verbosef(
920 "Block %v failed to parse with error %v\n",
921 block.id, err)
922 response = workResponse{err: err}
923 } else {
924 response = workResponse{
925 id: block.id,
926 err: nil,
927 tree: tempMap,
928 updatedDirs: updatedDirs,
929 }
930 }
931 f.verbosef("Processed block %v in %v\n",
932 block.id, time.Since(processStartTime),
933 )
934 resultChannel <- response
935 },
936 )
937 }
938 threadPool.Wait()
939 f.verbosef("Finished processing %v blocks in %v\n",
940 numProcessed, time.Since(startTime))
941 close(resultChannel)
942 }
943 go processBlocks()
944
945 // Read from <resultChannel> and use the results
946 combineResults := func() (err error) {
947 for {
948 result, received := <-resultChannel
949 if !received {
950 break
951 }
952 if err != nil {
953 // In case of an error, wait for work to complete before
954 // returning the error. This ensures that any subsequent
955 // work doesn't need to compete for resources (and possibly
956 // fail due to, for example, a filesystem limit on the number of
957 // concurrently open files) with past work.
958 continue
959 }
960 if result.err != nil {
961 err = result.err
962 continue
963 }
964 // update main tree
965 mainTree.MergeIn(result.tree)
966 // record any new directories that we will need to Stat()
967 updatedNodes := make([]*pathMap, len(result.updatedDirs))
968 for j, dir := range result.updatedDirs {
969 node := mainTree.GetNode(dir, false)
970 updatedNodes[j] = node
971 }
972 nodesToWalk = append(nodesToWalk, updatedNodes)
973 }
974 return err
975 }
976 err = combineResults()
977 if err != nil {
978 return err
979 }
980
981 f.nodes = *mainTree
982
983 // after having loaded the entire db and therefore created entries for
984 // the directories we know of, now it's safe to start calling ReadDir on
985 // any updated directories
986 for i := range nodesToWalk {
987 f.listDirsAsync(nodesToWalk[i])
988 }
Jeff Gastonb629e182017-08-14 16:49:18 -0700989 f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
990 f.threadPool.Wait()
991 f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
992
Jeff Gastonf1fd45e2017-08-09 18:25:28 -0700993 return err
994}
995
996// startWithoutExternalCache starts scanning the filesystem according to the cache config
997// startWithoutExternalCache should be called if startFromExternalCache is not applicable
998func (f *Finder) startWithoutExternalCache() {
Jeff Gastonb629e182017-08-14 16:49:18 -0700999 startTime := time.Now()
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001000 configDirs := f.cacheMetadata.Config.RootDirs
1001
1002 // clean paths
1003 candidates := make([]string, len(configDirs))
1004 for i, dir := range configDirs {
1005 candidates[i] = filepath.Clean(dir)
1006 }
1007 // remove duplicates
1008 dirsToScan := make([]string, 0, len(configDirs))
1009 for _, candidate := range candidates {
1010 include := true
1011 for _, included := range dirsToScan {
1012 if included == "/" || strings.HasPrefix(candidate+"/", included+"/") {
1013 include = false
1014 break
1015 }
1016 }
1017 if include {
1018 dirsToScan = append(dirsToScan, candidate)
1019 }
1020 }
1021
1022 // start searching finally
1023 for _, path := range dirsToScan {
1024 f.verbosef("Starting find of %v\n", path)
1025 f.startFind(path)
1026 }
Jeff Gastonb629e182017-08-14 16:49:18 -07001027
1028 f.threadPool.Wait()
1029
1030 f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001031}
1032
1033// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
1034func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
1035 if old.Inode != new.Inode {
1036 return false
1037 }
1038 if old.ModTime != new.ModTime {
1039 return false
1040 }
1041 if old.Device != new.Device {
1042 return false
1043 }
1044 return true
1045}
1046
1047func (f *Finder) wasModified() bool {
1048 return atomic.LoadInt32(&f.modifiedFlag) > 0
1049}
1050
1051func (f *Finder) setModified() {
1052 var newVal int32
1053 newVal = 1
1054 atomic.StoreInt32(&f.modifiedFlag, newVal)
1055}
1056
1057// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
1058func (f *Finder) sortedDirEntries() []dirFullInfo {
1059 startTime := time.Now()
1060 nodes := make([]dirFullInfo, 0)
1061 for _, node := range f.nodes.DumpAll() {
1062 if node.ModTime != 0 {
1063 nodes = append(nodes, node)
1064 }
1065 }
1066 discoveryDate := time.Now()
1067 f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
1068 less := func(i int, j int) bool {
1069 return nodes[i].Path < nodes[j].Path
1070 }
1071 sort.Slice(nodes, less)
1072 sortDate := time.Now()
1073 f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
1074
1075 return nodes
1076}
1077
1078// serializeDb converts the cache database into a form to save to disk
1079func (f *Finder) serializeDb() ([]byte, error) {
1080 // sort dir entries
1081 var entryList = f.sortedDirEntries()
1082
1083 // Generate an output file that can be conveniently loaded using the same number of threads
1084 // as were used in this execution (because presumably that will be the number of threads
1085 // used in the next execution too)
1086
1087 // generate header
1088 header := []byte{}
1089 header = append(header, []byte(f.cacheMetadata.Version)...)
1090 header = append(header, lineSeparator)
1091 configDump, err := f.cacheMetadata.Config.Dump()
1092 if err != nil {
1093 return nil, err
1094 }
1095 header = append(header, configDump...)
1096
1097 // serialize individual blocks in parallel
1098 numBlocks := f.numDbLoadingThreads
1099 if numBlocks > len(entryList) {
1100 numBlocks = len(entryList)
1101 }
1102 blocks := make([][]byte, 1+numBlocks)
1103 blocks[0] = header
1104 blockMin := 0
1105 wg := sync.WaitGroup{}
1106 var errLock sync.Mutex
1107
1108 for i := 1; i <= numBlocks; i++ {
1109 // identify next block
1110 blockMax := len(entryList) * i / numBlocks
1111 block := entryList[blockMin:blockMax]
1112
1113 // process block
1114 wg.Add(1)
1115 go func(index int, block []dirFullInfo) {
1116 byteBlock, subErr := f.serializeCacheEntry(block)
1117 f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
1118 if subErr != nil {
1119 f.verbosef("%v\n", subErr.Error())
1120 errLock.Lock()
1121 err = subErr
1122 errLock.Unlock()
1123 } else {
1124 blocks[index] = byteBlock
1125 }
1126 wg.Done()
1127 }(i, block)
1128
1129 blockMin = blockMax
1130 }
1131
1132 wg.Wait()
1133
1134 if err != nil {
1135 return nil, err
1136 }
1137
1138 content := bytes.Join(blocks, []byte{lineSeparator})
1139
1140 return content, nil
1141}
1142
1143// dumpDb saves the cache database to disk
1144func (f *Finder) dumpDb() error {
1145 startTime := time.Now()
1146 f.verbosef("Dumping db\n")
1147
1148 tempPath := f.DbPath + ".tmp"
1149
1150 bytes, err := f.serializeDb()
1151 if err != nil {
1152 return err
1153 }
1154 serializeDate := time.Now()
1155 f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
1156 // dump file and atomically move
1157 err = f.filesystem.WriteFile(tempPath, bytes, 0777)
1158 if err != nil {
1159 return err
1160 }
1161 err = f.filesystem.Rename(tempPath, f.DbPath)
1162 if err != nil {
1163 return err
1164 }
1165
1166 f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
1167 return nil
Jeff Gastonb629e182017-08-14 16:49:18 -07001168
1169}
1170
1171// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
1172func (f *Finder) canIgnoreFsErr(err error) bool {
1173 pathErr, isPathErr := err.(*os.PathError)
1174 if !isPathErr {
1175 // Don't recognize this error
1176 return false
1177 }
1178 if pathErr.Err == os.ErrPermission {
1179 // Permission errors are ignored:
1180 // https://issuetracker.google.com/37553659
1181 // https://github.com/google/kati/pull/116
1182 return true
1183 }
1184 if pathErr.Err == os.ErrNotExist {
1185 // If a directory doesn't exist, that generally means the cache is out-of-date
1186 return true
1187 }
1188 // Don't recognize this error
1189 return false
1190}
1191
1192// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
1193func (f *Finder) onFsError(path string, err error) {
1194 if !f.canIgnoreFsErr(err) {
1195 // We could send the errors through a channel instead, although that would cause this call
1196 // to block unless we preallocated a sufficient buffer or spawned a reader thread.
1197 // Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
1198 // more convenient to use a lock. Only in an unusual situation should this code be
1199 // invoked anyway.
1200 f.errlock.Lock()
1201 f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
1202 f.errlock.Unlock()
1203 }
1204}
1205
1206// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
1207func (f *Finder) discardErrsForPrunedPaths() {
1208 // This function could be somewhat inefficient due to being single-threaded,
1209 // but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
1210 relevantErrs := make([]fsErr, 0, len(f.fsErrs))
1211 for _, fsErr := range f.fsErrs {
1212 path := fsErr.path
1213 node := f.nodes.GetNode(path, false)
1214 if node != nil {
1215 // The path in question wasn't pruned due to a failure to process a parent directory.
1216 // So, the failure to process this path is important
1217 relevantErrs = append(relevantErrs, fsErr)
1218 }
1219 }
1220 f.fsErrs = relevantErrs
1221}
1222
1223// getErr returns an error based on previous calls to onFsErr, if any
1224func (f *Finder) getErr() error {
1225 f.discardErrsForPrunedPaths()
1226
1227 numErrs := len(f.fsErrs)
1228 if numErrs < 1 {
1229 return nil
1230 }
1231
1232 maxNumErrsToInclude := 10
1233 message := ""
1234 if numErrs > maxNumErrsToInclude {
1235 message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
1236 } else {
1237 message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
1238 }
1239
1240 return errors.New(message)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001241}
1242
1243func (f *Finder) statDirAsync(dir *pathMap) {
1244 node := dir
1245 path := dir.path
1246 f.threadPool.Run(
1247 func() {
1248 updatedStats := f.statDirSync(path)
1249
1250 if !f.isInfoUpToDate(node.statResponse, updatedStats) {
1251 node.mapNode = mapNode{
1252 statResponse: updatedStats,
1253 FileNames: []string{},
1254 }
1255 f.setModified()
1256 if node.statResponse.ModTime != 0 {
1257 // modification time was updated, so re-scan for
1258 // child directories
1259 f.listDirAsync(dir)
1260 }
1261 }
1262 },
1263 )
1264}
1265
1266func (f *Finder) statDirSync(path string) statResponse {
1267
1268 fileInfo, err := f.filesystem.Lstat(path)
1269
1270 var stats statResponse
1271 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001272 // possibly record this error
1273 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001274 // in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
1275 return stats
1276 }
1277 modTime := fileInfo.ModTime()
1278 stats = statResponse{}
1279 inode, err := f.filesystem.InodeNumber(fileInfo)
1280 if err != nil {
1281 panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
1282 }
1283 stats.Inode = inode
1284 device, err := f.filesystem.DeviceNumber(fileInfo)
1285 if err != nil {
1286 panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
1287 }
1288 stats.Device = device
1289 permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
1290
1291 if err != nil {
1292 panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
1293 }
1294 // We're only interested in knowing whether anything about the directory
1295 // has changed since last check, so we use the latest of the two
1296 // modification times (content modification (mtime) and
1297 // permission modification (ctime))
1298 if permissionsChangeTime.After(modTime) {
1299 modTime = permissionsChangeTime
1300 }
1301 stats.ModTime = modTime.UnixNano()
1302
1303 return stats
1304}
1305
1306// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
1307func (f *Finder) pruneCacheCandidates(items *DirEntries) {
1308
1309 for _, fileName := range items.FileNames {
1310 for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
1311 if fileName == abortedName {
1312 items.FileNames = []string{}
1313 items.DirNames = []string{}
1314 return
1315 }
1316 }
1317 }
1318
1319 // remove any files that aren't the ones we want to include
1320 writeIndex := 0
1321 for _, fileName := range items.FileNames {
1322 // include only these files
1323 for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
1324 if fileName == includedName {
1325 items.FileNames[writeIndex] = fileName
1326 writeIndex++
1327 break
1328 }
1329 }
1330 }
1331 // resize
1332 items.FileNames = items.FileNames[:writeIndex]
1333
1334 writeIndex = 0
1335 for _, dirName := range items.DirNames {
1336 items.DirNames[writeIndex] = dirName
1337 // ignore other dirs that are known to not be inputs to the build process
1338 include := true
1339 for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
1340 if dirName == excludedName {
1341 // don't include
1342 include = false
1343 break
1344 }
1345 }
1346 if include {
1347 writeIndex++
1348 }
1349 }
1350 // resize
1351 items.DirNames = items.DirNames[:writeIndex]
1352}
1353
1354func (f *Finder) listDirsAsync(nodes []*pathMap) {
1355 f.threadPool.Run(
1356 func() {
1357 for i := range nodes {
1358 f.listDirSync(nodes[i])
1359 }
1360 },
1361 )
1362}
1363
1364func (f *Finder) listDirAsync(node *pathMap) {
1365 f.threadPool.Run(
1366 func() {
1367 f.listDirSync(node)
1368 },
1369 )
1370}
1371
1372func (f *Finder) listDirSync(dir *pathMap) {
1373 path := dir.path
1374 children, err := f.filesystem.ReadDir(path)
1375
1376 if err != nil {
Jeff Gastonb629e182017-08-14 16:49:18 -07001377 // possibly record this error
1378 f.onFsError(path, err)
Jeff Gastonf1fd45e2017-08-09 18:25:28 -07001379 // if listing the contents of the directory fails (presumably due to
1380 // permission denied), then treat the directory as empty
1381 children = []os.FileInfo{}
1382 }
1383
1384 var subdirs []string
1385 var subfiles []string
1386
1387 for _, child := range children {
1388 linkBits := child.Mode() & os.ModeSymlink
1389 isLink := linkBits != 0
1390 if child.IsDir() {
1391 if !isLink {
1392 // Skip symlink dirs.
1393 // We don't have to support symlink dirs because
1394 // that would cause duplicates.
1395 subdirs = append(subdirs, child.Name())
1396 }
1397 } else {
1398 // We do have to support symlink files because the link name might be
1399 // different than the target name
1400 // (for example, Android.bp -> build/soong/root.bp)
1401 subfiles = append(subfiles, child.Name())
1402 }
1403
1404 }
1405 parentNode := dir
1406
1407 entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
1408 f.pruneCacheCandidates(entry)
1409
1410 // create a pathMap node for each relevant subdirectory
1411 relevantChildren := map[string]*pathMap{}
1412 for _, subdirName := range entry.DirNames {
1413 childNode, found := parentNode.children[subdirName]
1414 // if we already knew of this directory, then we already have a request pending to Stat it
1415 // if we didn't already know of this directory, then we must Stat it now
1416 if !found {
1417 childNode = parentNode.newChild(subdirName)
1418 f.statDirAsync(childNode)
1419 }
1420 relevantChildren[subdirName] = childNode
1421 }
1422 // Note that in rare cases, it's possible that we're reducing the set of
1423 // children via this statement, if these are all true:
1424 // 1. we previously had a cache that knew about subdirectories of parentNode
1425 // 2. the user created a prune-file (described in pruneCacheCandidates)
1426 // inside <parentNode>, which specifies that the contents of parentNode
1427 // are to be ignored.
1428 // The fact that it's possible to remove children here means that *pathMap structs
1429 // must not be looked up from f.nodes by filepath (and instead must be accessed by
1430 // direct pointer) until after every listDirSync completes
1431 parentNode.FileNames = entry.FileNames
1432 parentNode.children = relevantChildren
1433
1434}
1435
1436// listMatches takes a node and a function that specifies which subdirectories and
1437// files to include, and listMatches returns the matches
1438func (f *Finder) listMatches(node *pathMap,
1439 filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
1440 entries := DirEntries{
1441 FileNames: node.FileNames,
1442 }
1443 entries.DirNames = make([]string, 0, len(node.children))
1444 for childName := range node.children {
1445 entries.DirNames = append(entries.DirNames, childName)
1446 }
1447
1448 dirNames, fileNames := filter(entries)
1449
1450 subDirs = []*pathMap{}
1451 filePaths = make([]string, 0, len(fileNames))
1452 for _, fileName := range fileNames {
1453 filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
1454 }
1455 subDirs = make([]*pathMap, 0, len(dirNames))
1456 for _, childName := range dirNames {
1457 child, ok := node.children[childName]
1458 if ok {
1459 subDirs = append(subDirs, child)
1460 }
1461 }
1462
1463 return subDirs, filePaths
1464}
1465
1466// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
1467func (f *Finder) findInCacheMultithreaded(node *pathMap, filter WalkFunc,
1468 approxNumThreads int) []string {
1469
1470 if approxNumThreads < 2 {
1471 // Done spawning threads; process remaining directories
1472 return f.findInCacheSinglethreaded(node, filter)
1473 }
1474
1475 totalWork := 0
1476 for _, child := range node.children {
1477 totalWork += child.approximateNumDescendents
1478 }
1479 childrenResults := make(chan []string, len(node.children))
1480
1481 subDirs, filePaths := f.listMatches(node, filter)
1482
1483 // process child directories
1484 for _, child := range subDirs {
1485 numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
1486 childProcessor := func(child *pathMap) {
1487 childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
1488 childrenResults <- childResults
1489 }
1490 // If we're allowed to use more than 1 thread to process this directory,
1491 // then instead we use 1 thread for each subdirectory.
1492 // It would be strange to spawn threads for only some subdirectories.
1493 go childProcessor(child)
1494 }
1495
1496 // collect results
1497 for i := 0; i < len(subDirs); i++ {
1498 childResults := <-childrenResults
1499 filePaths = append(filePaths, childResults...)
1500 }
1501 close(childrenResults)
1502
1503 return filePaths
1504}
1505
1506// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
1507// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
1508func (f *Finder) findInCacheSinglethreaded(node *pathMap, filter WalkFunc) []string {
1509 if node == nil {
1510 return []string{}
1511 }
1512
1513 nodes := []*pathMap{node}
1514 matches := []string{}
1515
1516 for len(nodes) > 0 {
1517 currentNode := nodes[0]
1518 nodes = nodes[1:]
1519
1520 subDirs, filePaths := f.listMatches(currentNode, filter)
1521
1522 nodes = append(nodes, subDirs...)
1523
1524 matches = append(matches, filePaths...)
1525 }
1526 return matches
1527}