Blame - finder/finder.go - android_build_soong

blob: c2df928925298035e65ffc78fceaae930ad0323e [file] [log] [blame]

Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1	// Copyright 2017 Google Inc. All rights reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	package finder
				16
				17	import (
				18	"bufio"
				19	"bytes"
				20	"encoding/json"
				21	"fmt"
				22	"io"
				23	"os"
				24	"path/filepath"
				25	"runtime"
				26	"sort"
				27	"strings"
				28	"sync"
				29	"sync/atomic"
				30	"time"
				31
				32	"android/soong/fs"
				33	"errors"
				34	)
				35
				36	// This file provides a Finder struct that can quickly search for files satisfying
				37	// certain criteria.
				38	// This Finder gets its speed partially from parallelism and partially from caching.
				39	// If a Stat call returns the same result as last time, then it means Finder
				40	// can skip the ReadDir call for that dir.
				41
				42	// The primary data structure used by the finder is the field Finder.nodes ,
				43	// which is a tree of nodes of type *pathMap .
				44	// Each node represents a directory on disk, along with its stats, subdirectories,
				45	// and contained files.
				46
				47	// The common use case for the Finder is that the caller creates a Finder and gives
				48	// it the same query that was given to it in the previous execution.
				49	// In this situation, the major events that take place are:
				50	// 1. The Finder begins to load its db
				51	// 2. The Finder begins to stat the directories mentioned in its db (using multiple threads)
				52	// Calling Stat on each of these directories is generally a large fraction of the total time
				53	// 3. The Finder begins to construct a separate tree of nodes in each of its threads
				54	// 4. The Finder merges the individual node trees into the main node tree
				55	// 5. The Finder may call ReadDir a few times if there are a few directories that are out-of-date
				56	// These ReadDir calls might prompt additional Stat calls, etc
				57	// 6. The Finder waits for all loading to complete
				58	// 7. The Finder searches the cache for files matching the user's query (using multiple threads)
				59
				60	// These are the invariants regarding concurrency:
				61	// 1. The public methods of Finder are threadsafe.
				62	// The public methods are only performance-optimized for one caller at a time, however.
				63	// For the moment, multiple concurrent callers shouldn't expect any better performance than
				64	// multiple serial callers.
				65	// 2. While building the node tree, only one thread may ever access the <children> collection of a
				66	// *pathMap at once.
				67	// a) The thread that accesses the <children> collection is the thread that discovers the
				68	// children (by reading them from the cache or by having received a response to ReadDir).
				69	// 1) Consequently, the thread that discovers the children also spawns requests to stat
				70	// subdirs.
				71	// b) Consequently, while building the node tree, no thread may do a lookup of its
				72	// *pathMap via filepath because another thread may be adding children to the
				73	// <children> collection of an ancestor node. Additionally, in rare cases, another thread
				74	// may be removing children from an ancestor node if the children were only discovered to
				75	// be irrelevant after calling ReadDir (which happens if a prune-file was just added).
				76	// 3. No query will begin to be serviced until all loading (both reading the db
				77	// and scanning the filesystem) is complete.
				78	// Tests indicate that it only takes about 10% as long to search the in-memory cache as to
				79	// generate it, making this not a huge loss in performance.
				80	// 4. The parsing of the db and the initial setup of the pathMap tree must complete before
				81	// beginning to call listDirSync (because listDirSync can create new entries in the pathMap)
				82
				83	// see cmd/finder.go or finder_test.go for usage examples
				84
				85	// Update versionString whenever making a backwards-incompatible change to the cache file format
				86	const versionString = "Android finder version 1"
				87
				88	// a CacheParams specifies which files and directories the user wishes be scanned and
				89	// potentially added to the cache
				90	type CacheParams struct {
				91	// WorkingDirectory is used as a base for any relative file paths given to the Finder
				92	WorkingDirectory string
				93
				94	// RootDirs are the root directories used to initiate the search
				95	RootDirs []string
				96
				97	// ExcludeDirs are directory names that if encountered are removed from the search
				98	ExcludeDirs []string
				99
				100	// PruneFiles are file names that if encountered prune their entire directory
				101	// (including siblings)
				102	PruneFiles []string
				103
				104	// IncludeFiles are file names to include as matches
				105	IncludeFiles []string
				106	}
				107
				108	// a cacheConfig stores the inputs that determine what should be included in the cache
				109	type cacheConfig struct {
				110	CacheParams
				111
				112	// FilesystemView is a unique identifier telling which parts of which file systems
				113	// are readable by the Finder. In practice its value is essentially username@hostname.
				114	// FilesystemView is set to ensure that a cache file copied to another host or
				115	// found by another user doesn't inadvertently get reused.
				116	FilesystemView string
				117	}
				118
				119	func (p *cacheConfig) Dump() ([]byte, error) {
				120	bytes, err := json.Marshal(p)
				121	return bytes, err
				122	}
				123
				124	// a cacheMetadata stores version information about the cache
				125	type cacheMetadata struct {
				126	// The Version enables the Finder to determine whether it can even parse the file
				127	// If the version changes, the entire cache file must be regenerated
				128	Version string
				129
				130	// The CacheParams enables the Finder to determine whether the parameters match
				131	// If the CacheParams change, the Finder can choose how much of the cache file to reuse
				132	// (although in practice, the Finder will probably choose to ignore the entire file anyway)
				133	Config cacheConfig
				134	}
				135
				136	type Logger interface {
				137	Output(calldepth int, s string) error
				138	}
				139
				140	// the Finder is the main struct that callers will want to use
				141	type Finder struct {
				142	// configuration
				143	DbPath string
				144	numDbLoadingThreads int
				145	numSearchingThreads int
				146	cacheMetadata cacheMetadata
				147	logger Logger
				148	filesystem fs.FileSystem
				149
				150	// temporary state
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	151	threadPool *threadPool
				152	mutex sync.Mutex
				153	fsErrs []fsErr
				154	errlock sync.Mutex
				155	shutdownWaitgroup sync.WaitGroup
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	156
				157	// non-temporary state
				158	modifiedFlag int32
				159	nodes pathMap
				160	}
				161
				162	// New creates a new Finder for use
				163	func New(cacheParams CacheParams, filesystem fs.FileSystem,
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	164	logger Logger, dbPath string) (f *Finder, err error) {
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	165
				166	numThreads := runtime.NumCPU() * 2
				167	numDbLoadingThreads := numThreads
				168	numSearchingThreads := numThreads
				169
				170	metadata := cacheMetadata{
				171	Version: versionString,
				172	Config: cacheConfig{
				173	CacheParams: cacheParams,
				174	FilesystemView: filesystem.ViewId(),
				175	},
				176	}
				177
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	178	f = &Finder{
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	179	numDbLoadingThreads: numDbLoadingThreads,
				180	numSearchingThreads: numSearchingThreads,
				181	cacheMetadata: metadata,
				182	logger: logger,
				183	filesystem: filesystem,
				184
				185	nodes: *newPathMap("/"),
				186	DbPath: dbPath,
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	187
				188	shutdownWaitgroup: sync.WaitGroup{},
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	189	}
				190
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	191	f.loadFromFilesystem()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	192
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	193	// check for any filesystem errors
				194	err = f.getErr()
				195	if err != nil {
				196	return nil, err
				197	}
				198
				199	// confirm that every path mentioned in the CacheConfig exists
				200	for _, path := range cacheParams.RootDirs {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	201	if !filepath.IsAbs(path) {
				202	path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
				203	}
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	204	node := f.nodes.GetNode(filepath.Clean(path), false)
				205	if node == nil \|\| node.ModTime == 0 {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	206	return nil, fmt.Errorf("path %v was specified to be included in the cache but does not exist\n", path)
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	207	}
				208	}
				209
				210	return f, nil
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	211	}
				212
				213	// FindNamed searches for every cached file
				214	func (f *Finder) FindAll() []string {
				215	return f.FindAt("/")
				216	}
				217
				218	// FindNamed searches for every cached file under <rootDir>
				219	func (f *Finder) FindAt(rootDir string) []string {
				220	filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
				221	return entries.DirNames, entries.FileNames
				222	}
				223	return f.FindMatching(rootDir, filter)
				224	}
				225
				226	// FindNamed searches for every cached file named <fileName>
				227	func (f *Finder) FindNamed(fileName string) []string {
				228	return f.FindNamedAt("/", fileName)
				229	}
				230
				231	// FindNamedAt searches under <rootPath> for every file named <fileName>
				232	// The reason a caller might use FindNamedAt instead of FindNamed is if they want
				233	// to limit their search to a subset of the cache
				234	func (f *Finder) FindNamedAt(rootPath string, fileName string) []string {
				235	filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
				236	matches := []string{}
				237	for _, foundName := range entries.FileNames {
				238	if foundName == fileName {
				239	matches = append(matches, foundName)
				240	}
				241	}
				242	return entries.DirNames, matches
				243
				244	}
				245	return f.FindMatching(rootPath, filter)
				246	}
				247
				248	// FindFirstNamed searches for every file named <fileName>
				249	// Whenever it finds a match, it stops search subdirectories
				250	func (f *Finder) FindFirstNamed(fileName string) []string {
				251	return f.FindFirstNamedAt("/", fileName)
				252	}
				253
				254	// FindFirstNamedAt searches for every file named <fileName>
				255	// Whenever it finds a match, it stops search subdirectories
				256	func (f *Finder) FindFirstNamedAt(rootPath string, fileName string) []string {
				257	filter := func(entries DirEntries) (dirNames []string, fileNames []string) {
				258	matches := []string{}
				259	for _, foundName := range entries.FileNames {
				260	if foundName == fileName {
				261	matches = append(matches, foundName)
				262	}
				263	}
				264
				265	if len(matches) > 0 {
				266	return []string{}, matches
				267	}
				268	return entries.DirNames, matches
				269	}
				270	return f.FindMatching(rootPath, filter)
				271	}
				272
				273	// FindMatching is the most general exported function for searching for files in the cache
				274	// The WalkFunc will be invoked repeatedly and is expected to modify the provided DirEntries
				275	// in place, removing file paths and directories as desired.
				276	// WalkFunc will be invoked potentially many times in parallel, and must be threadsafe.
				277	func (f *Finder) FindMatching(rootPath string, filter WalkFunc) []string {
				278	// set up some parameters
				279	scanStart := time.Now()
				280	var isRel bool
				281	workingDir := f.cacheMetadata.Config.WorkingDirectory
				282
				283	isRel = !filepath.IsAbs(rootPath)
				284	if isRel {
				285	rootPath = filepath.Join(workingDir, rootPath)
				286	}
				287
				288	rootPath = filepath.Clean(rootPath)
				289
				290	// ensure nothing else is using the Finder
				291	f.verbosef("FindMatching waiting for finder to be idle\n")
				292	f.lock()
				293	defer f.unlock()
				294
				295	node := f.nodes.GetNode(rootPath, false)
				296	if node == nil {
				297	f.verbosef("No data for path %v ; apparently not included in cache params: %v\n",
				298	rootPath, f.cacheMetadata.Config.CacheParams)
				299	// path is not found; don't do a search
				300	return []string{}
				301	}
				302
				303	// search for matching files
				304	f.verbosef("Finder finding %v using cache\n", rootPath)
				305	results := f.findInCacheMultithreaded(node, filter, f.numSearchingThreads)
				306
				307	// format and return results
				308	if isRel {
				309	for i := 0; i < len(results); i++ {
				310	results[i] = strings.Replace(results[i], workingDir+"/", "", 1)
				311	}
				312	}
				313	sort.Strings(results)
				314	f.verbosef("Found %v files under %v in %v using cache\n",
				315	len(results), rootPath, time.Since(scanStart))
				316	return results
				317	}
				318
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	319	// Shutdown declares that the finder is no longer needed and waits for its cleanup to complete
				320	// Currently, that only entails waiting for the database dump to complete.
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	321	func (f *Finder) Shutdown() {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	322	f.waitForDbDump()
				323	}
				324
				325	// End of public api
				326
				327	func (f *Finder) goDumpDb() {
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	328	if f.wasModified() {
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	329	f.shutdownWaitgroup.Add(1)
				330	go func() {
				331	err := f.dumpDb()
				332	if err != nil {
				333	f.verbosef("%v\n", err)
				334	}
				335	f.shutdownWaitgroup.Done()
				336	}()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	337	} else {
				338	f.verbosef("Skipping dumping unmodified db\n")
				339	}
				340	}
				341
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	342	func (f *Finder) waitForDbDump() {
				343	f.shutdownWaitgroup.Wait()
				344	}
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	345
				346	// joinCleanPaths is like filepath.Join but is faster because
				347	// joinCleanPaths doesn't have to support paths ending in "/" or containing ".."
				348	func joinCleanPaths(base string, leaf string) string {
				349	if base == "" {
				350	return leaf
				351	}
				352	if base == "/" {
				353	return base + leaf
				354	}
				355	if leaf == "" {
				356	return base
				357	}
				358	return base + "/" + leaf
				359	}
				360
				361	func (f *Finder) verbosef(format string, args ...interface{}) {
				362	f.logger.Output(2, fmt.Sprintf(format, args...))
				363	}
				364
				365	// loadFromFilesystem populates the in-memory cache based on the contents of the filesystem
				366	func (f *Finder) loadFromFilesystem() {
				367	f.threadPool = newThreadPool(f.numDbLoadingThreads)
				368
				369	err := f.startFromExternalCache()
				370	if err != nil {
				371	f.startWithoutExternalCache()
				372	}
				373
Jeff Gaston	b64fc1c	2017-08-04 12:30:12 -0700	[diff] [blame]	374	f.goDumpDb()
				375
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	376	f.threadPool = nil
				377	}
				378
				379	func (f *Finder) startFind(path string) {
				380	if !filepath.IsAbs(path) {
				381	path = filepath.Join(f.cacheMetadata.Config.WorkingDirectory, path)
				382	}
				383	node := f.nodes.GetNode(path, true)
				384	f.statDirAsync(node)
				385	}
				386
				387	func (f *Finder) lock() {
				388	f.mutex.Lock()
				389	}
				390
				391	func (f *Finder) unlock() {
				392	f.mutex.Unlock()
				393	}
				394
				395	// a statResponse is the relevant portion of the response from the filesystem to a Stat call
				396	type statResponse struct {
				397	ModTime int64
				398	Inode uint64
				399	Device uint64
				400	}
				401
				402	// a pathAndStats stores a path and its stats
				403	type pathAndStats struct {
				404	statResponse
				405
				406	Path string
				407	}
				408
				409	// a dirFullInfo stores all of the relevant information we know about a directory
				410	type dirFullInfo struct {
				411	pathAndStats
				412
				413	FileNames []string
				414	}
				415
				416	// a PersistedDirInfo is the information about a dir that we save to our cache on disk
				417	type PersistedDirInfo struct {
				418	// These field names are short because they are repeated many times in the output json file
				419	P string // path
				420	T int64 // modification time
				421	I uint64 // inode number
				422	F []string // relevant filenames contained
				423	}
				424
				425	// a PersistedDirs is the information that we persist for a group of dirs
				426	type PersistedDirs struct {
				427	// the device on which each directory is stored
				428	Device uint64
				429	// the common root path to which all contained dirs are relative
				430	Root string
				431	// the directories themselves
				432	Dirs []PersistedDirInfo
				433	}
				434
				435	// a CacheEntry is the smallest unit that can be read and parsed from the cache (on disk) at a time
				436	type CacheEntry []PersistedDirs
				437
				438	// a DirEntries lists the files and directories contained directly within a specific directory
				439	type DirEntries struct {
				440	Path string
				441
				442	// elements of DirNames are just the dir names; they don't include any '/' character
				443	DirNames []string
				444	// elements of FileNames are just the file names; they don't include '/' character
				445	FileNames []string
				446	}
				447
				448	// a WalkFunc is the type that is passed into various Find functions for determining which
				449	// directories the caller wishes be walked. The WalkFunc is expected to decide which
				450	// directories to walk and which files to consider as matches to the original query.
				451	type WalkFunc func(DirEntries) (dirs []string, files []string)
				452
				453	// a mapNode stores the relevant stats about a directory to be stored in a pathMap
				454	type mapNode struct {
				455	statResponse
				456	FileNames []string
				457	}
				458
				459	// a pathMap implements the directory tree structure of nodes
				460	type pathMap struct {
				461	mapNode
				462
				463	path string
				464
				465	children map[string]*pathMap
				466
				467	// number of descendent nodes, including self
				468	approximateNumDescendents int
				469	}
				470
				471	func newPathMap(path string) *pathMap {
				472	result := &pathMap{path: path, children: make(map[string]*pathMap, 4),
				473	approximateNumDescendents: 1}
				474	return result
				475	}
				476
				477	// GetNode returns the node at <path>
				478	func (m pathMap) GetNode(path string, createIfNotFound bool) pathMap {
				479	if len(path) > 0 && path[0] == '/' {
				480	path = path[1:]
				481	}
				482
				483	node := m
				484	for {
				485	if path == "" {
				486	return node
				487	}
				488
				489	index := strings.Index(path, "/")
				490	var firstComponent string
				491	if index >= 0 {
				492	firstComponent = path[:index]
				493	path = path[index+1:]
				494	} else {
				495	firstComponent = path
				496	path = ""
				497	}
				498
				499	child, found := node.children[firstComponent]
				500
				501	if !found {
				502	if createIfNotFound {
				503	child = node.newChild(firstComponent)
				504	} else {
				505	return nil
				506	}
				507	}
				508
				509	node = child
				510	}
				511	}
				512
				513	func (m pathMap) newChild(name string) (child pathMap) {
				514	path := joinCleanPaths(m.path, name)
				515	newChild := newPathMap(path)
				516	m.children[name] = newChild
				517
				518	return m.children[name]
				519	}
				520
				521	func (m *pathMap) UpdateNumDescendents() int {
				522	count := 1
				523	for _, child := range m.children {
				524	count += child.approximateNumDescendents
				525	}
				526	m.approximateNumDescendents = count
				527	return count
				528	}
				529
				530	func (m *pathMap) UpdateNumDescendentsRecursive() {
				531	for _, child := range m.children {
				532	child.UpdateNumDescendentsRecursive()
				533	}
				534	m.UpdateNumDescendents()
				535	}
				536
				537	func (m pathMap) MergeIn(other pathMap) {
				538	for key, theirs := range other.children {
				539	ours, found := m.children[key]
				540	if found {
				541	ours.MergeIn(theirs)
				542	} else {
				543	m.children[key] = theirs
				544	}
				545	}
				546	if other.ModTime != 0 {
				547	m.mapNode = other.mapNode
				548	}
				549	m.UpdateNumDescendents()
				550	}
				551
				552	func (m *pathMap) DumpAll() []dirFullInfo {
				553	results := []dirFullInfo{}
				554	m.dumpInto("", &results)
				555	return results
				556	}
				557
				558	func (m pathMap) dumpInto(path string, results []dirFullInfo) {
				559	results = append(results,
				560	dirFullInfo{
				561	pathAndStats{statResponse: m.statResponse, Path: path},
				562	m.FileNames},
				563	)
				564	for key, child := range m.children {
				565	childPath := joinCleanPaths(path, key)
				566	if len(childPath) == 0 \|\| childPath[0] != '/' {
				567	childPath = "/" + childPath
				568	}
				569	child.dumpInto(childPath, results)
				570	}
				571	}
				572
				573	// a semaphore can be locked by up to <capacity> callers at once
				574	type semaphore struct {
				575	pool chan bool
				576	}
				577
				578	func newSemaphore(capacity int) *semaphore {
				579	return &semaphore{pool: make(chan bool, capacity)}
				580	}
				581
				582	func (l *semaphore) Lock() {
				583	l.pool <- true
				584	}
				585
				586	func (l *semaphore) Unlock() {
				587	<-l.pool
				588	}
				589
				590	// A threadPool runs goroutines and supports throttling and waiting.
				591	// Without throttling, Go may exhaust the maximum number of various resources, such as
				592	// threads or file descriptors, and crash the program.
				593	type threadPool struct {
				594	receivedRequests sync.WaitGroup
				595	activeRequests semaphore
				596	}
				597
				598	func newThreadPool(maxNumConcurrentThreads int) *threadPool {
				599	return &threadPool{
				600	receivedRequests: sync.WaitGroup{},
				601	activeRequests: *newSemaphore(maxNumConcurrentThreads),
				602	}
				603	}
				604
				605	// Run requests to run the given function in its own goroutine
				606	func (p *threadPool) Run(function func()) {
				607	p.receivedRequests.Add(1)
				608	// If Run() was called from within a goroutine spawned by this threadPool,
				609	// then we may need to return from Run() before having capacity to actually
				610	// run <function>.
				611	//
				612	// It's possible that the body of <function> contains a statement (such as a syscall)
				613	// that will cause Go to pin it to a thread, or will contain a statement that uses
				614	// another resource that is in short supply (such as a file descriptor), so we can't
				615	// actually run <function> until we have capacity.
				616	//
				617	// However, the semaphore used for synchronization is implemented via a channel and
				618	// shouldn't require a new thread for each access.
				619	go func() {
				620	p.activeRequests.Lock()
				621	function()
				622	p.activeRequests.Unlock()
				623	p.receivedRequests.Done()
				624	}()
				625	}
				626
				627	// Wait waits until all goroutines are done, just like sync.WaitGroup's Wait
				628	func (p *threadPool) Wait() {
				629	p.receivedRequests.Wait()
				630	}
				631
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	632	type fsErr struct {
				633	path string
				634	err error
				635	}
				636
				637	func (e fsErr) String() string {
				638	return e.path + ": " + e.err.Error()
				639	}
				640
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	641	func (f *Finder) serializeCacheEntry(dirInfos []dirFullInfo) ([]byte, error) {
				642	// group each dirFullInfo by its Device, to avoid having to repeat it in the output
				643	dirsByDevice := map[uint64][]PersistedDirInfo{}
				644	for _, entry := range dirInfos {
				645	_, found := dirsByDevice[entry.Device]
				646	if !found {
				647	dirsByDevice[entry.Device] = []PersistedDirInfo{}
				648	}
				649	dirsByDevice[entry.Device] = append(dirsByDevice[entry.Device],
				650	PersistedDirInfo{P: entry.Path, T: entry.ModTime, I: entry.Inode, F: entry.FileNames})
				651	}
				652
				653	cacheEntry := CacheEntry{}
				654
				655	for device, infos := range dirsByDevice {
				656	// find common prefix
				657	prefix := ""
				658	if len(infos) > 0 {
				659	prefix = infos[0].P
				660	}
				661	for _, info := range infos {
				662	for !strings.HasPrefix(info.P+"/", prefix+"/") {
				663	prefix = filepath.Dir(prefix)
Jeff Gaston	996716a	2017-08-22 13:33:19 -0700	[diff] [blame^]	664	if prefix == "/" {
				665	break
				666	}
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	667	}
				668	}
				669	// remove common prefix
				670	for i := range infos {
				671	suffix := strings.Replace(infos[i].P, prefix, "", 1)
				672	if len(suffix) > 0 && suffix[0] == '/' {
				673	suffix = suffix[1:]
				674	}
				675	infos[i].P = suffix
				676	}
				677
				678	// turn the map (keyed by device) into a list of structs with labeled fields
				679	// this is to improve readability of the output
				680	cacheEntry = append(cacheEntry, PersistedDirs{Device: device, Root: prefix, Dirs: infos})
				681	}
				682
				683	// convert to json.
				684	// it would save some space to use a different format than json for the db file,
				685	// but the space and time savings are small, and json is easy for humans to read
				686	bytes, err := json.Marshal(cacheEntry)
				687	return bytes, err
				688	}
				689
				690	func (f *Finder) parseCacheEntry(bytes []byte) ([]dirFullInfo, error) {
				691	var cacheEntry CacheEntry
				692	err := json.Unmarshal(bytes, &cacheEntry)
				693	if err != nil {
				694	return nil, err
				695	}
				696
				697	// convert from a CacheEntry to a []dirFullInfo (by copying a few fields)
				698	capacity := 0
				699	for _, element := range cacheEntry {
				700	capacity += len(element.Dirs)
				701	}
				702	nodes := make([]dirFullInfo, capacity)
				703	count := 0
				704	for _, element := range cacheEntry {
				705	for _, dir := range element.Dirs {
				706	path := joinCleanPaths(element.Root, dir.P)
				707
				708	nodes[count] = dirFullInfo{
				709	pathAndStats: pathAndStats{
				710	statResponse: statResponse{
				711	ModTime: dir.T, Inode: dir.I, Device: element.Device,
				712	},
				713	Path: path},
				714	FileNames: dir.F}
				715	count++
				716	}
				717	}
				718	return nodes, nil
				719	}
				720
				721	// We use the following separator byte to distinguish individually parseable blocks of json
				722	// because we know this separator won't appear in the json that we're parsing.
				723	//
				724	// The newline byte can only appear in a UTF-8 stream if the newline character appears, because:
				725	// - The newline character is encoded as "0000 1010" in binary ("0a" in hex)
				726	// - UTF-8 dictates that bytes beginning with a "0" bit are never emitted as part of a multibyte
				727	// character.
				728	//
				729	// We know that the newline character will never appear in our json string, because:
				730	// - If a newline character appears as part of a data string, then json encoding will
				731	// emit two characters instead: '\' and 'n'.
				732	// - The json encoder that we use doesn't emit the optional newlines between any of its
				733	// other outputs.
				734	const lineSeparator = byte('\n')
				735
				736	func (f Finder) readLine(reader bufio.Reader) ([]byte, error) {
				737	return reader.ReadBytes(lineSeparator)
				738	}
				739
				740	// validateCacheHeader reads the cache header from cacheReader and tells whether the cache is compatible with this Finder
				741	func (f Finder) validateCacheHeader(cacheReader bufio.Reader) bool {
				742	cacheVersionBytes, err := f.readLine(cacheReader)
				743	if err != nil {
				744	f.verbosef("Failed to read database header; database is invalid\n")
				745	return false
				746	}
				747	if len(cacheVersionBytes) > 0 && cacheVersionBytes[len(cacheVersionBytes)-1] == lineSeparator {
				748	cacheVersionBytes = cacheVersionBytes[:len(cacheVersionBytes)-1]
				749	}
				750	cacheVersionString := string(cacheVersionBytes)
				751	currentVersion := f.cacheMetadata.Version
				752	if cacheVersionString != currentVersion {
				753	f.verbosef("Version changed from %q to %q, database is not applicable\n", cacheVersionString, currentVersion)
				754	return false
				755	}
				756
				757	cacheParamBytes, err := f.readLine(cacheReader)
				758	if err != nil {
				759	f.verbosef("Failed to read database search params; database is invalid\n")
				760	return false
				761	}
				762
				763	if len(cacheParamBytes) > 0 && cacheParamBytes[len(cacheParamBytes)-1] == lineSeparator {
				764	cacheParamBytes = cacheParamBytes[:len(cacheParamBytes)-1]
				765	}
				766
				767	currentParamBytes, err := f.cacheMetadata.Config.Dump()
				768	if err != nil {
				769	panic("Finder failed to serialize its parameters")
				770	}
				771	cacheParamString := string(cacheParamBytes)
				772	currentParamString := string(currentParamBytes)
				773	if cacheParamString != currentParamString {
				774	f.verbosef("Params changed from %q to %q, database is not applicable\n", cacheParamString, currentParamString)
				775	return false
				776	}
				777	return true
				778	}
				779
				780	// loadBytes compares the cache info in <data> to the state of the filesystem
				781	// loadBytes returns a map representing <data> and also a slice of dirs that need to be re-walked
				782	func (f Finder) loadBytes(id int, data []byte) (m pathMap, dirsToWalk []string, err error) {
				783
				784	helperStartTime := time.Now()
				785
				786	cachedNodes, err := f.parseCacheEntry(data)
				787	if err != nil {
				788	return nil, nil, fmt.Errorf("Failed to parse block %v: %v\n", id, err.Error())
				789	}
				790
				791	unmarshalDate := time.Now()
				792	f.verbosef("Unmarshaled %v objects for %v in %v\n",
				793	len(cachedNodes), id, unmarshalDate.Sub(helperStartTime))
				794
				795	tempMap := newPathMap("/")
				796	stats := make([]statResponse, len(cachedNodes))
				797
				798	for i, node := range cachedNodes {
				799	// check the file system for an updated timestamp
				800	stats[i] = f.statDirSync(node.Path)
				801	}
				802
				803	dirsToWalk = []string{}
				804	for i, cachedNode := range cachedNodes {
				805	updated := stats[i]
				806	// save the cached value
				807	container := tempMap.GetNode(cachedNode.Path, true)
				808	container.mapNode = mapNode{statResponse: updated}
				809
				810	// if the metadata changed and the directory still exists, then
				811	// make a note to walk it later
				812	if !f.isInfoUpToDate(cachedNode.statResponse, updated) && updated.ModTime != 0 {
				813	f.setModified()
				814	// make a note that the directory needs to be walked
				815	dirsToWalk = append(dirsToWalk, cachedNode.Path)
				816	} else {
				817	container.mapNode.FileNames = cachedNode.FileNames
				818	}
				819	}
				820	// count the number of nodes to improve our understanding of the shape of the tree,
				821	// thereby improving parallelism of subsequent searches
				822	tempMap.UpdateNumDescendentsRecursive()
				823
				824	f.verbosef("Statted inodes of block %v in %v\n", id, time.Now().Sub(unmarshalDate))
				825	return tempMap, dirsToWalk, nil
				826	}
				827
				828	// startFromExternalCache loads the cache database from disk
				829	// startFromExternalCache waits to return until the load of the cache db is complete, but
				830	// startFromExternalCache does not wait for all every listDir() or statDir() request to complete
				831	func (f *Finder) startFromExternalCache() (err error) {
				832	startTime := time.Now()
				833	dbPath := f.DbPath
				834
				835	// open cache file and validate its header
				836	reader, err := f.filesystem.Open(dbPath)
				837	if err != nil {
				838	return errors.New("No data to load from database\n")
				839	}
				840	bufferedReader := bufio.NewReader(reader)
				841	if !f.validateCacheHeader(bufferedReader) {
				842	return errors.New("Cache header does not match")
				843	}
				844	f.verbosef("Database header matches, will attempt to use database %v\n", f.DbPath)
				845
				846	// read the file and spawn threads to process it
				847	nodesToWalk := [][]*pathMap{}
				848	mainTree := newPathMap("/")
				849
				850	// read the blocks and stream them into <blockChannel>
				851	type dataBlock struct {
				852	id int
				853	err error
				854	data []byte
				855	}
				856	blockChannel := make(chan dataBlock, f.numDbLoadingThreads)
				857	readBlocks := func() {
				858	index := 0
				859	for {
				860	// It takes some time to unmarshal the input from json, so we want
				861	// to unmarshal it in parallel. In order to find valid places to
				862	// break the input, we scan for the line separators that we inserted
				863	// (for this purpose) when we dumped the database.
				864	data, err := f.readLine(bufferedReader)
				865	var response dataBlock
				866	done := false
				867	if err != nil && err != io.EOF {
				868	response = dataBlock{id: index, err: err, data: nil}
				869	done = true
				870	} else {
				871	done = (err == io.EOF)
				872	response = dataBlock{id: index, err: nil, data: data}
				873	}
				874	blockChannel <- response
				875	index++
				876	duration := time.Since(startTime)
				877	f.verbosef("Read block %v after %v\n", index, duration)
				878	if done {
				879	f.verbosef("Read %v blocks in %v\n", index, duration)
				880	close(blockChannel)
				881	return
				882	}
				883	}
				884	}
				885	go readBlocks()
				886
				887	// Read from <blockChannel> and stream the responses into <resultChannel>.
				888	type workResponse struct {
				889	id int
				890	err error
				891	tree *pathMap
				892	updatedDirs []string
				893	}
				894	resultChannel := make(chan workResponse)
				895	processBlocks := func() {
				896	numProcessed := 0
				897	threadPool := newThreadPool(f.numDbLoadingThreads)
				898	for {
				899	// get a block to process
				900	block, received := <-blockChannel
				901	if !received {
				902	break
				903	}
				904
				905	if block.err != nil {
				906	resultChannel <- workResponse{err: block.err}
				907	break
				908	}
				909	numProcessed++
				910	// wait until there is CPU available to process it
				911	threadPool.Run(
				912	func() {
				913	processStartTime := time.Now()
				914	f.verbosef("Starting to process block %v after %v\n",
				915	block.id, processStartTime.Sub(startTime))
				916	tempMap, updatedDirs, err := f.loadBytes(block.id, block.data)
				917	var response workResponse
				918	if err != nil {
				919	f.verbosef(
				920	"Block %v failed to parse with error %v\n",
				921	block.id, err)
				922	response = workResponse{err: err}
				923	} else {
				924	response = workResponse{
				925	id: block.id,
				926	err: nil,
				927	tree: tempMap,
				928	updatedDirs: updatedDirs,
				929	}
				930	}
				931	f.verbosef("Processed block %v in %v\n",
				932	block.id, time.Since(processStartTime),
				933	)
				934	resultChannel <- response
				935	},
				936	)
				937	}
				938	threadPool.Wait()
				939	f.verbosef("Finished processing %v blocks in %v\n",
				940	numProcessed, time.Since(startTime))
				941	close(resultChannel)
				942	}
				943	go processBlocks()
				944
				945	// Read from <resultChannel> and use the results
				946	combineResults := func() (err error) {
				947	for {
				948	result, received := <-resultChannel
				949	if !received {
				950	break
				951	}
				952	if err != nil {
				953	// In case of an error, wait for work to complete before
				954	// returning the error. This ensures that any subsequent
				955	// work doesn't need to compete for resources (and possibly
				956	// fail due to, for example, a filesystem limit on the number of
				957	// concurrently open files) with past work.
				958	continue
				959	}
				960	if result.err != nil {
				961	err = result.err
				962	continue
				963	}
				964	// update main tree
				965	mainTree.MergeIn(result.tree)
				966	// record any new directories that we will need to Stat()
				967	updatedNodes := make([]*pathMap, len(result.updatedDirs))
				968	for j, dir := range result.updatedDirs {
				969	node := mainTree.GetNode(dir, false)
				970	updatedNodes[j] = node
				971	}
				972	nodesToWalk = append(nodesToWalk, updatedNodes)
				973	}
				974	return err
				975	}
				976	err = combineResults()
				977	if err != nil {
				978	return err
				979	}
				980
				981	f.nodes = *mainTree
				982
				983	// after having loaded the entire db and therefore created entries for
				984	// the directories we know of, now it's safe to start calling ReadDir on
				985	// any updated directories
				986	for i := range nodesToWalk {
				987	f.listDirsAsync(nodesToWalk[i])
				988	}
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	989	f.verbosef("Loaded db and statted known dirs in %v\n", time.Since(startTime))
				990	f.threadPool.Wait()
				991	f.verbosef("Loaded db and statted all dirs in %v\n", time.Now().Sub(startTime))
				992
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	993	return err
				994	}
				995
				996	// startWithoutExternalCache starts scanning the filesystem according to the cache config
				997	// startWithoutExternalCache should be called if startFromExternalCache is not applicable
				998	func (f *Finder) startWithoutExternalCache() {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	999	startTime := time.Now()
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1000	configDirs := f.cacheMetadata.Config.RootDirs
				1001
				1002	// clean paths
				1003	candidates := make([]string, len(configDirs))
				1004	for i, dir := range configDirs {
				1005	candidates[i] = filepath.Clean(dir)
				1006	}
				1007	// remove duplicates
				1008	dirsToScan := make([]string, 0, len(configDirs))
				1009	for _, candidate := range candidates {
				1010	include := true
				1011	for _, included := range dirsToScan {
				1012	if included == "/" \|\| strings.HasPrefix(candidate+"/", included+"/") {
				1013	include = false
				1014	break
				1015	}
				1016	}
				1017	if include {
				1018	dirsToScan = append(dirsToScan, candidate)
				1019	}
				1020	}
				1021
				1022	// start searching finally
				1023	for _, path := range dirsToScan {
				1024	f.verbosef("Starting find of %v\n", path)
				1025	f.startFind(path)
				1026	}
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1027
				1028	f.threadPool.Wait()
				1029
				1030	f.verbosef("Scanned filesystem (not using cache) in %v\n", time.Now().Sub(startTime))
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1031	}
				1032
				1033	// isInfoUpToDate tells whether <new> can confirm that results computed at <old> are still valid
				1034	func (f *Finder) isInfoUpToDate(old statResponse, new statResponse) (equal bool) {
				1035	if old.Inode != new.Inode {
				1036	return false
				1037	}
				1038	if old.ModTime != new.ModTime {
				1039	return false
				1040	}
				1041	if old.Device != new.Device {
				1042	return false
				1043	}
				1044	return true
				1045	}
				1046
				1047	func (f *Finder) wasModified() bool {
				1048	return atomic.LoadInt32(&f.modifiedFlag) > 0
				1049	}
				1050
				1051	func (f *Finder) setModified() {
				1052	var newVal int32
				1053	newVal = 1
				1054	atomic.StoreInt32(&f.modifiedFlag, newVal)
				1055	}
				1056
				1057	// sortedDirEntries exports directory entries to facilitate dumping them to the external cache
				1058	func (f *Finder) sortedDirEntries() []dirFullInfo {
				1059	startTime := time.Now()
				1060	nodes := make([]dirFullInfo, 0)
				1061	for _, node := range f.nodes.DumpAll() {
				1062	if node.ModTime != 0 {
				1063	nodes = append(nodes, node)
				1064	}
				1065	}
				1066	discoveryDate := time.Now()
				1067	f.verbosef("Generated %v cache entries in %v\n", len(nodes), discoveryDate.Sub(startTime))
				1068	less := func(i int, j int) bool {
				1069	return nodes[i].Path < nodes[j].Path
				1070	}
				1071	sort.Slice(nodes, less)
				1072	sortDate := time.Now()
				1073	f.verbosef("Sorted %v cache entries in %v\n", len(nodes), sortDate.Sub(discoveryDate))
				1074
				1075	return nodes
				1076	}
				1077
				1078	// serializeDb converts the cache database into a form to save to disk
				1079	func (f *Finder) serializeDb() ([]byte, error) {
				1080	// sort dir entries
				1081	var entryList = f.sortedDirEntries()
				1082
				1083	// Generate an output file that can be conveniently loaded using the same number of threads
				1084	// as were used in this execution (because presumably that will be the number of threads
				1085	// used in the next execution too)
				1086
				1087	// generate header
				1088	header := []byte{}
				1089	header = append(header, []byte(f.cacheMetadata.Version)...)
				1090	header = append(header, lineSeparator)
				1091	configDump, err := f.cacheMetadata.Config.Dump()
				1092	if err != nil {
				1093	return nil, err
				1094	}
				1095	header = append(header, configDump...)
				1096
				1097	// serialize individual blocks in parallel
				1098	numBlocks := f.numDbLoadingThreads
				1099	if numBlocks > len(entryList) {
				1100	numBlocks = len(entryList)
				1101	}
				1102	blocks := make([][]byte, 1+numBlocks)
				1103	blocks[0] = header
				1104	blockMin := 0
				1105	wg := sync.WaitGroup{}
				1106	var errLock sync.Mutex
				1107
				1108	for i := 1; i <= numBlocks; i++ {
				1109	// identify next block
				1110	blockMax := len(entryList) * i / numBlocks
				1111	block := entryList[blockMin:blockMax]
				1112
				1113	// process block
				1114	wg.Add(1)
				1115	go func(index int, block []dirFullInfo) {
				1116	byteBlock, subErr := f.serializeCacheEntry(block)
				1117	f.verbosef("Serialized block %v into %v bytes\n", index, len(byteBlock))
				1118	if subErr != nil {
				1119	f.verbosef("%v\n", subErr.Error())
				1120	errLock.Lock()
				1121	err = subErr
				1122	errLock.Unlock()
				1123	} else {
				1124	blocks[index] = byteBlock
				1125	}
				1126	wg.Done()
				1127	}(i, block)
				1128
				1129	blockMin = blockMax
				1130	}
				1131
				1132	wg.Wait()
				1133
				1134	if err != nil {
				1135	return nil, err
				1136	}
				1137
				1138	content := bytes.Join(blocks, []byte{lineSeparator})
				1139
				1140	return content, nil
				1141	}
				1142
				1143	// dumpDb saves the cache database to disk
				1144	func (f *Finder) dumpDb() error {
				1145	startTime := time.Now()
				1146	f.verbosef("Dumping db\n")
				1147
				1148	tempPath := f.DbPath + ".tmp"
				1149
				1150	bytes, err := f.serializeDb()
				1151	if err != nil {
				1152	return err
				1153	}
				1154	serializeDate := time.Now()
				1155	f.verbosef("Serialized db in %v\n", serializeDate.Sub(startTime))
				1156	// dump file and atomically move
				1157	err = f.filesystem.WriteFile(tempPath, bytes, 0777)
				1158	if err != nil {
				1159	return err
				1160	}
				1161	err = f.filesystem.Rename(tempPath, f.DbPath)
				1162	if err != nil {
				1163	return err
				1164	}
				1165
				1166	f.verbosef("Wrote db in %v\n", time.Now().Sub(serializeDate))
				1167	return nil
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1168
				1169	}
				1170
				1171	// canIgnoreFsErr checks for certain classes of filesystem errors that are safe to ignore
				1172	func (f *Finder) canIgnoreFsErr(err error) bool {
				1173	pathErr, isPathErr := err.(*os.PathError)
				1174	if !isPathErr {
				1175	// Don't recognize this error
				1176	return false
				1177	}
				1178	if pathErr.Err == os.ErrPermission {
				1179	// Permission errors are ignored:
				1180	// https://issuetracker.google.com/37553659
				1181	// https://github.com/google/kati/pull/116
				1182	return true
				1183	}
				1184	if pathErr.Err == os.ErrNotExist {
				1185	// If a directory doesn't exist, that generally means the cache is out-of-date
				1186	return true
				1187	}
				1188	// Don't recognize this error
				1189	return false
				1190	}
				1191
				1192	// onFsError should be called whenever a potentially fatal error is returned from a filesystem call
				1193	func (f *Finder) onFsError(path string, err error) {
				1194	if !f.canIgnoreFsErr(err) {
				1195	// We could send the errors through a channel instead, although that would cause this call
				1196	// to block unless we preallocated a sufficient buffer or spawned a reader thread.
				1197	// Although it wouldn't be too complicated to spawn a reader thread, it's still slightly
				1198	// more convenient to use a lock. Only in an unusual situation should this code be
				1199	// invoked anyway.
				1200	f.errlock.Lock()
				1201	f.fsErrs = append(f.fsErrs, fsErr{path: path, err: err})
				1202	f.errlock.Unlock()
				1203	}
				1204	}
				1205
				1206	// discardErrsForPrunedPaths removes any errors for paths that are no longer included in the cache
				1207	func (f *Finder) discardErrsForPrunedPaths() {
				1208	// This function could be somewhat inefficient due to being single-threaded,
				1209	// but the length of f.fsErrs should be approximately 0, so it shouldn't take long anyway.
				1210	relevantErrs := make([]fsErr, 0, len(f.fsErrs))
				1211	for _, fsErr := range f.fsErrs {
				1212	path := fsErr.path
				1213	node := f.nodes.GetNode(path, false)
				1214	if node != nil {
				1215	// The path in question wasn't pruned due to a failure to process a parent directory.
				1216	// So, the failure to process this path is important
				1217	relevantErrs = append(relevantErrs, fsErr)
				1218	}
				1219	}
				1220	f.fsErrs = relevantErrs
				1221	}
				1222
				1223	// getErr returns an error based on previous calls to onFsErr, if any
				1224	func (f *Finder) getErr() error {
				1225	f.discardErrsForPrunedPaths()
				1226
				1227	numErrs := len(f.fsErrs)
				1228	if numErrs < 1 {
				1229	return nil
				1230	}
				1231
				1232	maxNumErrsToInclude := 10
				1233	message := ""
				1234	if numErrs > maxNumErrsToInclude {
				1235	message = fmt.Sprintf("finder encountered %v errors: %v...", numErrs, f.fsErrs[:maxNumErrsToInclude])
				1236	} else {
				1237	message = fmt.Sprintf("finder encountered %v errors: %v", numErrs, f.fsErrs)
				1238	}
				1239
				1240	return errors.New(message)
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1241	}
				1242
				1243	func (f Finder) statDirAsync(dir pathMap) {
				1244	node := dir
				1245	path := dir.path
				1246	f.threadPool.Run(
				1247	func() {
				1248	updatedStats := f.statDirSync(path)
				1249
				1250	if !f.isInfoUpToDate(node.statResponse, updatedStats) {
				1251	node.mapNode = mapNode{
				1252	statResponse: updatedStats,
				1253	FileNames: []string{},
				1254	}
				1255	f.setModified()
				1256	if node.statResponse.ModTime != 0 {
				1257	// modification time was updated, so re-scan for
				1258	// child directories
				1259	f.listDirAsync(dir)
				1260	}
				1261	}
				1262	},
				1263	)
				1264	}
				1265
				1266	func (f *Finder) statDirSync(path string) statResponse {
				1267
				1268	fileInfo, err := f.filesystem.Lstat(path)
				1269
				1270	var stats statResponse
				1271	if err != nil {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1272	// possibly record this error
				1273	f.onFsError(path, err)
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1274	// in case of a failure to stat the directory, treat the directory as missing (modTime = 0)
				1275	return stats
				1276	}
				1277	modTime := fileInfo.ModTime()
				1278	stats = statResponse{}
				1279	inode, err := f.filesystem.InodeNumber(fileInfo)
				1280	if err != nil {
				1281	panic(fmt.Sprintf("Could not get inode number of %v: %v\n", path, err.Error()))
				1282	}
				1283	stats.Inode = inode
				1284	device, err := f.filesystem.DeviceNumber(fileInfo)
				1285	if err != nil {
				1286	panic(fmt.Sprintf("Could not get device number of %v: %v\n", path, err.Error()))
				1287	}
				1288	stats.Device = device
				1289	permissionsChangeTime, err := f.filesystem.PermTime(fileInfo)
				1290
				1291	if err != nil {
				1292	panic(fmt.Sprintf("Could not get permissions modification time (CTime) of %v: %v\n", path, err.Error()))
				1293	}
				1294	// We're only interested in knowing whether anything about the directory
				1295	// has changed since last check, so we use the latest of the two
				1296	// modification times (content modification (mtime) and
				1297	// permission modification (ctime))
				1298	if permissionsChangeTime.After(modTime) {
				1299	modTime = permissionsChangeTime
				1300	}
				1301	stats.ModTime = modTime.UnixNano()
				1302
				1303	return stats
				1304	}
				1305
				1306	// pruneCacheCandidates removes the items that we don't want to include in our persistent cache
				1307	func (f Finder) pruneCacheCandidates(items DirEntries) {
				1308
				1309	for _, fileName := range items.FileNames {
				1310	for _, abortedName := range f.cacheMetadata.Config.PruneFiles {
				1311	if fileName == abortedName {
				1312	items.FileNames = []string{}
				1313	items.DirNames = []string{}
				1314	return
				1315	}
				1316	}
				1317	}
				1318
				1319	// remove any files that aren't the ones we want to include
				1320	writeIndex := 0
				1321	for _, fileName := range items.FileNames {
				1322	// include only these files
				1323	for _, includedName := range f.cacheMetadata.Config.IncludeFiles {
				1324	if fileName == includedName {
				1325	items.FileNames[writeIndex] = fileName
				1326	writeIndex++
				1327	break
				1328	}
				1329	}
				1330	}
				1331	// resize
				1332	items.FileNames = items.FileNames[:writeIndex]
				1333
				1334	writeIndex = 0
				1335	for _, dirName := range items.DirNames {
				1336	items.DirNames[writeIndex] = dirName
				1337	// ignore other dirs that are known to not be inputs to the build process
				1338	include := true
				1339	for _, excludedName := range f.cacheMetadata.Config.ExcludeDirs {
				1340	if dirName == excludedName {
				1341	// don't include
				1342	include = false
				1343	break
				1344	}
				1345	}
				1346	if include {
				1347	writeIndex++
				1348	}
				1349	}
				1350	// resize
				1351	items.DirNames = items.DirNames[:writeIndex]
				1352	}
				1353
				1354	func (f Finder) listDirsAsync(nodes []pathMap) {
				1355	f.threadPool.Run(
				1356	func() {
				1357	for i := range nodes {
				1358	f.listDirSync(nodes[i])
				1359	}
				1360	},
				1361	)
				1362	}
				1363
				1364	func (f Finder) listDirAsync(node pathMap) {
				1365	f.threadPool.Run(
				1366	func() {
				1367	f.listDirSync(node)
				1368	},
				1369	)
				1370	}
				1371
				1372	func (f Finder) listDirSync(dir pathMap) {
				1373	path := dir.path
				1374	children, err := f.filesystem.ReadDir(path)
				1375
				1376	if err != nil {
Jeff Gaston	b629e18	2017-08-14 16:49:18 -0700	[diff] [blame]	1377	// possibly record this error
				1378	f.onFsError(path, err)
Jeff Gaston	f1fd45e	2017-08-09 18:25:28 -0700	[diff] [blame]	1379	// if listing the contents of the directory fails (presumably due to
				1380	// permission denied), then treat the directory as empty
				1381	children = []os.FileInfo{}
				1382	}
				1383
				1384	var subdirs []string
				1385	var subfiles []string
				1386
				1387	for _, child := range children {
				1388	linkBits := child.Mode() & os.ModeSymlink
				1389	isLink := linkBits != 0
				1390	if child.IsDir() {
				1391	if !isLink {
				1392	// Skip symlink dirs.
				1393	// We don't have to support symlink dirs because
				1394	// that would cause duplicates.
				1395	subdirs = append(subdirs, child.Name())
				1396	}
				1397	} else {
				1398	// We do have to support symlink files because the link name might be
				1399	// different than the target name
				1400	// (for example, Android.bp -> build/soong/root.bp)
				1401	subfiles = append(subfiles, child.Name())
				1402	}
				1403
				1404	}
				1405	parentNode := dir
				1406
				1407	entry := &DirEntries{Path: path, DirNames: subdirs, FileNames: subfiles}
				1408	f.pruneCacheCandidates(entry)
				1409
				1410	// create a pathMap node for each relevant subdirectory
				1411	relevantChildren := map[string]*pathMap{}
				1412	for _, subdirName := range entry.DirNames {
				1413	childNode, found := parentNode.children[subdirName]
				1414	// if we already knew of this directory, then we already have a request pending to Stat it
				1415	// if we didn't already know of this directory, then we must Stat it now
				1416	if !found {
				1417	childNode = parentNode.newChild(subdirName)
				1418	f.statDirAsync(childNode)
				1419	}
				1420	relevantChildren[subdirName] = childNode
				1421	}
				1422	// Note that in rare cases, it's possible that we're reducing the set of
				1423	// children via this statement, if these are all true:
				1424	// 1. we previously had a cache that knew about subdirectories of parentNode
				1425	// 2. the user created a prune-file (described in pruneCacheCandidates)
				1426	// inside <parentNode>, which specifies that the contents of parentNode
				1427	// are to be ignored.
				1428	// The fact that it's possible to remove children here means that *pathMap structs
				1429	// must not be looked up from f.nodes by filepath (and instead must be accessed by
				1430	// direct pointer) until after every listDirSync completes
				1431	parentNode.FileNames = entry.FileNames
				1432	parentNode.children = relevantChildren
				1433
				1434	}
				1435
				1436	// listMatches takes a node and a function that specifies which subdirectories and
				1437	// files to include, and listMatches returns the matches
				1438	func (f Finder) listMatches(node pathMap,
				1439	filter WalkFunc) (subDirs []*pathMap, filePaths []string) {
				1440	entries := DirEntries{
				1441	FileNames: node.FileNames,
				1442	}
				1443	entries.DirNames = make([]string, 0, len(node.children))
				1444	for childName := range node.children {
				1445	entries.DirNames = append(entries.DirNames, childName)
				1446	}
				1447
				1448	dirNames, fileNames := filter(entries)
				1449
				1450	subDirs = []*pathMap{}
				1451	filePaths = make([]string, 0, len(fileNames))
				1452	for _, fileName := range fileNames {
				1453	filePaths = append(filePaths, joinCleanPaths(node.path, fileName))
				1454	}
				1455	subDirs = make([]*pathMap, 0, len(dirNames))
				1456	for _, childName := range dirNames {
				1457	child, ok := node.children[childName]
				1458	if ok {
				1459	subDirs = append(subDirs, child)
				1460	}
				1461	}
				1462
				1463	return subDirs, filePaths
				1464	}
				1465
				1466	// findInCacheMultithreaded spawns potentially multiple goroutines with which to search the cache.
				1467	func (f Finder) findInCacheMultithreaded(node pathMap, filter WalkFunc,
				1468	approxNumThreads int) []string {
				1469
				1470	if approxNumThreads < 2 {
				1471	// Done spawning threads; process remaining directories
				1472	return f.findInCacheSinglethreaded(node, filter)
				1473	}
				1474
				1475	totalWork := 0
				1476	for _, child := range node.children {
				1477	totalWork += child.approximateNumDescendents
				1478	}
				1479	childrenResults := make(chan []string, len(node.children))
				1480
				1481	subDirs, filePaths := f.listMatches(node, filter)
				1482
				1483	// process child directories
				1484	for _, child := range subDirs {
				1485	numChildThreads := approxNumThreads * child.approximateNumDescendents / totalWork
				1486	childProcessor := func(child *pathMap) {
				1487	childResults := f.findInCacheMultithreaded(child, filter, numChildThreads)
				1488	childrenResults <- childResults
				1489	}
				1490	// If we're allowed to use more than 1 thread to process this directory,
				1491	// then instead we use 1 thread for each subdirectory.
				1492	// It would be strange to spawn threads for only some subdirectories.
				1493	go childProcessor(child)
				1494	}
				1495
				1496	// collect results
				1497	for i := 0; i < len(subDirs); i++ {
				1498	childResults := <-childrenResults
				1499	filePaths = append(filePaths, childResults...)
				1500	}
				1501	close(childrenResults)
				1502
				1503	return filePaths
				1504	}
				1505
				1506	// findInCacheSinglethreaded synchronously searches the cache for all matching file paths
				1507	// note findInCacheSinglethreaded runs 2X to 4X as fast by being iterative rather than recursive
				1508	func (f Finder) findInCacheSinglethreaded(node pathMap, filter WalkFunc) []string {
				1509	if node == nil {
				1510	return []string{}
				1511	}
				1512
				1513	nodes := []*pathMap{node}
				1514	matches := []string{}
				1515
				1516	for len(nodes) > 0 {
				1517	currentNode := nodes[0]
				1518	nodes = nodes[1:]
				1519
				1520	subDirs, filePaths := f.listMatches(currentNode, filter)
				1521
				1522	nodes = append(nodes, subDirs...)
				1523
				1524	matches = append(matches, filePaths...)
				1525	}
				1526	return matches
				1527	}