Handle more mixed warning lines from RBE
* Use android_root or __file__ path to find
  the source tree root and its sub directories.
* Use the sub directory list to clean up RBE-returned
  warning lines, removing all mixed prefix before
  the top sub directories.
* Change some re.match rules and order to save
  unnecessary comparisons.
Test: warn.py --url=http://cs/android --separator='?l=' build.log > warnings.html
Test: warn.py --gencsv build.log > warnings.csv
Bug: 198657613
Change-Id: I78cc17d04b8ab9e12935ef04797f3272298d5267
diff --git a/tools/warn/warn_common.py b/tools/warn/warn_common.py
index 61c8676..f24cfb7 100755
--- a/tools/warn/warn_common.py
+++ b/tools/warn/warn_common.py
@@ -228,6 +228,14 @@
   return ''
 
 
+def find_android_root_top_dirs(root_dir):
+  """Return a list of directories under the root_dir, if it exists."""
+  if not os.path.isdir(root_dir):
+    return None
+  return list(filter(lambda d: os.path.isdir(root_dir + '/' + d),
+                     os.listdir(root_dir)))
+
+
 def find_android_root(buildlog):
   """Guess android source root from common prefix of file paths."""
   # Use the longest common prefix of the absolute file paths
@@ -239,8 +247,8 @@
     # We want to find android_root of a local build machine.
     # Do not use RBE warning lines, which has '/b/f/w/' path prefix.
     # Do not use /tmp/ file warnings.
-    if warning_pattern.match(line) and (
-        '/b/f/w' not in line and not line.startswith('/tmp/')):
+    if ('/b/f/w' not in line and not line.startswith('/tmp/') and
+        warning_pattern.match(line)):
       warning_lines.append(line)
       count += 1
       if count > 9999:
@@ -251,15 +259,23 @@
         path = os.path.normpath(re.sub(':.*$', '', line))
         android_root = find_warn_py_and_android_root(path)
         if android_root:
-          return android_root
+          return android_root, find_android_root_top_dirs(android_root)
   # Do not use common prefix of a small number of paths.
+  android_root = ''
   if count > 10:
     # pytype: disable=wrong-arg-types
     root_path = os.path.commonprefix(warning_lines)
     # pytype: enable=wrong-arg-types
     if len(root_path) > 2 and root_path[len(root_path) - 1] == '/':
-      return root_path[:-1]
-  return ''
+      android_root = root_path[:-1]
+  if android_root and os.path.isdir(android_root):
+    return android_root, find_android_root_top_dirs(android_root)
+  # When the build.log file is moved to a different machine where
+  # android_root is not found, use the location of this script
+  # to find the android source tree root and its sub directories.
+  # This __file__ is /..../build/make/tools/warn/warn_common.py
+  script_root = __file__.replace('/build/make/tools/warn/warn_common.py', '')
+  return android_root, find_android_root_top_dirs(script_root)
 
 
 def remove_android_root_prefix(path, android_root):
@@ -310,8 +326,6 @@
   warning_pattern = re.compile(chrome_warning_pattern)
 
   # Collect all unique warning lines
-  # Remove the duplicated warnings save ~8% of time when parsing
-  # one typical build log than before
   unique_warnings = dict()
   for line in infile:
     if warning_pattern.match(line):
@@ -354,7 +368,7 @@
   target_variant = 'unknown'
   build_id = 'unknown'
   use_rbe = False
-  android_root = find_android_root(infile)
+  android_root, root_top_dirs = find_android_root(infile)
   infile.seek(0)
 
   # rustc warning messages have two lines that should be combined:
@@ -367,24 +381,39 @@
   # C/C++ compiler warning messages have line and column numbers:
   #     some/path/file.c:line_number:column_number: warning: description
   warning_pattern = re.compile('(^[^ ]*/[^ ]*: warning: .*)|(^warning: .*)')
-  warning_without_file = re.compile('^warning: .*')
   rustc_file_position = re.compile('^[ ]+--> [^ ]*/[^ ]*:[0-9]+:[0-9]+')
 
-  # If RBE was used, try to reclaim some warning lines mixed with some
-  # leading chars from other concurrent job's stderr output .
+  # If RBE was used, try to reclaim some warning lines (from stdout)
+  # that contain leading characters from stderr.
   # The leading characters can be any character, including digits and spaces.
-  # It's impossible to correctly identify the starting point of the source
-  # file path without the file directory name knowledge.
-  # Here we can only be sure to recover lines containing "/b/f/w/".
-  rbe_warning_pattern = re.compile('.*/b/f/w/[^ ]*: warning: .*')
 
-   # Collect all unique warning lines
-  # Remove the duplicated warnings save ~8% of time when parsing
-  # one typical build log than before
+  # If a warning line's source file path contains the special RBE prefix
+  # /b/f/w/, we can remove all leading chars up to and including the "/b/f/w/".
+  bfw_warning_pattern = re.compile('.*/b/f/w/([^ ]*: warning: .*)')
+
+  # When android_root is known and available, we find its top directories
+  # and remove all leading chars before a top directory name.
+  # We assume that the leading chars from stderr do not contain "/".
+  # For example,
+  #   10external/...
+  #   12 warningsexternal/...
+  #   413 warningexternal/...
+  #   5 warnings generatedexternal/...
+  #   Suppressed 1000 warnings (packages/modules/...
+  if root_top_dirs:
+    extra_warning_pattern = re.compile(
+        '^.[^/]*((' + '|'.join(root_top_dirs) +
+        ')/[^ ]*: warning: .*)')
+  else:
+    extra_warning_pattern = re.compile('^[^/]* ([^ /]*/[^ ]*: warning: .*)')
+
+  # Collect all unique warning lines
   unique_warnings = dict()
+  checked_warning_lines = dict()
   line_counter = 0
   prev_warning = ''
   for line in infile:
+    line_counter += 1
     if prev_warning:
       if rustc_file_position.match(line):
         # must be a rustc warning, combine 2 lines into one warning
@@ -399,14 +428,32 @@
           prev_warning, flags, android_root, unique_warnings)
       prev_warning = ''
 
-    if use_rbe and rbe_warning_pattern.match(line):
-      cleaned_up_line = re.sub('.*/b/f/w/', '', line)
-      unique_warnings = add_normalized_line_to_warnings(
-          cleaned_up_line, flags, android_root, unique_warnings)
+    # re.match is slow, with several warning line patterns and
+    # long input lines like "TIMEOUT: ...".
+    # We save significant time by skipping non-warning lines.
+    # But do not skip the first 100 lines, because we want to
+    # catch build variables.
+    if line_counter > 100 and line.find('warning: ') < 0:
       continue
 
+    # A large clean build output can contain up to 90% of duplicated
+    # "warning:" lines. If we can skip them quickly, we can
+    # speed up this for-loop 3X to 5X.
+    if line in checked_warning_lines:
+      continue
+    checked_warning_lines[line] = True
+
+    # Clean up extra prefix if RBE is used.
+    if use_rbe:
+      if '/b/f/w/' in line:
+        result = bfw_warning_pattern.search(line)
+      else:
+        result = extra_warning_pattern.search(line)
+      if result is not None:
+        line = result.group(1)
+
     if warning_pattern.match(line):
-      if warning_without_file.match(line):
+      if line.startswith('warning: '):
         # save this line and combine it with the next line
         prev_warning = line
       else:
@@ -416,7 +463,6 @@
 
     if line_counter < 100:
       # save a little bit of time by only doing this for the first few lines
-      line_counter += 1
       result = re.search('(?<=^PLATFORM_VERSION=).*', line)
       if result is not None:
         platform_version = result.group(0)