Sort hiddenapi monolithic files by signature Adds a new --key_field option to merge_csv.py which specifies the name of the field that should be used to sort the input. If specified it causes that field to be the first in each row and performs the merge operation of a merge sort on the input files. That assumes that each input file is already sorted into the same order. Modifies the rules that use merge_csv.py to pass in: --key_field signature to sort the rows by signature. Bug: 180387396 Test: Verified that hiddenapi files (both aggregated ones and for the individual modules) are not affected by this change other than changing the order. Change-Id: Idcd5f0fea373b520b604889e1c280f21ed495660

commit: 2c36f240828eaad1904d231b8809e212aecd6708 [log] [tgz]
author: Paul Duffin <paulduffin@google.com> Tue Feb 16 16:57:06 2021 +0000
committer: Paul Duffin <paulduffin@google.com> Tue Feb 16 17:38:08 2021 +0000
tree: f61e95c8dd1d2ffe5a2cbb51c44775d4af5c96c7
parent: 82b3fcf12334a9d3bc257eecc948cc6fbc6ac473 [diff]
diff --git a/java/hiddenapi.go b/java/hiddenapi.go
index 2d94ac4..da2c48f 100644
--- a/java/hiddenapi.go
+++ b/java/hiddenapi.go

@@ -254,6 +254,7 @@
 	rule.Command().
 		BuiltTool("merge_csv").
 		Flag("--zip_input").
+		Flag("--key_field signature").
 		FlagWithOutput("--output=", indexCSV).
 		Inputs(classesJars)
 	rule.Build("merged-hiddenapi-index", "Merged Hidden API index")

diff --git a/java/hiddenapi_singleton.go b/java/hiddenapi_singleton.go
index 25d39f3..82e8b3f 100644
--- a/java/hiddenapi_singleton.go
+++ b/java/hiddenapi_singleton.go

@@ -424,6 +424,7 @@
 
 	rule.Command().
 		BuiltTool("merge_csv").
+		Flag("--key_field signature").
 		FlagWithOutput("--output=", outputPath).
 		Inputs(metadataCSV)
 
@@ -535,6 +536,7 @@
 	rule := android.NewRuleBuilder(pctx, ctx)
 	rule.Command().
 		BuiltTool("merge_csv").
+		Flag("--key_field signature").
 		FlagWithArg("--header=", "signature,file,startline,startcol,endline,endcol,properties").
 		FlagWithOutput("--output=", hiddenAPISingletonPaths(ctx).index).
 		Inputs(indexes)

diff --git a/scripts/hiddenapi/merge_csv.py b/scripts/hiddenapi/merge_csv.py
index 5ad61b2..b047aab 100755
--- a/scripts/hiddenapi/merge_csv.py
+++ b/scripts/hiddenapi/merge_csv.py

@@ -20,6 +20,9 @@
 import argparse
 import csv
 import io
+import heapq
+import itertools
+import operator
 
 from zipfile import ZipFile
 
@@ -28,6 +31,10 @@
                                           'if missing determines the header from input files.')
 args_parser.add_argument('--zip_input', help='Treat files as ZIP archives containing CSV files to merge.',
                          action="store_true")
+args_parser.add_argument('--key_field', help='The name of the field by which the rows should be sorted. '
+                                             'Must be in the field names. '
+                                             'Will be the first field in the output. '
+                                             'All input files must be sorted by that field.')
 args_parser.add_argument('--output', help='Output file for merged CSV.',
                          default='-', type=argparse.FileType('w'))
 args_parser.add_argument('files', nargs=argparse.REMAINDER)
@@ -57,10 +64,29 @@
         headers = headers.union(reader.fieldnames)
     fieldnames = sorted(headers)
 
-# Concatenate all files to output:
+# By default chain the csv readers together so that the resulting output is
+# the concatenation of the rows from each of them:
+all_rows = itertools.chain.from_iterable(csv_readers)
+
+if len(csv_readers) > 0:
+    keyField = args.key_field
+    if keyField:
+        assert keyField in fieldnames, (
+            "--key_field {} not found, must be one of {}\n").format(
+            keyField, ",".join(fieldnames))
+        # Make the key field the first field in the output
+        keyFieldIndex = fieldnames.index(args.key_field)
+        fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
+        # Create an iterable that performs a lazy merge sort on the csv readers
+        # sorting the rows by the key field.
+        all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
+
+# Write all rows from the input files to the output:
 writer = csv.DictWriter(args.output, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL,
                         dialect='unix', fieldnames=fieldnames)
 writer.writeheader()
-for reader in csv_readers:
-    for row in reader:
-        writer.writerow(row)
+
+# Read all the rows from the input and write them to the output in the correct
+# order:
+for row in all_rows:
+  writer.writerow(row)
commit	2c36f240828eaad1904d231b8809e212aecd6708	[log] [tgz]
author	Paul Duffin <paulduffin@google.com>	Tue Feb 16 16:57:06 2021 +0000
committer	Paul Duffin <paulduffin@google.com>	Tue Feb 16 17:38:08 2021 +0000
tree	f61e95c8dd1d2ffe5a2cbb51c44775d4af5c96c7
parent	82b3fcf12334a9d3bc257eecc948cc6fbc6ac473 [diff]