Blame - scripts/hiddenapi/merge_csv.py - android_build_soong

blob: c17ec25f1e6d2ab519bbebc3c4af48398af74203 [file] [log] [blame]

Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	#
				3	# Copyright (C) 2018 The Android Open Source Project
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the "License");
				6	# you may not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# http://www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an "AS IS" BASIS,
				13	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	16	"""Merge multiple CSV files, possibly with different columns.
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	17	"""
				18
				19	import argparse
				20	import csv
				21	import io
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	22	import heapq
				23	import itertools
				24	import operator
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	25
				26	from zipfile import ZipFile
				27
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	28	args_parser = argparse.ArgumentParser(
				29	description='Merge given CSV files into a single one.'
				30	)
				31	args_parser.add_argument(
				32	'--header',
				33	help='Comma separated field names; '
				34	'if missing determines the header from input files.',
				35	)
				36	args_parser.add_argument(
				37	'--zip_input',
				38	help='Treat files as ZIP archives containing CSV files to merge.',
				39	action="store_true",
				40	)
				41	args_parser.add_argument(
				42	'--key_field',
				43	help='The name of the field by which the rows should be sorted. '
				44	'Must be in the field names. '
				45	'Will be the first field in the output. '
				46	'All input files must be sorted by that field.',
				47	)
				48	args_parser.add_argument(
				49	'--output',
				50	help='Output file for merged CSV.',
				51	default='-',
				52	type=argparse.FileType('w'),
				53	)
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	54	args_parser.add_argument('files', nargs=argparse.REMAINDER)
				55	args = args_parser.parse_args()
				56
				57
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	58	def dict_reader(csvfile):
				59	return csv.DictReader(csvfile, delimiter=',', quotechar='\|')
				60
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	61
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	62	csv_readers = []
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	63	if not args.zip_input:
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	64	for file in args.files:
				65	csv_readers.append(dict_reader(open(file, 'r')))
Paul Duffin	031d869	2021-02-12 11:46:42 +0000	[diff] [blame]	66	else:
				67	for file in args.files:
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	68	with ZipFile(file) as zipfile:
				69	for entry in zipfile.namelist():
Paul Duffin	031d869	2021-02-12 11:46:42 +0000	[diff] [blame]	70	if entry.endswith('.uau'):
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	71	csv_readers.append(
				72	dict_reader(io.TextIOWrapper(zipfile.open(entry, 'r')))
				73	)
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	74
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	75	if args.header:
				76	fieldnames = args.header.split(',')
				77	else:
Paul Duffin	84c1cdf	2021-06-08 15:41:32 +0100	[diff] [blame]	78	headers = {}
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	79	# Build union of all columns from source files:
				80	for reader in csv_readers:
Paul Duffin	84c1cdf	2021-06-08 15:41:32 +0100	[diff] [blame]	81	for fieldname in reader.fieldnames:
				82	headers[fieldname] = ""
				83	fieldnames = list(headers.keys())
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	84
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	85	# By default chain the csv readers together so that the resulting output is
				86	# the concatenation of the rows from each of them:
				87	all_rows = itertools.chain.from_iterable(csv_readers)
				88
				89	if len(csv_readers) > 0:
				90	keyField = args.key_field
				91	if keyField:
				92	assert keyField in fieldnames, (
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	93	"--key_field {} not found, must be one of {}\n"
				94	).format(keyField, ",".join(fieldnames))
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	95	# Make the key field the first field in the output
				96	keyFieldIndex = fieldnames.index(args.key_field)
				97	fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
				98	# Create an iterable that performs a lazy merge sort on the csv readers
				99	# sorting the rows by the key field.
				100	all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
				101
				102	# Write all rows from the input files to the output:
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	103	writer = csv.DictWriter(
				104	args.output,
				105	delimiter=',',
				106	quotechar='\|',
				107	quoting=csv.QUOTE_MINIMAL,
				108	dialect='unix',
				109	fieldnames=fieldnames,
				110	)
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	111	writer.writeheader()
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	112
				113	# Read all the rows from the input and write them to the output in the correct
				114	# order:
				115	for row in all_rows:
Spandan Das	2c2219b	2021-08-25 17:47:43 +0000	[diff] [blame]	116	writer.writerow(row)