Blame - scripts/hiddenapi/merge_csv.py - android_build_soong

blob: a65326c51e4edec2c9fbec910475c3defe0a3f7a [file] [log] [blame]

Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	1	#!/usr/bin/env python
				2	#
				3	# Copyright (C) 2018 The Android Open Source Project
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the "License");
				6	# you may not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# http://www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an "AS IS" BASIS,
				13	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	"""
				17	Merge multiple CSV files, possibly with different columns.
				18	"""
				19
				20	import argparse
				21	import csv
				22	import io
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	23	import heapq
				24	import itertools
				25	import operator
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	26
				27	from zipfile import ZipFile
				28
				29	args_parser = argparse.ArgumentParser(description='Merge given CSV files into a single one.')
				30	args_parser.add_argument('--header', help='Comma separated field names; '
				31	'if missing determines the header from input files.')
Paul Duffin	031d869	2021-02-12 11:46:42 +0000	[diff] [blame]	32	args_parser.add_argument('--zip_input', help='Treat files as ZIP archives containing CSV files to merge.',
				33	action="store_true")
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	34	args_parser.add_argument('--key_field', help='The name of the field by which the rows should be sorted. '
				35	'Must be in the field names. '
				36	'Will be the first field in the output. '
				37	'All input files must be sorted by that field.')
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	38	args_parser.add_argument('--output', help='Output file for merged CSV.',
				39	default='-', type=argparse.FileType('w'))
				40	args_parser.add_argument('files', nargs=argparse.REMAINDER)
				41	args = args_parser.parse_args()
				42
				43
				44	def dict_reader(input):
				45	return csv.DictReader(input, delimiter=',', quotechar='\|')
				46
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	47	csv_readers = []
Paul Duffin	031d869	2021-02-12 11:46:42 +0000	[diff] [blame]	48	if not(args.zip_input):
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	49	for file in args.files:
				50	csv_readers.append(dict_reader(open(file, 'r')))
Paul Duffin	031d869	2021-02-12 11:46:42 +0000	[diff] [blame]	51	else:
				52	for file in args.files:
				53	with ZipFile(file) as zip:
				54	for entry in zip.namelist():
				55	if entry.endswith('.uau'):
				56	csv_readers.append(dict_reader(io.TextIOWrapper(zip.open(entry, 'r'))))
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	57
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	58	if args.header:
				59	fieldnames = args.header.split(',')
				60	else:
Paul Duffin	84c1cdf	2021-06-08 15:41:32 +0100	[diff] [blame^]	61	headers = {}
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	62	# Build union of all columns from source files:
				63	for reader in csv_readers:
Paul Duffin	84c1cdf	2021-06-08 15:41:32 +0100	[diff] [blame^]	64	for fieldname in reader.fieldnames:
				65	headers[fieldname] = ""
				66	fieldnames = list(headers.keys())
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	67
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	68	# By default chain the csv readers together so that the resulting output is
				69	# the concatenation of the rows from each of them:
				70	all_rows = itertools.chain.from_iterable(csv_readers)
				71
				72	if len(csv_readers) > 0:
				73	keyField = args.key_field
				74	if keyField:
				75	assert keyField in fieldnames, (
				76	"--key_field {} not found, must be one of {}\n").format(
				77	keyField, ",".join(fieldnames))
				78	# Make the key field the first field in the output
				79	keyFieldIndex = fieldnames.index(args.key_field)
				80	fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
				81	# Create an iterable that performs a lazy merge sort on the csv readers
				82	# sorting the rows by the key field.
				83	all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
				84
				85	# Write all rows from the input files to the output:
Paul Duffin	fdada68	2021-02-08 18:08:09 +0000	[diff] [blame]	86	writer = csv.DictWriter(args.output, delimiter=',', quotechar='\|', quoting=csv.QUOTE_MINIMAL,
				87	dialect='unix', fieldnames=fieldnames)
				88	writer.writeheader()
Paul Duffin	2c36f24	2021-02-16 16:57:06 +0000	[diff] [blame]	89
				90	# Read all the rows from the input and write them to the output in the correct
				91	# order:
				92	for row in all_rows:
				93	writer.writerow(row)