blob: c17ec25f1e6d2ab519bbebc3c4af48398af74203 [file] [log] [blame]
Paul Duffinfdada682021-02-08 18:08:09 +00001#!/usr/bin/env python
2#
3# Copyright (C) 2018 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Spandan Das2c2219b2021-08-25 17:47:43 +000016"""Merge multiple CSV files, possibly with different columns.
Paul Duffinfdada682021-02-08 18:08:09 +000017"""
18
19import argparse
20import csv
21import io
Paul Duffin2c36f242021-02-16 16:57:06 +000022import heapq
23import itertools
24import operator
Paul Duffinfdada682021-02-08 18:08:09 +000025
26from zipfile import ZipFile
27
Spandan Das2c2219b2021-08-25 17:47:43 +000028args_parser = argparse.ArgumentParser(
29 description='Merge given CSV files into a single one.'
30)
31args_parser.add_argument(
32 '--header',
33 help='Comma separated field names; '
34 'if missing determines the header from input files.',
35)
36args_parser.add_argument(
37 '--zip_input',
38 help='Treat files as ZIP archives containing CSV files to merge.',
39 action="store_true",
40)
41args_parser.add_argument(
42 '--key_field',
43 help='The name of the field by which the rows should be sorted. '
44 'Must be in the field names. '
45 'Will be the first field in the output. '
46 'All input files must be sorted by that field.',
47)
48args_parser.add_argument(
49 '--output',
50 help='Output file for merged CSV.',
51 default='-',
52 type=argparse.FileType('w'),
53)
Paul Duffinfdada682021-02-08 18:08:09 +000054args_parser.add_argument('files', nargs=argparse.REMAINDER)
55args = args_parser.parse_args()
56
57
Spandan Das2c2219b2021-08-25 17:47:43 +000058def dict_reader(csvfile):
59 return csv.DictReader(csvfile, delimiter=',', quotechar='|')
60
Paul Duffinfdada682021-02-08 18:08:09 +000061
Paul Duffinfdada682021-02-08 18:08:09 +000062csv_readers = []
Spandan Das2c2219b2021-08-25 17:47:43 +000063if not args.zip_input:
Paul Duffinfdada682021-02-08 18:08:09 +000064 for file in args.files:
65 csv_readers.append(dict_reader(open(file, 'r')))
Paul Duffin031d8692021-02-12 11:46:42 +000066else:
67 for file in args.files:
Spandan Das2c2219b2021-08-25 17:47:43 +000068 with ZipFile(file) as zipfile:
69 for entry in zipfile.namelist():
Paul Duffin031d8692021-02-12 11:46:42 +000070 if entry.endswith('.uau'):
Spandan Das2c2219b2021-08-25 17:47:43 +000071 csv_readers.append(
72 dict_reader(io.TextIOWrapper(zipfile.open(entry, 'r')))
73 )
Paul Duffinfdada682021-02-08 18:08:09 +000074
Paul Duffinfdada682021-02-08 18:08:09 +000075if args.header:
76 fieldnames = args.header.split(',')
77else:
Paul Duffin84c1cdf2021-06-08 15:41:32 +010078 headers = {}
Paul Duffinfdada682021-02-08 18:08:09 +000079 # Build union of all columns from source files:
80 for reader in csv_readers:
Paul Duffin84c1cdf2021-06-08 15:41:32 +010081 for fieldname in reader.fieldnames:
82 headers[fieldname] = ""
83 fieldnames = list(headers.keys())
Paul Duffinfdada682021-02-08 18:08:09 +000084
Paul Duffin2c36f242021-02-16 16:57:06 +000085# By default chain the csv readers together so that the resulting output is
86# the concatenation of the rows from each of them:
87all_rows = itertools.chain.from_iterable(csv_readers)
88
89if len(csv_readers) > 0:
90 keyField = args.key_field
91 if keyField:
92 assert keyField in fieldnames, (
Spandan Das2c2219b2021-08-25 17:47:43 +000093 "--key_field {} not found, must be one of {}\n"
94 ).format(keyField, ",".join(fieldnames))
Paul Duffin2c36f242021-02-16 16:57:06 +000095 # Make the key field the first field in the output
96 keyFieldIndex = fieldnames.index(args.key_field)
97 fieldnames.insert(0, fieldnames.pop(keyFieldIndex))
98 # Create an iterable that performs a lazy merge sort on the csv readers
99 # sorting the rows by the key field.
100 all_rows = heapq.merge(*csv_readers, key=operator.itemgetter(keyField))
101
102# Write all rows from the input files to the output:
Spandan Das2c2219b2021-08-25 17:47:43 +0000103writer = csv.DictWriter(
104 args.output,
105 delimiter=',',
106 quotechar='|',
107 quoting=csv.QUOTE_MINIMAL,
108 dialect='unix',
109 fieldnames=fieldnames,
110)
Paul Duffinfdada682021-02-08 18:08:09 +0000111writer.writeheader()
Paul Duffin2c36f242021-02-16 16:57:06 +0000112
113# Read all the rows from the input and write them to the output in the correct
114# order:
115for row in all_rows:
Spandan Das2c2219b2021-08-25 17:47:43 +0000116 writer.writerow(row)