Blame - scripts/generate-notice-files.py - android_build_soong

blob: 1b4acfaaf387f78135b7bf4306506db113094e7c [file] [log] [blame]

Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	1	#!/usr/bin/env python3
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	2	#
				3	# Copyright (C) 2012 The Android Open Source Project
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the "License");
				6	# you may not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# http://www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an "AS IS" BASIS,
				13	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	"""
				17	Usage: generate-notice-files --text-output [plain text output file] \
				18	--html-output [html output file] \
				19	--xml-output [xml output file] \
				20	-t [file title] -s [directory of notices]
				21
				22	Generate the Android notice files, including both text and html files.
				23
				24	-h to display this usage message and exit.
				25	"""
				26	from collections import defaultdict
				27	import argparse
				28	import hashlib
				29	import itertools
				30	import os
				31	import os.path
				32	import re
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	33	import struct
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	34	import sys
				35
				36	MD5_BLOCKSIZE = 1024 * 1024
				37	HTML_ESCAPE_TABLE = {
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	38	b"&": b"&",
				39	b'"': b""",
				40	b"'": b"'",
				41	b">": b">",
				42	b"<": b"<",
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	43	}
				44
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	45	def md5sum(filename):
				46	"""Calculate an MD5 of the file given by FILENAME,
				47	and return hex digest as a string.
				48	Output should be compatible with md5sum command"""
				49
				50	f = open(filename, "rb")
				51	sum = hashlib.md5()
				52	while 1:
				53	block = f.read(MD5_BLOCKSIZE)
				54	if not block:
				55	break
				56	sum.update(block)
				57	f.close()
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	58	return sum.hexdigest()
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	59
				60
				61	def html_escape(text):
				62	"""Produce entities within text."""
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	63	# Using for i in text doesn't work since i will be an int, not a byte.
				64	# There are multiple ways to solve this, but the most performant way
				65	# to iterate over a byte array is to use unpack. Using the
				66	# for i in range(len(text)) and using that to get a byte using array
				67	# slices is twice as slow as this method.
				68	return b"".join(HTML_ESCAPE_TABLE.get(i,i) for i in struct.unpack(str(len(text)) + 'c', text))
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	69
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	70	HTML_OUTPUT_CSS=b"""
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	71	<style type="text/css">
				72	body { padding: 0; font-family: sans-serif; }
				73	.same-license { background-color: #eeeeee; border-top: 20px solid white; padding: 10px; }
				74	.label { font-weight: bold; }
				75	.file-list { margin-left: 1em; color: blue; }
				76	</style>
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	77
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	78	"""
				79
				80	def combine_notice_files_html(file_hash, input_dir, output_filename):
				81	"""Combine notice files in FILE_HASH and output a HTML version to OUTPUT_FILENAME."""
				82
				83	SRC_DIR_STRIP_RE = re.compile(input_dir + "(/.*).txt")
				84
				85	# Set up a filename to row id table (anchors inside tables don't work in
				86	# most browsers, but href's to table row ids do)
				87	id_table = {}
				88	id_count = 0
				89	for value in file_hash:
				90	for filename in value:
				91	id_table[filename] = id_count
				92	id_count += 1
				93
				94	# Open the output file, and output the header pieces
				95	output_file = open(output_filename, "wb")
				96
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	97	output_file.write(b"<html><head>\n")
				98	output_file.write(HTML_OUTPUT_CSS)
				99	output_file.write(b'</head><body topmargin="0" leftmargin="0" rightmargin="0" bottommargin="0">\n')
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	100
				101	# Output our table of contents
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	102	output_file.write(b'<div class="toc">\n')
				103	output_file.write(b"<ul>\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	104
				105	# Flatten the list of lists into a single list of filenames
				106	sorted_filenames = sorted(itertools.chain.from_iterable(file_hash))
				107
				108	# Print out a nice table of contents
				109	for filename in sorted_filenames:
				110	stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	111	output_file.write(('<li><a href="#id%d">%s</a></li>\n' % (id_table.get(filename), stripped_filename)).encode())
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	112
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	113	output_file.write(b"</ul>\n")
				114	output_file.write(b"</div><!-- table of contents -->\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	115	# Output the individual notice file lists
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	116	output_file.write(b'<table cellpadding="0" cellspacing="0" border="0">\n')
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	117	for value in file_hash:
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	118	output_file.write(('<tr id="id%d"><td class="same-license">\n' % id_table.get(value[0])).encode())
				119	output_file.write(b'<div class="label">Notices for file(s):</div>\n')
				120	output_file.write(b'<div class="file-list">\n')
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	121	for filename in value:
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	122	output_file.write(("%s <br/>\n" % (SRC_DIR_STRIP_RE.sub(r"\1", filename))).encode())
				123	output_file.write(b"</div><!-- file-list -->\n\n")
				124	output_file.write(b'<pre class="license-text">\n')
				125	with open(value[0], "rb") as notice_file:
				126	output_file.write(html_escape(notice_file.read()))
				127	output_file.write(b"\n</pre><!-- license-text -->\n")
				128	output_file.write(b"</td></tr><!-- same-license -->\n\n\n\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	129
				130	# Finish off the file output
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	131	output_file.write(b"</table>\n")
				132	output_file.write(b"</body></html>\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	133	output_file.close()
				134
				135	def combine_notice_files_text(file_hash, input_dir, output_filename, file_title):
				136	"""Combine notice files in FILE_HASH and output a text version to OUTPUT_FILENAME."""
				137
				138	SRC_DIR_STRIP_RE = re.compile(input_dir + "(/.*).txt")
				139	output_file = open(output_filename, "wb")
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	140	output_file.write(file_title.encode())
				141	output_file.write(b"\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	142	for value in file_hash:
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	143	output_file.write(b"============================================================\n")
				144	output_file.write(b"Notices for file(s):\n")
				145	for filename in value:
				146	output_file.write(SRC_DIR_STRIP_RE.sub(r"\1", filename).encode())
				147	output_file.write(b"\n")
				148	output_file.write(b"------------------------------------------------------------\n")
				149	with open(value[0], "rb") as notice_file:
				150	output_file.write(notice_file.read())
				151	output_file.write(b"\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	152	output_file.close()
				153
				154	def combine_notice_files_xml(files_with_same_hash, input_dir, output_filename):
				155	"""Combine notice files in FILE_HASH and output a XML version to OUTPUT_FILENAME."""
				156
				157	SRC_DIR_STRIP_RE = re.compile(input_dir + "(/.*).txt")
				158
				159	# Set up a filename to row id table (anchors inside tables don't work in
				160	# most browsers, but href's to table row ids do)
				161	id_table = {}
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	162	for file_key, files in files_with_same_hash.items():
				163	for filename in files:
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	164	id_table[filename] = file_key
				165
				166	# Open the output file, and output the header pieces
				167	output_file = open(output_filename, "wb")
				168
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	169	output_file.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
				170	output_file.write(b"<licenses>\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	171
				172	# Flatten the list of lists into a single list of filenames
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	173	sorted_filenames = sorted(list(id_table))
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	174
				175	# Print out a nice table of contents
				176	for filename in sorted_filenames:
				177	stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	178	output_file.write(('<file-name contentId="%s">%s</file-name>\n' % (id_table.get(filename), stripped_filename)).encode())
				179	output_file.write(b"\n\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	180
				181	processed_file_keys = []
				182	# Output the individual notice file lists
				183	for filename in sorted_filenames:
				184	file_key = id_table.get(filename)
				185	if file_key in processed_file_keys:
				186	continue
				187	processed_file_keys.append(file_key)
				188
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	189	output_file.write(('<file-content contentId="%s"><![CDATA[' % file_key).encode())
				190	with open(filename, "rb") as notice_file:
				191	output_file.write(html_escape(notice_file.read()))
				192	output_file.write(b"]]></file-content>\n\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	193
				194	# Finish off the file output
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	195	output_file.write(b"</licenses>\n")
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	196	output_file.close()
				197
				198	def get_args():
				199	parser = argparse.ArgumentParser()
				200	parser.add_argument(
				201	'--text-output', required=True,
				202	help='The text output file path.')
				203	parser.add_argument(
				204	'--html-output',
				205	help='The html output file path.')
				206	parser.add_argument(
				207	'--xml-output',
				208	help='The xml output file path.')
				209	parser.add_argument(
				210	'-t', '--title', required=True,
				211	help='The file title.')
				212	parser.add_argument(
				213	'-s', '--source-dir', required=True,
				214	help='The directory containing notices.')
				215	parser.add_argument(
				216	'-i', '--included-subdirs', action='append',
				217	help='The sub directories which should be included.')
				218	parser.add_argument(
				219	'-e', '--excluded-subdirs', action='append',
				220	help='The sub directories which should be excluded.')
				221	return parser.parse_args()
				222
				223	def main(argv):
				224	args = get_args()
				225
				226	txt_output_file = args.text_output
				227	html_output_file = args.html_output
				228	xml_output_file = args.xml_output
				229	file_title = args.title
				230	included_subdirs = []
				231	excluded_subdirs = []
				232	if args.included_subdirs is not None:
				233	included_subdirs = args.included_subdirs
				234	if args.excluded_subdirs is not None:
				235	excluded_subdirs = args.excluded_subdirs
				236
				237	# Find all the notice files and md5 them
				238	input_dir = os.path.normpath(args.source_dir)
				239	files_with_same_hash = defaultdict(list)
				240	for root, dir, files in os.walk(input_dir):
				241	for file in files:
				242	matched = True
				243	if len(included_subdirs) > 0:
				244	matched = False
				245	for subdir in included_subdirs:
				246	if (root == (input_dir + '/' + subdir) or
				247	root.startswith(input_dir + '/' + subdir + '/')):
				248	matched = True
				249	break
				250	elif len(excluded_subdirs) > 0:
				251	for subdir in excluded_subdirs:
				252	if (root == (input_dir + '/' + subdir) or
				253	root.startswith(input_dir + '/' + subdir + '/')):
				254	matched = False
				255	break
				256	if matched and file.endswith(".txt"):
				257	filename = os.path.join(root, file)
				258	file_md5sum = md5sum(filename)
				259	files_with_same_hash[file_md5sum].append(filename)
				260
Christopher Ferris	8e5feaa	2021-09-01 16:31:59 -0700	[diff] [blame]	261	filesets = [sorted(files_with_same_hash[md5]) for md5 in sorted(list(files_with_same_hash))]
Bob Badour	3911e6a	2020-02-10 17:08:47 -0800	[diff] [blame]	262
				263	combine_notice_files_text(filesets, input_dir, txt_output_file, file_title)
				264
				265	if html_output_file is not None:
				266	combine_notice_files_html(filesets, input_dir, html_output_file)
				267
				268	if xml_output_file is not None:
				269	combine_notice_files_xml(files_with_same_hash, input_dir, xml_output_file)
				270
				271	if __name__ == "__main__":
				272	main(sys.argv)