blob: 1b4acfaaf387f78135b7bf4306506db113094e7c [file] [log] [blame]
Christopher Ferris8e5feaa2021-09-01 16:31:59 -07001#!/usr/bin/env python3
Bob Badour3911e6a2020-02-10 17:08:47 -08002#
3# Copyright (C) 2012 The Android Open Source Project
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16"""
17Usage: generate-notice-files --text-output [plain text output file] \
18 --html-output [html output file] \
19 --xml-output [xml output file] \
20 -t [file title] -s [directory of notices]
21
22Generate the Android notice files, including both text and html files.
23
24-h to display this usage message and exit.
25"""
26from collections import defaultdict
27import argparse
28import hashlib
29import itertools
30import os
31import os.path
32import re
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070033import struct
Bob Badour3911e6a2020-02-10 17:08:47 -080034import sys
35
36MD5_BLOCKSIZE = 1024 * 1024
37HTML_ESCAPE_TABLE = {
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070038 b"&": b"&",
39 b'"': b""",
40 b"'": b"'",
41 b">": b">",
42 b"<": b"&lt;",
Bob Badour3911e6a2020-02-10 17:08:47 -080043 }
44
Bob Badour3911e6a2020-02-10 17:08:47 -080045def md5sum(filename):
46 """Calculate an MD5 of the file given by FILENAME,
47 and return hex digest as a string.
48 Output should be compatible with md5sum command"""
49
50 f = open(filename, "rb")
51 sum = hashlib.md5()
52 while 1:
53 block = f.read(MD5_BLOCKSIZE)
54 if not block:
55 break
56 sum.update(block)
57 f.close()
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070058 return sum.hexdigest()
Bob Badour3911e6a2020-02-10 17:08:47 -080059
60
61def html_escape(text):
62 """Produce entities within text."""
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070063 # Using for i in text doesn't work since i will be an int, not a byte.
64 # There are multiple ways to solve this, but the most performant way
65 # to iterate over a byte array is to use unpack. Using the
66 # for i in range(len(text)) and using that to get a byte using array
67 # slices is twice as slow as this method.
68 return b"".join(HTML_ESCAPE_TABLE.get(i,i) for i in struct.unpack(str(len(text)) + 'c', text))
Bob Badour3911e6a2020-02-10 17:08:47 -080069
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070070HTML_OUTPUT_CSS=b"""
Bob Badour3911e6a2020-02-10 17:08:47 -080071<style type="text/css">
72body { padding: 0; font-family: sans-serif; }
73.same-license { background-color: #eeeeee; border-top: 20px solid white; padding: 10px; }
74.label { font-weight: bold; }
75.file-list { margin-left: 1em; color: blue; }
76</style>
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070077
Bob Badour3911e6a2020-02-10 17:08:47 -080078"""
79
80def combine_notice_files_html(file_hash, input_dir, output_filename):
81 """Combine notice files in FILE_HASH and output a HTML version to OUTPUT_FILENAME."""
82
83 SRC_DIR_STRIP_RE = re.compile(input_dir + "(/.*).txt")
84
85 # Set up a filename to row id table (anchors inside tables don't work in
86 # most browsers, but href's to table row ids do)
87 id_table = {}
88 id_count = 0
89 for value in file_hash:
90 for filename in value:
91 id_table[filename] = id_count
92 id_count += 1
93
94 # Open the output file, and output the header pieces
95 output_file = open(output_filename, "wb")
96
Christopher Ferris8e5feaa2021-09-01 16:31:59 -070097 output_file.write(b"<html><head>\n")
98 output_file.write(HTML_OUTPUT_CSS)
99 output_file.write(b'</head><body topmargin="0" leftmargin="0" rightmargin="0" bottommargin="0">\n')
Bob Badour3911e6a2020-02-10 17:08:47 -0800100
101 # Output our table of contents
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700102 output_file.write(b'<div class="toc">\n')
103 output_file.write(b"<ul>\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800104
105 # Flatten the list of lists into a single list of filenames
106 sorted_filenames = sorted(itertools.chain.from_iterable(file_hash))
107
108 # Print out a nice table of contents
109 for filename in sorted_filenames:
110 stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700111 output_file.write(('<li><a href="#id%d">%s</a></li>\n' % (id_table.get(filename), stripped_filename)).encode())
Bob Badour3911e6a2020-02-10 17:08:47 -0800112
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700113 output_file.write(b"</ul>\n")
114 output_file.write(b"</div><!-- table of contents -->\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800115 # Output the individual notice file lists
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700116 output_file.write(b'<table cellpadding="0" cellspacing="0" border="0">\n')
Bob Badour3911e6a2020-02-10 17:08:47 -0800117 for value in file_hash:
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700118 output_file.write(('<tr id="id%d"><td class="same-license">\n' % id_table.get(value[0])).encode())
119 output_file.write(b'<div class="label">Notices for file(s):</div>\n')
120 output_file.write(b'<div class="file-list">\n')
Bob Badour3911e6a2020-02-10 17:08:47 -0800121 for filename in value:
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700122 output_file.write(("%s <br/>\n" % (SRC_DIR_STRIP_RE.sub(r"\1", filename))).encode())
123 output_file.write(b"</div><!-- file-list -->\n\n")
124 output_file.write(b'<pre class="license-text">\n')
125 with open(value[0], "rb") as notice_file:
126 output_file.write(html_escape(notice_file.read()))
127 output_file.write(b"\n</pre><!-- license-text -->\n")
128 output_file.write(b"</td></tr><!-- same-license -->\n\n\n\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800129
130 # Finish off the file output
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700131 output_file.write(b"</table>\n")
132 output_file.write(b"</body></html>\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800133 output_file.close()
134
135def combine_notice_files_text(file_hash, input_dir, output_filename, file_title):
136 """Combine notice files in FILE_HASH and output a text version to OUTPUT_FILENAME."""
137
138 SRC_DIR_STRIP_RE = re.compile(input_dir + "(/.*).txt")
139 output_file = open(output_filename, "wb")
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700140 output_file.write(file_title.encode())
141 output_file.write(b"\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800142 for value in file_hash:
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700143 output_file.write(b"============================================================\n")
144 output_file.write(b"Notices for file(s):\n")
145 for filename in value:
146 output_file.write(SRC_DIR_STRIP_RE.sub(r"\1", filename).encode())
147 output_file.write(b"\n")
148 output_file.write(b"------------------------------------------------------------\n")
149 with open(value[0], "rb") as notice_file:
150 output_file.write(notice_file.read())
151 output_file.write(b"\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800152 output_file.close()
153
154def combine_notice_files_xml(files_with_same_hash, input_dir, output_filename):
155 """Combine notice files in FILE_HASH and output a XML version to OUTPUT_FILENAME."""
156
157 SRC_DIR_STRIP_RE = re.compile(input_dir + "(/.*).txt")
158
159 # Set up a filename to row id table (anchors inside tables don't work in
160 # most browsers, but href's to table row ids do)
161 id_table = {}
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700162 for file_key, files in files_with_same_hash.items():
163 for filename in files:
Bob Badour3911e6a2020-02-10 17:08:47 -0800164 id_table[filename] = file_key
165
166 # Open the output file, and output the header pieces
167 output_file = open(output_filename, "wb")
168
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700169 output_file.write(b'<?xml version="1.0" encoding="utf-8"?>\n')
170 output_file.write(b"<licenses>\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800171
172 # Flatten the list of lists into a single list of filenames
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700173 sorted_filenames = sorted(list(id_table))
Bob Badour3911e6a2020-02-10 17:08:47 -0800174
175 # Print out a nice table of contents
176 for filename in sorted_filenames:
177 stripped_filename = SRC_DIR_STRIP_RE.sub(r"\1", filename)
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700178 output_file.write(('<file-name contentId="%s">%s</file-name>\n' % (id_table.get(filename), stripped_filename)).encode())
179 output_file.write(b"\n\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800180
181 processed_file_keys = []
182 # Output the individual notice file lists
183 for filename in sorted_filenames:
184 file_key = id_table.get(filename)
185 if file_key in processed_file_keys:
186 continue
187 processed_file_keys.append(file_key)
188
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700189 output_file.write(('<file-content contentId="%s"><![CDATA[' % file_key).encode())
190 with open(filename, "rb") as notice_file:
191 output_file.write(html_escape(notice_file.read()))
192 output_file.write(b"]]></file-content>\n\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800193
194 # Finish off the file output
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700195 output_file.write(b"</licenses>\n")
Bob Badour3911e6a2020-02-10 17:08:47 -0800196 output_file.close()
197
198def get_args():
199 parser = argparse.ArgumentParser()
200 parser.add_argument(
201 '--text-output', required=True,
202 help='The text output file path.')
203 parser.add_argument(
204 '--html-output',
205 help='The html output file path.')
206 parser.add_argument(
207 '--xml-output',
208 help='The xml output file path.')
209 parser.add_argument(
210 '-t', '--title', required=True,
211 help='The file title.')
212 parser.add_argument(
213 '-s', '--source-dir', required=True,
214 help='The directory containing notices.')
215 parser.add_argument(
216 '-i', '--included-subdirs', action='append',
217 help='The sub directories which should be included.')
218 parser.add_argument(
219 '-e', '--excluded-subdirs', action='append',
220 help='The sub directories which should be excluded.')
221 return parser.parse_args()
222
223def main(argv):
224 args = get_args()
225
226 txt_output_file = args.text_output
227 html_output_file = args.html_output
228 xml_output_file = args.xml_output
229 file_title = args.title
230 included_subdirs = []
231 excluded_subdirs = []
232 if args.included_subdirs is not None:
233 included_subdirs = args.included_subdirs
234 if args.excluded_subdirs is not None:
235 excluded_subdirs = args.excluded_subdirs
236
237 # Find all the notice files and md5 them
238 input_dir = os.path.normpath(args.source_dir)
239 files_with_same_hash = defaultdict(list)
240 for root, dir, files in os.walk(input_dir):
241 for file in files:
242 matched = True
243 if len(included_subdirs) > 0:
244 matched = False
245 for subdir in included_subdirs:
246 if (root == (input_dir + '/' + subdir) or
247 root.startswith(input_dir + '/' + subdir + '/')):
248 matched = True
249 break
250 elif len(excluded_subdirs) > 0:
251 for subdir in excluded_subdirs:
252 if (root == (input_dir + '/' + subdir) or
253 root.startswith(input_dir + '/' + subdir + '/')):
254 matched = False
255 break
256 if matched and file.endswith(".txt"):
257 filename = os.path.join(root, file)
258 file_md5sum = md5sum(filename)
259 files_with_same_hash[file_md5sum].append(filename)
260
Christopher Ferris8e5feaa2021-09-01 16:31:59 -0700261 filesets = [sorted(files_with_same_hash[md5]) for md5 in sorted(list(files_with_same_hash))]
Bob Badour3911e6a2020-02-10 17:08:47 -0800262
263 combine_notice_files_text(filesets, input_dir, txt_output_file, file_title)
264
265 if html_output_file is not None:
266 combine_notice_files_html(filesets, input_dir, html_output_file)
267
268 if xml_output_file is not None:
269 combine_notice_files_xml(files_with_same_hash, input_dir, xml_output_file)
270
271if __name__ == "__main__":
272 main(sys.argv)