Merge "Fix frameworks/base/tools/localedata/extract_icu_data.py" into main
diff --git a/tools/localedata/extract_icu_data.py b/tools/localedata/extract_icu_data.py
index 81ac897..8f67fa8 100755
--- a/tools/localedata/extract_icu_data.py
+++ b/tools/localedata/extract_icu_data.py
@@ -22,6 +22,8 @@
 import os.path
 import sys
 
+import xml.etree.ElementTree as ElementTree
+
 
 def get_locale_parts(locale):
     """Split a locale into three parts, for langauge, script, and region."""
@@ -40,42 +42,43 @@
 
 def read_likely_subtags(input_file_name):
     """Read and parse ICU's likelySubtags.txt."""
-    with open(input_file_name) as input_file:
-        likely_script_dict = {
-            # Android's additions for pseudo-locales. These internal codes make
-            # sure that the pseudo-locales would not match other English or
-            # Arabic locales. (We can't use private-use ISO 15924 codes, since
-            # they may be used by apps for other purposes.)
-            "en_XA": "~~~A",
-            "ar_XB": "~~~B",
-            # Removed data from later versions of ICU
-            "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
-        }
-        representative_locales = {
-            # Android's additions
-            "en_Latn_GB", # representative for en_Latn_001
-            "es_Latn_MX", # representative for es_Latn_419
-            "es_Latn_US", # representative for es_Latn_419 (not the best idea,
-                          # but Android has been shipping with it for quite a
-                          # while. Fortunately, MX < US, so if both exist, MX
-                          # would be chosen.)
-        }
-        for line in input_file:
-            line = line.strip(u' \n\uFEFF')
-            if line.startswith('//'):
-                continue
-            if '{' in line and '}' in line:
-                from_locale = line[:line.index('{')]
-                to_locale = line[line.index('"')+1:line.rindex('"')]
-                from_lang, from_scr, from_region = get_locale_parts(from_locale)
-                _, to_scr, to_region = get_locale_parts(to_locale)
-                if from_lang == 'und':
-                    continue  # not very useful for our purposes
-                if from_region is None and to_region not in ['001', 'ZZ']:
-                    representative_locales.add(to_locale)
-                if from_scr is None:
-                    likely_script_dict[from_locale] = to_scr
-        return likely_script_dict, frozenset(representative_locales)
+    likely_script_dict = {
+        # Android's additions for pseudo-locales. These internal codes make
+        # sure that the pseudo-locales would not match other English or
+        # Arabic locales. (We can't use private-use ISO 15924 codes, since
+        # they may be used by apps for other purposes.)
+        "en_XA": "~~~A",
+        "ar_XB": "~~~B",
+        # Removed data from later versions of ICU
+        "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
+    }
+    representative_locales = {
+        # Android's additions
+        "en_Latn_GB", # representative for en_Latn_001
+        "es_Latn_MX", # representative for es_Latn_419
+        "es_Latn_US", # representative for es_Latn_419 (not the best idea,
+        # but Android has been shipping with it for quite a
+        # while. Fortunately, MX < US, so if both exist, MX
+        # would be chosen.)
+    }
+    xml_tree = ElementTree.parse(input_file_name)
+    likely_subtags = xml_tree.find('likelySubtags')
+    for child in likely_subtags:
+        from_locale = child.get('from')
+        to_locale = child.get('to')
+        # print(f'from: {from_locale} to: {to_locale}')
+        from_lang, from_scr, from_region = get_locale_parts(from_locale)
+        _, to_scr, to_region = get_locale_parts(to_locale)
+        if to_locale == "FAIL":
+            continue # "FAIL" cases are not useful here.
+        if from_lang == 'und':
+            continue  # not very useful for our purposes
+        if from_region is None and to_region not in ['001', 'ZZ']:
+            representative_locales.add(to_locale)
+        if from_scr is None:
+            likely_script_dict[from_locale] = to_scr
+
+    return likely_script_dict, frozenset(representative_locales)
 
 
 # From packLanguageOrRegion() in ResourceTypes.cpp
@@ -86,7 +89,7 @@
     elif len(inp) == 2:
         return ord(inp[0]), ord(inp[1])
     else:
-        assert len(inp) == 3
+        assert len(inp) == 3, f'Expects a 3-character string, but "{inp}" '
         base = ord(base)
         first = ord(inp[0]) - base
         second = ord(inp[1]) - base
@@ -161,9 +164,10 @@
     print('});')
 
 
-def read_and_dump_likely_data(icu_data_dir):
+def read_and_dump_likely_data(cldr_source_dir):
     """Read and dump the likely-script data."""
-    likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
+    likely_subtags_txt = os.path.join(cldr_source_dir,
+                                      'common', 'supplemental', 'likelySubtags.xml')
     likely_script_dict, representative_locales = read_likely_subtags(
         likely_subtags_txt)
 
@@ -280,10 +284,11 @@
     icu_data_dir = os.path.join(
         source_root,
         'external', 'icu', 'icu4c', 'source', 'data')
+    cldr_source_dir = os.path.join(source_root, 'external', 'cldr')
 
     print('// Auto-generated by %s' % sys.argv[0])
     print()
-    likely_script_dict = read_and_dump_likely_data(icu_data_dir)
+    likely_script_dict = read_and_dump_likely_data(cldr_source_dir)
     read_and_dump_parent_data(icu_data_dir, likely_script_dict)