Update SORTED_CHAR_MAP[] in char_utils.cpp Change-Id: I683793a0fd0ccf41f5a378275ef11def4e70ce76

commit: 2d27ca6ce3f0d9f2d229f67c24e7a2a1456605f9 [log] [tgz]
author: Ken Wakasa <kwakasa@google.com> Mon Nov 26 15:50:26 2012 +0900
committer: Ken Wakasa <kwakasa@google.com> Mon Nov 26 17:17:16 2012 +0900
tree: 14c86cae415a0e157097cf4e2d97038e1b862a27
parent: 71e1496ab0370b2de47bb62e5cc2a86ffbc8c30c [diff]
diff --git a/native/jni/src/char_utils.cpp b/native/jni/src/char_utils.cpp
index ede1155..ee0d308 100644
--- a/native/jni/src/char_utils.cpp
+++ b/native/jni/src/char_utils.cpp

@@ -26,71 +26,61 @@
   unsigned short small;
 };
 
-// Generated from http://unicode.org/Public/UNIDATA/UnicodeData.txt
-//
-// 1. Run the following code.  Bascially taken from
-//    Dictionary::toLowerCase(unsigned short c) in dictionary.cpp.
-//    Then, get the list of chars where cc != ccc.
-//
-//    unsigned short c, cc, ccc, ccc2;
-//    for (c = 0; c < 0xFFFF ; c++) {
-//        if (c < NELEMS(BASE_CHARS)) {
-//            cc = BASE_CHARS[c];
-//        } else {
-//            cc = c;
-//        }
-//
-//        // tolower
-//        int isBase = 0;
-//        if (cc >='A' && cc <= 'Z') {
-//            ccc = (cc | 0x20);
-//            ccc2 = ccc;
-//            isBase = 1;
-//        } else if (cc > 0x7F) {
-//            ccc = u_tolower(cc);
-//            ccc2 = latin_tolower(cc);
-//        } else {
-//            ccc = cc;
-//            ccc2 = ccc;
-//        }
-//        if (!isBase && cc != ccc) {
-//            wprintf(L" 0x%04X => 0x%04X => 0x%04X  %lc => %lc => %lc \n",
-//                    c, cc, ccc, c, cc, ccc);
-//            //assert(ccc == ccc2);
-//        }
-//    }
-//
-//    Initially, started with an empty latin_tolower() as below.
-//
-//    unsigned short latin_tolower(unsigned short c) {
-//        return c;
-//    }
-//
-//
-// 2. Process the list obtained by 1 by the following perl script and apply
-//    'sort -u' as well.  Get the SORTED_CHAR_MAP[].
-//    Note that '$1' in the perl script is 'cc' in the above C code.
-//
-//    while(<>) {
-//        / 0x\w* => 0x(\w*) =/;
-//        open(HDL, "grep -iw ^" . $1 . " UnicodeData.txt | ");
-//        $line = <HDL>;
-//        chomp $line;
-//        @cols = split(/;/, $line);
-//        print "    { 0x$1, 0x$cols[13] },  // $cols[1]\n";
-//    }
-//
-//
-// 3. Update the latin_tolower() function above with SORTED_CHAR_MAP.  Enable
-//    the assert(ccc == ccc2) above and confirm the function exits successfully.
-//
-// TODO: Regenerate this map by using the updated BASE_CHARS table in this file.
+/*
+ * How to update the SORTED_CHAR_MAP[] array.
+ *
+ * 1. Download http://unicode.org/Public/UNIDATA/UnicodeData.txt
+ *
+ * 2. Have a latest version of ICU4C dev package installed
+ *    (Note: the current data has been generated with version 4.8)
+ *    $ apt-get install libicu-dev
+ *
+ * 3. Build the following code
+ *    (You need this file, char_utils.h, and defines.h)
+ *    $ g++ -o char_utils -DUPDATING_CHAR_UTILS char_utils.cpp -licuuc
+ */
+#ifdef UPDATING_CHAR_UTILS
+#include <stdio.h>
+#include <unicode/uchar.h> // ICU4C
+
+extern "C" int main() {
+    for (unsigned short c = 0; c < 0xFFFF; c++) {
+        const unsigned short baseC = c < NELEMS(BASE_CHARS) ? BASE_CHARS[c] : c;
+        if (baseC <= 0x7F) continue;
+        const unsigned short icu4cLowerBaseC = u_tolower(baseC);
+        const unsigned short myLowerBaseC = latin_tolower(baseC);
+        if (baseC != icu4cLowerBaseC) {
+#ifdef CONFIRMING_CHAR_UTILS
+            if (icu4cLowerBaseC != myLowerBaseC) {
+                fprintf(stderr, "icu4cLowerBaseC != myLowerBaseC, 0x%04X, 0x%04X\n",
+                        icu4cLowerBaseC, myLowerBaseC);
+            }
+#else // CONFIRMING_CHAR_UTILS
+            printf("0x%04X, 0x%04X\n", baseC, icu4cLowerBaseC);
+#endif // CONFIRMING_CHAR_UTILS
+        }
+    }
+}
+#endif // UPDATING_CHAR_UTILS
+/*
+ * 4. Process the list with UnicodeData.txt
+ *    (You need UnicodeData.txt in the current directory)
+ *    $ ./char_utils | sort -u | \
+ *      perl -e 'open(FH, "UnicodeData.txt"); @buf = <FH>; close(FH); \
+ *      while(<>){/0x(\w*), 0x(\w*)/; @lines = grep(/^$1/, @buf); @cols = split(/;/, $lines[0]); \
+ *      print "    { 0x$1, 0x$cols[13] },  // $cols[1]\n";}'
+ *
+ * 5. Update the SORTED_CHAR_MAP[] array below with the output above.
+ *    Then, rebuild with -DCONFIRMING_CHAR_UTILS and confirm the program exits successfully.
+ *    $ g++ -o char_utils -DUPDATING_CHAR_UTILS -DCONFIRMING_CHAR_UTILS char_utils.cpp -licuuc
+ *    $ ./char_utils
+ *    $
+ */
 static const struct LatinCapitalSmallPair SORTED_CHAR_MAP[] = {
     { 0x00C4, 0x00E4 },  // LATIN CAPITAL LETTER A WITH DIAERESIS
     { 0x00C5, 0x00E5 },  // LATIN CAPITAL LETTER A WITH RING ABOVE
     { 0x00C6, 0x00E6 },  // LATIN CAPITAL LETTER AE
     { 0x00D0, 0x00F0 },  // LATIN CAPITAL LETTER ETH
-    { 0x00D1, 0x00F1 },  // LATIN CAPITAL LETTER N WITH TILDE
     { 0x00D5, 0x00F5 },  // LATIN CAPITAL LETTER O WITH TILDE
     { 0x00D6, 0x00F6 },  // LATIN CAPITAL LETTER O WITH DIAERESIS
     { 0x00D8, 0x00F8 },  // LATIN CAPITAL LETTER O WITH STROKE
@@ -98,7 +88,6 @@
     { 0x00DE, 0x00FE },  // LATIN CAPITAL LETTER THORN
     { 0x0110, 0x0111 },  // LATIN CAPITAL LETTER D WITH STROKE
     { 0x0126, 0x0127 },  // LATIN CAPITAL LETTER H WITH STROKE
-    { 0x0141, 0x0142 },  // LATIN CAPITAL LETTER L WITH STROKE
     { 0x014A, 0x014B },  // LATIN CAPITAL LETTER ENG
     { 0x0152, 0x0153 },  // LATIN CAPITAL LIGATURE OE
     { 0x0166, 0x0167 },  // LATIN CAPITAL LETTER T WITH STROKE
@@ -322,6 +311,7 @@
     { 0x0520, 0x0521 },  // CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK
     { 0x0522, 0x0523 },  // CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK
     { 0x0524, 0x0525 },  // CYRILLIC CAPITAL LETTER PE WITH DESCENDER
+    { 0x0526, 0x0527 },  // CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER
     { 0x0531, 0x0561 },  // ARMENIAN CAPITAL LETTER AYB
     { 0x0532, 0x0562 },  // ARMENIAN CAPITAL LETTER BEN
     { 0x0533, 0x0563 },  // ARMENIAN CAPITAL LETTER GIM
@@ -795,6 +785,7 @@
     { 0xA65A, 0xA65B },  // CYRILLIC CAPITAL LETTER BLENDED YUS
     { 0xA65C, 0xA65D },  // CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS
     { 0xA65E, 0xA65F },  // CYRILLIC CAPITAL LETTER YN
+    { 0xA660, 0xA661 },  // CYRILLIC CAPITAL LETTER REVERSED TSE
     { 0xA662, 0xA663 },  // CYRILLIC CAPITAL LETTER SOFT DE
     { 0xA664, 0xA665 },  // CYRILLIC CAPITAL LETTER SOFT EL
     { 0xA666, 0xA667 },  // CYRILLIC CAPITAL LETTER SOFT EM
@@ -860,6 +851,13 @@
     { 0xA784, 0xA785 },  // LATIN CAPITAL LETTER INSULAR S
     { 0xA786, 0xA787 },  // LATIN CAPITAL LETTER INSULAR T
     { 0xA78B, 0xA78C },  // LATIN CAPITAL LETTER SALTILLO
+    { 0xA78D, 0x0265 },  // LATIN CAPITAL LETTER TURNED H
+    { 0xA790, 0xA791 },  // LATIN CAPITAL LETTER N WITH DESCENDER
+    { 0xA7A0, 0xA7A1 },  // LATIN CAPITAL LETTER G WITH OBLIQUE STROKE
+    { 0xA7A2, 0xA7A3 },  // LATIN CAPITAL LETTER K WITH OBLIQUE STROKE
+    { 0xA7A4, 0xA7A5 },  // LATIN CAPITAL LETTER N WITH OBLIQUE STROKE
+    { 0xA7A6, 0xA7A7 },  // LATIN CAPITAL LETTER R WITH OBLIQUE STROKE
+    { 0xA7A8, 0xA7A9 },  // LATIN CAPITAL LETTER S WITH OBLIQUE STROKE
     { 0xFF21, 0xFF41 },  // FULLWIDTH LATIN CAPITAL LETTER A
     { 0xFF22, 0xFF42 },  // FULLWIDTH LATIN CAPITAL LETTER B
     { 0xFF23, 0xFF43 },  // FULLWIDTH LATIN CAPITAL LETTER C
commit	2d27ca6ce3f0d9f2d229f67c24e7a2a1456605f9	[log] [tgz]
author	Ken Wakasa <kwakasa@google.com>	Mon Nov 26 15:50:26 2012 +0900
committer	Ken Wakasa <kwakasa@google.com>	Mon Nov 26 17:17:16 2012 +0900
tree	14c86cae415a0e157097cf4e2d97038e1b862a27
parent	71e1496ab0370b2de47bb62e5cc2a86ffbc8c30c [diff]