Bram Moolenaar | e4f25e4 | 2017-07-07 11:54:15 +0200 | [diff] [blame] | 1 | INIT |
| 2 | WANTENCODING |
| 3 | |
| 4 | !Low |
| 5 | ENCIN "123" |
| 6 | encout 0x31,0x32,0x33 |
| 7 | |
| 8 | # We want to prove the UTF-8 parser correctly handles all the sequences. |
| 9 | # Easy way to do this is to check it does low/high boundary cases, as that |
| 10 | # leaves only two for each sequence length |
| 11 | # |
| 12 | # These ranges are therefore: |
| 13 | # |
| 14 | # Two bytes: |
| 15 | # U+0080 = 000 10000000 => 00010 000000 |
| 16 | # => 11000010 10000000 = C2 80 |
| 17 | # U+07FF = 111 11111111 => 11111 111111 |
| 18 | # => 11011111 10111111 = DF BF |
| 19 | # |
| 20 | # Three bytes: |
| 21 | # U+0800 = 00001000 00000000 => 0000 100000 000000 |
| 22 | # => 11100000 10100000 10000000 = E0 A0 80 |
| 23 | # U+FFFD = 11111111 11111101 => 1111 111111 111101 |
| 24 | # => 11101111 10111111 10111101 = EF BF BD |
| 25 | # (We avoid U+FFFE and U+FFFF as they're invalid codepoints) |
| 26 | # |
| 27 | # Four bytes: |
| 28 | # U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000 |
| 29 | # => 11110000 10010000 10000000 10000000 = F0 90 80 80 |
| 30 | # U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111 |
| 31 | # => 11110111 10111111 10111111 10111111 = F7 BF BF BF |
| 32 | |
| 33 | !2 byte |
| 34 | ENCIN "\xC2\x80\xDF\xBF" |
| 35 | encout 0x0080, 0x07FF |
| 36 | |
| 37 | !3 byte |
| 38 | ENCIN "\xE0\xA0\x80\xEF\xBF\xBD" |
| 39 | encout 0x0800,0xFFFD |
| 40 | |
| 41 | !4 byte |
| 42 | ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF" |
| 43 | encout 0x10000,0x1fffff |
| 44 | |
| 45 | # Next up, we check some invalid sequences |
| 46 | # + Early termination (back to low bytes too soon) |
| 47 | # + Early restart (another sequence introduction before the previous one was finished) |
| 48 | |
| 49 | !Early termination |
| 50 | ENCIN "\xC2!" |
| 51 | encout 0xfffd,0x21 |
| 52 | |
| 53 | ENCIN "\xE0!\xE0\xA0!" |
| 54 | encout 0xfffd,0x21,0xfffd,0x21 |
| 55 | |
| 56 | ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!" |
| 57 | encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21 |
| 58 | |
| 59 | !Early restart |
| 60 | ENCIN "\xC2\xC2\x90" |
| 61 | encout 0xfffd,0x0090 |
| 62 | |
| 63 | ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90" |
| 64 | encout 0xfffd,0x0090,0xfffd,0x0090 |
| 65 | |
| 66 | ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90" |
| 67 | encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090 |
| 68 | |
| 69 | # Test the overlong sequences by giving an overlong encoding of U+0000 and |
| 70 | # an encoding of the highest codepoint still too short |
| 71 | # |
| 72 | # Two bytes: |
| 73 | # U+0000 = C0 80 |
| 74 | # U+007F = 000 01111111 => 00001 111111 => |
| 75 | # => 11000001 10111111 => C1 BF |
| 76 | # |
| 77 | # Three bytes: |
| 78 | # U+0000 = E0 80 80 |
| 79 | # U+07FF = 00000111 11111111 => 0000 011111 111111 |
| 80 | # => 11100000 10011111 10111111 = E0 9F BF |
| 81 | # |
| 82 | # Four bytes: |
| 83 | # U+0000 = F0 80 80 80 |
| 84 | # U+FFFF = 11111111 11111111 => 000 001111 111111 111111 |
| 85 | # => 11110000 10001111 10111111 10111111 = F0 8F BF BF |
| 86 | |
| 87 | !Overlong |
| 88 | ENCIN "\xC0\x80\xC1\xBF" |
| 89 | encout 0xfffd,0xfffd |
| 90 | |
| 91 | ENCIN "\xE0\x80\x80\xE0\x9F\xBF" |
| 92 | encout 0xfffd,0xfffd |
| 93 | |
| 94 | ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF" |
| 95 | encout 0xfffd,0xfffd |
| 96 | |
| 97 | # UTF-16 surrogates U+D800 and U+DFFF |
| 98 | !UTF-16 Surrogates |
| 99 | ENCIN "\xED\xA0\x80\xED\xBF\xBF" |
| 100 | encout 0xfffd,0xfffd |
| 101 | |
| 102 | !Split write |
| 103 | ENCIN "\xC2" |
| 104 | ENCIN "\xA0" |
| 105 | encout 0x000A0 |
| 106 | |
| 107 | ENCIN "\xE0" |
| 108 | ENCIN "\xA0\x80" |
| 109 | encout 0x00800 |
| 110 | ENCIN "\xE0\xA0" |
| 111 | ENCIN "\x80" |
| 112 | encout 0x00800 |
| 113 | |
| 114 | ENCIN "\xF0" |
| 115 | ENCIN "\x90\x80\x80" |
| 116 | encout 0x10000 |
| 117 | ENCIN "\xF0\x90" |
| 118 | ENCIN "\x80\x80" |
| 119 | encout 0x10000 |
| 120 | ENCIN "\xF0\x90\x80" |
| 121 | ENCIN "\x80" |
| 122 | encout 0x10000 |