src/libvterm/t/03encoding_utf8.test - android_external_vim - Gitiles

 INIT
 WANTENCODING

 !Low
 ENCIN "123"
   encout 0x31,0x32,0x33

 # We want to prove the UTF-8 parser correctly handles all the sequences.
 # Easy way to do this is to check it does low/high boundary cases, as that
 # leaves only two for each sequence length
 #
 # These ranges are therefore:
 #
 # Two bytes:
 # U+0080 = 000 10000000 =>    00010   000000
 #                       => 11000010 10000000 = C2 80
 # U+07FF = 111 11111111 =>    11111   111111
 #                       => 11011111 10111111 = DF BF
 #
 # Three bytes:
 # U+0800 = 00001000 00000000 =>     0000   100000   000000
 #                            => 11100000 10100000 10000000 = E0 A0 80
 # U+FFFD = 11111111 11111101 =>     1111   111111   111101
 #                            => 11101111 10111111 10111101 = EF BF BD
 # (We avoid U+FFFE and U+FFFF as they're invalid codepoints)
 #
 # Four bytes:
 # U+10000  = 00001 00000000 00000000 =>      000   010000   000000   000000
 #                                    => 11110000 10010000 10000000 10000000 = F0 90 80 80
 # U+1FFFFF = 11111 11111111 11111111 =>      111   111111   111111   111111
 #                                    => 11110111 10111111 10111111 10111111 = F7 BF BF BF

 !2 byte
 ENCIN "\xC2\x80\xDF\xBF"
   encout 0x0080, 0x07FF

 !3 byte
 ENCIN "\xE0\xA0\x80\xEF\xBF\xBD"
   encout 0x0800,0xFFFD

 !4 byte
 ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF"
   encout 0x10000,0x1fffff

 # Next up, we check some invalid sequences
 #  + Early termination (back to low bytes too soon)
 #  + Early restart (another sequence introduction before the previous one was finished)

 !Early termination
 ENCIN "\xC2!"
   encout 0xfffd,0x21

 ENCIN "\xE0!\xE0\xA0!"
   encout 0xfffd,0x21,0xfffd,0x21

 ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!"
   encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21

 !Early restart
 ENCIN "\xC2\xC2\x90"
   encout 0xfffd,0x0090

 ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90"
   encout 0xfffd,0x0090,0xfffd,0x0090

 ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90"
   encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090

 # Test the overlong sequences by giving an overlong encoding of U+0000 and
 # an encoding of the highest codepoint still too short
 #
 # Two bytes:
 # U+0000 = C0 80
 # U+007F = 000 01111111 =>    00001   111111 =>
 #                       => 11000001 10111111 => C1 BF
 #
 # Three bytes:
 # U+0000 = E0 80 80
 # U+07FF = 00000111 11111111 =>     0000   011111   111111
 #                            => 11100000 10011111 10111111 = E0 9F BF
 #
 # Four bytes:
 # U+0000 = F0 80 80 80
 # U+FFFF = 11111111 11111111 =>      000   001111   111111   111111
 #                            => 11110000 10001111 10111111 10111111 = F0 8F BF BF

 !Overlong
 ENCIN "\xC0\x80\xC1\xBF"
   encout 0xfffd,0xfffd

 ENCIN "\xE0\x80\x80\xE0\x9F\xBF"
   encout 0xfffd,0xfffd

 ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF"
   encout 0xfffd,0xfffd

 # UTF-16 surrogates U+D800 and U+DFFF
 !UTF-16 Surrogates
 ENCIN "\xED\xA0\x80\xED\xBF\xBF"
   encout 0xfffd,0xfffd

 !Split write
 ENCIN "\xC2"
 ENCIN "\xA0"
   encout 0x000A0

 ENCIN "\xE0"
 ENCIN "\xA0\x80"
   encout 0x00800
 ENCIN "\xE0\xA0"
 ENCIN "\x80"
   encout 0x00800

 ENCIN "\xF0"
 ENCIN "\x90\x80\x80"
   encout 0x10000
 ENCIN "\xF0\x90"
 ENCIN "\x80\x80"
   encout 0x10000
 ENCIN "\xF0\x90\x80"
 ENCIN "\x80"
   encout 0x10000
	INIT
	WANTENCODING

	!Low
	ENCIN "123"
	encout 0x31,0x32,0x33

	# We want to prove the UTF-8 parser correctly handles all the sequences.
	# Easy way to do this is to check it does low/high boundary cases, as that
	# leaves only two for each sequence length
	#
	# These ranges are therefore:
	#
	# Two bytes:
	# U+0080 = 000 10000000 => 00010 000000
	# => 11000010 10000000 = C2 80
	# U+07FF = 111 11111111 => 11111 111111
	# => 11011111 10111111 = DF BF
	#
	# Three bytes:
	# U+0800 = 00001000 00000000 => 0000 100000 000000
	# => 11100000 10100000 10000000 = E0 A0 80
	# U+FFFD = 11111111 11111101 => 1111 111111 111101
	# => 11101111 10111111 10111101 = EF BF BD
	# (We avoid U+FFFE and U+FFFF as they're invalid codepoints)
	#
	# Four bytes:
	# U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000
	# => 11110000 10010000 10000000 10000000 = F0 90 80 80
	# U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111
	# => 11110111 10111111 10111111 10111111 = F7 BF BF BF

	!2 byte
	ENCIN "\xC2\x80\xDF\xBF"
	encout 0x0080, 0x07FF

	!3 byte
	ENCIN "\xE0\xA0\x80\xEF\xBF\xBD"
	encout 0x0800,0xFFFD

	!4 byte
	ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF"
	encout 0x10000,0x1fffff

	# Next up, we check some invalid sequences
	# + Early termination (back to low bytes too soon)
	# + Early restart (another sequence introduction before the previous one was finished)

	!Early termination
	ENCIN "\xC2!"
	encout 0xfffd,0x21

	ENCIN "\xE0!\xE0\xA0!"
	encout 0xfffd,0x21,0xfffd,0x21

	ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!"
	encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21

	!Early restart
	ENCIN "\xC2\xC2\x90"
	encout 0xfffd,0x0090

	ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90"
	encout 0xfffd,0x0090,0xfffd,0x0090

	ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90"
	encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090

	# Test the overlong sequences by giving an overlong encoding of U+0000 and
	# an encoding of the highest codepoint still too short
	#
	# Two bytes:
	# U+0000 = C0 80
	# U+007F = 000 01111111 => 00001 111111 =>
	# => 11000001 10111111 => C1 BF
	#
	# Three bytes:
	# U+0000 = E0 80 80
	# U+07FF = 00000111 11111111 => 0000 011111 111111
	# => 11100000 10011111 10111111 = E0 9F BF
	#
	# Four bytes:
	# U+0000 = F0 80 80 80
	# U+FFFF = 11111111 11111111 => 000 001111 111111 111111
	# => 11110000 10001111 10111111 10111111 = F0 8F BF BF

	!Overlong
	ENCIN "\xC0\x80\xC1\xBF"
	encout 0xfffd,0xfffd

	ENCIN "\xE0\x80\x80\xE0\x9F\xBF"
	encout 0xfffd,0xfffd

	ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF"
	encout 0xfffd,0xfffd

	# UTF-16 surrogates U+D800 and U+DFFF
	!UTF-16 Surrogates
	ENCIN "\xED\xA0\x80\xED\xBF\xBF"
	encout 0xfffd,0xfffd

	!Split write
	ENCIN "\xC2"
	ENCIN "\xA0"
	encout 0x000A0

	ENCIN "\xE0"
	ENCIN "\xA0\x80"
	encout 0x00800
	ENCIN "\xE0\xA0"
	ENCIN "\x80"
	encout 0x00800

	ENCIN "\xF0"
	ENCIN "\x90\x80\x80"
	encout 0x10000
	ENCIN "\xF0\x90"
	ENCIN "\x80\x80"
	encout 0x10000
	ENCIN "\xF0\x90\x80"
	ENCIN "\x80"
	encout 0x10000