Blame - src/libvterm/t/03encoding_utf8.test - android_external_vim

blob: 7ee16ac641a7c88df9aaffdf9eeb20a0531e55a0 [file] [log] [blame]

Bram Moolenaar	e4f25e4	2017-07-07 11:54:15 +0200	[diff] [blame]	1	INIT
				2	WANTENCODING
				3
				4	!Low
				5	ENCIN "123"
				6	encout 0x31,0x32,0x33
				7
				8	# We want to prove the UTF-8 parser correctly handles all the sequences.
				9	# Easy way to do this is to check it does low/high boundary cases, as that
				10	# leaves only two for each sequence length
				11	#
				12	# These ranges are therefore:
				13	#
				14	# Two bytes:
				15	# U+0080 = 000 10000000 => 00010 000000
				16	# => 11000010 10000000 = C2 80
				17	# U+07FF = 111 11111111 => 11111 111111
				18	# => 11011111 10111111 = DF BF
				19	#
				20	# Three bytes:
				21	# U+0800 = 00001000 00000000 => 0000 100000 000000
				22	# => 11100000 10100000 10000000 = E0 A0 80
				23	# U+FFFD = 11111111 11111101 => 1111 111111 111101
				24	# => 11101111 10111111 10111101 = EF BF BD
				25	# (We avoid U+FFFE and U+FFFF as they're invalid codepoints)
				26	#
				27	# Four bytes:
				28	# U+10000 = 00001 00000000 00000000 => 000 010000 000000 000000
				29	# => 11110000 10010000 10000000 10000000 = F0 90 80 80
				30	# U+1FFFFF = 11111 11111111 11111111 => 111 111111 111111 111111
				31	# => 11110111 10111111 10111111 10111111 = F7 BF BF BF
				32
				33	!2 byte
				34	ENCIN "\xC2\x80\xDF\xBF"
				35	encout 0x0080, 0x07FF
				36
				37	!3 byte
				38	ENCIN "\xE0\xA0\x80\xEF\xBF\xBD"
				39	encout 0x0800,0xFFFD
				40
				41	!4 byte
				42	ENCIN "\xF0\x90\x80\x80\xF7\xBF\xBF\xBF"
				43	encout 0x10000,0x1fffff
				44
				45	# Next up, we check some invalid sequences
				46	# + Early termination (back to low bytes too soon)
				47	# + Early restart (another sequence introduction before the previous one was finished)
				48
				49	!Early termination
				50	ENCIN "\xC2!"
				51	encout 0xfffd,0x21
				52
				53	ENCIN "\xE0!\xE0\xA0!"
				54	encout 0xfffd,0x21,0xfffd,0x21
				55
				56	ENCIN "\xF0!\xF0\x90!\xF0\x90\x80!"
				57	encout 0xfffd,0x21,0xfffd,0x21,0xfffd,0x21
				58
				59	!Early restart
				60	ENCIN "\xC2\xC2\x90"
				61	encout 0xfffd,0x0090
				62
				63	ENCIN "\xE0\xC2\x90\xE0\xA0\xC2\x90"
				64	encout 0xfffd,0x0090,0xfffd,0x0090
				65
				66	ENCIN "\xF0\xC2\x90\xF0\x90\xC2\x90\xF0\x90\x80\xC2\x90"
				67	encout 0xfffd,0x0090,0xfffd,0x0090,0xfffd,0x0090
				68
				69	# Test the overlong sequences by giving an overlong encoding of U+0000 and
				70	# an encoding of the highest codepoint still too short
				71	#
				72	# Two bytes:
				73	# U+0000 = C0 80
				74	# U+007F = 000 01111111 => 00001 111111 =>
				75	# => 11000001 10111111 => C1 BF
				76	#
				77	# Three bytes:
				78	# U+0000 = E0 80 80
				79	# U+07FF = 00000111 11111111 => 0000 011111 111111
				80	# => 11100000 10011111 10111111 = E0 9F BF
				81	#
				82	# Four bytes:
				83	# U+0000 = F0 80 80 80
				84	# U+FFFF = 11111111 11111111 => 000 001111 111111 111111
				85	# => 11110000 10001111 10111111 10111111 = F0 8F BF BF
				86
				87	!Overlong
				88	ENCIN "\xC0\x80\xC1\xBF"
				89	encout 0xfffd,0xfffd
				90
				91	ENCIN "\xE0\x80\x80\xE0\x9F\xBF"
				92	encout 0xfffd,0xfffd
				93
				94	ENCIN "\xF0\x80\x80\x80\xF0\x8F\xBF\xBF"
				95	encout 0xfffd,0xfffd
				96
				97	# UTF-16 surrogates U+D800 and U+DFFF
				98	!UTF-16 Surrogates
				99	ENCIN "\xED\xA0\x80\xED\xBF\xBF"
				100	encout 0xfffd,0xfffd
				101
				102	!Split write
				103	ENCIN "\xC2"
				104	ENCIN "\xA0"
				105	encout 0x000A0
				106
				107	ENCIN "\xE0"
				108	ENCIN "\xA0\x80"
				109	encout 0x00800
				110	ENCIN "\xE0\xA0"
				111	ENCIN "\x80"
				112	encout 0x00800
				113
				114	ENCIN "\xF0"
				115	ENCIN "\x90\x80\x80"
				116	encout 0x10000
				117	ENCIN "\xF0\x90"
				118	ENCIN "\x80\x80"
				119	encout 0x10000
				120	ENCIN "\xF0\x90\x80"
				121	ENCIN "\x80"
				122	encout 0x10000