Blame - common/fltk/src/fl_utf.c - android_external_tigervnc

blob: dbdcd502e6df163e53b163ccf727c538f5ace504 [file] [log] [blame]

DRC	2ff39b8	2011-07-28 08:38:59 +0000	[diff] [blame^]	1	/*
				2	* "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $"
				3	*
				4	* This is the utf.c file from fltk2 adapted for use in my fltk1.1 port
				5	*/
				6	/* Copyright 2006-2011 by Bill Spitzak and others.
				7	*
				8	* This library is free software; you can redistribute it and/or
				9	* modify it under the terms of the GNU Library General Public
				10	* License as published by the Free Software Foundation; either
				11	* version 2 of the License, or (at your option) any later version.
				12	*
				13	* This library is distributed in the hope that it will be useful,
				14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				16	* Library General Public License for more details.
				17	*
				18	* You should have received a copy of the GNU Library General Public
				19	* License along with this library; if not, write to the Free Software
				20	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
				21	* USA.
				22	*
				23	* Please report all bugs and problems on the following page:
				24	*
				25	* http://www.fltk.org/str.php
				26	*/
				27
				28	/* Modified to obey rfc3629, which limits unicode to 0-0x10ffff */
				29
				30	#include <FL/fl_utf8.h>
				31	#include <string.h>
				32	#include <stdlib.h>
				33
				34	/** \addtogroup fl_unicode
				35	@{
				36	*/
				37
				38
				39	#if 0
				40	/**
				41	\defgroup fl_unichar Unicode Character Functions
				42	Global Functions Handling Single Unicode Characters
				43	@{ */
				44
				45	/**
				46	Converts a Unicode character into a utf-8 sequence.
				47	\param[in] uc Unicode character
				48	\param[out] text utf-8 sequence will be written here; if this pointer is
				49	\c NULL, only the length of the utf-8 sequence is calculated
				50	\return length of the sequence in bytes
				51	*/
				52	/* FL_EXPORT int fl_unichar_to_utf8(unsigned int uc, char text); /
				53
				54	/** @} */
				55
				56	/**
				57	\defgroup fl_utf8 Unicode String Functions
				58	Global Functions Handling Unicode Text
				59	@{ */
				60
				61	/**
				62	Calculate the size of a utf-8 sequence for a Unicode character.
				63	\param[in] uc Unicode character
				64	\return length of the sequence in bytes
				65	*/
				66	/* FL_EXPORT int fl_utf8_size(unsigned int uc); */
				67
				68	/** @} */
				69	#endif /* 0 */
				70
				71	/*!Set to 1 to turn bad UTF8 bytes into ISO-8859-1. If this is to zero
				72	they are instead turned into the Unicode REPLACEMENT CHARACTER, of
				73	value 0xfffd.
				74	If this is on fl_utf8decode() will correctly map most (perhaps all)
				75	human-readable text that is in ISO-8859-1. This may allow you
				76	to completely ignore character sets in your code because virtually
				77	everything is either ISO-8859-1 or UTF-8.
				78	*/
				79	#define ERRORS_TO_ISO8859_1 1
				80
				81	/*!Set to 1 to turn bad UTF8 bytes in the 0x80-0x9f range into the
				82	Unicode index for Microsoft's CP1252 character set. You should
				83	also set ERRORS_TO_ISO8859_1. With this a huge amount of more
				84	available text (such as all web pages) are correctly converted
				85	to Unicode.
				86	*/
				87	#define ERRORS_TO_CP1252 1
				88
				89	/*!A number of Unicode code points are in fact illegal and should not
				90	be produced by a UTF-8 converter. Turn this on will replace the
				91	bytes in those encodings with errors. If you do this then converting
				92	arbitrary 16-bit data to UTF-8 and then back is not an identity,
				93	which will probably break a lot of software.
				94	*/
				95	#define STRICT_RFC3629 0
				96
				97	#if ERRORS_TO_CP1252
				98	/* Codes 0x80..0x9f from the Microsoft CP1252 character set, translated
				99	* to Unicode:
				100	*/
				101	static unsigned short cp1252[32] = {
				102	0x20ac, 0x0081, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
				103	0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008d, 0x017d, 0x008f,
				104	0x0090, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
				105	0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x009d, 0x017e, 0x0178
				106	};
				107	#endif
				108
				109	/*! Decode a single UTF-8 encoded character starting at \e p. The
				110	resulting Unicode value (in the range 0-0x10ffff) is returned,
				111	and \e len is set to the number of bytes in the UTF-8 encoding
				112	(adding \e len to \e p will point at the next character).
				113
				114	If \p p points at an illegal UTF-8 encoding, including one that
				115	would go past \e end, or where a code is uses more bytes than
				116	necessary, then (unsigned char)p is translated as though it is
				117	in the Microsoft CP1252 character set and \e len is set to 1.
				118	Treating errors this way allows this to decode almost any
				119	ISO-8859-1 or CP1252 text that has been mistakenly placed where
				120	UTF-8 is expected, and has proven very useful.
				121
				122	If you want errors to be converted to error characters (as the
				123	standards recommend), adding a test to see if the length is
				124	unexpectedly 1 will work:
				125
				126	\code
				127	if (*p & 0x80) { // what should be a multibyte encoding
				128	code = fl_utf8decode(p,end,&len);
				129	if (len<2) code = 0xFFFD; // Turn errors into REPLACEMENT CHARACTER
				130	} else { // handle the 1-byte utf8 encoding:
				131	code = *p;
				132	len = 1;
				133	}
				134	\endcode
				135
				136	Direct testing for the 1-byte case (as shown above) will also
				137	speed up the scanning of strings where the majority of characters
				138	are ASCII.
				139	*/
				140	unsigned fl_utf8decode(const char* p, const char* end, int* len)
				141	{
				142	unsigned char c = (unsigned char)p;
				143	if (c < 0x80) {
				144	if (len) *len = 1;
				145	return c;
				146	#if ERRORS_TO_CP1252
				147	} else if (c < 0xa0) {
				148	if (len) *len = 1;
				149	return cp1252[c-0x80];
				150	#endif
				151	} else if (c < 0xc2) {
				152	goto FAIL;
				153	}
				154	if ( (end && p+1 >= end) \|\| (p[1]&0xc0) != 0x80) goto FAIL;
				155	if (c < 0xe0) {
				156	if (len) *len = 2;
				157	return
				158	((p[0] & 0x1f) << 6) +
				159	((p[1] & 0x3f));
				160	} else if (c == 0xe0) {
				161	if (((unsigned char*)p)[1] < 0xa0) goto FAIL;
				162	goto UTF8_3;
				163	#if STRICT_RFC3629
				164	} else if (c == 0xed) {
				165	/* RFC 3629 says surrogate chars are illegal. */
				166	if (((unsigned char*)p)[1] >= 0xa0) goto FAIL;
				167	goto UTF8_3;
				168	} else if (c == 0xef) {
				169	/* 0xfffe and 0xffff are also illegal characters */
				170	if (((unsigned char*)p)[1]==0xbf &&
				171	((unsigned char*)p)[2]>=0xbe) goto FAIL;
				172	goto UTF8_3;
				173	#endif
				174	} else if (c < 0xf0) {
				175	UTF8_3:
				176	if ( (end && p+2 >= end) \|\| (p[2]&0xc0) != 0x80) goto FAIL;
				177	if (len) *len = 3;
				178	return
				179	((p[0] & 0x0f) << 12) +
				180	((p[1] & 0x3f) << 6) +
				181	((p[2] & 0x3f));
				182	} else if (c == 0xf0) {
				183	if (((unsigned char*)p)[1] < 0x90) goto FAIL;
				184	goto UTF8_4;
				185	} else if (c < 0xf4) {
				186	UTF8_4:
				187	if ( (end && p+3 >= end) \|\| (p[2]&0xc0) != 0x80 \|\| (p[3]&0xc0) != 0x80) goto FAIL;
				188	if (len) *len = 4;
				189	#if STRICT_RFC3629
				190	/* RFC 3629 says all codes ending in fffe or ffff are illegal: */
				191	if ((p[1]&0xf)==0xf &&
				192	((unsigned char*)p)[2] == 0xbf &&
				193	((unsigned char*)p)[3] >= 0xbe) goto FAIL;
				194	#endif
				195	return
				196	((p[0] & 0x07) << 18) +
				197	((p[1] & 0x3f) << 12) +
				198	((p[2] & 0x3f) << 6) +
				199	((p[3] & 0x3f));
				200	} else if (c == 0xf4) {
				201	if (((unsigned char)p)[1] > 0x8f) goto FAIL; / after 0x10ffff */
				202	goto UTF8_4;
				203	} else {
				204	FAIL:
				205	if (len) *len = 1;
				206	#if ERRORS_TO_ISO8859_1
				207	return c;
				208	#else
				209	return 0xfffd; /* Unicode REPLACEMENT CHARACTER */
				210	#endif
				211	}
				212	}
				213
				214	/*! Move \p p forward until it points to the start of a UTF-8
				215	character. If it already points at the start of one then it
				216	is returned unchanged. Any UTF-8 errors are treated as though each
				217	byte of the error is an individual character.
				218
				219	\e start is the start of the string and is used to limit the
				220	backwards search for the start of a utf8 character.
				221
				222	\e end is the end of the string and is assumed to be a break
				223	between characters. It is assumed to be greater than p.
				224
				225	This function is for moving a pointer that was jumped to the
				226	middle of a string, such as when doing a binary search for
				227	a position. You should use either this or fl_utf8back() depending
				228	on which direction your algorithim can handle the pointer
				229	moving. Do not use this to scan strings, use fl_utf8decode()
				230	instead.
				231	*/
				232	const char* fl_utf8fwd(const char* p, const char* start, const char* end)
				233	{
				234	const char* a;
				235	int len;
				236	/* if we are not pointing at a continuation character, we are done: */
				237	if ((*p&0xc0) != 0x80) return p;
				238	/* search backwards for a 0xc0 starting the character: */
				239	for (a = p-1; ; --a) {
				240	if (a < start) return p;
				241	if (!(a[0]&0x80)) return p;
				242	if ((a[0]&0x40)) break;
				243	}
				244	fl_utf8decode(a,end,&len);
				245	a += len;
				246	if (a > p) return a;
				247	return p;
				248	}
				249
				250	/*! Move \p p backward until it points to the start of a UTF-8
				251	character. If it already points at the start of one then it
				252	is returned unchanged. Any UTF-8 errors are treated as though each
				253	byte of the error is an individual character.
				254
				255	\e start is the start of the string and is used to limit the
				256	backwards search for the start of a UTF-8 character.
				257
				258	\e end is the end of the string and is assumed to be a break
				259	between characters. It is assumed to be greater than p.
				260
				261	If you wish to decrement a UTF-8 pointer, pass p-1 to this.
				262	*/
				263	const char* fl_utf8back(const char* p, const char* start, const char* end)
				264	{
				265	const char* a;
				266	int len;
				267	/* if we are not pointing at a continuation character, we are done: */
				268	if ((*p&0xc0) != 0x80) return p;
				269	/* search backwards for a 0xc0 starting the character: */
				270	for (a = p-1; ; --a) {
				271	if (a < start) return p;
				272	if (!(a[0]&0x80)) return p;
				273	if ((a[0]&0x40)) break;
				274	}
				275	fl_utf8decode(a,end,&len);
				276	if (a+len > p) return a;
				277	return p;
				278	}
				279
				280	/*! Returns number of bytes that utf8encode() will use to encode the
				281	character \p ucs. */
				282	int fl_utf8bytes(unsigned ucs) {
				283	if (ucs < 0x000080U) {
				284	return 1;
				285	} else if (ucs < 0x000800U) {
				286	return 2;
				287	} else if (ucs < 0x010000U) {
				288	return 3;
				289	} else if (ucs <= 0x10ffffU) {
				290	return 4;
				291	} else {
				292	return 3; /* length of the illegal character encoding */
				293	}
				294	}
				295
				296	/*! Write the UTF-8 encoding of \e ucs into \e buf and return the
				297	number of bytes written. Up to 4 bytes may be written. If you know
				298	that \p ucs is less than 0x10000 then at most 3 bytes will be written.
				299	If you wish to speed this up, remember that anything less than 0x80
				300	is written as a single byte.
				301
				302	If ucs is greater than 0x10ffff this is an illegal character
				303	according to RFC 3629. These are converted as though they are
				304	0xFFFD (REPLACEMENT CHARACTER).
				305
				306	RFC 3629 also says many other values for \p ucs are illegal (in
				307	the range 0xd800 to 0xdfff, or ending with 0xfffe or
				308	0xffff). However I encode these as though they are legal, so that
				309	utf8encode/fl_utf8decode will be the identity for all codes between 0
				310	and 0x10ffff.
				311	*/
				312	int fl_utf8encode(unsigned ucs, char* buf) {
				313	if (ucs < 0x000080U) {
				314	buf[0] = ucs;
				315	return 1;
				316	} else if (ucs < 0x000800U) {
				317	buf[0] = 0xc0 \| (ucs >> 6);
				318	buf[1] = 0x80 \| (ucs & 0x3F);
				319	return 2;
				320	} else if (ucs < 0x010000U) {
				321	buf[0] = 0xe0 \| (ucs >> 12);
				322	buf[1] = 0x80 \| ((ucs >> 6) & 0x3F);
				323	buf[2] = 0x80 \| (ucs & 0x3F);
				324	return 3;
				325	} else if (ucs <= 0x0010ffffU) {
				326	buf[0] = 0xf0 \| (ucs >> 18);
				327	buf[1] = 0x80 \| ((ucs >> 12) & 0x3F);
				328	buf[2] = 0x80 \| ((ucs >> 6) & 0x3F);
				329	buf[3] = 0x80 \| (ucs & 0x3F);
				330	return 4;
				331	} else {
				332	/* encode 0xfffd: */
				333	buf[0] = 0xefU;
				334	buf[1] = 0xbfU;
				335	buf[2] = 0xbdU;
				336	return 3;
				337	}
				338	}
				339
				340	/*! Convert a single 32-bit Unicode codepoint into an array of 16-bit
				341	characters. These are used by some system calls, especially on Windows.
				342
				343	\p ucs is the value to convert.
				344
				345	\p dst points at an array to write, and \p dstlen is the number of
				346	locations in this array. At most \p dstlen words will be
				347	written, and a 0 terminating word will be added if \p dstlen is
				348	large enough. Thus this function will never overwrite the buffer
				349	and will attempt return a zero-terminated string if space permits.
				350	If \p dstlen is zero then \p dst can be set to NULL and no data
				351	is written, but the length is returned.
				352
				353	The return value is the number of 16-bit words that \e would be written
				354	to \p dst if it is large enough, not counting any terminating
				355	zero.
				356
				357	If the return value is greater than \p dstlen it indicates truncation,
				358	you should then allocate a new array of size return+1 and call this again.
				359
				360	Unicode characters in the range 0x10000 to 0x10ffff are converted to
				361	"surrogate pairs" which take two words each (in UTF-16 encoding).
				362	Typically, setting \p dstlen to 2 will ensure that any valid Unicode
				363	value can be converted, and setting \p dstlen to 3 or more will allow
				364	a NULL terminated sequence to be returned.
				365	*/
				366	unsigned fl_ucs_to_Utf16(const unsigned ucs, unsigned short *dst, const unsigned dstlen)
				367	{
				368	/* The rule for direct conversion from UCS to UTF16 is:
				369	* - if UCS > 0x0010FFFF then UCS is invalid
				370	* - if UCS >= 0xD800 && UCS <= 0xDFFF UCS is invalid
				371	* - if UCS <= 0x0000FFFF then U16 = UCS, len = 1
				372	* - else
				373	* -- U16[0] = ((UCS - 0x00010000) >> 10) & 0x3FF + 0xD800
				374	* -- U16[1] = (UCS & 0x3FF) + 0xDC00
				375	* -- len = 2;
				376	*/
				377	unsigned count; /* Count of converted UTF16 cells */
				378	unsigned short u16[4]; /* Alternate buffer if dst is not set */
				379	unsigned short out; / points to the active buffer */
				380	/* Ensure we have a valid buffer to write to */
				381	if((!dstlen) \|\| (!dst)) {
				382	out = u16;
				383	} else {
				384	out = dst;
				385	}
				386	/* Convert from UCS to UTF16 */
				387	if((ucs > 0x0010FFFF) \|\| /* UCS is too large */
				388	((ucs > 0xD7FF) && (ucs < 0xE000))) { /* UCS in invalid range */
				389	out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
				390	count = 1;
				391	} else if(ucs < 0x00010000) {
				392	out[0] = (unsigned short)ucs;
				393	count = 1;
				394	} else if(dstlen < 2) { /* dst is too small for the result */
				395	out[0] = 0xFFFD; /* REPLACEMENT CHARACTER */
				396	count = 2;
				397	} else {
				398	out[0] = (((ucs - 0x00010000) >> 10) & 0x3FF) + 0xD800;
				399	out[1] = (ucs & 0x3FF) + 0xDC00;
				400	count = 2;
				401	}
				402	/* NULL terminate the output, if there is space */
				403	if(count < dstlen) { out[count] = 0; }
				404	return count;
				405	} /* fl_ucs_to_Utf16 */
				406
				407	/*! Convert a UTF-8 sequence into an array of 16-bit characters. These
				408	are used by some system calls, especially on Windows.
				409
				410	\p src points at the UTF-8, and \p srclen is the number of bytes to
				411	convert.
				412
				413	\p dst points at an array to write, and \p dstlen is the number of
				414	locations in this array. At most \p dstlen-1 words will be
				415	written there, plus a 0 terminating word. Thus this function
				416	will never overwrite the buffer and will always return a
				417	zero-terminated string. If \p dstlen is zero then \p dst can be
				418	null and no data is written, but the length is returned.
				419
				420	The return value is the number of 16-bit words that \e would be written
				421	to \p dst if it were long enough, not counting the terminating
				422	zero. If the return value is greater or equal to \p dstlen it
				423	indicates truncation, you can then allocate a new array of size
				424	return+1 and call this again.
				425
				426	Errors in the UTF-8 are converted as though each byte in the
				427	erroneous string is in the Microsoft CP1252 encoding. This allows
				428	ISO-8859-1 text mistakenly identified as UTF-8 to be printed
				429	correctly.
				430
				431	Unicode characters in the range 0x10000 to 0x10ffff are converted to
				432	"surrogate pairs" which take two words each (this is called UTF-16
				433	encoding).
				434	*/
				435	unsigned fl_utf8toUtf16(const char* src, unsigned srclen,
				436	unsigned short* dst, unsigned dstlen)
				437	{
				438	const char* p = src;
				439	const char* e = src+srclen;
				440	unsigned count = 0;
				441	if (dstlen) for (;;) {
				442	if (p >= e) {dst[count] = 0; return count;}
				443	if (!(p & 0x80)) { / ascii */
				444	dst[count] = *p++;
				445	} else {
				446	int len; unsigned ucs = fl_utf8decode(p,e,&len);
				447	p += len;
				448	if (ucs < 0x10000) {
				449	dst[count] = ucs;
				450	} else {
				451	/* make a surrogate pair: */
				452	if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
				453	dst[count] = (((ucs-0x10000u)>>10)&0x3ff) \| 0xd800;
				454	dst[++count] = (ucs&0x3ff) \| 0xdc00;
				455	}
				456	}
				457	if (++count == dstlen) {dst[count-1] = 0; break;}
				458	}
				459	/* we filled dst, measure the rest: */
				460	while (p < e) {
				461	if (!(*p & 0x80)) p++;
				462	else {
				463	int len; unsigned ucs = fl_utf8decode(p,e,&len);
				464	p += len;
				465	if (ucs >= 0x10000) ++count;
				466	}
				467	++count;
				468	}
				469	return count;
				470	}
				471
				472
				473	/**
				474	Converts a UTF-8 string into a wide character string.
				475
				476	This function generates 32-bit wchar_t (e.g. "ucs4" as it were) except
				477	on Windows where it is equivalent to fl_utf8toUtf16 and returns
				478	UTF-16.
				479
				480	\p src points at the UTF-8, and \p srclen is the number of bytes to
				481	convert.
				482
				483	\p dst points at an array to write, and \p dstlen is the number of
				484	locations in this array. At most \p dstlen-1 wchar_t will be
				485	written there, plus a 0 terminating wchar_t.
				486
				487	The return value is the number of wchar_t that \e would be written
				488	to \p dst if it were long enough, not counting the terminating
				489	zero. If the return value is greater or equal to \p dstlen it
				490	indicates truncation, you can then allocate a new array of size
				491	return+1 and call this again.
				492
				493	Notice that sizeof(wchar_t) is 2 on Windows and is 4 on Linux
				494	and most other systems. Where wchar_t is 16 bits, Unicode
				495	characters in the range 0x10000 to 0x10ffff are converted to
				496	"surrogate pairs" which take two words each (this is called UTF-16
				497	encoding). If wchar_t is 32 bits this rather nasty problem is
				498	avoided.
				499
				500	Note that Windows includes Cygwin, i.e. compiled with Cygwin's POSIX
				501	layer (cygwin1.dll, --enable-cygwin), either native (GDI) or X11.
				502	*/
				503	unsigned fl_utf8towc(const char* src, unsigned srclen,
				504	wchar_t* dst, unsigned dstlen)
				505	{
				506	#if defined(WIN32) \|\| defined(__CYGWIN__)
				507	return fl_utf8toUtf16(src, srclen, (unsigned short*)dst, dstlen);
				508	#else
				509	const char* p = src;
				510	const char* e = src+srclen;
				511	unsigned count = 0;
				512	if (dstlen) for (;;) {
				513	if (p >= e) {
				514	dst[count] = 0;
				515	return count;
				516	}
				517	if (!(p & 0x80)) { / ascii */
				518	dst[count] = *p++;
				519	} else {
				520	int len; unsigned ucs = fl_utf8decode(p,e,&len);
				521	p += len;
				522	dst[count] = (wchar_t)ucs;
				523	}
				524	if (++count == dstlen) {dst[count-1] = 0; break;}
				525	}
				526	/* we filled dst, measure the rest: */
				527	while (p < e) {
				528	if (!(*p & 0x80)) p++;
				529	else {
				530	int len; fl_utf8decode(p,e,&len);
				531	p += len;
				532	}
				533	++count;
				534	}
				535	return count;
				536	#endif
				537	}
				538
				539	/*! Convert a UTF-8 sequence into an array of 1-byte characters.
				540
				541	If the UTF-8 decodes to a character greater than 0xff then it is
				542	replaced with '?'.
				543
				544	Errors in the UTF-8 are converted as individual bytes, same as
				545	fl_utf8decode() does. This allows ISO-8859-1 text mistakenly identified
				546	as UTF-8 to be printed correctly (and possibly CP1512 on Windows).
				547
				548	\p src points at the UTF-8, and \p srclen is the number of bytes to
				549	convert.
				550
				551	Up to \p dstlen bytes are written to \p dst, including a null
				552	terminator. The return value is the number of bytes that would be
				553	written, not counting the null terminator. If greater or equal to
				554	\p dstlen then if you malloc a new array of size n+1 you will have
				555	the space needed for the entire string. If \p dstlen is zero then
				556	nothing is written and this call just measures the storage space
				557	needed.
				558	*/
				559	unsigned fl_utf8toa(const char* src, unsigned srclen,
				560	char* dst, unsigned dstlen)
				561	{
				562	const char* p = src;
				563	const char* e = src+srclen;
				564	unsigned count = 0;
				565	if (dstlen) for (;;) {
				566	unsigned char c;
				567	if (p >= e) {dst[count] = 0; return count;}
				568	c = (unsigned char)p;
				569	if (c < 0xC2) { /* ascii or bad code */
				570	dst[count] = c;
				571	p++;
				572	} else {
				573	int len; unsigned ucs = fl_utf8decode(p,e,&len);
				574	p += len;
				575	if (ucs < 0x100) dst[count] = ucs;
				576	else dst[count] = '?';
				577	}
				578	if (++count >= dstlen) {dst[count-1] = 0; break;}
				579	}
				580	/* we filled dst, measure the rest: */
				581	while (p < e) {
				582	if (!(*p & 0x80)) p++;
				583	else {
				584	int len;
				585	fl_utf8decode(p,e,&len);
				586	p += len;
				587	}
				588	++count;
				589	}
				590	return count;
				591	}
				592
				593	/*! Turn "wide characters" as returned by some system calls
				594	(especially on Windows) into UTF-8.
				595
				596	Up to \p dstlen bytes are written to \p dst, including a null
				597	terminator. The return value is the number of bytes that would be
				598	written, not counting the null terminator. If greater or equal to
				599	\p dstlen then if you malloc a new array of size n+1 you will have
				600	the space needed for the entire string. If \p dstlen is zero then
				601	nothing is written and this call just measures the storage space
				602	needed.
				603
				604	\p srclen is the number of words in \p src to convert. On Windows
				605	this is not necessarily the number of characters, due to there
				606	possibly being "surrogate pairs" in the UTF-16 encoding used.
				607	On Unix wchar_t is 32 bits and each location is a character.
				608
				609	On Unix if a \p src word is greater than 0x10ffff then this is an
				610	illegal character according to RFC 3629. These are converted as
				611	though they are 0xFFFD (REPLACEMENT CHARACTER). Characters in the
				612	range 0xd800 to 0xdfff, or ending with 0xfffe or 0xffff are also
				613	illegal according to RFC 3629. However I encode these as though
				614	they are legal, so that fl_utf8towc will return the original data.
				615
				616	On Windows "surrogate pairs" are converted to a single character
				617	and UTF-8 encoded (as 4 bytes). Mismatched halves of surrogate
				618	pairs are converted as though they are individual characters.
				619	*/
				620	unsigned fl_utf8fromwc(char* dst, unsigned dstlen,
				621	const wchar_t* src, unsigned srclen) {
				622	unsigned i = 0;
				623	unsigned count = 0;
				624	if (dstlen) for (;;) {
				625	unsigned ucs;
				626	if (i >= srclen) {dst[count] = 0; return count;}
				627	ucs = src[i++];
				628	if (ucs < 0x80U) {
				629	dst[count++] = ucs;
				630	if (count >= dstlen) {dst[count-1] = 0; break;}
				631	} else if (ucs < 0x800U) { /* 2 bytes */
				632	if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
				633	dst[count++] = 0xc0 \| (ucs >> 6);
				634	dst[count++] = 0x80 \| (ucs & 0x3F);
				635	#if defined(WIN32) \|\| defined(__CYGWIN__)
				636	} else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen &&
				637	src[i] >= 0xdc00 && src[i] <= 0xdfff) {
				638	/* surrogate pair */
				639	unsigned ucs2 = src[i++];
				640	ucs = 0x10000U + ((ucs&0x3ff)<<10) + (ucs2&0x3ff);
				641	/* all surrogate pairs turn into 4-byte utf8 */
				642	#else
				643	} else if (ucs >= 0x10000) {
				644	if (ucs > 0x10ffff) {
				645	ucs = 0xfffd;
				646	goto J1;
				647	}
				648	#endif
				649	if (count+4 >= dstlen) {dst[count] = 0; count += 4; break;}
				650	dst[count++] = 0xf0 \| (ucs >> 18);
				651	dst[count++] = 0x80 \| ((ucs >> 12) & 0x3F);
				652	dst[count++] = 0x80 \| ((ucs >> 6) & 0x3F);
				653	dst[count++] = 0x80 \| (ucs & 0x3F);
				654	} else {
				655	#if !(defined(WIN32) \|\| defined(__CYGWIN__))
				656	J1:
				657	#endif
				658	/* all others are 3 bytes: */
				659	if (count+3 >= dstlen) {dst[count] = 0; count += 3; break;}
				660	dst[count++] = 0xe0 \| (ucs >> 12);
				661	dst[count++] = 0x80 \| ((ucs >> 6) & 0x3F);
				662	dst[count++] = 0x80 \| (ucs & 0x3F);
				663	}
				664	}
				665	/* we filled dst, measure the rest: */
				666	while (i < srclen) {
				667	unsigned ucs = src[i++];
				668	if (ucs < 0x80U) {
				669	count++;
				670	} else if (ucs < 0x800U) { /* 2 bytes */
				671	count += 2;
				672	#if defined(WIN32) \|\| defined(__CYGWIN__)
				673	} else if (ucs >= 0xd800 && ucs <= 0xdbff && i < srclen-1 &&
				674	src[i+1] >= 0xdc00 && src[i+1] <= 0xdfff) {
				675	/* surrogate pair */
				676	++i;
				677	#else
				678	} else if (ucs >= 0x10000 && ucs <= 0x10ffff) {
				679	#endif
				680	count += 4;
				681	} else {
				682	count += 3;
				683	}
				684	}
				685	return count;
				686	}
				687
				688	/*! Convert an ISO-8859-1 (ie normal c-string) byte stream to UTF-8.
				689
				690	It is possible this should convert Microsoft's CP1252 to UTF-8
				691	instead. This would translate the codes in the range 0x80-0x9f
				692	to different characters. Currently it does not do this.
				693
				694	Up to \p dstlen bytes are written to \p dst, including a null
				695	terminator. The return value is the number of bytes that would be
				696	written, not counting the null terminator. If greater or equal to
				697	\p dstlen then if you malloc a new array of size n+1 you will have
				698	the space needed for the entire string. If \p dstlen is zero then
				699	nothing is written and this call just measures the storage space
				700	needed.
				701
				702	\p srclen is the number of bytes in \p src to convert.
				703
				704	If the return value equals \p srclen then this indicates that
				705	no conversion is necessary, as only ASCII characters are in the
				706	string.
				707	*/
				708	unsigned fl_utf8froma(char* dst, unsigned dstlen,
				709	const char* src, unsigned srclen) {
				710	const char* p = src;
				711	const char* e = src+srclen;
				712	unsigned count = 0;
				713	if (dstlen) for (;;) {
				714	unsigned char ucs;
				715	if (p >= e) {dst[count] = 0; return count;}
				716	ucs = (unsigned char)p++;
				717	if (ucs < 0x80U) {
				718	dst[count++] = ucs;
				719	if (count >= dstlen) {dst[count-1] = 0; break;}
				720	} else { /* 2 bytes (note that CP1252 translate could make 3 bytes!) */
				721	if (count+2 >= dstlen) {dst[count] = 0; count += 2; break;}
				722	dst[count++] = 0xc0 \| (ucs >> 6);
				723	dst[count++] = 0x80 \| (ucs & 0x3F);
				724	}
				725	}
				726	/* we filled dst, measure the rest: */
				727	while (p < e) {
				728	unsigned char ucs = (unsigned char)p++;
				729	if (ucs < 0x80U) {
				730	count++;
				731	} else {
				732	count += 2;
				733	}
				734	}
				735	return count;
				736	}
				737
				738	#ifdef WIN32
				739	# include <windows.h>
				740	#endif
				741
				742	/*! Return true if the "locale" seems to indicate that UTF-8 encoding
				743	is used. If true the fl_utf8to_mb and fl_utf8from_mb don't do anything
				744	useful.
				745
				746	<i>It is highly recommended that you change your system so this
				747	does return true.</i> On Windows this is done by setting the
				748	"codepage" to CP_UTF8. On Unix this is done by setting $LC_CTYPE
				749	to a string containing the letters "utf" or "UTF" in it, or by
				750	deleting all $LC* and $LANG environment variables. In the future
				751	it is likely that all non-Asian Unix systems will return true,
				752	due to the compatibility of UTF-8 with ISO-8859-1.
				753	*/
				754	int fl_utf8locale(void) {
				755	static int ret = 2;
				756	if (ret == 2) {
				757	#ifdef WIN32
				758	ret = GetACP() == CP_UTF8;
				759	#else
				760	char* s;
				761	ret = 1; /* assume UTF-8 if no locale */
				762	if (((s = getenv("LC_CTYPE")) && *s) \|\|
				763	((s = getenv("LC_ALL")) && *s) \|\|
				764	((s = getenv("LANG")) && *s)) {
				765	ret = (strstr(s,"utf") \|\| strstr(s,"UTF"));
				766	}
				767	#endif
				768	}
				769	return ret;
				770	}
				771
				772	/*! Convert the UTF-8 used by FLTK to the locale-specific encoding
				773	used for filenames (and sometimes used for data in files).
				774	Unfortunately due to stupid design you will have to do this as
				775	needed for filenames. This is a bug on both Unix and Windows.
				776
				777	Up to \p dstlen bytes are written to \p dst, including a null
				778	terminator. The return value is the number of bytes that would be
				779	written, not counting the null terminator. If greater or equal to
				780	\p dstlen then if you malloc a new array of size n+1 you will have
				781	the space needed for the entire string. If \p dstlen is zero then
				782	nothing is written and this call just measures the storage space
				783	needed.
				784
				785	If fl_utf8locale() returns true then this does not change the data.
				786	*/
				787	unsigned fl_utf8to_mb(const char* src, unsigned srclen,
				788	char* dst, unsigned dstlen)
				789	{
				790	if (!fl_utf8locale()) {
				791	#ifdef WIN32
				792	wchar_t lbuf[1024];
				793	wchar_t* buf = lbuf;
				794	unsigned length = fl_utf8towc(src, srclen, buf, 1024);
				795	unsigned ret;
				796	if (length >= 1024) {
				797	buf = (wchar_t)(malloc((length+1)sizeof(wchar_t)));
				798	fl_utf8towc(src, srclen, buf, length+1);
				799	}
				800	if (dstlen) {
				801	/* apparently this does not null-terminate, even though msdn
				802	* documentation claims it does:
				803	*/
				804	ret =
				805	WideCharToMultiByte(GetACP(), 0, buf, length, dst, dstlen, 0, 0);
				806	dst[ret] = 0;
				807	}
				808	/* if it overflows or measuring length, get the actual length: */
				809	if (dstlen==0 \|\| ret >= dstlen-1)
				810	ret =
				811	WideCharToMultiByte(GetACP(), 0, buf, length, 0, 0, 0, 0);
				812	if (buf != lbuf) free((void*)buf);
				813	return ret;
				814	#else
				815	wchar_t lbuf[1024];
				816	wchar_t* buf = lbuf;
				817	unsigned length = fl_utf8towc(src, srclen, buf, 1024);
				818	int ret;
				819	if (length >= 1024) {
				820	buf = (wchar_t)(malloc((length+1)sizeof(wchar_t)));
				821	fl_utf8towc(src, srclen, buf, length+1);
				822	}
				823	if (dstlen) {
				824	ret = wcstombs(dst, buf, dstlen);
				825	if (ret >= dstlen-1) ret = wcstombs(0,buf,0);
				826	} else {
				827	ret = wcstombs(0,buf,0);
				828	}
				829	if (buf != lbuf) free((void*)buf);
				830	if (ret >= 0) return (unsigned)ret;
				831	/* on any errors we return the UTF-8 as raw text...*/
				832	#endif
				833	}
				834	/* identity transform: */
				835	if (srclen < dstlen) {
				836	memcpy(dst, src, srclen);
				837	dst[srclen] = 0;
				838	} else {
				839	/* Buffer insufficent or buffer query */
				840	}
				841	return srclen;
				842	}
				843
				844	/*! Convert a filename from the locale-specific multibyte encoding
				845	used by Windows to UTF-8 as used by FLTK.
				846
				847	Up to \p dstlen bytes are written to \p dst, including a null
				848	terminator. The return value is the number of bytes that would be
				849	written, not counting the null terminator. If greater or equal to
				850	\p dstlen then if you malloc a new array of size n+1 you will have
				851	the space needed for the entire string. If \p dstlen is zero then
				852	nothing is written and this call just measures the storage space
				853	needed.
				854
				855	On Unix or on Windows when a UTF-8 locale is in effect, this
				856	does not change the data.
				857	You may also want to check if fl_utf8test() returns non-zero, so that
				858	the filesystem can store filenames in UTF-8 encoding regardless of
				859	the locale.
				860	*/
				861	unsigned fl_utf8from_mb(char* dst, unsigned dstlen,
				862	const char* src, unsigned srclen)
				863	{
				864	if (!fl_utf8locale()) {
				865	#ifdef WIN32
				866	wchar_t lbuf[1024];
				867	wchar_t* buf = lbuf;
				868	unsigned length;
				869	unsigned ret;
				870	length = MultiByteToWideChar(GetACP(), 0, src, srclen, buf, 1024);
				871	if ((length == 0)&&(GetLastError()==ERROR_INSUFFICIENT_BUFFER)) {
				872	length = MultiByteToWideChar(GetACP(), 0, src, srclen, 0, 0);
				873	buf = (wchar_t)(malloc(lengthsizeof(wchar_t)));
				874	MultiByteToWideChar(GetACP(), 0, src, srclen, buf, length);
				875	}
				876	ret = fl_utf8fromwc(dst, dstlen, buf, length);
				877	if (buf != lbuf) free((void*)buf);
				878	return ret;
				879	#else
				880	wchar_t lbuf[1024];
				881	wchar_t* buf = lbuf;
				882	int length;
				883	unsigned ret;
				884	length = mbstowcs(buf, src, 1024);
				885	if (length >= 1024) {
				886	length = mbstowcs(0, src, 0)+1;
				887	buf = (wchar_t)(malloc(lengthsizeof(wchar_t)));
				888	mbstowcs(buf, src, length);
				889	}
				890	if (length >= 0) {
				891	ret = fl_utf8fromwc(dst, dstlen, buf, length);
				892	if (buf != lbuf) free((void*)buf);
				893	return ret;
				894	}
				895	/* errors in conversion return the UTF-8 unchanged */
				896	#endif
				897	}
				898	/* identity transform: */
				899	if (srclen < dstlen) {
				900	memcpy(dst, src, srclen);
				901	dst[srclen] = 0;
				902	} else {
				903	/* Buffer insufficent or buffer query */
				904	}
				905	return srclen;
				906	}
				907
				908	/*! Examines the first \p srclen bytes in \p src and returns a verdict
				909	on whether it is UTF-8 or not.
				910	- Returns 0 if there is any illegal UTF-8 sequences, using the
				911	same rules as fl_utf8decode(). Note that some UCS values considered
				912	illegal by RFC 3629, such as 0xffff, are considered legal by this.
				913	- Returns 1 if there are only single-byte characters (ie no bytes
				914	have the high bit set). This is legal UTF-8, but also indicates
				915	plain ASCII. It also returns 1 if \p srclen is zero.
				916	- Returns 2 if there are only characters less than 0x800.
				917	- Returns 3 if there are only characters less than 0x10000.
				918	- Returns 4 if there are characters in the 0x10000 to 0x10ffff range.
				919
				920	Because there are many illegal sequences in UTF-8, it is almost
				921	impossible for a string in another encoding to be confused with
				922	UTF-8. This is very useful for transitioning Unix to UTF-8
				923	filenames, you can simply test each filename with this to decide
				924	if it is UTF-8 or in the locale encoding. My hope is that if
				925	this is done we will be able to cleanly transition to a locale-less
				926	encoding.
				927	*/
				928	int fl_utf8test(const char* src, unsigned srclen) {
				929	int ret = 1;
				930	const char* p = src;
				931	const char* e = src+srclen;
				932	while (p < e) {
				933	if (*p & 0x80) {
				934	int len; fl_utf8decode(p,e,&len);
				935	if (len < 2) return 0;
				936	if (len > ret) ret = len;
				937	p += len;
				938	} else {
				939	p++;
				940	}
				941	}
				942	return ret;
				943	}
				944
				945	/* forward declare mk_wcwidth() as static so the name is not visible.
				946	*/
				947	static int mk_wcwidth(unsigned int ucs);
				948
				949	/* include the c source directly so it's contents are only visible here
				950	*/
				951	#include "xutf8/mk_wcwidth.c"
				952
				953	/** wrapper to adapt Markus Kuhn's implementation of wcwidth() for FLTK
				954	\param [in] ucs Unicode character value
				955	\returns width of character in columns
				956
				957	See http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c for Markus Kuhn's
				958	original implementation of wcwidth() and wcswidth()
				959	(defined in IEEE Std 1002.1-2001) for Unicode.
				960
				961	\b WARNING: this function returns widths for "raw" Unicode characters.
				962	It does not even try to map C1 control characters (0x80 to 0x9F) to
				963	CP1252, and C0/C1 control characters and DEL will return -1.
				964	You are advised to use fl_width(const char* src) instead.
				965	*/
				966	int fl_wcwidth_(unsigned int ucs) {
				967	return mk_wcwidth(ucs);
				968	}
				969
				970	/** extended wrapper around fl_wcwidth_(unsigned int ucs) function.
				971	\param[in] src pointer to start of UTF-8 byte sequence
				972	\returns width of character in columns
				973
				974	Depending on build options, this function may map C1 control
				975	characters (0x80 to 0x9f) to CP1252, and return the width of
				976	that character instead. This is not the same behaviour as
				977	fl_wcwidth_(unsigned int ucs) .
				978
				979	Note that other control characters and DEL will still return -1,
				980	so if you want different behaviour, you need to test for those
				981	characters before calling fl_wcwidth(), and handle them separately.
				982	*/
				983	int fl_wcwidth(const char* src) {
				984	int len = fl_utf8len(*src);
				985	int ret = 0;
				986	unsigned int ucs = fl_utf8decode(src, src+len, &ret);
				987	int width = fl_wcwidth_(ucs);
				988	return width;
				989	}
				990
				991	/** @} */
				992
				993	/*
				994	* End of "$Id: fl_utf.c 8585 2011-04-13 15:43:22Z ianmacarthur $".
				995	*/