blob: eed0ebbe960b33677ed4c902e6b76c2e613b926e [file] [log] [blame]
Bram Moolenaarab79bcb2004-07-18 21:34:53 +00001/* vi:set ts=8 sts=4 sw=4:
2 *
3 * VIM - Vi IMproved by Bram Moolenaar
4 *
5 * Do ":help uganda" in Vim to read copying and usage conditions.
6 * Do ":help credits" in Vim to see a list of people who contributed.
7 * See README.txt for an overview of the Vim source code.
8 */
9/*
10 * os_mac_conv.c: Code specifically for Mac string conversions.
11 *
12 * This code has been put in a separate file to avoid the conflicts that are
13 * caused by including both the X11 and Carbon header files.
14 */
15
16#define NO_X11_INCLUDES
17#include "vim.h"
18
Bram Moolenaar5eb86f92004-07-26 12:53:41 +000019#ifdef FEAT_MBYTE
Bram Moolenaarab79bcb2004-07-18 21:34:53 +000020extern char_u *mac_string_convert __ARGS((char_u *ptr, int len, int *lenp, int fail_on_error, int from, int to, int *unconvlenp));
21extern int macroman2enc __ARGS((char_u *ptr, long *sizep, long real_size));
22extern int enc2macroman __ARGS((char_u *from, size_t fromlen, char_u *to, int *tolenp, int maxtolen, char_u *rest, int *restlenp));
23
Bram Moolenaar26a60b42005-02-22 08:49:11 +000024extern void mac_conv_init __ARGS((void));
25extern void mac_conv_cleanup __ARGS((void));
26extern char_u *mac_utf16_to_enc __ARGS((UniChar *from, size_t fromLen, size_t *actualLen));
27extern UniChar *mac_enc_to_utf16 __ARGS((char_u *from, size_t fromLen, size_t *actualLen));
28extern CFStringRef mac_enc_to_cfstring __ARGS((char_u *from, size_t fromLen));
29extern char_u *mac_precompose_path __ARGS((char_u *decompPath, size_t decompLen, size_t *precompLen));
30
31static char_u *mac_utf16_to_utf8 __ARGS((UniChar *from, size_t fromLen, size_t *actualLen));
32static UniChar *mac_utf8_to_utf16 __ARGS((char_u *from, size_t fromLen, size_t *actualLen));
33
34/* Converter for composing decomposed HFS+ file paths */
35static TECObjectRef gPathConverter;
36/* Converter used by mac_utf16_to_utf8 */
37static TECObjectRef gUTF16ToUTF8Converter;
38
Bram Moolenaarab79bcb2004-07-18 21:34:53 +000039/*
40 * A Mac version of string_convert_ext() for special cases.
41 */
42 char_u *
43mac_string_convert(ptr, len, lenp, fail_on_error, from_enc, to_enc, unconvlenp)
44 char_u *ptr;
45 int len;
46 int *lenp;
47 int fail_on_error;
48 int from_enc;
49 int to_enc;
50 int *unconvlenp;
51{
52 char_u *retval, *d;
53 CFStringRef cfstr;
54 int buflen, in, out, l, i;
55 CFStringEncoding from;
56 CFStringEncoding to;
57
58 switch (from_enc)
59 {
60 case 'l': from = kCFStringEncodingISOLatin1; break;
61 case 'm': from = kCFStringEncodingMacRoman; break;
62 case 'u': from = kCFStringEncodingUTF8; break;
63 default: return NULL;
64 }
65 switch (to_enc)
66 {
67 case 'l': to = kCFStringEncodingISOLatin1; break;
68 case 'm': to = kCFStringEncodingMacRoman; break;
69 case 'u': to = kCFStringEncodingUTF8; break;
70 default: return NULL;
71 }
72
73 if (unconvlenp != NULL)
74 *unconvlenp = 0;
75 cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
76
Bram Moolenaar26a60b42005-02-22 08:49:11 +000077 if(cfstr == NULL)
78 fprintf(stderr, "Encoding failed\n");
Bram Moolenaarab79bcb2004-07-18 21:34:53 +000079 /* When conversion failed, try excluding bytes from the end, helps when
80 * there is an incomplete byte sequence. Only do up to 6 bytes to avoid
81 * looping a long time when there really is something unconvertable. */
82 while (cfstr == NULL && unconvlenp != NULL && len > 1 && *unconvlenp < 6)
83 {
84 --len;
85 ++*unconvlenp;
86 cfstr = CFStringCreateWithBytes(NULL, ptr, len, from, 0);
87 }
88 if (cfstr == NULL)
89 return NULL;
Bram Moolenaar26a60b42005-02-22 08:49:11 +000090
Bram Moolenaarab79bcb2004-07-18 21:34:53 +000091 if (to == kCFStringEncodingUTF8)
92 buflen = len * 6 + 1;
93 else
94 buflen = len + 1;
95 retval = alloc(buflen);
96 if (retval == NULL)
97 {
98 CFRelease(cfstr);
99 return NULL;
100 }
Bram Moolenaar26a60b42005-02-22 08:49:11 +0000101
102#if 0
103 CFRange convertRange = CFRangeMake(0, CFStringGetLength(cfstr));
104 /* Determine output buffer size */
105 CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, NULL, 0, (CFIndex *)&buflen);
106 retval = (buflen > 0) ? alloc(buflen) : NULL;
107 if (retval == NULL) {
108 CFRelease(cfstr);
109 return NULL;
110 }
111
112 if (lenp)
113 *lenp = buflen / sizeof(char_u);
114
115 if (!CFStringGetBytes(cfstr, convertRange, to, NULL, FALSE, retval, buflen, NULL))
116#endif
Bram Moolenaarab79bcb2004-07-18 21:34:53 +0000117 if (!CFStringGetCString(cfstr, retval, buflen, to))
118 {
119 CFRelease(cfstr);
120 if (fail_on_error)
121 {
122 vim_free(retval);
123 return NULL;
124 }
125
Bram Moolenaar26a60b42005-02-22 08:49:11 +0000126 fprintf(stderr, "Trying char-by-char conversion...\n");
Bram Moolenaarab79bcb2004-07-18 21:34:53 +0000127 /* conversion failed for the whole string, but maybe it will work
128 * for each character */
129 for (d = retval, in = 0, out = 0; in < len && out < buflen - 1;)
130 {
131 if (from == kCFStringEncodingUTF8)
132 l = utf_ptr2len_check(ptr + in);
133 else
134 l = 1;
135 cfstr = CFStringCreateWithBytes(NULL, ptr + in, l, from, 0);
136 if (cfstr == NULL)
137 {
138 *d++ = '?';
139 out++;
140 }
141 else
142 {
143 if (!CFStringGetCString(cfstr, d, buflen - out, to))
144 {
145 *d++ = '?';
146 out++;
147 }
148 else
149 {
150 i = strlen(d);
151 d += i;
152 out += i;
153 }
154 CFRelease(cfstr);
155 }
156 in += l;
157 }
158 *d = NUL;
159 if (lenp != NULL)
160 *lenp = out;
161 return retval;
162 }
163 CFRelease(cfstr);
164 if (lenp != NULL)
165 *lenp = strlen(retval);
Bram Moolenaar26a60b42005-02-22 08:49:11 +0000166
Bram Moolenaarab79bcb2004-07-18 21:34:53 +0000167 return retval;
168}
169
170/*
171 * Conversion from Apple MacRoman char encoding to UTF-8 or latin1, using
172 * standard Carbon framework.
173 * Input: "ptr[*sizep]".
174 * "real_size" is the size of the buffer that "ptr" points to.
175 * output is in-place, "sizep" is adjusted.
176 * Returns OK or FAIL.
177 */
178 int
179macroman2enc(ptr, sizep, real_size)
180 char_u *ptr;
181 long *sizep;
182 long real_size;
183{
184 CFStringRef cfstr;
185 CFRange r;
186 CFIndex len = *sizep;
187
188 /* MacRoman is an 8-bit encoding, no need to move bytes to
189 * conv_rest[]. */
190 cfstr = CFStringCreateWithBytes(NULL, ptr, len,
191 kCFStringEncodingMacRoman, 0);
192 /*
193 * If there is a conversion error, try using another
194 * conversion.
195 */
196 if (cfstr == NULL)
197 return FAIL;
198
199 r.location = 0;
200 r.length = CFStringGetLength(cfstr);
201 if (r.length != CFStringGetBytes(cfstr, r,
202 (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
203 0, /* no lossy conversion */
204 0, /* not external representation */
205 ptr + *sizep, real_size - *sizep, &len))
206 {
207 CFRelease(cfstr);
208 return FAIL;
209 }
210 CFRelease(cfstr);
211 mch_memmove(ptr, ptr + *sizep, len);
212 *sizep = len;
213
214 return OK;
215}
216
217/*
218 * Conversion from UTF-8 or latin1 to MacRoman.
219 * Input: "from[fromlen]"
220 * Output: "to[maxtolen]" length in "*tolenp"
221 * Unconverted rest in rest[*restlenp].
222 * Returns OK or FAIL.
223 */
224 int
225enc2macroman(from, fromlen, to, tolenp, maxtolen, rest, restlenp)
226 char_u *from;
227 size_t fromlen;
228 char_u *to;
229 int *tolenp;
230 int maxtolen;
231 char_u *rest;
232 int *restlenp;
233{
234 CFStringRef cfstr;
235 CFRange r;
236 CFIndex l;
237
238 *restlenp = 0;
239 cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
240 (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
241 0);
242 while (cfstr == NULL && *restlenp < 3 && fromlen > 1)
243 {
244 rest[*restlenp++] = from[--fromlen];
245 cfstr = CFStringCreateWithBytes(NULL, from, fromlen,
246 (enc_utf8) ? kCFStringEncodingUTF8 : kCFStringEncodingISOLatin1,
247 0);
248 }
249 if (cfstr == NULL)
250 return FAIL;
251
252 r.location = 0;
253 r.length = CFStringGetLength(cfstr);
254 if (r.length != CFStringGetBytes(cfstr, r,
255 kCFStringEncodingMacRoman,
256 0, /* no lossy conversion */
257 0, /* not external representation (since vim
258 * handles this internally */
259 to, maxtolen, &l))
260 {
261 CFRelease(cfstr);
262 return FAIL;
263 }
264 CFRelease(cfstr);
265 *tolenp = l;
266 return OK;
267}
Bram Moolenaar5eb86f92004-07-26 12:53:41 +0000268
Bram Moolenaar26a60b42005-02-22 08:49:11 +0000269/*
270 * Initializes text converters
271 */
272 void
273mac_conv_init()
274{
275 TextEncoding utf8_encoding;
276 TextEncoding utf8_hfsplus_encoding;
277 TextEncoding utf8_canon_encoding;
278 TextEncoding utf16_encoding;
279
280 utf8_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
281 kTextEncodingDefaultVariant, kUnicodeUTF8Format);
282 utf8_hfsplus_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
283 kUnicodeHFSPlusCompVariant, kUnicodeUTF8Format);
284 utf8_canon_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
285 kUnicodeCanonicalCompVariant, kUnicodeUTF8Format);
286 utf16_encoding = CreateTextEncoding(kTextEncodingUnicodeDefault,
287 kTextEncodingDefaultVariant, kUnicode16BitFormat);
288
289 if (TECCreateConverter(&gPathConverter, utf8_encoding,
290 utf8_hfsplus_encoding) != noErr)
291 gPathConverter = NULL;
292
293 if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
294 utf8_canon_encoding) != noErr)
Bram Moolenaar19a09a12005-03-04 23:39:37 +0000295 {
296 /* On pre-10.3, Unicode normalization is not available so
297 * fall back to non-normalizing converter */
298 if (TECCreateConverter(&gUTF16ToUTF8Converter, utf16_encoding,
299 utf8_encoding) != noErr)
300 gUTF16ToUTF8Converter = NULL;
301 }
Bram Moolenaar26a60b42005-02-22 08:49:11 +0000302}
303
304/*
305 * Destroys text converters
306 */
307 void
308mac_conv_cleanup()
309{
310 if (gUTF16ToUTF8Converter)
311 {
312 TECDisposeConverter(gUTF16ToUTF8Converter);
313 gUTF16ToUTF8Converter = NULL;
314 }
315
316 if (gPathConverter)
317 {
318 TECDisposeConverter(gPathConverter);
319 gPathConverter = NULL;
320 }
321}
322
323/*
324 * Conversion from UTF-16 UniChars to 'encoding'
325 */
326 char_u *
327mac_utf16_to_enc(from, fromLen, actualLen)
328 UniChar *from;
329 size_t fromLen;
330 size_t *actualLen;
331{
332 /* Following code borrows somewhat from os_mswin.c */
333 vimconv_T conv;
334 size_t utf8_len;
335 char_u *utf8_str;
336 char_u *result = NULL;
337
338 /* Convert to utf-8 first, works better with iconv */
339 utf8_len = 0;
340 utf8_str = mac_utf16_to_utf8(from, fromLen, &utf8_len);
341
342 if (utf8_str)
343 {
344 /* We might be called before we have p_enc set up. */
345 conv.vc_type = CONV_NONE;
346
347 /* If encoding (p_enc) is any unicode, it is actually in utf-8 (vim
348 * internal unicode is always utf-8) so don't convert in such cases */
349
350 if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0)
351 convert_setup(&conv, (char_u *)"utf-8",
352 p_enc? p_enc: (char_u *)"macroman");
353 if (conv.vc_type == CONV_NONE)
354 {
355 /* p_enc is utf-8, so we're done. */
356 result = utf8_str;
357 }
358 else
359 {
360 result = string_convert(&conv, utf8_str, (int *)&utf8_len);
361 vim_free(utf8_str);
362 }
363
364 convert_setup(&conv, NULL, NULL);
365
366 if (actualLen)
367 *actualLen = utf8_len;
368 }
369 else if (actualLen)
370 *actualLen = 0;
371
372 return result;
373}
374
375/*
376 * Conversion from 'encoding' to UTF-16 UniChars
377 */
378 UniChar *
379mac_enc_to_utf16(from, fromLen, actualLen)
380 char_u *from;
381 size_t fromLen;
382 size_t *actualLen;
383{
384 /* Following code borrows somewhat from os_mswin.c */
385 vimconv_T conv;
386 size_t utf8_len;
387 char_u *utf8_str;
388 UniChar *result = NULL;
389 Boolean should_free_utf8 = FALSE;
390
391 do
392 {
393 /* Use MacRoman by default, we might be called before we have p_enc
394 * set up. Convert to utf-8 first, works better with iconv(). Does
395 * nothing if 'encoding' is "utf-8". */
396 conv.vc_type = CONV_NONE;
397 if ((enc_canon_props(p_enc) & ENC_UNICODE) == 0 &&
398 convert_setup(&conv, p_enc ? p_enc : (char_u *)"macroman",
399 (char_u *)"utf-8") == FAIL)
400 break;
401
402 if (conv.vc_type != CONV_NONE)
403 {
404 utf8_len = fromLen;
405 utf8_str = string_convert(&conv, from, (int *)&utf8_len);
406 should_free_utf8 = TRUE;
407 }
408 else
409 {
410 utf8_str = from;
411 utf8_len = fromLen;
412 }
413
414 if (utf8_str == NULL)
415 break;
416
417 convert_setup(&conv, NULL, NULL);
418
419 result = mac_utf8_to_utf16(utf8_str, utf8_len, actualLen);
420
421 if (should_free_utf8)
422 vim_free(utf8_str);
423 return result;
424 }
425 while (0);
426
427 if (actualLen)
428 *actualLen = 0;
429
430 return result;
431}
432
433/*
434 * Converts from UTF-16 UniChars to CFString
435 */
436 CFStringRef
437mac_enc_to_cfstring(from, fromLen)
438 char_u *from;
439 size_t fromLen;
440{
441 UniChar *utf16_str;
442 size_t utf16_len;
443 CFStringRef result = NULL;
444
445 utf16_str = mac_enc_to_utf16(from, fromLen, &utf16_len);
446 if (utf16_str)
447 {
448 result = CFStringCreateWithCharacters(NULL, utf16_str, utf16_len/sizeof(UniChar));
449 vim_free(utf16_str);
450 }
451
452 return result;
453}
454
455/*
456 * Converts a decomposed HFS+ UTF-8 path to precomposed UTF-8
457 */
458 char_u *
459mac_precompose_path(decompPath, decompLen, precompLen)
460 char_u *decompPath;
461 size_t decompLen;
462 size_t *precompLen;
463{
464 char_u *result = NULL;
465 size_t actualLen = 0;
466
467 if (gPathConverter)
468 {
469 result = alloc(decompLen);
470 if (result)
471 {
472 if (TECConvertText(gPathConverter, decompPath,
473 decompLen, &decompLen, result,
474 decompLen, &actualLen) != noErr)
475 {
476 vim_free(result);
477 result = NULL;
478 }
479 }
480 }
481
482 if (precompLen)
483 *precompLen = actualLen;
484
485 return result;
486}
487
488/*
489 * Converts from UTF-16 UniChars to precomposed UTF-8
490 */
491 char_u *
492mac_utf16_to_utf8(from, fromLen, actualLen)
493 UniChar *from;
494 size_t fromLen;
495 size_t *actualLen;
496{
497 ByteCount utf8_len;
498 ByteCount inputRead;
499 char_u *result;
500
501 if (gUTF16ToUTF8Converter)
502 {
503 result = alloc(fromLen * 6 + 1);
504 if (result && TECConvertText(gUTF16ToUTF8Converter, (ConstTextPtr)from,
505 fromLen, &inputRead, result,
506 (fromLen*6+1)*sizeof(char_u), &utf8_len) == noErr)
507 {
508 TECFlushText(gUTF16ToUTF8Converter, result, (fromLen*6+1)*sizeof(char_u), &inputRead);
509 utf8_len += inputRead;
510 }
511 else
512 {
513 vim_free(result);
514 result = NULL;
515 }
516 }
517 else
518 {
519 result = NULL;
520 }
521
522 if (actualLen)
523 *actualLen = result ? utf8_len : 0;
524
525 return result;
526}
527
528/*
529 * Converts from UTF-8 to UTF-16 UniChars
530 */
531 UniChar *
532mac_utf8_to_utf16(from, fromLen, actualLen)
533 char_u *from;
534 size_t fromLen;
535 size_t *actualLen;
536{
537 CFStringRef utf8_str;
538 CFRange convertRange;
539 UniChar *result = NULL;
540
541 utf8_str = CFStringCreateWithBytes(NULL, from, fromLen,
542 kCFStringEncodingUTF8, FALSE);
543
544 if (utf8_str == NULL) {
545 if (actualLen)
546 *actualLen = 0;
547 return NULL;
548 }
549
550 convertRange = CFRangeMake(0, CFStringGetLength(utf8_str));
551 result = (UniChar *)alloc(convertRange.length * sizeof(UniChar));
552
553 CFStringGetCharacters(utf8_str, convertRange, result);
554
555 CFRelease(utf8_str);
556
557 if (actualLen)
558 *actualLen = convertRange.length * sizeof(UniChar);
559
560 return result;
561}
Bram Moolenaar5eb86f92004-07-26 12:53:41 +0000562#endif /* FEAT_MBYTE */