utf8.c (18333B)
1 /* 2 Basic UTF-8 manipulation routines 3 by Jeff Bezanson 4 placed in the public domain Fall 2005 5 6 This code is designed to provide the utilities you need to manipulate 7 UTF-8 as an internal string encoding. These functions do not perform the 8 error checking normally needed when handling UTF-8 data, so if you happen 9 to be from the Unicode Consortium you will want to flay me alive. 10 I do this because error checking can be performed at the boundaries (I/O), 11 with these routines reserved for higher performance on data known to be 12 valid. 13 A UTF-8 validation routine is included. 14 */ 15 #include <stdlib.h> 16 #include <stdio.h> 17 #include <string.h> 18 #include <stdarg.h> 19 #include <stdint.h> 20 #include <wchar.h> 21 #include <wctype.h> 22 23 #ifdef WIN32 24 #include <malloc.h> 25 #define snprintf _snprintf 26 #else 27 #ifndef __FreeBSD__ 28 #include <alloca.h> 29 #endif /* __FreeBSD__ */ 30 #endif 31 #include <assert.h> 32 33 #include "utf8.h" 34 35 static const uint32_t offsetsFromUTF8[6] = { 36 0x00000000UL, 0x00003080UL, 0x000E2080UL, 37 0x03C82080UL, 0xFA082080UL, 0x82082080UL 38 }; 39 40 static const char trailingBytesForUTF8[256] = { 41 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 42 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 43 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 44 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 45 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 47 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 48 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 49 }; 50 51 /* returns length of next utf-8 sequence */ 52 size_t u8_seqlen(const char *s) 53 { 54 return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; 55 } 56 57 /* returns the # of bytes needed to encode a certain character 58 0 means the character cannot (or should not) be encoded. */ 59 size_t u8_charlen(uint32_t ch) 60 { 61 if (ch < 0x80) 62 return 1; 63 else if (ch < 0x800) 64 return 2; 65 else if (ch < 0x10000) 66 return 3; 67 else if (ch < 0x110000) 68 return 4; 69 return 0; 70 } 71 72 size_t u8_codingsize(uint32_t *wcstr, size_t n) 73 { 74 size_t i, c=0; 75 76 for(i=0; i < n; i++) 77 c += u8_charlen(wcstr[i]); 78 return c; 79 } 80 81 /* conversions without error checking 82 only works for valid UTF-8, i.e. no 5- or 6-byte sequences 83 srcsz = source size in bytes 84 sz = dest size in # of wide characters 85 86 returns # characters converted 87 if sz == srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. 88 */ 89 size_t u8_toucs(uint32_t *dest, size_t sz, const char *src, size_t srcsz) 90 { 91 uint32_t ch; 92 const char *src_end = src + srcsz; 93 size_t nb; 94 size_t i=0; 95 96 if (sz == 0 || srcsz == 0) 97 return 0; 98 99 while (i < sz) { 100 if (!isutf(*src)) { // invalid sequence 101 dest[i++] = 0xFFFD; 102 src++; 103 if (src >= src_end) break; 104 continue; 105 } 106 nb = trailingBytesForUTF8[(unsigned char)*src]; 107 if (src + nb >= src_end) 108 break; 109 ch = 0; 110 switch (nb) { 111 /* these fall through deliberately */ 112 case 5: ch += (unsigned char)*src++; ch <<= 6; 113 case 4: ch += (unsigned char)*src++; ch <<= 6; 114 case 3: ch += (unsigned char)*src++; ch <<= 6; 115 case 2: ch += (unsigned char)*src++; ch <<= 6; 116 case 1: ch += (unsigned char)*src++; ch <<= 6; 117 case 0: ch += (unsigned char)*src++; 118 } 119 ch -= offsetsFromUTF8[nb]; 120 dest[i++] = ch; 121 } 122 return i; 123 } 124 125 /* srcsz = number of source characters 126 sz = size of dest buffer in bytes 127 128 returns # bytes stored in dest 129 the destination string will never be bigger than the source string. 130 */ 131 size_t u8_toutf8(char *dest, size_t sz, const uint32_t *src, size_t srcsz) 132 { 133 uint32_t ch; 134 size_t i = 0; 135 char *dest0 = dest; 136 char *dest_end = dest + sz; 137 138 while (i < srcsz) { 139 ch = src[i]; 140 if (ch < 0x80) { 141 if (dest >= dest_end) 142 break; 143 *dest++ = (char)ch; 144 } 145 else if (ch < 0x800) { 146 if (dest >= dest_end-1) 147 break; 148 *dest++ = (ch>>6) | 0xC0; 149 *dest++ = (ch & 0x3F) | 0x80; 150 } 151 else if (ch < 0x10000) { 152 if (dest >= dest_end-2) 153 break; 154 *dest++ = (ch>>12) | 0xE0; 155 *dest++ = ((ch>>6) & 0x3F) | 0x80; 156 *dest++ = (ch & 0x3F) | 0x80; 157 } 158 else if (ch < 0x110000) { 159 if (dest >= dest_end-3) 160 break; 161 *dest++ = (ch>>18) | 0xF0; 162 *dest++ = ((ch>>12) & 0x3F) | 0x80; 163 *dest++ = ((ch>>6) & 0x3F) | 0x80; 164 *dest++ = (ch & 0x3F) | 0x80; 165 } 166 i++; 167 } 168 return (dest-dest0); 169 } 170 171 size_t u8_wc_toutf8(char *dest, uint32_t ch) 172 { 173 if (ch < 0x80) { 174 dest[0] = (char)ch; 175 return 1; 176 } 177 if (ch < 0x800) { 178 dest[0] = (ch>>6) | 0xC0; 179 dest[1] = (ch & 0x3F) | 0x80; 180 return 2; 181 } 182 if (ch < 0x10000) { 183 dest[0] = (ch>>12) | 0xE0; 184 dest[1] = ((ch>>6) & 0x3F) | 0x80; 185 dest[2] = (ch & 0x3F) | 0x80; 186 return 3; 187 } 188 if (ch < 0x110000) { 189 dest[0] = (ch>>18) | 0xF0; 190 dest[1] = ((ch>>12) & 0x3F) | 0x80; 191 dest[2] = ((ch>>6) & 0x3F) | 0x80; 192 dest[3] = (ch & 0x3F) | 0x80; 193 return 4; 194 } 195 return 0; 196 } 197 198 /* charnum => byte offset */ 199 size_t u8_offset(const char *s, size_t charnum) 200 { 201 size_t i=0; 202 203 while (charnum > 0) { 204 if (s[i++] & 0x80) { 205 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i); 206 } 207 charnum--; 208 } 209 return i; 210 } 211 212 /* byte offset => charnum */ 213 size_t u8_charnum(const char *s, size_t offset) 214 { 215 size_t charnum = 0, i=0; 216 217 while (i < offset) { 218 if (s[i++] & 0x80) { 219 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i); 220 } 221 charnum++; 222 } 223 return charnum; 224 } 225 226 /* number of characters in NUL-terminated string */ 227 size_t u8_strlen(const char *s) 228 { 229 size_t count = 0; 230 size_t i = 0, lasti; 231 232 while (1) { 233 lasti = i; 234 while (s[i] > 0) 235 i++; 236 count += (i-lasti); 237 if (s[i++]==0) break; 238 (void)(isutf(s[++i]) || isutf(s[++i]) || ++i); 239 count++; 240 } 241 return count; 242 } 243 244 int wcwidth(wchar_t c); 245 246 size_t u8_strwidth(const char *s) 247 { 248 uint32_t ch; 249 size_t nb, tot=0; 250 int w; 251 signed char sc; 252 253 while ((sc = (signed char)*s) != 0) { 254 if (sc >= 0) { 255 s++; 256 if (sc) tot++; 257 } 258 else { 259 if (!isutf(sc)) { tot++; s++; continue; } 260 nb = trailingBytesForUTF8[(unsigned char)sc]; 261 ch = 0; 262 switch (nb) { 263 /* these fall through deliberately */ 264 case 5: ch += (unsigned char)*s++; ch <<= 6; 265 case 4: ch += (unsigned char)*s++; ch <<= 6; 266 case 3: ch += (unsigned char)*s++; ch <<= 6; 267 case 2: ch += (unsigned char)*s++; ch <<= 6; 268 case 1: ch += (unsigned char)*s++; ch <<= 6; 269 case 0: ch += (unsigned char)*s++; 270 } 271 ch -= offsetsFromUTF8[nb]; 272 w = wcwidth(ch); // might return -1 273 if (w > 0) tot += w; 274 } 275 } 276 return tot; 277 } 278 279 /* reads the next utf-8 sequence out of a string, updating an index */ 280 uint32_t u8_nextchar(const char *s, size_t *i) 281 { 282 uint32_t ch = 0; 283 size_t sz = 0; 284 285 do { 286 ch <<= 6; 287 ch += (unsigned char)s[(*i)]; 288 sz++; 289 } while (s[*i] && (++(*i)) && !isutf(s[*i])); 290 ch -= offsetsFromUTF8[sz-1]; 291 292 return ch; 293 } 294 295 /* next character without NUL character terminator */ 296 uint32_t u8_nextmemchar(const char *s, size_t *i) 297 { 298 uint32_t ch = 0; 299 size_t sz = 0; 300 do { 301 ch <<= 6; 302 ch += (unsigned char)s[(*i)++]; 303 sz++; 304 } while (!isutf(s[*i])); 305 ch -= offsetsFromUTF8[sz-1]; 306 307 return ch; 308 } 309 310 void u8_inc(const char *s, size_t *i) 311 { 312 (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || isutf(s[++(*i)]) || ++(*i)); 313 } 314 315 void u8_dec(const char *s, size_t *i) 316 { 317 (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || isutf(s[--(*i)]) || --(*i)); 318 } 319 320 int octal_digit(char c) 321 { 322 return (c >= '0' && c <= '7'); 323 } 324 325 int hex_digit(char c) 326 { 327 return ((c >= '0' && c <= '9') || 328 (c >= 'A' && c <= 'F') || 329 (c >= 'a' && c <= 'f')); 330 } 331 332 char read_escape_control_char(char c) 333 { 334 if (c == 'n') 335 return '\n'; 336 else if (c == 't') 337 return '\t'; 338 else if (c == 'r') 339 return '\r'; 340 else if (c == 'e') 341 return 033; // '\e' 342 else if (c == 'b') 343 return '\b'; 344 else if (c == 'f') 345 return '\f'; 346 else if (c == 'v') 347 return '\v'; 348 else if (c == 'a') 349 return '\a'; 350 return c; 351 } 352 353 /* assumes that src points to the character after a backslash 354 returns number of input characters processed, 0 if error */ 355 size_t u8_read_escape_sequence(const char *str, size_t ssz, uint32_t *dest) 356 { 357 uint32_t ch; 358 char digs[10]; 359 int dno=0, ndig; 360 size_t i=1; 361 char c0 = str[0]; 362 assert(ssz > 0); 363 364 if (octal_digit(c0)) { 365 i = 0; 366 do { 367 digs[dno++] = str[i++]; 368 } while (i<ssz && octal_digit(str[i]) && dno<3); 369 digs[dno] = '\0'; 370 ch = strtol(digs, NULL, 8); 371 } 372 else if ((c0=='x' && (ndig=2)) || 373 (c0=='u' && (ndig=4)) || 374 (c0=='U' && (ndig=8))) { 375 while (i<ssz && hex_digit(str[i]) && dno<ndig) { 376 digs[dno++] = str[i++]; 377 } 378 if (dno == 0) return 0; 379 digs[dno] = '\0'; 380 ch = strtol(digs, NULL, 16); 381 } 382 else { 383 ch = (uint32_t)read_escape_control_char(c0); 384 } 385 *dest = ch; 386 387 return i; 388 } 389 390 /* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8 391 example: u8_unescape(mybuf, 256, "hello\\u220e") 392 note the double backslash is needed if called on a C string literal */ 393 size_t u8_unescape(char *buf, size_t sz, const char *src) 394 { 395 size_t c=0, amt; 396 uint32_t ch = 0; 397 char temp[4]; 398 399 while (*src && c < sz) { 400 if (*src == '\\') { 401 src++; 402 amt = u8_read_escape_sequence(src, 1000, &ch); 403 } 404 else { 405 ch = (uint32_t)*src; 406 amt = 1; 407 } 408 src += amt; 409 amt = u8_wc_toutf8(temp, ch); 410 if (amt > sz-c) 411 break; 412 memcpy(&buf[c], temp, amt); 413 c += amt; 414 } 415 if (c < sz) 416 buf[c] = '\0'; 417 return c; 418 } 419 420 static int buf_put2c(char *buf, const char *src) 421 { 422 buf[0] = src[0]; 423 buf[1] = src[1]; 424 buf[2] = '\0'; 425 return 2; 426 } 427 428 int u8_escape_wchar(char *buf, size_t sz, uint32_t ch) 429 { 430 assert(sz > 2); 431 if (ch == L'\n') 432 return buf_put2c(buf, "\\n"); 433 else if (ch == L'\t') 434 return buf_put2c(buf, "\\t"); 435 else if (ch == L'\r') 436 return buf_put2c(buf, "\\r"); 437 else if (ch == 033) // L'\e' 438 return buf_put2c(buf, "\\e"); 439 else if (ch == L'\b') 440 return buf_put2c(buf, "\\b"); 441 else if (ch == L'\f') 442 return buf_put2c(buf, "\\f"); 443 else if (ch == L'\v') 444 return buf_put2c(buf, "\\v"); 445 else if (ch == L'\a') 446 return buf_put2c(buf, "\\a"); 447 else if (ch == L'\\') 448 return buf_put2c(buf, "\\\\"); 449 else if (ch < 32 || ch == 0x7f) 450 return snprintf(buf, sz, "\\x%.2hhx", (unsigned char)ch); 451 else if (ch > 0xFFFF) 452 return snprintf(buf, sz, "\\U%.8x", (uint32_t)ch); 453 else if (ch >= 0x80) 454 return snprintf(buf, sz, "\\u%.4hx", (unsigned short)ch); 455 456 buf[0] = (char)ch; 457 buf[1] = '\0'; 458 return 1; 459 } 460 461 size_t u8_escape(char *buf, size_t sz, const char *src, size_t *pi, size_t end, 462 int escape_quotes, int ascii) 463 { 464 size_t i = *pi, i0; 465 uint32_t ch; 466 char *start = buf; 467 char *blim = start + sz-11; 468 assert(sz > 11); 469 470 while (i<end && buf<blim) { 471 // sz-11: leaves room for longest escape sequence 472 if (escape_quotes && src[i] == '"') { 473 buf += buf_put2c(buf, "\\\""); 474 i++; 475 } 476 else if (src[i] == '\\') { 477 buf += buf_put2c(buf, "\\\\"); 478 i++; 479 } 480 else { 481 i0 = i; 482 ch = u8_nextmemchar(src, &i); 483 if (ascii || !iswprint((wint_t)ch)) { 484 buf += u8_escape_wchar(buf, sz - (buf-start), ch); 485 } 486 else { 487 i = i0; 488 do { 489 *buf++ = src[i++]; 490 } while (!isutf(src[i])); 491 } 492 } 493 } 494 *buf++ = '\0'; 495 *pi = i; 496 return (buf-start); 497 } 498 499 char *u8_strchr(const char *s, uint32_t ch, size_t *charn) 500 { 501 size_t i = 0, lasti=0; 502 uint32_t c; 503 504 *charn = 0; 505 while (s[i]) { 506 c = u8_nextchar(s, &i); 507 if (c == ch) { 508 /* it's const for us, but not necessarily the caller */ 509 return (char*)&s[lasti]; 510 } 511 lasti = i; 512 (*charn)++; 513 } 514 return NULL; 515 } 516 517 char *u8_memchr(const char *s, uint32_t ch, size_t sz, size_t *charn) 518 { 519 size_t i = 0, lasti=0; 520 uint32_t c; 521 int csz; 522 523 *charn = 0; 524 while (i < sz) { 525 c = csz = 0; 526 do { 527 c <<= 6; 528 c += (unsigned char)s[i++]; 529 csz++; 530 } while (i < sz && !isutf(s[i])); 531 c -= offsetsFromUTF8[csz-1]; 532 533 if (c == ch) { 534 return (char*)&s[lasti]; 535 } 536 lasti = i; 537 (*charn)++; 538 } 539 return NULL; 540 } 541 542 char *u8_memrchr(const char *s, uint32_t ch, size_t sz) 543 { 544 size_t i = sz-1, tempi=0; 545 uint32_t c; 546 547 if (sz == 0) return NULL; 548 549 while (i && !isutf(s[i])) i--; 550 551 while (1) { 552 tempi = i; 553 c = u8_nextmemchar(s, &tempi); 554 if (c == ch) { 555 return (char*)&s[i]; 556 } 557 if (i == 0) 558 break; 559 tempi = i; 560 u8_dec(s, &i); 561 if (i > tempi) 562 break; 563 } 564 return NULL; 565 } 566 567 int u8_is_locale_utf8(const char *locale) 568 { 569 /* this code based on libutf8 */ 570 const char* cp = locale; 571 572 if (locale == NULL) return 0; 573 574 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) { 575 if (*cp == '.') { 576 const char* encoding = ++cp; 577 for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) 578 ; 579 if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5)) 580 || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4))) 581 return 1; /* it's UTF-8 */ 582 break; 583 } 584 } 585 return 0; 586 } 587 588 size_t u8_vprintf(const char *fmt, va_list ap) 589 { 590 int cnt, sz=0, nc, needfree=0; 591 char *buf; 592 uint32_t *wcs; 593 594 sz = 512; 595 buf = (char*)alloca(sz); 596 cnt = vsnprintf(buf, sz, fmt, ap); 597 if (cnt < 0) 598 return 0; 599 if (cnt >= sz) { 600 buf = (char*)malloc(cnt + 1); 601 needfree = 1; 602 vsnprintf(buf, cnt+1, fmt, ap); 603 } 604 wcs = (uint32_t*)alloca((cnt+1) * sizeof(uint32_t)); 605 nc = u8_toucs(wcs, (size_t)cnt+1, buf, cnt); 606 wcs[nc] = 0; 607 printf("%ls", (wchar_t*)wcs); 608 if (needfree) free(buf); 609 return nc; 610 } 611 612 size_t u8_printf(const char *fmt, ...) 613 { 614 size_t cnt; 615 va_list args; 616 617 va_start(args, fmt); 618 619 cnt = u8_vprintf(fmt, args); 620 621 va_end(args); 622 return cnt; 623 } 624 625 /* based on the valid_utf8 routine from the PCRE library by Philip Hazel 626 627 length is in bytes, since without knowing whether the string is valid 628 it's hard to know how many characters there are! */ 629 int u8_isvalid(const char *str, size_t length) 630 { 631 const unsigned char *p, *pend = (unsigned char*)str + length; 632 unsigned char c; 633 int ret = 1; /* ASCII */ 634 size_t ab; 635 636 for (p = (unsigned char*)str; p < pend; p++) { 637 c = *p; 638 if (c < 128) 639 continue; 640 ret = 2; /* non-ASCII UTF-8 */ 641 if ((c & 0xc0) != 0xc0) 642 return 0; 643 ab = trailingBytesForUTF8[c]; 644 if (length < ab) 645 return 0; 646 length -= ab; 647 648 p++; 649 /* Check top bits in the second byte */ 650 if ((*p & 0xc0) != 0x80) 651 return 0; 652 653 /* Check for overlong sequences for each different length */ 654 switch (ab) { 655 /* Check for xx00 000x */ 656 case 1: 657 if ((c & 0x3e) == 0) return 0; 658 continue; /* We know there aren't any more bytes to check */ 659 660 /* Check for 1110 0000, xx0x xxxx */ 661 case 2: 662 if (c == 0xe0 && (*p & 0x20) == 0) return 0; 663 break; 664 665 /* Check for 1111 0000, xx00 xxxx */ 666 case 3: 667 if (c == 0xf0 && (*p & 0x30) == 0) return 0; 668 break; 669 670 /* Check for 1111 1000, xx00 0xxx */ 671 case 4: 672 if (c == 0xf8 && (*p & 0x38) == 0) return 0; 673 break; 674 675 /* Check for leading 0xfe or 0xff, 676 and then for 1111 1100, xx00 00xx */ 677 case 5: 678 if (c == 0xfe || c == 0xff || 679 (c == 0xfc && (*p & 0x3c) == 0)) return 0; 680 break; 681 } 682 683 /* Check for valid bytes after the 2nd, if any; all must start 10 */ 684 while (--ab > 0) { 685 if ((*(++p) & 0xc0) != 0x80) return 0; 686 } 687 } 688 689 return ret; 690 } 691 692 int u8_reverse(char *dest, char * src, size_t len) 693 { 694 size_t si=0, di=len; 695 unsigned char c; 696 697 dest[di] = '\0'; 698 while (si < len) { 699 c = (unsigned char)src[si]; 700 if ((~c) & 0x80) { 701 di--; 702 dest[di] = c; 703 si++; 704 } 705 else { 706 switch (c>>4) { 707 case 0xC: 708 case 0xD: 709 di -= 2; 710 *((int16_t*)&dest[di]) = *((int16_t*)&src[si]); 711 si += 2; 712 break; 713 case 0xE: 714 di -= 3; 715 dest[di] = src[si]; 716 *((int16_t*)&dest[di+1]) = *((int16_t*)&src[si+1]); 717 si += 3; 718 break; 719 case 0xF: 720 di -= 4; 721 *((int32_t*)&dest[di]) = *((int32_t*)&src[si]); 722 si += 4; 723 break; 724 default: 725 return 1; 726 } 727 } 728 } 729 return 0; 730 }