1 /* 2 * This file is part of gtkD. 3 * 4 * gtkD is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License 6 * as published by the Free Software Foundation; either version 3 7 * of the License, or (at your option) any later version, with 8 * some exceptions, please read the COPYING file. 9 * 10 * gtkD is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with gtkD; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA 18 */ 19 20 // generated automatically - do not change 21 // find conversion definition on APILookup.txt 22 // implement new conversion functionalities on the wrap.utils pakage 23 24 25 module glib.Unicode; 26 27 private import glib.ErrorG; 28 private import glib.GException; 29 private import glib.Str; 30 private import gtkc.glib; 31 public import gtkc.glibtypes; 32 33 34 /** */ 35 public struct Unicode 36 { 37 38 /** 39 * Convert a string from UCS-4 to UTF-16. A 0 character will be 40 * added to the result after the converted text. 41 * 42 * Params: 43 * str = a UCS-4 encoded string 44 * len = the maximum length (number of characters) of @str to use. 45 * If @len < 0, then the string is nul-terminated. 46 * itemsRead = location to store number of bytes read, 47 * or %NULL. If an error occurs then the index of the invalid input 48 * is stored here. 49 * itemsWritten = location to store number of #gunichar2 50 * written, or %NULL. The value stored here does not include the 51 * trailing 0. 52 * 53 * Return: a pointer to a newly allocated UTF-16 string. 54 * This value must be freed with g_free(). If an error occurs, 55 * %NULL will be returned and @error set. 56 * 57 * Throws: GException on failure. 58 */ 59 public static wchar* ucs4ToUtf16(dchar* str, glong len, glong* itemsRead, glong* itemsWritten) 60 { 61 GError* err = null; 62 63 auto p = g_ucs4_to_utf16(str, len, itemsRead, itemsWritten, &err); 64 65 if (err !is null) 66 { 67 throw new GException( new ErrorG(err) ); 68 } 69 70 return p; 71 } 72 73 /** 74 * Convert a string from a 32-bit fixed width representation as UCS-4. 75 * to UTF-8. The result will be terminated with a 0 byte. 76 * 77 * Params: 78 * str = a UCS-4 encoded string 79 * len = the maximum length (number of characters) of @str to use. 80 * If @len < 0, then the string is nul-terminated. 81 * itemsRead = location to store number of characters 82 * read, or %NULL. 83 * itemsWritten = location to store number of bytes 84 * written or %NULL. The value here stored does not include the 85 * trailing 0 byte. 86 * 87 * Return: a pointer to a newly allocated UTF-8 string. 88 * This value must be freed with g_free(). If an error occurs, 89 * %NULL will be returned and @error set. In that case, @items_read 90 * will be set to the position of the first invalid input character. 91 * 92 * Throws: GException on failure. 93 */ 94 public static string ucs4ToUtf8(dchar* str, glong len, glong* itemsRead, glong* itemsWritten) 95 { 96 GError* err = null; 97 98 auto p = g_ucs4_to_utf8(str, len, itemsRead, itemsWritten, &err); 99 100 if (err !is null) 101 { 102 throw new GException( new ErrorG(err) ); 103 } 104 105 return Str.toString(p); 106 } 107 108 /** 109 * Determines the break type of @c. @c should be a Unicode character 110 * (to derive a character from UTF-8 encoded text, use 111 * g_utf8_get_char()). The break type is used to find word and line 112 * breaks ("text boundaries"), Pango implements the Unicode boundary 113 * resolution algorithms and normally you would use a function such 114 * as pango_break() instead of caring about break types yourself. 115 * 116 * Params: 117 * c = a Unicode character 118 * 119 * Return: the break type of @c 120 */ 121 public static GUnicodeBreakType unicharBreakType(dchar c) 122 { 123 return g_unichar_break_type(c); 124 } 125 126 /** 127 * Determines the canonical combining class of a Unicode character. 128 * 129 * Params: 130 * uc = a Unicode character 131 * 132 * Return: the combining class of the character 133 * 134 * Since: 2.14 135 */ 136 public static int unicharCombiningClass(dchar uc) 137 { 138 return g_unichar_combining_class(uc); 139 } 140 141 /** 142 * Performs a single composition step of the 143 * Unicode canonical composition algorithm. 144 * 145 * This function includes algorithmic Hangul Jamo composition, 146 * but it is not exactly the inverse of g_unichar_decompose(). 147 * No composition can have either of @a or @b equal to zero. 148 * To be precise, this function composes if and only if 149 * there exists a Primary Composite P which is canonically 150 * equivalent to the sequence <@a,@b>. See the Unicode 151 * Standard for the definition of Primary Composite. 152 * 153 * If @a and @b do not compose a new character, @ch is set to zero. 154 * 155 * See 156 * [UAX#15](http://unicode.org/reports/tr15/) 157 * for details. 158 * 159 * Params: 160 * a = a Unicode character 161 * b = a Unicode character 162 * ch = return location for the composed character 163 * 164 * Return: %TRUE if the characters could be composed 165 * 166 * Since: 2.30 167 */ 168 public static bool unicharCompose(dchar a, dchar b, dchar* ch) 169 { 170 return g_unichar_compose(a, b, ch) != 0; 171 } 172 173 /** 174 * Performs a single decomposition step of the 175 * Unicode canonical decomposition algorithm. 176 * 177 * This function does not include compatibility 178 * decompositions. It does, however, include algorithmic 179 * Hangul Jamo decomposition, as well as 'singleton' 180 * decompositions which replace a character by a single 181 * other character. In the case of singletons *@b will 182 * be set to zero. 183 * 184 * If @ch is not decomposable, *@a is set to @ch and *@b 185 * is set to zero. 186 * 187 * Note that the way Unicode decomposition pairs are 188 * defined, it is guaranteed that @b would not decompose 189 * further, but @a may itself decompose. To get the full 190 * canonical decomposition for @ch, one would need to 191 * recursively call this function on @a. Or use 192 * g_unichar_fully_decompose(). 193 * 194 * See 195 * [UAX#15](http://unicode.org/reports/tr15/) 196 * for details. 197 * 198 * Params: 199 * ch = a Unicode character 200 * a = return location for the first component of @ch 201 * b = return location for the second component of @ch 202 * 203 * Return: %TRUE if the character could be decomposed 204 * 205 * Since: 2.30 206 */ 207 public static bool unicharDecompose(dchar ch, dchar* a, dchar* b) 208 { 209 return g_unichar_decompose(ch, a, b) != 0; 210 } 211 212 /** 213 * Determines the numeric value of a character as a decimal 214 * digit. 215 * 216 * Params: 217 * c = a Unicode character 218 * 219 * Return: If @c is a decimal digit (according to 220 * g_unichar_isdigit()), its numeric value. Otherwise, -1. 221 */ 222 public static int unicharDigitValue(dchar c) 223 { 224 return g_unichar_digit_value(c); 225 } 226 227 /** 228 * Computes the canonical or compatibility decomposition of a 229 * Unicode character. For compatibility decomposition, 230 * pass %TRUE for @compat; for canonical decomposition 231 * pass %FALSE for @compat. 232 * 233 * The decomposed sequence is placed in @result. Only up to 234 * @result_len characters are written into @result. The length 235 * of the full decomposition (irrespective of @result_len) is 236 * returned by the function. For canonical decomposition, 237 * currently all decompositions are of length at most 4, but 238 * this may change in the future (very unlikely though). 239 * At any rate, Unicode does guarantee that a buffer of length 240 * 18 is always enough for both compatibility and canonical 241 * decompositions, so that is the size recommended. This is provided 242 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. 243 * 244 * See 245 * [UAX#15](http://unicode.org/reports/tr15/) 246 * for details. 247 * 248 * Params: 249 * ch = a Unicode character. 250 * compat = whether perform canonical or compatibility decomposition 251 * result = location to store decomposed result, or %NULL 252 * resultLen = length of @result 253 * 254 * Return: the length of the full decomposition. 255 * 256 * Since: 2.30 257 */ 258 public static size_t unicharFullyDecompose(dchar ch, bool compat, dchar* result, size_t resultLen) 259 { 260 return g_unichar_fully_decompose(ch, compat, result, resultLen); 261 } 262 263 /** 264 * In Unicode, some characters are "mirrored". This means that their 265 * images are mirrored horizontally in text that is laid out from right 266 * to left. For instance, "(" would become its mirror image, ")", in 267 * right-to-left text. 268 * 269 * If @ch has the Unicode mirrored property and there is another unicode 270 * character that typically has a glyph that is the mirror image of @ch's 271 * glyph and @mirrored_ch is set, it puts that character in the address 272 * pointed to by @mirrored_ch. Otherwise the original character is put. 273 * 274 * Params: 275 * ch = a Unicode character 276 * mirroredCh = location to store the mirrored character 277 * 278 * Return: %TRUE if @ch has a mirrored character, %FALSE otherwise 279 * 280 * Since: 2.4 281 */ 282 public static bool unicharGetMirrorChar(dchar ch, dchar* mirroredCh) 283 { 284 return g_unichar_get_mirror_char(ch, mirroredCh) != 0; 285 } 286 287 /** 288 * Looks up the #GUnicodeScript for a particular character (as defined 289 * by Unicode Standard Annex \#24). No check is made for @ch being a 290 * valid Unicode character; if you pass in invalid character, the 291 * result is undefined. 292 * 293 * This function is equivalent to pango_script_for_unichar() and the 294 * two are interchangeable. 295 * 296 * Params: 297 * ch = a Unicode character 298 * 299 * Return: the #GUnicodeScript for the character. 300 * 301 * Since: 2.14 302 */ 303 public static GUnicodeScript unicharGetScript(dchar ch) 304 { 305 return g_unichar_get_script(ch); 306 } 307 308 /** 309 * Determines whether a character is alphanumeric. 310 * Given some UTF-8 text, obtain a character value 311 * with g_utf8_get_char(). 312 * 313 * Params: 314 * c = a Unicode character 315 * 316 * Return: %TRUE if @c is an alphanumeric character 317 */ 318 public static bool unicharIsalnum(dchar c) 319 { 320 return g_unichar_isalnum(c) != 0; 321 } 322 323 /** 324 * Determines whether a character is alphabetic (i.e. a letter). 325 * Given some UTF-8 text, obtain a character value with 326 * g_utf8_get_char(). 327 * 328 * Params: 329 * c = a Unicode character 330 * 331 * Return: %TRUE if @c is an alphabetic character 332 */ 333 public static bool unicharIsalpha(dchar c) 334 { 335 return g_unichar_isalpha(c) != 0; 336 } 337 338 /** 339 * Determines whether a character is a control character. 340 * Given some UTF-8 text, obtain a character value with 341 * g_utf8_get_char(). 342 * 343 * Params: 344 * c = a Unicode character 345 * 346 * Return: %TRUE if @c is a control character 347 */ 348 public static bool unicharIscntrl(dchar c) 349 { 350 return g_unichar_iscntrl(c) != 0; 351 } 352 353 /** 354 * Determines if a given character is assigned in the Unicode 355 * standard. 356 * 357 * Params: 358 * c = a Unicode character 359 * 360 * Return: %TRUE if the character has an assigned value 361 */ 362 public static bool unicharIsdefined(dchar c) 363 { 364 return g_unichar_isdefined(c) != 0; 365 } 366 367 /** 368 * Determines whether a character is numeric (i.e. a digit). This 369 * covers ASCII 0-9 and also digits in other languages/scripts. Given 370 * some UTF-8 text, obtain a character value with g_utf8_get_char(). 371 * 372 * Params: 373 * c = a Unicode character 374 * 375 * Return: %TRUE if @c is a digit 376 */ 377 public static bool unicharIsdigit(dchar c) 378 { 379 return g_unichar_isdigit(c) != 0; 380 } 381 382 /** 383 * Determines whether a character is printable and not a space 384 * (returns %FALSE for control characters, format characters, and 385 * spaces). g_unichar_isprint() is similar, but returns %TRUE for 386 * spaces. Given some UTF-8 text, obtain a character value with 387 * g_utf8_get_char(). 388 * 389 * Params: 390 * c = a Unicode character 391 * 392 * Return: %TRUE if @c is printable unless it's a space 393 */ 394 public static bool unicharIsgraph(dchar c) 395 { 396 return g_unichar_isgraph(c) != 0; 397 } 398 399 /** 400 * Determines whether a character is a lowercase letter. 401 * Given some UTF-8 text, obtain a character value with 402 * g_utf8_get_char(). 403 * 404 * Params: 405 * c = a Unicode character 406 * 407 * Return: %TRUE if @c is a lowercase letter 408 */ 409 public static bool unicharIslower(dchar c) 410 { 411 return g_unichar_islower(c) != 0; 412 } 413 414 /** 415 * Determines whether a character is a mark (non-spacing mark, 416 * combining mark, or enclosing mark in Unicode speak). 417 * Given some UTF-8 text, obtain a character value 418 * with g_utf8_get_char(). 419 * 420 * Note: in most cases where isalpha characters are allowed, 421 * ismark characters should be allowed to as they are essential 422 * for writing most European languages as well as many non-Latin 423 * scripts. 424 * 425 * Params: 426 * c = a Unicode character 427 * 428 * Return: %TRUE if @c is a mark character 429 * 430 * Since: 2.14 431 */ 432 public static bool unicharIsmark(dchar c) 433 { 434 return g_unichar_ismark(c) != 0; 435 } 436 437 /** 438 * Determines whether a character is printable. 439 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. 440 * Given some UTF-8 text, obtain a character value with 441 * g_utf8_get_char(). 442 * 443 * Params: 444 * c = a Unicode character 445 * 446 * Return: %TRUE if @c is printable 447 */ 448 public static bool unicharIsprint(dchar c) 449 { 450 return g_unichar_isprint(c) != 0; 451 } 452 453 /** 454 * Determines whether a character is punctuation or a symbol. 455 * Given some UTF-8 text, obtain a character value with 456 * g_utf8_get_char(). 457 * 458 * Params: 459 * c = a Unicode character 460 * 461 * Return: %TRUE if @c is a punctuation or symbol character 462 */ 463 public static bool unicharIspunct(dchar c) 464 { 465 return g_unichar_ispunct(c) != 0; 466 } 467 468 /** 469 * Determines whether a character is a space, tab, or line separator 470 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a 471 * character value with g_utf8_get_char(). 472 * 473 * (Note: don't use this to do word breaking; you have to use 474 * Pango or equivalent to get word breaking right, the algorithm 475 * is fairly complex.) 476 * 477 * Params: 478 * c = a Unicode character 479 * 480 * Return: %TRUE if @c is a space character 481 */ 482 public static bool unicharIsspace(dchar c) 483 { 484 return g_unichar_isspace(c) != 0; 485 } 486 487 /** 488 * Determines if a character is titlecase. Some characters in 489 * Unicode which are composites, such as the DZ digraph 490 * have three case variants instead of just two. The titlecase 491 * form is used at the beginning of a word where only the 492 * first letter is capitalized. The titlecase form of the DZ 493 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. 494 * 495 * Params: 496 * c = a Unicode character 497 * 498 * Return: %TRUE if the character is titlecase 499 */ 500 public static bool unicharIstitle(dchar c) 501 { 502 return g_unichar_istitle(c) != 0; 503 } 504 505 /** 506 * Determines if a character is uppercase. 507 * 508 * Params: 509 * c = a Unicode character 510 * 511 * Return: %TRUE if @c is an uppercase character 512 */ 513 public static bool unicharIsupper(dchar c) 514 { 515 return g_unichar_isupper(c) != 0; 516 } 517 518 /** 519 * Determines if a character is typically rendered in a double-width 520 * cell. 521 * 522 * Params: 523 * c = a Unicode character 524 * 525 * Return: %TRUE if the character is wide 526 */ 527 public static bool unicharIswide(dchar c) 528 { 529 return g_unichar_iswide(c) != 0; 530 } 531 532 /** 533 * Determines if a character is typically rendered in a double-width 534 * cell under legacy East Asian locales. If a character is wide according to 535 * g_unichar_iswide(), then it is also reported wide with this function, but 536 * the converse is not necessarily true. See the 537 * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) 538 * for details. 539 * 540 * If a character passes the g_unichar_iswide() test then it will also pass 541 * this test, but not the other way around. Note that some characters may 542 * pass both this test and g_unichar_iszerowidth(). 543 * 544 * Params: 545 * c = a Unicode character 546 * 547 * Return: %TRUE if the character is wide in legacy East Asian locales 548 * 549 * Since: 2.12 550 */ 551 public static bool unicharIswideCjk(dchar c) 552 { 553 return g_unichar_iswide_cjk(c) != 0; 554 } 555 556 /** 557 * Determines if a character is a hexidecimal digit. 558 * 559 * Params: 560 * c = a Unicode character. 561 * 562 * Return: %TRUE if the character is a hexadecimal digit 563 */ 564 public static bool unicharIsxdigit(dchar c) 565 { 566 return g_unichar_isxdigit(c) != 0; 567 } 568 569 /** 570 * Determines if a given character typically takes zero width when rendered. 571 * The return value is %TRUE for all non-spacing and enclosing marks 572 * (e.g., combining accents), format characters, zero-width 573 * space, but not U+00AD SOFT HYPHEN. 574 * 575 * A typical use of this function is with one of g_unichar_iswide() or 576 * g_unichar_iswide_cjk() to determine the number of cells a string occupies 577 * when displayed on a grid display (terminals). However, note that not all 578 * terminals support zero-width rendering of zero-width marks. 579 * 580 * Params: 581 * c = a Unicode character 582 * 583 * Return: %TRUE if the character has zero width 584 * 585 * Since: 2.14 586 */ 587 public static bool unicharIszerowidth(dchar c) 588 { 589 return g_unichar_iszerowidth(c) != 0; 590 } 591 592 /** 593 * Converts a single character to UTF-8. 594 * 595 * Params: 596 * c = a Unicode character code 597 * outbuf = output buffer, must have at least 6 bytes of space. 598 * If %NULL, the length will be computed and returned 599 * and nothing will be written to @outbuf. 600 * 601 * Return: number of bytes written 602 */ 603 public static int unicharToUtf8(dchar c, string outbuf) 604 { 605 return g_unichar_to_utf8(c, Str.toStringz(outbuf)); 606 } 607 608 /** 609 * Converts a character to lower case. 610 * 611 * Params: 612 * c = a Unicode character. 613 * 614 * Return: the result of converting @c to lower case. 615 * If @c is not an upperlower or titlecase character, 616 * or has no lowercase equivalent @c is returned unchanged. 617 */ 618 public static dchar unicharTolower(dchar c) 619 { 620 return g_unichar_tolower(c); 621 } 622 623 /** 624 * Converts a character to the titlecase. 625 * 626 * Params: 627 * c = a Unicode character 628 * 629 * Return: the result of converting @c to titlecase. 630 * If @c is not an uppercase or lowercase character, 631 * @c is returned unchanged. 632 */ 633 public static dchar unicharTotitle(dchar c) 634 { 635 return g_unichar_totitle(c); 636 } 637 638 /** 639 * Converts a character to uppercase. 640 * 641 * Params: 642 * c = a Unicode character 643 * 644 * Return: the result of converting @c to uppercase. 645 * If @c is not an lowercase or titlecase character, 646 * or has no upper case equivalent @c is returned unchanged. 647 */ 648 public static dchar unicharToupper(dchar c) 649 { 650 return g_unichar_toupper(c); 651 } 652 653 /** 654 * Classifies a Unicode character by type. 655 * 656 * Params: 657 * c = a Unicode character 658 * 659 * Return: the type of the character. 660 */ 661 public static GUnicodeType unicharType(dchar c) 662 { 663 return g_unichar_type(c); 664 } 665 666 /** 667 * Checks whether @ch is a valid Unicode character. Some possible 668 * integer values of @ch will not be valid. 0 is considered a valid 669 * character, though it's normally a string terminator. 670 * 671 * Params: 672 * ch = a Unicode character 673 * 674 * Return: %TRUE if @ch is a valid Unicode character 675 */ 676 public static bool unicharValidate(dchar ch) 677 { 678 return g_unichar_validate(ch) != 0; 679 } 680 681 /** 682 * Determines the numeric value of a character as a hexidecimal 683 * digit. 684 * 685 * Params: 686 * c = a Unicode character 687 * 688 * Return: If @c is a hex digit (according to 689 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. 690 */ 691 public static int unicharXdigitValue(dchar c) 692 { 693 return g_unichar_xdigit_value(c); 694 } 695 696 /** 697 * Computes the canonical decomposition of a Unicode character. 698 * 699 * Deprecated: Use the more flexible g_unichar_fully_decompose() 700 * instead. 701 * 702 * Params: 703 * ch = a Unicode character. 704 * resultLen = location to store the length of the return value. 705 * 706 * Return: a newly allocated string of Unicode characters. 707 * @result_len is set to the resulting length of the string. 708 */ 709 public static dchar* unicodeCanonicalDecomposition(dchar ch, size_t* resultLen) 710 { 711 return g_unicode_canonical_decomposition(ch, resultLen); 712 } 713 714 /** 715 * Computes the canonical ordering of a string in-place. 716 * This rearranges decomposed characters in the string 717 * according to their combining classes. See the Unicode 718 * manual for more information. 719 * 720 * Params: 721 * str = a UCS-4 encoded string. 722 * len = the maximum length of @string to use. 723 */ 724 public static void unicodeCanonicalOrdering(dchar* str, size_t len) 725 { 726 g_unicode_canonical_ordering(str, len); 727 } 728 729 /** 730 * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter 731 * codes to scripts. For example, the code for Arabic is 'Arab'. 732 * This function accepts four letter codes encoded as a @guint32 in a 733 * big-endian fashion. That is, the code expected for Arabic is 734 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 735 * 736 * See 737 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 738 * for details. 739 * 740 * Params: 741 * iso15924 = a Unicode script 742 * 743 * Return: the Unicode script for @iso15924, or 744 * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and 745 * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. 746 * 747 * Since: 2.30 748 */ 749 public static GUnicodeScript unicodeScriptFromIso15924(uint iso15924) 750 { 751 return g_unicode_script_from_iso15924(iso15924); 752 } 753 754 /** 755 * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter 756 * codes to scripts. For example, the code for Arabic is 'Arab'. The 757 * four letter codes are encoded as a @guint32 by this function in a 758 * big-endian fashion. That is, the code returned for Arabic is 759 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 760 * 761 * See 762 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 763 * for details. 764 * 765 * Params: 766 * script = a Unicode script 767 * 768 * Return: the ISO 15924 code for @script, encoded as an integer, 769 * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or 770 * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. 771 * 772 * Since: 2.30 773 */ 774 public static uint unicodeScriptToIso15924(GUnicodeScript script) 775 { 776 return g_unicode_script_to_iso15924(script); 777 } 778 779 /** 780 * Convert a string from UTF-16 to UCS-4. The result will be 781 * nul-terminated. 782 * 783 * Params: 784 * str = a UTF-16 encoded string 785 * len = the maximum length (number of #gunichar2) of @str to use. 786 * If @len < 0, then the string is nul-terminated. 787 * itemsRead = location to store number of words read, 788 * or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 789 * returned in case @str contains a trailing partial character. If 790 * an error occurs then the index of the invalid input is stored here. 791 * itemsWritten = location to store number of characters 792 * written, or %NULL. The value stored here does not include the trailing 793 * 0 character. 794 * 795 * Return: a pointer to a newly allocated UCS-4 string. 796 * This value must be freed with g_free(). If an error occurs, 797 * %NULL will be returned and @error set. 798 * 799 * Throws: GException on failure. 800 */ 801 public static dchar* utf16ToUcs4(wchar* str, glong len, glong* itemsRead, glong* itemsWritten) 802 { 803 GError* err = null; 804 805 auto p = g_utf16_to_ucs4(str, len, itemsRead, itemsWritten, &err); 806 807 if (err !is null) 808 { 809 throw new GException( new ErrorG(err) ); 810 } 811 812 return p; 813 } 814 815 /** 816 * Convert a string from UTF-16 to UTF-8. The result will be 817 * terminated with a 0 byte. 818 * 819 * Note that the input is expected to be already in native endianness, 820 * an initial byte-order-mark character is not handled specially. 821 * g_convert() can be used to convert a byte buffer of UTF-16 data of 822 * ambiguous endianess. 823 * 824 * Further note that this function does not validate the result 825 * string; it may e.g. include embedded NUL characters. The only 826 * validation done by this function is to ensure that the input can 827 * be correctly interpreted as UTF-16, i.e. it doesn't contain 828 * things unpaired surrogates. 829 * 830 * Params: 831 * str = a UTF-16 encoded string 832 * len = the maximum length (number of #gunichar2) of @str to use. 833 * If @len < 0, then the string is nul-terminated. 834 * itemsRead = location to store number of words read, 835 * or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 836 * returned in case @str contains a trailing partial character. If 837 * an error occurs then the index of the invalid input is stored here. 838 * itemsWritten = location to store number of bytes written, 839 * or %NULL. The value stored here does not include the trailing 0 byte. 840 * 841 * Return: a pointer to a newly allocated UTF-8 string. 842 * This value must be freed with g_free(). If an error occurs, 843 * %NULL will be returned and @error set. 844 * 845 * Throws: GException on failure. 846 */ 847 public static string utf16ToUtf8(wchar* str, glong len, glong* itemsRead, glong* itemsWritten) 848 { 849 GError* err = null; 850 851 auto p = g_utf16_to_utf8(str, len, itemsRead, itemsWritten, &err); 852 853 if (err !is null) 854 { 855 throw new GException( new ErrorG(err) ); 856 } 857 858 return Str.toString(p); 859 } 860 861 /** 862 * Converts a string into a form that is independent of case. The 863 * result will not correspond to any particular case, but can be 864 * compared for equality or ordered with the results of calling 865 * g_utf8_casefold() on other strings. 866 * 867 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is 868 * only an approximation to the correct linguistic case insensitive 869 * ordering, though it is a fairly good one. Getting this exactly 870 * right would require a more sophisticated collation function that 871 * takes case sensitivity into account. GLib does not currently 872 * provide such a function. 873 * 874 * Params: 875 * str = a UTF-8 encoded string 876 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 877 * 878 * Return: a newly allocated string, that is a 879 * case independent form of @str. 880 */ 881 public static string utf8Casefold(string str, ptrdiff_t len) 882 { 883 return Str.toString(g_utf8_casefold(Str.toStringz(str), len)); 884 } 885 886 /** 887 * Compares two strings for ordering using the linguistically 888 * correct rules for the [current locale][setlocale]. 889 * When sorting a large number of strings, it will be significantly 890 * faster to obtain collation keys with g_utf8_collate_key() and 891 * compare the keys with strcmp() when sorting instead of sorting 892 * the original strings. 893 * 894 * Params: 895 * str1 = a UTF-8 encoded string 896 * str2 = a UTF-8 encoded string 897 * 898 * Return: < 0 if @str1 compares before @str2, 899 * 0 if they compare equal, > 0 if @str1 compares after @str2. 900 */ 901 public static int utf8Collate(string str1, string str2) 902 { 903 return g_utf8_collate(Str.toStringz(str1), Str.toStringz(str2)); 904 } 905 906 /** 907 * Converts a string into a collation key that can be compared 908 * with other collation keys produced by the same function using 909 * strcmp(). 910 * 911 * The results of comparing the collation keys of two strings 912 * with strcmp() will always be the same as comparing the two 913 * original keys with g_utf8_collate(). 914 * 915 * Note that this function depends on the [current locale][setlocale]. 916 * 917 * Params: 918 * str = a UTF-8 encoded string. 919 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 920 * 921 * Return: a newly allocated string. This string should 922 * be freed with g_free() when you are done with it. 923 */ 924 public static string utf8CollateKey(string str, ptrdiff_t len) 925 { 926 return Str.toString(g_utf8_collate_key(Str.toStringz(str), len)); 927 } 928 929 /** 930 * Converts a string into a collation key that can be compared 931 * with other collation keys produced by the same function using strcmp(). 932 * 933 * In order to sort filenames correctly, this function treats the dot '.' 934 * as a special case. Most dictionary orderings seem to consider it 935 * insignificant, thus producing the ordering "event.c" "eventgenerator.c" 936 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we 937 * would like to treat numbers intelligently so that "file1" "file10" "file5" 938 * is sorted as "file1" "file5" "file10". 939 * 940 * Note that this function depends on the [current locale][setlocale]. 941 * 942 * Params: 943 * str = a UTF-8 encoded string. 944 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 945 * 946 * Return: a newly allocated string. This string should 947 * be freed with g_free() when you are done with it. 948 * 949 * Since: 2.8 950 */ 951 public static string utf8CollateKeyForFilename(string str, ptrdiff_t len) 952 { 953 return Str.toString(g_utf8_collate_key_for_filename(Str.toStringz(str), len)); 954 } 955 956 /** 957 * Finds the start of the next UTF-8 character in the string after @p. 958 * 959 * @p does not have to be at the beginning of a UTF-8 character. No check 960 * is made to see if the character found is actually valid other than 961 * it starts with an appropriate byte. 962 * 963 * Params: 964 * p = a pointer to a position within a UTF-8 encoded string 965 * end = a pointer to the byte following the end of the string, 966 * or %NULL to indicate that the string is nul-terminated 967 * 968 * Return: a pointer to the found character or %NULL 969 */ 970 public static string utf8FindNextChar(string p, string end) 971 { 972 return Str.toString(g_utf8_find_next_char(Str.toStringz(p), Str.toStringz(end))); 973 } 974 975 /** 976 * Given a position @p with a UTF-8 encoded string @str, find the start 977 * of the previous UTF-8 character starting before @p. Returns %NULL if no 978 * UTF-8 characters are present in @str before @p. 979 * 980 * @p does not have to be at the beginning of a UTF-8 character. No check 981 * is made to see if the character found is actually valid other than 982 * it starts with an appropriate byte. 983 * 984 * Params: 985 * str = pointer to the beginning of a UTF-8 encoded string 986 * p = pointer to some position within @str 987 * 988 * Return: a pointer to the found character or %NULL. 989 */ 990 public static string utf8FindPrevChar(string str, string p) 991 { 992 return Str.toString(g_utf8_find_prev_char(Str.toStringz(str), Str.toStringz(p))); 993 } 994 995 /** 996 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 997 * 998 * If @p does not point to a valid UTF-8 encoded character, results 999 * are undefined. If you are not sure that the bytes are complete 1000 * valid Unicode characters, you should use g_utf8_get_char_validated() 1001 * instead. 1002 * 1003 * Params: 1004 * p = a pointer to Unicode character encoded as UTF-8 1005 * 1006 * Return: the resulting character 1007 */ 1008 public static dchar utf8GetChar(string p) 1009 { 1010 return g_utf8_get_char(Str.toStringz(p)); 1011 } 1012 1013 /** 1014 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 1015 * This function checks for incomplete characters, for invalid characters 1016 * such as characters that are out of the range of Unicode, and for 1017 * overlong encodings of valid characters. 1018 * 1019 * Params: 1020 * p = a pointer to Unicode character encoded as UTF-8 1021 * maxLen = the maximum number of bytes to read, or -1, for no maximum or 1022 * if @p is nul-terminated 1023 * 1024 * Return: the resulting character. If @p points to a partial 1025 * sequence at the end of a string that could begin a valid 1026 * character (or if @max_len is zero), returns (gunichar)-2; 1027 * otherwise, if @p does not point to a valid UTF-8 encoded 1028 * Unicode character, returns (gunichar)-1. 1029 */ 1030 public static dchar utf8GetCharValidated(string p, ptrdiff_t maxLen) 1031 { 1032 return g_utf8_get_char_validated(Str.toStringz(p), maxLen); 1033 } 1034 1035 /** 1036 * Converts a string into canonical form, standardizing 1037 * such issues as whether a character with an accent 1038 * is represented as a base character and combining 1039 * accent or as a single precomposed character. The 1040 * string has to be valid UTF-8, otherwise %NULL is 1041 * returned. You should generally call g_utf8_normalize() 1042 * before comparing two Unicode strings. 1043 * 1044 * The normalization mode %G_NORMALIZE_DEFAULT only 1045 * standardizes differences that do not affect the 1046 * text content, such as the above-mentioned accent 1047 * representation. %G_NORMALIZE_ALL also standardizes 1048 * the "compatibility" characters in Unicode, such 1049 * as SUPERSCRIPT THREE to the standard forms 1050 * (in this case DIGIT THREE). Formatting information 1051 * may be lost but for most text operations such 1052 * characters should be considered the same. 1053 * 1054 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 1055 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 1056 * but returned a result with composed forms rather 1057 * than a maximally decomposed form. This is often 1058 * useful if you intend to convert the string to 1059 * a legacy encoding or pass it to a system with 1060 * less capable Unicode handling. 1061 * 1062 * Params: 1063 * str = a UTF-8 encoded string. 1064 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1065 * mode = the type of normalization to perform. 1066 * 1067 * Return: a newly allocated string, that is the 1068 * normalized form of @str, or %NULL if @str is not 1069 * valid UTF-8. 1070 */ 1071 public static string utf8Normalize(string str, ptrdiff_t len, GNormalizeMode mode) 1072 { 1073 return Str.toString(g_utf8_normalize(Str.toStringz(str), len, mode)); 1074 } 1075 1076 /** 1077 * Converts from an integer character offset to a pointer to a position 1078 * within the string. 1079 * 1080 * Since 2.10, this function allows to pass a negative @offset to 1081 * step backwards. It is usually worth stepping backwards from the end 1082 * instead of forwards if @offset is in the last fourth of the string, 1083 * since moving forward is about 3 times faster than moving backward. 1084 * 1085 * Note that this function doesn't abort when reaching the end of @str. 1086 * Therefore you should be sure that @offset is within string boundaries 1087 * before calling that function. Call g_utf8_strlen() when unsure. 1088 * This limitation exists as this function is called frequently during 1089 * text rendering and therefore has to be as fast as possible. 1090 * 1091 * Params: 1092 * str = a UTF-8 encoded string 1093 * offset = a character offset within @str 1094 * 1095 * Return: the resulting pointer 1096 */ 1097 public static string utf8OffsetToPointer(string str, glong offset) 1098 { 1099 return Str.toString(g_utf8_offset_to_pointer(Str.toStringz(str), offset)); 1100 } 1101 1102 /** 1103 * Converts from a pointer to position within a string to a integer 1104 * character offset. 1105 * 1106 * Since 2.10, this function allows @pos to be before @str, and returns 1107 * a negative offset in this case. 1108 * 1109 * Params: 1110 * str = a UTF-8 encoded string 1111 * pos = a pointer to a position within @str 1112 * 1113 * Return: the resulting character offset 1114 */ 1115 public static glong utf8PointerToOffset(string str, string pos) 1116 { 1117 return g_utf8_pointer_to_offset(Str.toStringz(str), Str.toStringz(pos)); 1118 } 1119 1120 /** 1121 * Finds the previous UTF-8 character in the string before @p. 1122 * 1123 * @p does not have to be at the beginning of a UTF-8 character. No check 1124 * is made to see if the character found is actually valid other than 1125 * it starts with an appropriate byte. If @p might be the first 1126 * character of the string, you must use g_utf8_find_prev_char() instead. 1127 * 1128 * Params: 1129 * p = a pointer to a position within a UTF-8 encoded string 1130 * 1131 * Return: a pointer to the found character 1132 */ 1133 public static string utf8PrevChar(string p) 1134 { 1135 return Str.toString(g_utf8_prev_char(Str.toStringz(p))); 1136 } 1137 1138 /** 1139 * Finds the leftmost occurrence of the given Unicode character 1140 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1141 * If @len is -1, allow unbounded search. 1142 * 1143 * Params: 1144 * p = a nul-terminated UTF-8 encoded string 1145 * len = the maximum length of @p 1146 * c = a Unicode character 1147 * 1148 * Return: %NULL if the string does not contain the character, 1149 * otherwise, a pointer to the start of the leftmost occurrence 1150 * of the character in the string. 1151 */ 1152 public static string utf8Strchr(string p, ptrdiff_t len, dchar c) 1153 { 1154 return Str.toString(g_utf8_strchr(Str.toStringz(p), len, c)); 1155 } 1156 1157 /** 1158 * Converts all Unicode characters in the string that have a case 1159 * to lowercase. The exact manner that this is done depends 1160 * on the current locale, and may result in the number of 1161 * characters in the string changing. 1162 * 1163 * Params: 1164 * str = a UTF-8 encoded string 1165 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1166 * 1167 * Return: a newly allocated string, with all characters 1168 * converted to lowercase. 1169 */ 1170 public static string utf8Strdown(string str, ptrdiff_t len) 1171 { 1172 return Str.toString(g_utf8_strdown(Str.toStringz(str), len)); 1173 } 1174 1175 /** 1176 * Computes the length of the string in characters, not including 1177 * the terminating nul character. If the @max'th byte falls in the 1178 * middle of a character, the last (partial) character is not counted. 1179 * 1180 * Params: 1181 * p = pointer to the start of a UTF-8 encoded string 1182 * max = the maximum number of bytes to examine. If @max 1183 * is less than 0, then the string is assumed to be 1184 * nul-terminated. If @max is 0, @p will not be examined and 1185 * may be %NULL. If @max is greater than 0, up to @max 1186 * bytes are examined 1187 * 1188 * Return: the length of the string in characters 1189 */ 1190 public static glong utf8Strlen(string p, ptrdiff_t max) 1191 { 1192 return g_utf8_strlen(Str.toStringz(p), max); 1193 } 1194 1195 /** 1196 * Like the standard C strncpy() function, but copies a given number 1197 * of characters instead of a given number of bytes. The @src string 1198 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all 1199 * text before trying to use UTF-8 utility functions with it.) 1200 * 1201 * Params: 1202 * dest = buffer to fill with characters from @src 1203 * src = UTF-8 encoded string 1204 * n = character count 1205 * 1206 * Return: @dest 1207 */ 1208 public static string utf8Strncpy(string dest, string src, size_t n) 1209 { 1210 return Str.toString(g_utf8_strncpy(Str.toStringz(dest), Str.toStringz(src), n)); 1211 } 1212 1213 /** 1214 * Find the rightmost occurrence of the given Unicode character 1215 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1216 * If @len is -1, allow unbounded search. 1217 * 1218 * Params: 1219 * p = a nul-terminated UTF-8 encoded string 1220 * len = the maximum length of @p 1221 * c = a Unicode character 1222 * 1223 * Return: %NULL if the string does not contain the character, 1224 * otherwise, a pointer to the start of the rightmost occurrence 1225 * of the character in the string. 1226 */ 1227 public static string utf8Strrchr(string p, ptrdiff_t len, dchar c) 1228 { 1229 return Str.toString(g_utf8_strrchr(Str.toStringz(p), len, c)); 1230 } 1231 1232 /** 1233 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 1234 * (Use g_utf8_validate() on all text before trying to use UTF-8 1235 * utility functions with it.) 1236 * 1237 * This function is intended for programmatic uses of reversed strings. 1238 * It pays no attention to decomposed characters, combining marks, byte 1239 * order marks, directional indicators (LRM, LRO, etc) and similar 1240 * characters which might need special handling when reversing a string 1241 * for display purposes. 1242 * 1243 * Note that unlike g_strreverse(), this function returns 1244 * newly-allocated memory, which should be freed with g_free() when 1245 * no longer needed. 1246 * 1247 * Params: 1248 * str = a UTF-8 encoded string 1249 * len = the maximum length of @str to use, in bytes. If @len < 0, 1250 * then the string is nul-terminated. 1251 * 1252 * Return: a newly-allocated string which is the reverse of @str 1253 * 1254 * Since: 2.2 1255 */ 1256 public static string utf8Strreverse(string str, ptrdiff_t len) 1257 { 1258 return Str.toString(g_utf8_strreverse(Str.toStringz(str), len)); 1259 } 1260 1261 /** 1262 * Converts all Unicode characters in the string that have a case 1263 * to uppercase. The exact manner that this is done depends 1264 * on the current locale, and may result in the number of 1265 * characters in the string increasing. (For instance, the 1266 * German ess-zet will be changed to SS.) 1267 * 1268 * Params: 1269 * str = a UTF-8 encoded string 1270 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1271 * 1272 * Return: a newly allocated string, with all characters 1273 * converted to uppercase. 1274 */ 1275 public static string utf8Strup(string str, ptrdiff_t len) 1276 { 1277 return Str.toString(g_utf8_strup(Str.toStringz(str), len)); 1278 } 1279 1280 /** 1281 * Copies a substring out of a UTF-8 encoded string. 1282 * The substring will contain @end_pos - @start_pos characters. 1283 * 1284 * Params: 1285 * str = a UTF-8 encoded string 1286 * startPos = a character offset within @str 1287 * endPos = another character offset within @str 1288 * 1289 * Return: a newly allocated copy of the requested 1290 * substring. Free with g_free() when no longer needed. 1291 * 1292 * Since: 2.30 1293 */ 1294 public static string utf8Substring(string str, glong startPos, glong endPos) 1295 { 1296 return Str.toString(g_utf8_substring(Str.toStringz(str), startPos, endPos)); 1297 } 1298 1299 /** 1300 * Convert a string from UTF-8 to a 32-bit fixed width 1301 * representation as UCS-4. A trailing 0 character will be added to the 1302 * string after the converted text. 1303 * 1304 * Params: 1305 * str = a UTF-8 encoded string 1306 * len = the maximum length of @str to use, in bytes. If @len < 0, 1307 * then the string is nul-terminated. 1308 * itemsRead = location to store number of bytes read, or %NULL. 1309 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1310 * returned in case @str contains a trailing partial 1311 * character. If an error occurs then the index of the 1312 * invalid input is stored here. 1313 * itemsWritten = location to store number of characters 1314 * written or %NULL. The value here stored does not include the 1315 * trailing 0 character. 1316 * 1317 * Return: a pointer to a newly allocated UCS-4 string. 1318 * This value must be freed with g_free(). If an error occurs, 1319 * %NULL will be returned and @error set. 1320 * 1321 * Throws: GException on failure. 1322 */ 1323 public static dchar* utf8ToUcs4(string str, glong len, glong* itemsRead, glong* itemsWritten) 1324 { 1325 GError* err = null; 1326 1327 auto p = g_utf8_to_ucs4(Str.toStringz(str), len, itemsRead, itemsWritten, &err); 1328 1329 if (err !is null) 1330 { 1331 throw new GException( new ErrorG(err) ); 1332 } 1333 1334 return p; 1335 } 1336 1337 /** 1338 * Convert a string from UTF-8 to a 32-bit fixed width 1339 * representation as UCS-4, assuming valid UTF-8 input. 1340 * This function is roughly twice as fast as g_utf8_to_ucs4() 1341 * but does no error checking on the input. A trailing 0 character 1342 * will be added to the string after the converted text. 1343 * 1344 * Params: 1345 * str = a UTF-8 encoded string 1346 * len = the maximum length of @str to use, in bytes. If @len < 0, 1347 * then the string is nul-terminated. 1348 * itemsWritten = location to store the number of 1349 * characters in the result, or %NULL. 1350 * 1351 * Return: a pointer to a newly allocated UCS-4 string. 1352 * This value must be freed with g_free(). 1353 */ 1354 public static dchar* utf8ToUcs4Fast(string str, glong len, glong* itemsWritten) 1355 { 1356 return g_utf8_to_ucs4_fast(Str.toStringz(str), len, itemsWritten); 1357 } 1358 1359 /** 1360 * Convert a string from UTF-8 to UTF-16. A 0 character will be 1361 * added to the result after the converted text. 1362 * 1363 * Params: 1364 * str = a UTF-8 encoded string 1365 * len = the maximum length (number of bytes) of @str to use. 1366 * If @len < 0, then the string is nul-terminated. 1367 * itemsRead = location to store number of bytes read, 1368 * or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1369 * returned in case @str contains a trailing partial character. If 1370 * an error occurs then the index of the invalid input is stored here. 1371 * itemsWritten = location to store number of #gunichar2 1372 * written, or %NULL. The value stored here does not include the 1373 * trailing 0. 1374 * 1375 * Return: a pointer to a newly allocated UTF-16 string. 1376 * This value must be freed with g_free(). If an error occurs, 1377 * %NULL will be returned and @error set. 1378 * 1379 * Throws: GException on failure. 1380 */ 1381 public static wchar* utf8ToUtf16(string str, glong len, glong* itemsRead, glong* itemsWritten) 1382 { 1383 GError* err = null; 1384 1385 auto p = g_utf8_to_utf16(Str.toStringz(str), len, itemsRead, itemsWritten, &err); 1386 1387 if (err !is null) 1388 { 1389 throw new GException( new ErrorG(err) ); 1390 } 1391 1392 return p; 1393 } 1394 1395 /** 1396 * Validates UTF-8 encoded text. @str is the text to validate; 1397 * if @str is nul-terminated, then @max_len can be -1, otherwise 1398 * @max_len should be the number of bytes to validate. 1399 * If @end is non-%NULL, then the end of the valid range 1400 * will be stored there (i.e. the start of the first invalid 1401 * character if some bytes were invalid, or the end of the text 1402 * being validated otherwise). 1403 * 1404 * Note that g_utf8_validate() returns %FALSE if @max_len is 1405 * positive and any of the @max_len bytes are nul. 1406 * 1407 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 1408 * routines require valid UTF-8 as input; so data read from a file 1409 * or the network should be checked with g_utf8_validate() before 1410 * doing anything else with it. 1411 * 1412 * Params: 1413 * str = a pointer to character data 1414 * maxLen = max bytes to validate, or -1 to go until NUL 1415 * end = return location for end of valid data 1416 * 1417 * Return: %TRUE if the text was valid UTF-8 1418 */ 1419 public static bool utf8Validate(string str, out string end) 1420 { 1421 char* outend = null; 1422 1423 auto p = g_utf8_validate(Str.toStringz(str), cast(ptrdiff_t)str.length, &outend) != 0; 1424 1425 end = Str.toString(outend); 1426 1427 return p; 1428 } 1429 }