1 /* 2 * This file is part of gtkD. 3 * 4 * gtkD is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License 6 * as published by the Free Software Foundation; either version 3 7 * of the License, or (at your option) any later version, with 8 * some exceptions, please read the COPYING file. 9 * 10 * gtkD is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with gtkD; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA 18 */ 19 20 // generated automatically - do not change 21 // find conversion definition on APILookup.txt 22 // implement new conversion functionalities on the wrap.utils pakage 23 24 25 module glib.Unicode; 26 27 private import glib.ErrorG; 28 private import glib.GException; 29 private import glib.Str; 30 private import glib.c.functions; 31 public import glib.c.types; 32 public import gtkc.glibtypes; 33 34 35 /** */ 36 public struct Unicode 37 { 38 39 /** 40 * Convert a string from UCS-4 to UTF-16. A 0 character will be 41 * added to the result after the converted text. 42 * 43 * Params: 44 * str = a UCS-4 encoded string 45 * len = the maximum length (number of characters) of @str to use. 46 * If @len < 0, then the string is nul-terminated. 47 * itemsRead = location to store number of 48 * bytes read, or %NULL. If an error occurs then the index of the invalid 49 * input is stored here. 50 * itemsWritten = location to store number 51 * of #gunichar2 written, or %NULL. The value stored here does not include 52 * the trailing 0. 53 * 54 * Returns: a pointer to a newly allocated UTF-16 string. 55 * This value must be freed with g_free(). If an error occurs, 56 * %NULL will be returned and @error set. 57 * 58 * Throws: GException on failure. 59 */ 60 public static wchar* ucs4ToUtf16(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 61 { 62 GError* err = null; 63 64 auto p = g_ucs4_to_utf16(str, len, &itemsRead, &itemsWritten, &err); 65 66 if (err !is null) 67 { 68 throw new GException( new ErrorG(err) ); 69 } 70 71 return p; 72 } 73 74 /** 75 * Convert a string from a 32-bit fixed width representation as UCS-4. 76 * to UTF-8. The result will be terminated with a 0 byte. 77 * 78 * Params: 79 * str = a UCS-4 encoded string 80 * len = the maximum length (number of characters) of @str to use. 81 * If @len < 0, then the string is nul-terminated. 82 * itemsRead = location to store number of 83 * characters read, or %NULL. 84 * itemsWritten = location to store number 85 * of bytes written or %NULL. The value here stored does not include the 86 * trailing 0 byte. 87 * 88 * Returns: a pointer to a newly allocated UTF-8 string. 89 * This value must be freed with g_free(). If an error occurs, 90 * %NULL will be returned and @error set. In that case, @items_read 91 * will be set to the position of the first invalid input character. 92 * 93 * Throws: GException on failure. 94 */ 95 public static string ucs4ToUtf8(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 96 { 97 GError* err = null; 98 99 auto retStr = g_ucs4_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 100 101 if (err !is null) 102 { 103 throw new GException( new ErrorG(err) ); 104 } 105 106 scope(exit) Str.freeString(retStr); 107 return Str.toString(retStr); 108 } 109 110 /** 111 * Determines the break type of @c. @c should be a Unicode character 112 * (to derive a character from UTF-8 encoded text, use 113 * g_utf8_get_char()). The break type is used to find word and line 114 * breaks ("text boundaries"), Pango implements the Unicode boundary 115 * resolution algorithms and normally you would use a function such 116 * as pango_break() instead of caring about break types yourself. 117 * 118 * Params: 119 * c = a Unicode character 120 * 121 * Returns: the break type of @c 122 */ 123 public static GUnicodeBreakType unicharBreakType(dchar c) 124 { 125 return g_unichar_break_type(c); 126 } 127 128 /** 129 * Determines the canonical combining class of a Unicode character. 130 * 131 * Params: 132 * uc = a Unicode character 133 * 134 * Returns: the combining class of the character 135 * 136 * Since: 2.14 137 */ 138 public static int unicharCombiningClass(dchar uc) 139 { 140 return g_unichar_combining_class(uc); 141 } 142 143 /** 144 * Performs a single composition step of the 145 * Unicode canonical composition algorithm. 146 * 147 * This function includes algorithmic Hangul Jamo composition, 148 * but it is not exactly the inverse of g_unichar_decompose(). 149 * No composition can have either of @a or @b equal to zero. 150 * To be precise, this function composes if and only if 151 * there exists a Primary Composite P which is canonically 152 * equivalent to the sequence <@a,@b>. See the Unicode 153 * Standard for the definition of Primary Composite. 154 * 155 * If @a and @b do not compose a new character, @ch is set to zero. 156 * 157 * See 158 * [UAX#15](http://unicode.org/reports/tr15/) 159 * for details. 160 * 161 * Params: 162 * a = a Unicode character 163 * b = a Unicode character 164 * ch = return location for the composed character 165 * 166 * Returns: %TRUE if the characters could be composed 167 * 168 * Since: 2.30 169 */ 170 public static bool unicharCompose(dchar a, dchar b, dchar* ch) 171 { 172 return g_unichar_compose(a, b, ch) != 0; 173 } 174 175 /** 176 * Performs a single decomposition step of the 177 * Unicode canonical decomposition algorithm. 178 * 179 * This function does not include compatibility 180 * decompositions. It does, however, include algorithmic 181 * Hangul Jamo decomposition, as well as 'singleton' 182 * decompositions which replace a character by a single 183 * other character. In the case of singletons *@b will 184 * be set to zero. 185 * 186 * If @ch is not decomposable, *@a is set to @ch and *@b 187 * is set to zero. 188 * 189 * Note that the way Unicode decomposition pairs are 190 * defined, it is guaranteed that @b would not decompose 191 * further, but @a may itself decompose. To get the full 192 * canonical decomposition for @ch, one would need to 193 * recursively call this function on @a. Or use 194 * g_unichar_fully_decompose(). 195 * 196 * See 197 * [UAX#15](http://unicode.org/reports/tr15/) 198 * for details. 199 * 200 * Params: 201 * ch = a Unicode character 202 * a = return location for the first component of @ch 203 * b = return location for the second component of @ch 204 * 205 * Returns: %TRUE if the character could be decomposed 206 * 207 * Since: 2.30 208 */ 209 public static bool unicharDecompose(dchar ch, dchar* a, dchar* b) 210 { 211 return g_unichar_decompose(ch, a, b) != 0; 212 } 213 214 /** 215 * Determines the numeric value of a character as a decimal 216 * digit. 217 * 218 * Params: 219 * c = a Unicode character 220 * 221 * Returns: If @c is a decimal digit (according to 222 * g_unichar_isdigit()), its numeric value. Otherwise, -1. 223 */ 224 public static int unicharDigitValue(dchar c) 225 { 226 return g_unichar_digit_value(c); 227 } 228 229 /** 230 * Computes the canonical or compatibility decomposition of a 231 * Unicode character. For compatibility decomposition, 232 * pass %TRUE for @compat; for canonical decomposition 233 * pass %FALSE for @compat. 234 * 235 * The decomposed sequence is placed in @result. Only up to 236 * @result_len characters are written into @result. The length 237 * of the full decomposition (irrespective of @result_len) is 238 * returned by the function. For canonical decomposition, 239 * currently all decompositions are of length at most 4, but 240 * this may change in the future (very unlikely though). 241 * At any rate, Unicode does guarantee that a buffer of length 242 * 18 is always enough for both compatibility and canonical 243 * decompositions, so that is the size recommended. This is provided 244 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. 245 * 246 * See 247 * [UAX#15](http://unicode.org/reports/tr15/) 248 * for details. 249 * 250 * Params: 251 * ch = a Unicode character. 252 * compat = whether perform canonical or compatibility decomposition 253 * result = location to store decomposed result, or %NULL 254 * resultLen = length of @result 255 * 256 * Returns: the length of the full decomposition. 257 * 258 * Since: 2.30 259 */ 260 public static size_t unicharFullyDecompose(dchar ch, bool compat, dchar* result, size_t resultLen) 261 { 262 return g_unichar_fully_decompose(ch, compat, result, resultLen); 263 } 264 265 /** 266 * In Unicode, some characters are "mirrored". This means that their 267 * images are mirrored horizontally in text that is laid out from right 268 * to left. For instance, "(" would become its mirror image, ")", in 269 * right-to-left text. 270 * 271 * If @ch has the Unicode mirrored property and there is another unicode 272 * character that typically has a glyph that is the mirror image of @ch's 273 * glyph and @mirrored_ch is set, it puts that character in the address 274 * pointed to by @mirrored_ch. Otherwise the original character is put. 275 * 276 * Params: 277 * ch = a Unicode character 278 * mirroredCh = location to store the mirrored character 279 * 280 * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise 281 * 282 * Since: 2.4 283 */ 284 public static bool unicharGetMirrorChar(dchar ch, dchar* mirroredCh) 285 { 286 return g_unichar_get_mirror_char(ch, mirroredCh) != 0; 287 } 288 289 /** 290 * Looks up the #GUnicodeScript for a particular character (as defined 291 * by Unicode Standard Annex \#24). No check is made for @ch being a 292 * valid Unicode character; if you pass in invalid character, the 293 * result is undefined. 294 * 295 * This function is equivalent to pango_script_for_unichar() and the 296 * two are interchangeable. 297 * 298 * Params: 299 * ch = a Unicode character 300 * 301 * Returns: the #GUnicodeScript for the character. 302 * 303 * Since: 2.14 304 */ 305 public static GUnicodeScript unicharGetScript(dchar ch) 306 { 307 return g_unichar_get_script(ch); 308 } 309 310 /** 311 * Determines whether a character is alphanumeric. 312 * Given some UTF-8 text, obtain a character value 313 * with g_utf8_get_char(). 314 * 315 * Params: 316 * c = a Unicode character 317 * 318 * Returns: %TRUE if @c is an alphanumeric character 319 */ 320 public static bool unicharIsalnum(dchar c) 321 { 322 return g_unichar_isalnum(c) != 0; 323 } 324 325 /** 326 * Determines whether a character is alphabetic (i.e. a letter). 327 * Given some UTF-8 text, obtain a character value with 328 * g_utf8_get_char(). 329 * 330 * Params: 331 * c = a Unicode character 332 * 333 * Returns: %TRUE if @c is an alphabetic character 334 */ 335 public static bool unicharIsalpha(dchar c) 336 { 337 return g_unichar_isalpha(c) != 0; 338 } 339 340 /** 341 * Determines whether a character is a control character. 342 * Given some UTF-8 text, obtain a character value with 343 * g_utf8_get_char(). 344 * 345 * Params: 346 * c = a Unicode character 347 * 348 * Returns: %TRUE if @c is a control character 349 */ 350 public static bool unicharIscntrl(dchar c) 351 { 352 return g_unichar_iscntrl(c) != 0; 353 } 354 355 /** 356 * Determines if a given character is assigned in the Unicode 357 * standard. 358 * 359 * Params: 360 * c = a Unicode character 361 * 362 * Returns: %TRUE if the character has an assigned value 363 */ 364 public static bool unicharIsdefined(dchar c) 365 { 366 return g_unichar_isdefined(c) != 0; 367 } 368 369 /** 370 * Determines whether a character is numeric (i.e. a digit). This 371 * covers ASCII 0-9 and also digits in other languages/scripts. Given 372 * some UTF-8 text, obtain a character value with g_utf8_get_char(). 373 * 374 * Params: 375 * c = a Unicode character 376 * 377 * Returns: %TRUE if @c is a digit 378 */ 379 public static bool unicharIsdigit(dchar c) 380 { 381 return g_unichar_isdigit(c) != 0; 382 } 383 384 /** 385 * Determines whether a character is printable and not a space 386 * (returns %FALSE for control characters, format characters, and 387 * spaces). g_unichar_isprint() is similar, but returns %TRUE for 388 * spaces. Given some UTF-8 text, obtain a character value with 389 * g_utf8_get_char(). 390 * 391 * Params: 392 * c = a Unicode character 393 * 394 * Returns: %TRUE if @c is printable unless it's a space 395 */ 396 public static bool unicharIsgraph(dchar c) 397 { 398 return g_unichar_isgraph(c) != 0; 399 } 400 401 /** 402 * Determines whether a character is a lowercase letter. 403 * Given some UTF-8 text, obtain a character value with 404 * g_utf8_get_char(). 405 * 406 * Params: 407 * c = a Unicode character 408 * 409 * Returns: %TRUE if @c is a lowercase letter 410 */ 411 public static bool unicharIslower(dchar c) 412 { 413 return g_unichar_islower(c) != 0; 414 } 415 416 /** 417 * Determines whether a character is a mark (non-spacing mark, 418 * combining mark, or enclosing mark in Unicode speak). 419 * Given some UTF-8 text, obtain a character value 420 * with g_utf8_get_char(). 421 * 422 * Note: in most cases where isalpha characters are allowed, 423 * ismark characters should be allowed to as they are essential 424 * for writing most European languages as well as many non-Latin 425 * scripts. 426 * 427 * Params: 428 * c = a Unicode character 429 * 430 * Returns: %TRUE if @c is a mark character 431 * 432 * Since: 2.14 433 */ 434 public static bool unicharIsmark(dchar c) 435 { 436 return g_unichar_ismark(c) != 0; 437 } 438 439 /** 440 * Determines whether a character is printable. 441 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. 442 * Given some UTF-8 text, obtain a character value with 443 * g_utf8_get_char(). 444 * 445 * Params: 446 * c = a Unicode character 447 * 448 * Returns: %TRUE if @c is printable 449 */ 450 public static bool unicharIsprint(dchar c) 451 { 452 return g_unichar_isprint(c) != 0; 453 } 454 455 /** 456 * Determines whether a character is punctuation or a symbol. 457 * Given some UTF-8 text, obtain a character value with 458 * g_utf8_get_char(). 459 * 460 * Params: 461 * c = a Unicode character 462 * 463 * Returns: %TRUE if @c is a punctuation or symbol character 464 */ 465 public static bool unicharIspunct(dchar c) 466 { 467 return g_unichar_ispunct(c) != 0; 468 } 469 470 /** 471 * Determines whether a character is a space, tab, or line separator 472 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a 473 * character value with g_utf8_get_char(). 474 * 475 * (Note: don't use this to do word breaking; you have to use 476 * Pango or equivalent to get word breaking right, the algorithm 477 * is fairly complex.) 478 * 479 * Params: 480 * c = a Unicode character 481 * 482 * Returns: %TRUE if @c is a space character 483 */ 484 public static bool unicharIsspace(dchar c) 485 { 486 return g_unichar_isspace(c) != 0; 487 } 488 489 /** 490 * Determines if a character is titlecase. Some characters in 491 * Unicode which are composites, such as the DZ digraph 492 * have three case variants instead of just two. The titlecase 493 * form is used at the beginning of a word where only the 494 * first letter is capitalized. The titlecase form of the DZ 495 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. 496 * 497 * Params: 498 * c = a Unicode character 499 * 500 * Returns: %TRUE if the character is titlecase 501 */ 502 public static bool unicharIstitle(dchar c) 503 { 504 return g_unichar_istitle(c) != 0; 505 } 506 507 /** 508 * Determines if a character is uppercase. 509 * 510 * Params: 511 * c = a Unicode character 512 * 513 * Returns: %TRUE if @c is an uppercase character 514 */ 515 public static bool unicharIsupper(dchar c) 516 { 517 return g_unichar_isupper(c) != 0; 518 } 519 520 /** 521 * Determines if a character is typically rendered in a double-width 522 * cell. 523 * 524 * Params: 525 * c = a Unicode character 526 * 527 * Returns: %TRUE if the character is wide 528 */ 529 public static bool unicharIswide(dchar c) 530 { 531 return g_unichar_iswide(c) != 0; 532 } 533 534 /** 535 * Determines if a character is typically rendered in a double-width 536 * cell under legacy East Asian locales. If a character is wide according to 537 * g_unichar_iswide(), then it is also reported wide with this function, but 538 * the converse is not necessarily true. See the 539 * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) 540 * for details. 541 * 542 * If a character passes the g_unichar_iswide() test then it will also pass 543 * this test, but not the other way around. Note that some characters may 544 * pass both this test and g_unichar_iszerowidth(). 545 * 546 * Params: 547 * c = a Unicode character 548 * 549 * Returns: %TRUE if the character is wide in legacy East Asian locales 550 * 551 * Since: 2.12 552 */ 553 public static bool unicharIswideCjk(dchar c) 554 { 555 return g_unichar_iswide_cjk(c) != 0; 556 } 557 558 /** 559 * Determines if a character is a hexidecimal digit. 560 * 561 * Params: 562 * c = a Unicode character. 563 * 564 * Returns: %TRUE if the character is a hexadecimal digit 565 */ 566 public static bool unicharIsxdigit(dchar c) 567 { 568 return g_unichar_isxdigit(c) != 0; 569 } 570 571 /** 572 * Determines if a given character typically takes zero width when rendered. 573 * The return value is %TRUE for all non-spacing and enclosing marks 574 * (e.g., combining accents), format characters, zero-width 575 * space, but not U+00AD SOFT HYPHEN. 576 * 577 * A typical use of this function is with one of g_unichar_iswide() or 578 * g_unichar_iswide_cjk() to determine the number of cells a string occupies 579 * when displayed on a grid display (terminals). However, note that not all 580 * terminals support zero-width rendering of zero-width marks. 581 * 582 * Params: 583 * c = a Unicode character 584 * 585 * Returns: %TRUE if the character has zero width 586 * 587 * Since: 2.14 588 */ 589 public static bool unicharIszerowidth(dchar c) 590 { 591 return g_unichar_iszerowidth(c) != 0; 592 } 593 594 /** 595 * Converts a single character to UTF-8. 596 * 597 * Params: 598 * c = a Unicode character code 599 * outbuf = output buffer, must have at 600 * least 6 bytes of space. If %NULL, the length will be computed and 601 * returned and nothing will be written to @outbuf. 602 * 603 * Returns: number of bytes written 604 */ 605 public static int unicharToUtf8(dchar c, out char[] outbuf) 606 { 607 return g_unichar_to_utf8(c, outbuf.ptr); 608 } 609 610 /** 611 * Converts a character to lower case. 612 * 613 * Params: 614 * c = a Unicode character. 615 * 616 * Returns: the result of converting @c to lower case. 617 * If @c is not an upperlower or titlecase character, 618 * or has no lowercase equivalent @c is returned unchanged. 619 */ 620 public static dchar unicharTolower(dchar c) 621 { 622 return g_unichar_tolower(c); 623 } 624 625 /** 626 * Converts a character to the titlecase. 627 * 628 * Params: 629 * c = a Unicode character 630 * 631 * Returns: the result of converting @c to titlecase. 632 * If @c is not an uppercase or lowercase character, 633 * @c is returned unchanged. 634 */ 635 public static dchar unicharTotitle(dchar c) 636 { 637 return g_unichar_totitle(c); 638 } 639 640 /** 641 * Converts a character to uppercase. 642 * 643 * Params: 644 * c = a Unicode character 645 * 646 * Returns: the result of converting @c to uppercase. 647 * If @c is not an lowercase or titlecase character, 648 * or has no upper case equivalent @c is returned unchanged. 649 */ 650 public static dchar unicharToupper(dchar c) 651 { 652 return g_unichar_toupper(c); 653 } 654 655 /** 656 * Classifies a Unicode character by type. 657 * 658 * Params: 659 * c = a Unicode character 660 * 661 * Returns: the type of the character. 662 */ 663 public static GUnicodeType unicharType(dchar c) 664 { 665 return g_unichar_type(c); 666 } 667 668 /** 669 * Checks whether @ch is a valid Unicode character. Some possible 670 * integer values of @ch will not be valid. 0 is considered a valid 671 * character, though it's normally a string terminator. 672 * 673 * Params: 674 * ch = a Unicode character 675 * 676 * Returns: %TRUE if @ch is a valid Unicode character 677 */ 678 public static bool unicharValidate(dchar ch) 679 { 680 return g_unichar_validate(ch) != 0; 681 } 682 683 /** 684 * Determines the numeric value of a character as a hexidecimal 685 * digit. 686 * 687 * Params: 688 * c = a Unicode character 689 * 690 * Returns: If @c is a hex digit (according to 691 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. 692 */ 693 public static int unicharXdigitValue(dchar c) 694 { 695 return g_unichar_xdigit_value(c); 696 } 697 698 /** 699 * Computes the canonical decomposition of a Unicode character. 700 * 701 * Deprecated: Use the more flexible g_unichar_fully_decompose() 702 * instead. 703 * 704 * Params: 705 * ch = a Unicode character. 706 * resultLen = location to store the length of the return value. 707 * 708 * Returns: a newly allocated string of Unicode characters. 709 * @result_len is set to the resulting length of the string. 710 */ 711 public static dchar* unicodeCanonicalDecomposition(dchar ch, size_t* resultLen) 712 { 713 return g_unicode_canonical_decomposition(ch, resultLen); 714 } 715 716 /** 717 * Computes the canonical ordering of a string in-place. 718 * This rearranges decomposed characters in the string 719 * according to their combining classes. See the Unicode 720 * manual for more information. 721 * 722 * Params: 723 * str = a UCS-4 encoded string. 724 * len = the maximum length of @string to use. 725 */ 726 public static void unicodeCanonicalOrdering(dchar* str, size_t len) 727 { 728 g_unicode_canonical_ordering(str, len); 729 } 730 731 /** 732 * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter 733 * codes to scripts. For example, the code for Arabic is 'Arab'. 734 * This function accepts four letter codes encoded as a @guint32 in a 735 * big-endian fashion. That is, the code expected for Arabic is 736 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 737 * 738 * See 739 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 740 * for details. 741 * 742 * Params: 743 * iso15924 = a Unicode script 744 * 745 * Returns: the Unicode script for @iso15924, or 746 * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and 747 * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. 748 * 749 * Since: 2.30 750 */ 751 public static GUnicodeScript unicodeScriptFromIso15924(uint iso15924) 752 { 753 return g_unicode_script_from_iso15924(iso15924); 754 } 755 756 /** 757 * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter 758 * codes to scripts. For example, the code for Arabic is 'Arab'. The 759 * four letter codes are encoded as a @guint32 by this function in a 760 * big-endian fashion. That is, the code returned for Arabic is 761 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 762 * 763 * See 764 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 765 * for details. 766 * 767 * Params: 768 * script = a Unicode script 769 * 770 * Returns: the ISO 15924 code for @script, encoded as an integer, 771 * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or 772 * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. 773 * 774 * Since: 2.30 775 */ 776 public static uint unicodeScriptToIso15924(GUnicodeScript script) 777 { 778 return g_unicode_script_to_iso15924(script); 779 } 780 781 /** 782 * Convert a string from UTF-16 to UCS-4. The result will be 783 * nul-terminated. 784 * 785 * Params: 786 * str = a UTF-16 encoded string 787 * len = the maximum length (number of #gunichar2) of @str to use. 788 * If @len < 0, then the string is nul-terminated. 789 * itemsRead = location to store number of 790 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 791 * be returned in case @str contains a trailing partial character. If 792 * an error occurs then the index of the invalid input is stored here. 793 * itemsWritten = location to store number 794 * of characters written, or %NULL. The value stored here does not include 795 * the trailing 0 character. 796 * 797 * Returns: a pointer to a newly allocated UCS-4 string. 798 * This value must be freed with g_free(). If an error occurs, 799 * %NULL will be returned and @error set. 800 * 801 * Throws: GException on failure. 802 */ 803 public static dchar* utf16ToUcs4(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 804 { 805 GError* err = null; 806 807 auto p = g_utf16_to_ucs4(str, len, &itemsRead, &itemsWritten, &err); 808 809 if (err !is null) 810 { 811 throw new GException( new ErrorG(err) ); 812 } 813 814 return p; 815 } 816 817 /** 818 * Convert a string from UTF-16 to UTF-8. The result will be 819 * terminated with a 0 byte. 820 * 821 * Note that the input is expected to be already in native endianness, 822 * an initial byte-order-mark character is not handled specially. 823 * g_convert() can be used to convert a byte buffer of UTF-16 data of 824 * ambiguous endianess. 825 * 826 * Further note that this function does not validate the result 827 * string; it may e.g. include embedded NUL characters. The only 828 * validation done by this function is to ensure that the input can 829 * be correctly interpreted as UTF-16, i.e. it doesn't contain 830 * things unpaired surrogates. 831 * 832 * Params: 833 * str = a UTF-16 encoded string 834 * len = the maximum length (number of #gunichar2) of @str to use. 835 * If @len < 0, then the string is nul-terminated. 836 * itemsRead = location to store number of 837 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 838 * be returned in case @str contains a trailing partial character. If 839 * an error occurs then the index of the invalid input is stored here. 840 * itemsWritten = location to store number 841 * of bytes written, or %NULL. The value stored here does not include the 842 * trailing 0 byte. 843 * 844 * Returns: a pointer to a newly allocated UTF-8 string. 845 * This value must be freed with g_free(). If an error occurs, 846 * %NULL will be returned and @error set. 847 * 848 * Throws: GException on failure. 849 */ 850 public static string utf16ToUtf8(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 851 { 852 GError* err = null; 853 854 auto retStr = g_utf16_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 855 856 if (err !is null) 857 { 858 throw new GException( new ErrorG(err) ); 859 } 860 861 scope(exit) Str.freeString(retStr); 862 return Str.toString(retStr); 863 } 864 865 /** 866 * Converts a string into a form that is independent of case. The 867 * result will not correspond to any particular case, but can be 868 * compared for equality or ordered with the results of calling 869 * g_utf8_casefold() on other strings. 870 * 871 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is 872 * only an approximation to the correct linguistic case insensitive 873 * ordering, though it is a fairly good one. Getting this exactly 874 * right would require a more sophisticated collation function that 875 * takes case sensitivity into account. GLib does not currently 876 * provide such a function. 877 * 878 * Params: 879 * str = a UTF-8 encoded string 880 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 881 * 882 * Returns: a newly allocated string, that is a 883 * case independent form of @str. 884 */ 885 public static string utf8Casefold(string str, ptrdiff_t len) 886 { 887 auto retStr = g_utf8_casefold(Str.toStringz(str), len); 888 889 scope(exit) Str.freeString(retStr); 890 return Str.toString(retStr); 891 } 892 893 /** 894 * Compares two strings for ordering using the linguistically 895 * correct rules for the [current locale][setlocale]. 896 * When sorting a large number of strings, it will be significantly 897 * faster to obtain collation keys with g_utf8_collate_key() and 898 * compare the keys with strcmp() when sorting instead of sorting 899 * the original strings. 900 * 901 * Params: 902 * str1 = a UTF-8 encoded string 903 * str2 = a UTF-8 encoded string 904 * 905 * Returns: < 0 if @str1 compares before @str2, 906 * 0 if they compare equal, > 0 if @str1 compares after @str2. 907 */ 908 public static int utf8Collate(string str1, string str2) 909 { 910 return g_utf8_collate(Str.toStringz(str1), Str.toStringz(str2)); 911 } 912 913 /** 914 * Converts a string into a collation key that can be compared 915 * with other collation keys produced by the same function using 916 * strcmp(). 917 * 918 * The results of comparing the collation keys of two strings 919 * with strcmp() will always be the same as comparing the two 920 * original keys with g_utf8_collate(). 921 * 922 * Note that this function depends on the [current locale][setlocale]. 923 * 924 * Params: 925 * str = a UTF-8 encoded string. 926 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 927 * 928 * Returns: a newly allocated string. This string should 929 * be freed with g_free() when you are done with it. 930 */ 931 public static string utf8CollateKey(string str, ptrdiff_t len) 932 { 933 auto retStr = g_utf8_collate_key(Str.toStringz(str), len); 934 935 scope(exit) Str.freeString(retStr); 936 return Str.toString(retStr); 937 } 938 939 /** 940 * Converts a string into a collation key that can be compared 941 * with other collation keys produced by the same function using strcmp(). 942 * 943 * In order to sort filenames correctly, this function treats the dot '.' 944 * as a special case. Most dictionary orderings seem to consider it 945 * insignificant, thus producing the ordering "event.c" "eventgenerator.c" 946 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we 947 * would like to treat numbers intelligently so that "file1" "file10" "file5" 948 * is sorted as "file1" "file5" "file10". 949 * 950 * Note that this function depends on the [current locale][setlocale]. 951 * 952 * Params: 953 * str = a UTF-8 encoded string. 954 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 955 * 956 * Returns: a newly allocated string. This string should 957 * be freed with g_free() when you are done with it. 958 * 959 * Since: 2.8 960 */ 961 public static string utf8CollateKeyForFilename(string str, ptrdiff_t len) 962 { 963 auto retStr = g_utf8_collate_key_for_filename(Str.toStringz(str), len); 964 965 scope(exit) Str.freeString(retStr); 966 return Str.toString(retStr); 967 } 968 969 /** 970 * Finds the start of the next UTF-8 character in the string after @p. 971 * 972 * @p does not have to be at the beginning of a UTF-8 character. No check 973 * is made to see if the character found is actually valid other than 974 * it starts with an appropriate byte. 975 * 976 * If @end is %NULL, the return value will never be %NULL: if the end of the 977 * string is reached, a pointer to the terminating nul byte is returned. If 978 * @end is non-%NULL, the return value will be %NULL if the end of the string 979 * is reached. 980 * 981 * Params: 982 * p = a pointer to a position within a UTF-8 encoded string 983 * end = a pointer to the byte following the end of the string, 984 * or %NULL to indicate that the string is nul-terminated 985 * 986 * Returns: a pointer to the found character or %NULL if @end is 987 * set and is reached 988 */ 989 public static string utf8FindNextChar(string p, string end) 990 { 991 auto retStr = g_utf8_find_next_char(Str.toStringz(p), Str.toStringz(end)); 992 993 scope(exit) Str.freeString(retStr); 994 return Str.toString(retStr); 995 } 996 997 /** 998 * Given a position @p with a UTF-8 encoded string @str, find the start 999 * of the previous UTF-8 character starting before @p. Returns %NULL if no 1000 * UTF-8 characters are present in @str before @p. 1001 * 1002 * @p does not have to be at the beginning of a UTF-8 character. No check 1003 * is made to see if the character found is actually valid other than 1004 * it starts with an appropriate byte. 1005 * 1006 * Params: 1007 * str = pointer to the beginning of a UTF-8 encoded string 1008 * p = pointer to some position within @str 1009 * 1010 * Returns: a pointer to the found character or %NULL. 1011 */ 1012 public static string utf8FindPrevChar(string str, string p) 1013 { 1014 auto retStr = g_utf8_find_prev_char(Str.toStringz(str), Str.toStringz(p)); 1015 1016 scope(exit) Str.freeString(retStr); 1017 return Str.toString(retStr); 1018 } 1019 1020 /** 1021 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 1022 * 1023 * If @p does not point to a valid UTF-8 encoded character, results 1024 * are undefined. If you are not sure that the bytes are complete 1025 * valid Unicode characters, you should use g_utf8_get_char_validated() 1026 * instead. 1027 * 1028 * Params: 1029 * p = a pointer to Unicode character encoded as UTF-8 1030 * 1031 * Returns: the resulting character 1032 */ 1033 public static dchar utf8GetChar(string p) 1034 { 1035 return g_utf8_get_char(Str.toStringz(p)); 1036 } 1037 1038 /** 1039 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 1040 * This function checks for incomplete characters, for invalid characters 1041 * such as characters that are out of the range of Unicode, and for 1042 * overlong encodings of valid characters. 1043 * 1044 * Note that g_utf8_get_char_validated() returns (gunichar)-2 if 1045 * @max_len is positive and any of the bytes in the first UTF-8 character 1046 * sequence are nul. 1047 * 1048 * Params: 1049 * p = a pointer to Unicode character encoded as UTF-8 1050 * maxLen = the maximum number of bytes to read, or -1 if @p is nul-terminated 1051 * 1052 * Returns: the resulting character. If @p points to a partial 1053 * sequence at the end of a string that could begin a valid 1054 * character (or if @max_len is zero), returns (gunichar)-2; 1055 * otherwise, if @p does not point to a valid UTF-8 encoded 1056 * Unicode character, returns (gunichar)-1. 1057 */ 1058 public static dchar utf8GetCharValidated(string p, ptrdiff_t maxLen) 1059 { 1060 return g_utf8_get_char_validated(Str.toStringz(p), maxLen); 1061 } 1062 1063 /** 1064 * Converts a string into canonical form, standardizing 1065 * such issues as whether a character with an accent 1066 * is represented as a base character and combining 1067 * accent or as a single precomposed character. The 1068 * string has to be valid UTF-8, otherwise %NULL is 1069 * returned. You should generally call g_utf8_normalize() 1070 * before comparing two Unicode strings. 1071 * 1072 * The normalization mode %G_NORMALIZE_DEFAULT only 1073 * standardizes differences that do not affect the 1074 * text content, such as the above-mentioned accent 1075 * representation. %G_NORMALIZE_ALL also standardizes 1076 * the "compatibility" characters in Unicode, such 1077 * as SUPERSCRIPT THREE to the standard forms 1078 * (in this case DIGIT THREE). Formatting information 1079 * may be lost but for most text operations such 1080 * characters should be considered the same. 1081 * 1082 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 1083 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 1084 * but returned a result with composed forms rather 1085 * than a maximally decomposed form. This is often 1086 * useful if you intend to convert the string to 1087 * a legacy encoding or pass it to a system with 1088 * less capable Unicode handling. 1089 * 1090 * Params: 1091 * str = a UTF-8 encoded string. 1092 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1093 * mode = the type of normalization to perform. 1094 * 1095 * Returns: a newly allocated string, that is the 1096 * normalized form of @str, or %NULL if @str is not 1097 * valid UTF-8. 1098 */ 1099 public static string utf8Normalize(string str, ptrdiff_t len, GNormalizeMode mode) 1100 { 1101 auto retStr = g_utf8_normalize(Str.toStringz(str), len, mode); 1102 1103 scope(exit) Str.freeString(retStr); 1104 return Str.toString(retStr); 1105 } 1106 1107 /** 1108 * Converts from an integer character offset to a pointer to a position 1109 * within the string. 1110 * 1111 * Since 2.10, this function allows to pass a negative @offset to 1112 * step backwards. It is usually worth stepping backwards from the end 1113 * instead of forwards if @offset is in the last fourth of the string, 1114 * since moving forward is about 3 times faster than moving backward. 1115 * 1116 * Note that this function doesn't abort when reaching the end of @str. 1117 * Therefore you should be sure that @offset is within string boundaries 1118 * before calling that function. Call g_utf8_strlen() when unsure. 1119 * This limitation exists as this function is called frequently during 1120 * text rendering and therefore has to be as fast as possible. 1121 * 1122 * Params: 1123 * str = a UTF-8 encoded string 1124 * offset = a character offset within @str 1125 * 1126 * Returns: the resulting pointer 1127 */ 1128 public static string utf8OffsetToPointer(string str, glong offset) 1129 { 1130 auto retStr = g_utf8_offset_to_pointer(Str.toStringz(str), offset); 1131 1132 scope(exit) Str.freeString(retStr); 1133 return Str.toString(retStr); 1134 } 1135 1136 /** 1137 * Converts from a pointer to position within a string to a integer 1138 * character offset. 1139 * 1140 * Since 2.10, this function allows @pos to be before @str, and returns 1141 * a negative offset in this case. 1142 * 1143 * Params: 1144 * str = a UTF-8 encoded string 1145 * pos = a pointer to a position within @str 1146 * 1147 * Returns: the resulting character offset 1148 */ 1149 public static glong utf8PointerToOffset(string str, string pos) 1150 { 1151 return g_utf8_pointer_to_offset(Str.toStringz(str), Str.toStringz(pos)); 1152 } 1153 1154 /** 1155 * Finds the previous UTF-8 character in the string before @p. 1156 * 1157 * @p does not have to be at the beginning of a UTF-8 character. No check 1158 * is made to see if the character found is actually valid other than 1159 * it starts with an appropriate byte. If @p might be the first 1160 * character of the string, you must use g_utf8_find_prev_char() instead. 1161 * 1162 * Params: 1163 * p = a pointer to a position within a UTF-8 encoded string 1164 * 1165 * Returns: a pointer to the found character 1166 */ 1167 public static string utf8PrevChar(string p) 1168 { 1169 auto retStr = g_utf8_prev_char(Str.toStringz(p)); 1170 1171 scope(exit) Str.freeString(retStr); 1172 return Str.toString(retStr); 1173 } 1174 1175 /** 1176 * Finds the leftmost occurrence of the given Unicode character 1177 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1178 * If @len is -1, allow unbounded search. 1179 * 1180 * Params: 1181 * p = a nul-terminated UTF-8 encoded string 1182 * len = the maximum length of @p 1183 * c = a Unicode character 1184 * 1185 * Returns: %NULL if the string does not contain the character, 1186 * otherwise, a pointer to the start of the leftmost occurrence 1187 * of the character in the string. 1188 */ 1189 public static string utf8Strchr(string p, ptrdiff_t len, dchar c) 1190 { 1191 auto retStr = g_utf8_strchr(Str.toStringz(p), len, c); 1192 1193 scope(exit) Str.freeString(retStr); 1194 return Str.toString(retStr); 1195 } 1196 1197 /** 1198 * Converts all Unicode characters in the string that have a case 1199 * to lowercase. The exact manner that this is done depends 1200 * on the current locale, and may result in the number of 1201 * characters in the string changing. 1202 * 1203 * Params: 1204 * str = a UTF-8 encoded string 1205 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1206 * 1207 * Returns: a newly allocated string, with all characters 1208 * converted to lowercase. 1209 */ 1210 public static string utf8Strdown(string str, ptrdiff_t len) 1211 { 1212 auto retStr = g_utf8_strdown(Str.toStringz(str), len); 1213 1214 scope(exit) Str.freeString(retStr); 1215 return Str.toString(retStr); 1216 } 1217 1218 /** 1219 * Computes the length of the string in characters, not including 1220 * the terminating nul character. If the @max'th byte falls in the 1221 * middle of a character, the last (partial) character is not counted. 1222 * 1223 * Params: 1224 * p = pointer to the start of a UTF-8 encoded string 1225 * max = the maximum number of bytes to examine. If @max 1226 * is less than 0, then the string is assumed to be 1227 * nul-terminated. If @max is 0, @p will not be examined and 1228 * may be %NULL. If @max is greater than 0, up to @max 1229 * bytes are examined 1230 * 1231 * Returns: the length of the string in characters 1232 */ 1233 public static glong utf8Strlen(string p, ptrdiff_t max) 1234 { 1235 return g_utf8_strlen(Str.toStringz(p), max); 1236 } 1237 1238 /** 1239 * Like the standard C strncpy() function, but copies a given number 1240 * of characters instead of a given number of bytes. The @src string 1241 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all 1242 * text before trying to use UTF-8 utility functions with it.) 1243 * 1244 * Params: 1245 * dest = buffer to fill with characters from @src 1246 * src = UTF-8 encoded string 1247 * n = character count 1248 * 1249 * Returns: @dest 1250 */ 1251 public static string utf8Strncpy(string dest, string src, size_t n) 1252 { 1253 auto retStr = g_utf8_strncpy(Str.toStringz(dest), Str.toStringz(src), n); 1254 1255 scope(exit) Str.freeString(retStr); 1256 return Str.toString(retStr); 1257 } 1258 1259 /** 1260 * Find the rightmost occurrence of the given Unicode character 1261 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1262 * If @len is -1, allow unbounded search. 1263 * 1264 * Params: 1265 * p = a nul-terminated UTF-8 encoded string 1266 * len = the maximum length of @p 1267 * c = a Unicode character 1268 * 1269 * Returns: %NULL if the string does not contain the character, 1270 * otherwise, a pointer to the start of the rightmost occurrence 1271 * of the character in the string. 1272 */ 1273 public static string utf8Strrchr(string p, ptrdiff_t len, dchar c) 1274 { 1275 auto retStr = g_utf8_strrchr(Str.toStringz(p), len, c); 1276 1277 scope(exit) Str.freeString(retStr); 1278 return Str.toString(retStr); 1279 } 1280 1281 /** 1282 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 1283 * (Use g_utf8_validate() on all text before trying to use UTF-8 1284 * utility functions with it.) 1285 * 1286 * This function is intended for programmatic uses of reversed strings. 1287 * It pays no attention to decomposed characters, combining marks, byte 1288 * order marks, directional indicators (LRM, LRO, etc) and similar 1289 * characters which might need special handling when reversing a string 1290 * for display purposes. 1291 * 1292 * Note that unlike g_strreverse(), this function returns 1293 * newly-allocated memory, which should be freed with g_free() when 1294 * no longer needed. 1295 * 1296 * Params: 1297 * str = a UTF-8 encoded string 1298 * len = the maximum length of @str to use, in bytes. If @len < 0, 1299 * then the string is nul-terminated. 1300 * 1301 * Returns: a newly-allocated string which is the reverse of @str 1302 * 1303 * Since: 2.2 1304 */ 1305 public static string utf8Strreverse(string str, ptrdiff_t len) 1306 { 1307 auto retStr = g_utf8_strreverse(Str.toStringz(str), len); 1308 1309 scope(exit) Str.freeString(retStr); 1310 return Str.toString(retStr); 1311 } 1312 1313 /** 1314 * Converts all Unicode characters in the string that have a case 1315 * to uppercase. The exact manner that this is done depends 1316 * on the current locale, and may result in the number of 1317 * characters in the string increasing. (For instance, the 1318 * German ess-zet will be changed to SS.) 1319 * 1320 * Params: 1321 * str = a UTF-8 encoded string 1322 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1323 * 1324 * Returns: a newly allocated string, with all characters 1325 * converted to uppercase. 1326 */ 1327 public static string utf8Strup(string str, ptrdiff_t len) 1328 { 1329 auto retStr = g_utf8_strup(Str.toStringz(str), len); 1330 1331 scope(exit) Str.freeString(retStr); 1332 return Str.toString(retStr); 1333 } 1334 1335 /** 1336 * Copies a substring out of a UTF-8 encoded string. 1337 * The substring will contain @end_pos - @start_pos characters. 1338 * 1339 * Params: 1340 * str = a UTF-8 encoded string 1341 * startPos = a character offset within @str 1342 * endPos = another character offset within @str 1343 * 1344 * Returns: a newly allocated copy of the requested 1345 * substring. Free with g_free() when no longer needed. 1346 * 1347 * Since: 2.30 1348 */ 1349 public static string utf8Substring(string str, glong startPos, glong endPos) 1350 { 1351 auto retStr = g_utf8_substring(Str.toStringz(str), startPos, endPos); 1352 1353 scope(exit) Str.freeString(retStr); 1354 return Str.toString(retStr); 1355 } 1356 1357 /** 1358 * Convert a string from UTF-8 to a 32-bit fixed width 1359 * representation as UCS-4. A trailing 0 character will be added to the 1360 * string after the converted text. 1361 * 1362 * Params: 1363 * str = a UTF-8 encoded string 1364 * len = the maximum length of @str to use, in bytes. If @len < 0, 1365 * then the string is nul-terminated. 1366 * itemsRead = location to store number of 1367 * bytes read, or %NULL. 1368 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1369 * returned in case @str contains a trailing partial 1370 * character. If an error occurs then the index of the 1371 * invalid input is stored here. 1372 * itemsWritten = location to store number 1373 * of characters written or %NULL. The value here stored does not include 1374 * the trailing 0 character. 1375 * 1376 * Returns: a pointer to a newly allocated UCS-4 string. 1377 * This value must be freed with g_free(). If an error occurs, 1378 * %NULL will be returned and @error set. 1379 * 1380 * Throws: GException on failure. 1381 */ 1382 public static dchar* utf8ToUcs4(string str, glong len, out glong itemsRead, out glong itemsWritten) 1383 { 1384 GError* err = null; 1385 1386 auto p = g_utf8_to_ucs4(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1387 1388 if (err !is null) 1389 { 1390 throw new GException( new ErrorG(err) ); 1391 } 1392 1393 return p; 1394 } 1395 1396 /** 1397 * Convert a string from UTF-8 to a 32-bit fixed width 1398 * representation as UCS-4, assuming valid UTF-8 input. 1399 * This function is roughly twice as fast as g_utf8_to_ucs4() 1400 * but does no error checking on the input. A trailing 0 character 1401 * will be added to the string after the converted text. 1402 * 1403 * Params: 1404 * str = a UTF-8 encoded string 1405 * len = the maximum length of @str to use, in bytes. If @len < 0, 1406 * then the string is nul-terminated. 1407 * itemsWritten = location to store the 1408 * number of characters in the result, or %NULL. 1409 * 1410 * Returns: a pointer to a newly allocated UCS-4 string. 1411 * This value must be freed with g_free(). 1412 */ 1413 public static dchar* utf8ToUcs4Fast(string str, glong len, out glong itemsWritten) 1414 { 1415 return g_utf8_to_ucs4_fast(Str.toStringz(str), len, &itemsWritten); 1416 } 1417 1418 /** 1419 * Convert a string from UTF-8 to UTF-16. A 0 character will be 1420 * added to the result after the converted text. 1421 * 1422 * Params: 1423 * str = a UTF-8 encoded string 1424 * len = the maximum length (number of bytes) of @str to use. 1425 * If @len < 0, then the string is nul-terminated. 1426 * itemsRead = location to store number of 1427 * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 1428 * be returned in case @str contains a trailing partial character. If 1429 * an error occurs then the index of the invalid input is stored here. 1430 * itemsWritten = location to store number 1431 * of #gunichar2 written, or %NULL. The value stored here does not include 1432 * the trailing 0. 1433 * 1434 * Returns: a pointer to a newly allocated UTF-16 string. 1435 * This value must be freed with g_free(). If an error occurs, 1436 * %NULL will be returned and @error set. 1437 * 1438 * Throws: GException on failure. 1439 */ 1440 public static wchar* utf8ToUtf16(string str, glong len, out glong itemsRead, out glong itemsWritten) 1441 { 1442 GError* err = null; 1443 1444 auto p = g_utf8_to_utf16(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1445 1446 if (err !is null) 1447 { 1448 throw new GException( new ErrorG(err) ); 1449 } 1450 1451 return p; 1452 } 1453 1454 /** 1455 * Validates UTF-8 encoded text. @str is the text to validate; 1456 * if @str is nul-terminated, then @max_len can be -1, otherwise 1457 * @max_len should be the number of bytes to validate. 1458 * If @end is non-%NULL, then the end of the valid range 1459 * will be stored there (i.e. the start of the first invalid 1460 * character if some bytes were invalid, or the end of the text 1461 * being validated otherwise). 1462 * 1463 * Note that g_utf8_validate() returns %FALSE if @max_len is 1464 * positive and any of the @max_len bytes are nul. 1465 * 1466 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 1467 * routines require valid UTF-8 as input; so data read from a file 1468 * or the network should be checked with g_utf8_validate() before 1469 * doing anything else with it. 1470 * 1471 * Params: 1472 * str = a pointer to character data 1473 * end = return location for end of valid data 1474 * 1475 * Returns: %TRUE if the text was valid UTF-8 1476 */ 1477 public static bool utf8Validate(string str, out string end) 1478 { 1479 char* outend = null; 1480 1481 auto p = g_utf8_validate(Str.toStringz(str), cast(ptrdiff_t)str.length, &outend) != 0; 1482 1483 end = Str.toString(outend); 1484 1485 return p; 1486 } 1487 1488 /** 1489 * If the provided string is valid UTF-8, return a copy of it. If not, 1490 * return a copy in which bytes that could not be interpreted as valid Unicode 1491 * are replaced with the Unicode replacement character (U+FFFD). 1492 * 1493 * For example, this is an appropriate function to use if you have received 1494 * a string that was incorrectly declared to be UTF-8, and you need a valid 1495 * UTF-8 version of it that can be logged or displayed to the user, with the 1496 * assumption that it is close enough to ASCII or UTF-8 to be mostly 1497 * readable as-is. 1498 * 1499 * Params: 1500 * str = string to coerce into UTF-8 1501 * len = the maximum length of @str to use, in bytes. If @len < 0, 1502 * then the string is nul-terminated. 1503 * 1504 * Returns: a valid UTF-8 string whose content resembles @str 1505 * 1506 * Since: 2.52 1507 */ 1508 public static string utf8MakeValid(string str, ptrdiff_t len) 1509 { 1510 auto retStr = g_utf8_make_valid(Str.toStringz(str), len); 1511 1512 scope(exit) Str.freeString(retStr); 1513 return Str.toString(retStr); 1514 } 1515 }