1 /* 2 * This file is part of gtkD. 3 * 4 * gtkD is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License 6 * as published by the Free Software Foundation; either version 3 7 * of the License, or (at your option) any later version, with 8 * some exceptions, please read the COPYING file. 9 * 10 * gtkD is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with gtkD; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA 18 */ 19 20 // generated automatically - do not change 21 // find conversion definition on APILookup.txt 22 // implement new conversion functionalities on the wrap.utils pakage 23 24 25 module glib.Unicode; 26 27 private import glib.ErrorG; 28 private import glib.GException; 29 private import glib.Str; 30 private import gtkc.glib; 31 public import gtkc.glibtypes; 32 33 34 /** */ 35 public struct Unicode 36 { 37 38 /** 39 * Convert a string from UCS-4 to UTF-16. A 0 character will be 40 * added to the result after the converted text. 41 * 42 * Params: 43 * str = a UCS-4 encoded string 44 * len = the maximum length (number of characters) of @str to use. 45 * If @len < 0, then the string is nul-terminated. 46 * itemsRead = location to store number of 47 * bytes read, or %NULL. If an error occurs then the index of the invalid 48 * input is stored here. 49 * itemsWritten = location to store number 50 * of #gunichar2 written, or %NULL. The value stored here does not include 51 * the trailing 0. 52 * 53 * Returns: a pointer to a newly allocated UTF-16 string. 54 * This value must be freed with g_free(). If an error occurs, 55 * %NULL will be returned and @error set. 56 * 57 * Throws: GException on failure. 58 */ 59 public static wchar* ucs4ToUtf16(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 60 { 61 GError* err = null; 62 63 auto p = g_ucs4_to_utf16(str, len, &itemsRead, &itemsWritten, &err); 64 65 if (err !is null) 66 { 67 throw new GException( new ErrorG(err) ); 68 } 69 70 return p; 71 } 72 73 /** 74 * Convert a string from a 32-bit fixed width representation as UCS-4. 75 * to UTF-8. The result will be terminated with a 0 byte. 76 * 77 * Params: 78 * str = a UCS-4 encoded string 79 * len = the maximum length (number of characters) of @str to use. 80 * If @len < 0, then the string is nul-terminated. 81 * itemsRead = location to store number of 82 * characters read, or %NULL. 83 * itemsWritten = location to store number 84 * of bytes written or %NULL. The value here stored does not include the 85 * trailing 0 byte. 86 * 87 * Returns: a pointer to a newly allocated UTF-8 string. 88 * This value must be freed with g_free(). If an error occurs, 89 * %NULL will be returned and @error set. In that case, @items_read 90 * will be set to the position of the first invalid input character. 91 * 92 * Throws: GException on failure. 93 */ 94 public static string ucs4ToUtf8(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 95 { 96 GError* err = null; 97 98 auto retStr = g_ucs4_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 99 100 if (err !is null) 101 { 102 throw new GException( new ErrorG(err) ); 103 } 104 105 scope(exit) Str.freeString(retStr); 106 return Str.toString(retStr); 107 } 108 109 /** 110 * Determines the break type of @c. @c should be a Unicode character 111 * (to derive a character from UTF-8 encoded text, use 112 * g_utf8_get_char()). The break type is used to find word and line 113 * breaks ("text boundaries"), Pango implements the Unicode boundary 114 * resolution algorithms and normally you would use a function such 115 * as pango_break() instead of caring about break types yourself. 116 * 117 * Params: 118 * c = a Unicode character 119 * 120 * Returns: the break type of @c 121 */ 122 public static GUnicodeBreakType unicharBreakType(dchar c) 123 { 124 return g_unichar_break_type(c); 125 } 126 127 /** 128 * Determines the canonical combining class of a Unicode character. 129 * 130 * Params: 131 * uc = a Unicode character 132 * 133 * Returns: the combining class of the character 134 * 135 * Since: 2.14 136 */ 137 public static int unicharCombiningClass(dchar uc) 138 { 139 return g_unichar_combining_class(uc); 140 } 141 142 /** 143 * Performs a single composition step of the 144 * Unicode canonical composition algorithm. 145 * 146 * This function includes algorithmic Hangul Jamo composition, 147 * but it is not exactly the inverse of g_unichar_decompose(). 148 * No composition can have either of @a or @b equal to zero. 149 * To be precise, this function composes if and only if 150 * there exists a Primary Composite P which is canonically 151 * equivalent to the sequence <@a,@b>. See the Unicode 152 * Standard for the definition of Primary Composite. 153 * 154 * If @a and @b do not compose a new character, @ch is set to zero. 155 * 156 * See 157 * [UAX#15](http://unicode.org/reports/tr15/) 158 * for details. 159 * 160 * Params: 161 * a = a Unicode character 162 * b = a Unicode character 163 * ch = return location for the composed character 164 * 165 * Returns: %TRUE if the characters could be composed 166 * 167 * Since: 2.30 168 */ 169 public static bool unicharCompose(dchar a, dchar b, dchar* ch) 170 { 171 return g_unichar_compose(a, b, ch) != 0; 172 } 173 174 /** 175 * Performs a single decomposition step of the 176 * Unicode canonical decomposition algorithm. 177 * 178 * This function does not include compatibility 179 * decompositions. It does, however, include algorithmic 180 * Hangul Jamo decomposition, as well as 'singleton' 181 * decompositions which replace a character by a single 182 * other character. In the case of singletons *@b will 183 * be set to zero. 184 * 185 * If @ch is not decomposable, *@a is set to @ch and *@b 186 * is set to zero. 187 * 188 * Note that the way Unicode decomposition pairs are 189 * defined, it is guaranteed that @b would not decompose 190 * further, but @a may itself decompose. To get the full 191 * canonical decomposition for @ch, one would need to 192 * recursively call this function on @a. Or use 193 * g_unichar_fully_decompose(). 194 * 195 * See 196 * [UAX#15](http://unicode.org/reports/tr15/) 197 * for details. 198 * 199 * Params: 200 * ch = a Unicode character 201 * a = return location for the first component of @ch 202 * b = return location for the second component of @ch 203 * 204 * Returns: %TRUE if the character could be decomposed 205 * 206 * Since: 2.30 207 */ 208 public static bool unicharDecompose(dchar ch, dchar* a, dchar* b) 209 { 210 return g_unichar_decompose(ch, a, b) != 0; 211 } 212 213 /** 214 * Determines the numeric value of a character as a decimal 215 * digit. 216 * 217 * Params: 218 * c = a Unicode character 219 * 220 * Returns: If @c is a decimal digit (according to 221 * g_unichar_isdigit()), its numeric value. Otherwise, -1. 222 */ 223 public static int unicharDigitValue(dchar c) 224 { 225 return g_unichar_digit_value(c); 226 } 227 228 /** 229 * Computes the canonical or compatibility decomposition of a 230 * Unicode character. For compatibility decomposition, 231 * pass %TRUE for @compat; for canonical decomposition 232 * pass %FALSE for @compat. 233 * 234 * The decomposed sequence is placed in @result. Only up to 235 * @result_len characters are written into @result. The length 236 * of the full decomposition (irrespective of @result_len) is 237 * returned by the function. For canonical decomposition, 238 * currently all decompositions are of length at most 4, but 239 * this may change in the future (very unlikely though). 240 * At any rate, Unicode does guarantee that a buffer of length 241 * 18 is always enough for both compatibility and canonical 242 * decompositions, so that is the size recommended. This is provided 243 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. 244 * 245 * See 246 * [UAX#15](http://unicode.org/reports/tr15/) 247 * for details. 248 * 249 * Params: 250 * ch = a Unicode character. 251 * compat = whether perform canonical or compatibility decomposition 252 * result = location to store decomposed result, or %NULL 253 * resultLen = length of @result 254 * 255 * Returns: the length of the full decomposition. 256 * 257 * Since: 2.30 258 */ 259 public static size_t unicharFullyDecompose(dchar ch, bool compat, dchar* result, size_t resultLen) 260 { 261 return g_unichar_fully_decompose(ch, compat, result, resultLen); 262 } 263 264 /** 265 * In Unicode, some characters are "mirrored". This means that their 266 * images are mirrored horizontally in text that is laid out from right 267 * to left. For instance, "(" would become its mirror image, ")", in 268 * right-to-left text. 269 * 270 * If @ch has the Unicode mirrored property and there is another unicode 271 * character that typically has a glyph that is the mirror image of @ch's 272 * glyph and @mirrored_ch is set, it puts that character in the address 273 * pointed to by @mirrored_ch. Otherwise the original character is put. 274 * 275 * Params: 276 * ch = a Unicode character 277 * mirroredCh = location to store the mirrored character 278 * 279 * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise 280 * 281 * Since: 2.4 282 */ 283 public static bool unicharGetMirrorChar(dchar ch, dchar* mirroredCh) 284 { 285 return g_unichar_get_mirror_char(ch, mirroredCh) != 0; 286 } 287 288 /** 289 * Looks up the #GUnicodeScript for a particular character (as defined 290 * by Unicode Standard Annex \#24). No check is made for @ch being a 291 * valid Unicode character; if you pass in invalid character, the 292 * result is undefined. 293 * 294 * This function is equivalent to pango_script_for_unichar() and the 295 * two are interchangeable. 296 * 297 * Params: 298 * ch = a Unicode character 299 * 300 * Returns: the #GUnicodeScript for the character. 301 * 302 * Since: 2.14 303 */ 304 public static GUnicodeScript unicharGetScript(dchar ch) 305 { 306 return g_unichar_get_script(ch); 307 } 308 309 /** 310 * Determines whether a character is alphanumeric. 311 * Given some UTF-8 text, obtain a character value 312 * with g_utf8_get_char(). 313 * 314 * Params: 315 * c = a Unicode character 316 * 317 * Returns: %TRUE if @c is an alphanumeric character 318 */ 319 public static bool unicharIsalnum(dchar c) 320 { 321 return g_unichar_isalnum(c) != 0; 322 } 323 324 /** 325 * Determines whether a character is alphabetic (i.e. a letter). 326 * Given some UTF-8 text, obtain a character value with 327 * g_utf8_get_char(). 328 * 329 * Params: 330 * c = a Unicode character 331 * 332 * Returns: %TRUE if @c is an alphabetic character 333 */ 334 public static bool unicharIsalpha(dchar c) 335 { 336 return g_unichar_isalpha(c) != 0; 337 } 338 339 /** 340 * Determines whether a character is a control character. 341 * Given some UTF-8 text, obtain a character value with 342 * g_utf8_get_char(). 343 * 344 * Params: 345 * c = a Unicode character 346 * 347 * Returns: %TRUE if @c is a control character 348 */ 349 public static bool unicharIscntrl(dchar c) 350 { 351 return g_unichar_iscntrl(c) != 0; 352 } 353 354 /** 355 * Determines if a given character is assigned in the Unicode 356 * standard. 357 * 358 * Params: 359 * c = a Unicode character 360 * 361 * Returns: %TRUE if the character has an assigned value 362 */ 363 public static bool unicharIsdefined(dchar c) 364 { 365 return g_unichar_isdefined(c) != 0; 366 } 367 368 /** 369 * Determines whether a character is numeric (i.e. a digit). This 370 * covers ASCII 0-9 and also digits in other languages/scripts. Given 371 * some UTF-8 text, obtain a character value with g_utf8_get_char(). 372 * 373 * Params: 374 * c = a Unicode character 375 * 376 * Returns: %TRUE if @c is a digit 377 */ 378 public static bool unicharIsdigit(dchar c) 379 { 380 return g_unichar_isdigit(c) != 0; 381 } 382 383 /** 384 * Determines whether a character is printable and not a space 385 * (returns %FALSE for control characters, format characters, and 386 * spaces). g_unichar_isprint() is similar, but returns %TRUE for 387 * spaces. Given some UTF-8 text, obtain a character value with 388 * g_utf8_get_char(). 389 * 390 * Params: 391 * c = a Unicode character 392 * 393 * Returns: %TRUE if @c is printable unless it's a space 394 */ 395 public static bool unicharIsgraph(dchar c) 396 { 397 return g_unichar_isgraph(c) != 0; 398 } 399 400 /** 401 * Determines whether a character is a lowercase letter. 402 * Given some UTF-8 text, obtain a character value with 403 * g_utf8_get_char(). 404 * 405 * Params: 406 * c = a Unicode character 407 * 408 * Returns: %TRUE if @c is a lowercase letter 409 */ 410 public static bool unicharIslower(dchar c) 411 { 412 return g_unichar_islower(c) != 0; 413 } 414 415 /** 416 * Determines whether a character is a mark (non-spacing mark, 417 * combining mark, or enclosing mark in Unicode speak). 418 * Given some UTF-8 text, obtain a character value 419 * with g_utf8_get_char(). 420 * 421 * Note: in most cases where isalpha characters are allowed, 422 * ismark characters should be allowed to as they are essential 423 * for writing most European languages as well as many non-Latin 424 * scripts. 425 * 426 * Params: 427 * c = a Unicode character 428 * 429 * Returns: %TRUE if @c is a mark character 430 * 431 * Since: 2.14 432 */ 433 public static bool unicharIsmark(dchar c) 434 { 435 return g_unichar_ismark(c) != 0; 436 } 437 438 /** 439 * Determines whether a character is printable. 440 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. 441 * Given some UTF-8 text, obtain a character value with 442 * g_utf8_get_char(). 443 * 444 * Params: 445 * c = a Unicode character 446 * 447 * Returns: %TRUE if @c is printable 448 */ 449 public static bool unicharIsprint(dchar c) 450 { 451 return g_unichar_isprint(c) != 0; 452 } 453 454 /** 455 * Determines whether a character is punctuation or a symbol. 456 * Given some UTF-8 text, obtain a character value with 457 * g_utf8_get_char(). 458 * 459 * Params: 460 * c = a Unicode character 461 * 462 * Returns: %TRUE if @c is a punctuation or symbol character 463 */ 464 public static bool unicharIspunct(dchar c) 465 { 466 return g_unichar_ispunct(c) != 0; 467 } 468 469 /** 470 * Determines whether a character is a space, tab, or line separator 471 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a 472 * character value with g_utf8_get_char(). 473 * 474 * (Note: don't use this to do word breaking; you have to use 475 * Pango or equivalent to get word breaking right, the algorithm 476 * is fairly complex.) 477 * 478 * Params: 479 * c = a Unicode character 480 * 481 * Returns: %TRUE if @c is a space character 482 */ 483 public static bool unicharIsspace(dchar c) 484 { 485 return g_unichar_isspace(c) != 0; 486 } 487 488 /** 489 * Determines if a character is titlecase. Some characters in 490 * Unicode which are composites, such as the DZ digraph 491 * have three case variants instead of just two. The titlecase 492 * form is used at the beginning of a word where only the 493 * first letter is capitalized. The titlecase form of the DZ 494 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. 495 * 496 * Params: 497 * c = a Unicode character 498 * 499 * Returns: %TRUE if the character is titlecase 500 */ 501 public static bool unicharIstitle(dchar c) 502 { 503 return g_unichar_istitle(c) != 0; 504 } 505 506 /** 507 * Determines if a character is uppercase. 508 * 509 * Params: 510 * c = a Unicode character 511 * 512 * Returns: %TRUE if @c is an uppercase character 513 */ 514 public static bool unicharIsupper(dchar c) 515 { 516 return g_unichar_isupper(c) != 0; 517 } 518 519 /** 520 * Determines if a character is typically rendered in a double-width 521 * cell. 522 * 523 * Params: 524 * c = a Unicode character 525 * 526 * Returns: %TRUE if the character is wide 527 */ 528 public static bool unicharIswide(dchar c) 529 { 530 return g_unichar_iswide(c) != 0; 531 } 532 533 /** 534 * Determines if a character is typically rendered in a double-width 535 * cell under legacy East Asian locales. If a character is wide according to 536 * g_unichar_iswide(), then it is also reported wide with this function, but 537 * the converse is not necessarily true. See the 538 * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) 539 * for details. 540 * 541 * If a character passes the g_unichar_iswide() test then it will also pass 542 * this test, but not the other way around. Note that some characters may 543 * pass both this test and g_unichar_iszerowidth(). 544 * 545 * Params: 546 * c = a Unicode character 547 * 548 * Returns: %TRUE if the character is wide in legacy East Asian locales 549 * 550 * Since: 2.12 551 */ 552 public static bool unicharIswideCjk(dchar c) 553 { 554 return g_unichar_iswide_cjk(c) != 0; 555 } 556 557 /** 558 * Determines if a character is a hexidecimal digit. 559 * 560 * Params: 561 * c = a Unicode character. 562 * 563 * Returns: %TRUE if the character is a hexadecimal digit 564 */ 565 public static bool unicharIsxdigit(dchar c) 566 { 567 return g_unichar_isxdigit(c) != 0; 568 } 569 570 /** 571 * Determines if a given character typically takes zero width when rendered. 572 * The return value is %TRUE for all non-spacing and enclosing marks 573 * (e.g., combining accents), format characters, zero-width 574 * space, but not U+00AD SOFT HYPHEN. 575 * 576 * A typical use of this function is with one of g_unichar_iswide() or 577 * g_unichar_iswide_cjk() to determine the number of cells a string occupies 578 * when displayed on a grid display (terminals). However, note that not all 579 * terminals support zero-width rendering of zero-width marks. 580 * 581 * Params: 582 * c = a Unicode character 583 * 584 * Returns: %TRUE if the character has zero width 585 * 586 * Since: 2.14 587 */ 588 public static bool unicharIszerowidth(dchar c) 589 { 590 return g_unichar_iszerowidth(c) != 0; 591 } 592 593 /** 594 * Converts a single character to UTF-8. 595 * 596 * Params: 597 * c = a Unicode character code 598 * outbuf = output buffer, must have at 599 * least 6 bytes of space. If %NULL, the length will be computed and 600 * returned and nothing will be written to @outbuf. 601 * 602 * Returns: number of bytes written 603 */ 604 public static int unicharToUtf8(dchar c, out char[] outbuf) 605 { 606 return g_unichar_to_utf8(c, outbuf.ptr); 607 } 608 609 /** 610 * Converts a character to lower case. 611 * 612 * Params: 613 * c = a Unicode character. 614 * 615 * Returns: the result of converting @c to lower case. 616 * If @c is not an upperlower or titlecase character, 617 * or has no lowercase equivalent @c is returned unchanged. 618 */ 619 public static dchar unicharTolower(dchar c) 620 { 621 return g_unichar_tolower(c); 622 } 623 624 /** 625 * Converts a character to the titlecase. 626 * 627 * Params: 628 * c = a Unicode character 629 * 630 * Returns: the result of converting @c to titlecase. 631 * If @c is not an uppercase or lowercase character, 632 * @c is returned unchanged. 633 */ 634 public static dchar unicharTotitle(dchar c) 635 { 636 return g_unichar_totitle(c); 637 } 638 639 /** 640 * Converts a character to uppercase. 641 * 642 * Params: 643 * c = a Unicode character 644 * 645 * Returns: the result of converting @c to uppercase. 646 * If @c is not an lowercase or titlecase character, 647 * or has no upper case equivalent @c is returned unchanged. 648 */ 649 public static dchar unicharToupper(dchar c) 650 { 651 return g_unichar_toupper(c); 652 } 653 654 /** 655 * Classifies a Unicode character by type. 656 * 657 * Params: 658 * c = a Unicode character 659 * 660 * Returns: the type of the character. 661 */ 662 public static GUnicodeType unicharType(dchar c) 663 { 664 return g_unichar_type(c); 665 } 666 667 /** 668 * Checks whether @ch is a valid Unicode character. Some possible 669 * integer values of @ch will not be valid. 0 is considered a valid 670 * character, though it's normally a string terminator. 671 * 672 * Params: 673 * ch = a Unicode character 674 * 675 * Returns: %TRUE if @ch is a valid Unicode character 676 */ 677 public static bool unicharValidate(dchar ch) 678 { 679 return g_unichar_validate(ch) != 0; 680 } 681 682 /** 683 * Determines the numeric value of a character as a hexidecimal 684 * digit. 685 * 686 * Params: 687 * c = a Unicode character 688 * 689 * Returns: If @c is a hex digit (according to 690 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. 691 */ 692 public static int unicharXdigitValue(dchar c) 693 { 694 return g_unichar_xdigit_value(c); 695 } 696 697 /** 698 * Computes the canonical decomposition of a Unicode character. 699 * 700 * Deprecated: Use the more flexible g_unichar_fully_decompose() 701 * instead. 702 * 703 * Params: 704 * ch = a Unicode character. 705 * resultLen = location to store the length of the return value. 706 * 707 * Returns: a newly allocated string of Unicode characters. 708 * @result_len is set to the resulting length of the string. 709 */ 710 public static dchar* unicodeCanonicalDecomposition(dchar ch, size_t* resultLen) 711 { 712 return g_unicode_canonical_decomposition(ch, resultLen); 713 } 714 715 /** 716 * Computes the canonical ordering of a string in-place. 717 * This rearranges decomposed characters in the string 718 * according to their combining classes. See the Unicode 719 * manual for more information. 720 * 721 * Params: 722 * str = a UCS-4 encoded string. 723 * len = the maximum length of @string to use. 724 */ 725 public static void unicodeCanonicalOrdering(dchar* str, size_t len) 726 { 727 g_unicode_canonical_ordering(str, len); 728 } 729 730 /** 731 * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter 732 * codes to scripts. For example, the code for Arabic is 'Arab'. 733 * This function accepts four letter codes encoded as a @guint32 in a 734 * big-endian fashion. That is, the code expected for Arabic is 735 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 736 * 737 * See 738 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 739 * for details. 740 * 741 * Params: 742 * iso15924 = a Unicode script 743 * 744 * Returns: the Unicode script for @iso15924, or 745 * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and 746 * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. 747 * 748 * Since: 2.30 749 */ 750 public static GUnicodeScript unicodeScriptFromIso15924(uint iso15924) 751 { 752 return g_unicode_script_from_iso15924(iso15924); 753 } 754 755 /** 756 * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter 757 * codes to scripts. For example, the code for Arabic is 'Arab'. The 758 * four letter codes are encoded as a @guint32 by this function in a 759 * big-endian fashion. That is, the code returned for Arabic is 760 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 761 * 762 * See 763 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 764 * for details. 765 * 766 * Params: 767 * script = a Unicode script 768 * 769 * Returns: the ISO 15924 code for @script, encoded as an integer, 770 * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or 771 * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. 772 * 773 * Since: 2.30 774 */ 775 public static uint unicodeScriptToIso15924(GUnicodeScript script) 776 { 777 return g_unicode_script_to_iso15924(script); 778 } 779 780 /** 781 * Convert a string from UTF-16 to UCS-4. The result will be 782 * nul-terminated. 783 * 784 * Params: 785 * str = a UTF-16 encoded string 786 * len = the maximum length (number of #gunichar2) of @str to use. 787 * If @len < 0, then the string is nul-terminated. 788 * itemsRead = location to store number of 789 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 790 * be returned in case @str contains a trailing partial character. If 791 * an error occurs then the index of the invalid input is stored here. 792 * itemsWritten = location to store number 793 * of characters written, or %NULL. The value stored here does not include 794 * the trailing 0 character. 795 * 796 * Returns: a pointer to a newly allocated UCS-4 string. 797 * This value must be freed with g_free(). If an error occurs, 798 * %NULL will be returned and @error set. 799 * 800 * Throws: GException on failure. 801 */ 802 public static dchar* utf16ToUcs4(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 803 { 804 GError* err = null; 805 806 auto p = g_utf16_to_ucs4(str, len, &itemsRead, &itemsWritten, &err); 807 808 if (err !is null) 809 { 810 throw new GException( new ErrorG(err) ); 811 } 812 813 return p; 814 } 815 816 /** 817 * Convert a string from UTF-16 to UTF-8. The result will be 818 * terminated with a 0 byte. 819 * 820 * Note that the input is expected to be already in native endianness, 821 * an initial byte-order-mark character is not handled specially. 822 * g_convert() can be used to convert a byte buffer of UTF-16 data of 823 * ambiguous endianess. 824 * 825 * Further note that this function does not validate the result 826 * string; it may e.g. include embedded NUL characters. The only 827 * validation done by this function is to ensure that the input can 828 * be correctly interpreted as UTF-16, i.e. it doesn't contain 829 * things unpaired surrogates. 830 * 831 * Params: 832 * str = a UTF-16 encoded string 833 * len = the maximum length (number of #gunichar2) of @str to use. 834 * If @len < 0, then the string is nul-terminated. 835 * itemsRead = location to store number of 836 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 837 * be returned in case @str contains a trailing partial character. If 838 * an error occurs then the index of the invalid input is stored here. 839 * itemsWritten = location to store number 840 * of bytes written, or %NULL. The value stored here does not include the 841 * trailing 0 byte. 842 * 843 * Returns: a pointer to a newly allocated UTF-8 string. 844 * This value must be freed with g_free(). If an error occurs, 845 * %NULL will be returned and @error set. 846 * 847 * Throws: GException on failure. 848 */ 849 public static string utf16ToUtf8(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 850 { 851 GError* err = null; 852 853 auto retStr = g_utf16_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 854 855 if (err !is null) 856 { 857 throw new GException( new ErrorG(err) ); 858 } 859 860 scope(exit) Str.freeString(retStr); 861 return Str.toString(retStr); 862 } 863 864 /** 865 * Converts a string into a form that is independent of case. The 866 * result will not correspond to any particular case, but can be 867 * compared for equality or ordered with the results of calling 868 * g_utf8_casefold() on other strings. 869 * 870 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is 871 * only an approximation to the correct linguistic case insensitive 872 * ordering, though it is a fairly good one. Getting this exactly 873 * right would require a more sophisticated collation function that 874 * takes case sensitivity into account. GLib does not currently 875 * provide such a function. 876 * 877 * Params: 878 * str = a UTF-8 encoded string 879 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 880 * 881 * Returns: a newly allocated string, that is a 882 * case independent form of @str. 883 */ 884 public static string utf8Casefold(string str, ptrdiff_t len) 885 { 886 auto retStr = g_utf8_casefold(Str.toStringz(str), len); 887 888 scope(exit) Str.freeString(retStr); 889 return Str.toString(retStr); 890 } 891 892 /** 893 * Compares two strings for ordering using the linguistically 894 * correct rules for the [current locale][setlocale]. 895 * When sorting a large number of strings, it will be significantly 896 * faster to obtain collation keys with g_utf8_collate_key() and 897 * compare the keys with strcmp() when sorting instead of sorting 898 * the original strings. 899 * 900 * Params: 901 * str1 = a UTF-8 encoded string 902 * str2 = a UTF-8 encoded string 903 * 904 * Returns: < 0 if @str1 compares before @str2, 905 * 0 if they compare equal, > 0 if @str1 compares after @str2. 906 */ 907 public static int utf8Collate(string str1, string str2) 908 { 909 return g_utf8_collate(Str.toStringz(str1), Str.toStringz(str2)); 910 } 911 912 /** 913 * Converts a string into a collation key that can be compared 914 * with other collation keys produced by the same function using 915 * strcmp(). 916 * 917 * The results of comparing the collation keys of two strings 918 * with strcmp() will always be the same as comparing the two 919 * original keys with g_utf8_collate(). 920 * 921 * Note that this function depends on the [current locale][setlocale]. 922 * 923 * Params: 924 * str = a UTF-8 encoded string. 925 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 926 * 927 * Returns: a newly allocated string. This string should 928 * be freed with g_free() when you are done with it. 929 */ 930 public static string utf8CollateKey(string str, ptrdiff_t len) 931 { 932 auto retStr = g_utf8_collate_key(Str.toStringz(str), len); 933 934 scope(exit) Str.freeString(retStr); 935 return Str.toString(retStr); 936 } 937 938 /** 939 * Converts a string into a collation key that can be compared 940 * with other collation keys produced by the same function using strcmp(). 941 * 942 * In order to sort filenames correctly, this function treats the dot '.' 943 * as a special case. Most dictionary orderings seem to consider it 944 * insignificant, thus producing the ordering "event.c" "eventgenerator.c" 945 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we 946 * would like to treat numbers intelligently so that "file1" "file10" "file5" 947 * is sorted as "file1" "file5" "file10". 948 * 949 * Note that this function depends on the [current locale][setlocale]. 950 * 951 * Params: 952 * str = a UTF-8 encoded string. 953 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 954 * 955 * Returns: a newly allocated string. This string should 956 * be freed with g_free() when you are done with it. 957 * 958 * Since: 2.8 959 */ 960 public static string utf8CollateKeyForFilename(string str, ptrdiff_t len) 961 { 962 auto retStr = g_utf8_collate_key_for_filename(Str.toStringz(str), len); 963 964 scope(exit) Str.freeString(retStr); 965 return Str.toString(retStr); 966 } 967 968 /** 969 * Finds the start of the next UTF-8 character in the string after @p. 970 * 971 * @p does not have to be at the beginning of a UTF-8 character. No check 972 * is made to see if the character found is actually valid other than 973 * it starts with an appropriate byte. 974 * 975 * Params: 976 * p = a pointer to a position within a UTF-8 encoded string 977 * end = a pointer to the byte following the end of the string, 978 * or %NULL to indicate that the string is nul-terminated 979 * 980 * Returns: a pointer to the found character or %NULL 981 */ 982 public static string utf8FindNextChar(string p, string end) 983 { 984 auto retStr = g_utf8_find_next_char(Str.toStringz(p), Str.toStringz(end)); 985 986 scope(exit) Str.freeString(retStr); 987 return Str.toString(retStr); 988 } 989 990 /** 991 * Given a position @p with a UTF-8 encoded string @str, find the start 992 * of the previous UTF-8 character starting before @p. Returns %NULL if no 993 * UTF-8 characters are present in @str before @p. 994 * 995 * @p does not have to be at the beginning of a UTF-8 character. No check 996 * is made to see if the character found is actually valid other than 997 * it starts with an appropriate byte. 998 * 999 * Params: 1000 * str = pointer to the beginning of a UTF-8 encoded string 1001 * p = pointer to some position within @str 1002 * 1003 * Returns: a pointer to the found character or %NULL. 1004 */ 1005 public static string utf8FindPrevChar(string str, string p) 1006 { 1007 auto retStr = g_utf8_find_prev_char(Str.toStringz(str), Str.toStringz(p)); 1008 1009 scope(exit) Str.freeString(retStr); 1010 return Str.toString(retStr); 1011 } 1012 1013 /** 1014 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 1015 * 1016 * If @p does not point to a valid UTF-8 encoded character, results 1017 * are undefined. If you are not sure that the bytes are complete 1018 * valid Unicode characters, you should use g_utf8_get_char_validated() 1019 * instead. 1020 * 1021 * Params: 1022 * p = a pointer to Unicode character encoded as UTF-8 1023 * 1024 * Returns: the resulting character 1025 */ 1026 public static dchar utf8GetChar(string p) 1027 { 1028 return g_utf8_get_char(Str.toStringz(p)); 1029 } 1030 1031 /** 1032 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 1033 * This function checks for incomplete characters, for invalid characters 1034 * such as characters that are out of the range of Unicode, and for 1035 * overlong encodings of valid characters. 1036 * 1037 * Params: 1038 * p = a pointer to Unicode character encoded as UTF-8 1039 * maxLen = the maximum number of bytes to read, or -1 if @p is nul-terminated 1040 * 1041 * Returns: the resulting character. If @p points to a partial 1042 * sequence at the end of a string that could begin a valid 1043 * character (or if @max_len is zero), returns (gunichar)-2; 1044 * otherwise, if @p does not point to a valid UTF-8 encoded 1045 * Unicode character, returns (gunichar)-1. 1046 */ 1047 public static dchar utf8GetCharValidated(string p, ptrdiff_t maxLen) 1048 { 1049 return g_utf8_get_char_validated(Str.toStringz(p), maxLen); 1050 } 1051 1052 /** 1053 * Converts a string into canonical form, standardizing 1054 * such issues as whether a character with an accent 1055 * is represented as a base character and combining 1056 * accent or as a single precomposed character. The 1057 * string has to be valid UTF-8, otherwise %NULL is 1058 * returned. You should generally call g_utf8_normalize() 1059 * before comparing two Unicode strings. 1060 * 1061 * The normalization mode %G_NORMALIZE_DEFAULT only 1062 * standardizes differences that do not affect the 1063 * text content, such as the above-mentioned accent 1064 * representation. %G_NORMALIZE_ALL also standardizes 1065 * the "compatibility" characters in Unicode, such 1066 * as SUPERSCRIPT THREE to the standard forms 1067 * (in this case DIGIT THREE). Formatting information 1068 * may be lost but for most text operations such 1069 * characters should be considered the same. 1070 * 1071 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 1072 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 1073 * but returned a result with composed forms rather 1074 * than a maximally decomposed form. This is often 1075 * useful if you intend to convert the string to 1076 * a legacy encoding or pass it to a system with 1077 * less capable Unicode handling. 1078 * 1079 * Params: 1080 * str = a UTF-8 encoded string. 1081 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1082 * mode = the type of normalization to perform. 1083 * 1084 * Returns: a newly allocated string, that is the 1085 * normalized form of @str, or %NULL if @str is not 1086 * valid UTF-8. 1087 */ 1088 public static string utf8Normalize(string str, ptrdiff_t len, GNormalizeMode mode) 1089 { 1090 auto retStr = g_utf8_normalize(Str.toStringz(str), len, mode); 1091 1092 scope(exit) Str.freeString(retStr); 1093 return Str.toString(retStr); 1094 } 1095 1096 /** 1097 * Converts from an integer character offset to a pointer to a position 1098 * within the string. 1099 * 1100 * Since 2.10, this function allows to pass a negative @offset to 1101 * step backwards. It is usually worth stepping backwards from the end 1102 * instead of forwards if @offset is in the last fourth of the string, 1103 * since moving forward is about 3 times faster than moving backward. 1104 * 1105 * Note that this function doesn't abort when reaching the end of @str. 1106 * Therefore you should be sure that @offset is within string boundaries 1107 * before calling that function. Call g_utf8_strlen() when unsure. 1108 * This limitation exists as this function is called frequently during 1109 * text rendering and therefore has to be as fast as possible. 1110 * 1111 * Params: 1112 * str = a UTF-8 encoded string 1113 * offset = a character offset within @str 1114 * 1115 * Returns: the resulting pointer 1116 */ 1117 public static string utf8OffsetToPointer(string str, glong offset) 1118 { 1119 auto retStr = g_utf8_offset_to_pointer(Str.toStringz(str), offset); 1120 1121 scope(exit) Str.freeString(retStr); 1122 return Str.toString(retStr); 1123 } 1124 1125 /** 1126 * Converts from a pointer to position within a string to a integer 1127 * character offset. 1128 * 1129 * Since 2.10, this function allows @pos to be before @str, and returns 1130 * a negative offset in this case. 1131 * 1132 * Params: 1133 * str = a UTF-8 encoded string 1134 * pos = a pointer to a position within @str 1135 * 1136 * Returns: the resulting character offset 1137 */ 1138 public static glong utf8PointerToOffset(string str, string pos) 1139 { 1140 return g_utf8_pointer_to_offset(Str.toStringz(str), Str.toStringz(pos)); 1141 } 1142 1143 /** 1144 * Finds the previous UTF-8 character in the string before @p. 1145 * 1146 * @p does not have to be at the beginning of a UTF-8 character. No check 1147 * is made to see if the character found is actually valid other than 1148 * it starts with an appropriate byte. If @p might be the first 1149 * character of the string, you must use g_utf8_find_prev_char() instead. 1150 * 1151 * Params: 1152 * p = a pointer to a position within a UTF-8 encoded string 1153 * 1154 * Returns: a pointer to the found character 1155 */ 1156 public static string utf8PrevChar(string p) 1157 { 1158 auto retStr = g_utf8_prev_char(Str.toStringz(p)); 1159 1160 scope(exit) Str.freeString(retStr); 1161 return Str.toString(retStr); 1162 } 1163 1164 /** 1165 * Finds the leftmost occurrence of the given Unicode character 1166 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1167 * If @len is -1, allow unbounded search. 1168 * 1169 * Params: 1170 * p = a nul-terminated UTF-8 encoded string 1171 * len = the maximum length of @p 1172 * c = a Unicode character 1173 * 1174 * Returns: %NULL if the string does not contain the character, 1175 * otherwise, a pointer to the start of the leftmost occurrence 1176 * of the character in the string. 1177 */ 1178 public static string utf8Strchr(string p, ptrdiff_t len, dchar c) 1179 { 1180 auto retStr = g_utf8_strchr(Str.toStringz(p), len, c); 1181 1182 scope(exit) Str.freeString(retStr); 1183 return Str.toString(retStr); 1184 } 1185 1186 /** 1187 * Converts all Unicode characters in the string that have a case 1188 * to lowercase. The exact manner that this is done depends 1189 * on the current locale, and may result in the number of 1190 * characters in the string changing. 1191 * 1192 * Params: 1193 * str = a UTF-8 encoded string 1194 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1195 * 1196 * Returns: a newly allocated string, with all characters 1197 * converted to lowercase. 1198 */ 1199 public static string utf8Strdown(string str, ptrdiff_t len) 1200 { 1201 auto retStr = g_utf8_strdown(Str.toStringz(str), len); 1202 1203 scope(exit) Str.freeString(retStr); 1204 return Str.toString(retStr); 1205 } 1206 1207 /** 1208 * Computes the length of the string in characters, not including 1209 * the terminating nul character. If the @max'th byte falls in the 1210 * middle of a character, the last (partial) character is not counted. 1211 * 1212 * Params: 1213 * p = pointer to the start of a UTF-8 encoded string 1214 * max = the maximum number of bytes to examine. If @max 1215 * is less than 0, then the string is assumed to be 1216 * nul-terminated. If @max is 0, @p will not be examined and 1217 * may be %NULL. If @max is greater than 0, up to @max 1218 * bytes are examined 1219 * 1220 * Returns: the length of the string in characters 1221 */ 1222 public static glong utf8Strlen(string p, ptrdiff_t max) 1223 { 1224 return g_utf8_strlen(Str.toStringz(p), max); 1225 } 1226 1227 /** 1228 * Like the standard C strncpy() function, but copies a given number 1229 * of characters instead of a given number of bytes. The @src string 1230 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all 1231 * text before trying to use UTF-8 utility functions with it.) 1232 * 1233 * Params: 1234 * dest = buffer to fill with characters from @src 1235 * src = UTF-8 encoded string 1236 * n = character count 1237 * 1238 * Returns: @dest 1239 */ 1240 public static string utf8Strncpy(string dest, string src, size_t n) 1241 { 1242 auto retStr = g_utf8_strncpy(Str.toStringz(dest), Str.toStringz(src), n); 1243 1244 scope(exit) Str.freeString(retStr); 1245 return Str.toString(retStr); 1246 } 1247 1248 /** 1249 * Find the rightmost occurrence of the given Unicode character 1250 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1251 * If @len is -1, allow unbounded search. 1252 * 1253 * Params: 1254 * p = a nul-terminated UTF-8 encoded string 1255 * len = the maximum length of @p 1256 * c = a Unicode character 1257 * 1258 * Returns: %NULL if the string does not contain the character, 1259 * otherwise, a pointer to the start of the rightmost occurrence 1260 * of the character in the string. 1261 */ 1262 public static string utf8Strrchr(string p, ptrdiff_t len, dchar c) 1263 { 1264 auto retStr = g_utf8_strrchr(Str.toStringz(p), len, c); 1265 1266 scope(exit) Str.freeString(retStr); 1267 return Str.toString(retStr); 1268 } 1269 1270 /** 1271 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 1272 * (Use g_utf8_validate() on all text before trying to use UTF-8 1273 * utility functions with it.) 1274 * 1275 * This function is intended for programmatic uses of reversed strings. 1276 * It pays no attention to decomposed characters, combining marks, byte 1277 * order marks, directional indicators (LRM, LRO, etc) and similar 1278 * characters which might need special handling when reversing a string 1279 * for display purposes. 1280 * 1281 * Note that unlike g_strreverse(), this function returns 1282 * newly-allocated memory, which should be freed with g_free() when 1283 * no longer needed. 1284 * 1285 * Params: 1286 * str = a UTF-8 encoded string 1287 * len = the maximum length of @str to use, in bytes. If @len < 0, 1288 * then the string is nul-terminated. 1289 * 1290 * Returns: a newly-allocated string which is the reverse of @str 1291 * 1292 * Since: 2.2 1293 */ 1294 public static string utf8Strreverse(string str, ptrdiff_t len) 1295 { 1296 auto retStr = g_utf8_strreverse(Str.toStringz(str), len); 1297 1298 scope(exit) Str.freeString(retStr); 1299 return Str.toString(retStr); 1300 } 1301 1302 /** 1303 * Converts all Unicode characters in the string that have a case 1304 * to uppercase. The exact manner that this is done depends 1305 * on the current locale, and may result in the number of 1306 * characters in the string increasing. (For instance, the 1307 * German ess-zet will be changed to SS.) 1308 * 1309 * Params: 1310 * str = a UTF-8 encoded string 1311 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1312 * 1313 * Returns: a newly allocated string, with all characters 1314 * converted to uppercase. 1315 */ 1316 public static string utf8Strup(string str, ptrdiff_t len) 1317 { 1318 auto retStr = g_utf8_strup(Str.toStringz(str), len); 1319 1320 scope(exit) Str.freeString(retStr); 1321 return Str.toString(retStr); 1322 } 1323 1324 /** 1325 * Copies a substring out of a UTF-8 encoded string. 1326 * The substring will contain @end_pos - @start_pos characters. 1327 * 1328 * Params: 1329 * str = a UTF-8 encoded string 1330 * startPos = a character offset within @str 1331 * endPos = another character offset within @str 1332 * 1333 * Returns: a newly allocated copy of the requested 1334 * substring. Free with g_free() when no longer needed. 1335 * 1336 * Since: 2.30 1337 */ 1338 public static string utf8Substring(string str, glong startPos, glong endPos) 1339 { 1340 auto retStr = g_utf8_substring(Str.toStringz(str), startPos, endPos); 1341 1342 scope(exit) Str.freeString(retStr); 1343 return Str.toString(retStr); 1344 } 1345 1346 /** 1347 * Convert a string from UTF-8 to a 32-bit fixed width 1348 * representation as UCS-4. A trailing 0 character will be added to the 1349 * string after the converted text. 1350 * 1351 * Params: 1352 * str = a UTF-8 encoded string 1353 * len = the maximum length of @str to use, in bytes. If @len < 0, 1354 * then the string is nul-terminated. 1355 * itemsRead = location to store number of 1356 * bytes read, or %NULL. 1357 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1358 * returned in case @str contains a trailing partial 1359 * character. If an error occurs then the index of the 1360 * invalid input is stored here. 1361 * itemsWritten = location to store number 1362 * of characters written or %NULL. The value here stored does not include 1363 * the trailing 0 character. 1364 * 1365 * Returns: a pointer to a newly allocated UCS-4 string. 1366 * This value must be freed with g_free(). If an error occurs, 1367 * %NULL will be returned and @error set. 1368 * 1369 * Throws: GException on failure. 1370 */ 1371 public static dchar* utf8ToUcs4(string str, glong len, out glong itemsRead, out glong itemsWritten) 1372 { 1373 GError* err = null; 1374 1375 auto p = g_utf8_to_ucs4(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1376 1377 if (err !is null) 1378 { 1379 throw new GException( new ErrorG(err) ); 1380 } 1381 1382 return p; 1383 } 1384 1385 /** 1386 * Convert a string from UTF-8 to a 32-bit fixed width 1387 * representation as UCS-4, assuming valid UTF-8 input. 1388 * This function is roughly twice as fast as g_utf8_to_ucs4() 1389 * but does no error checking on the input. A trailing 0 character 1390 * will be added to the string after the converted text. 1391 * 1392 * Params: 1393 * str = a UTF-8 encoded string 1394 * len = the maximum length of @str to use, in bytes. If @len < 0, 1395 * then the string is nul-terminated. 1396 * itemsWritten = location to store the 1397 * number of characters in the result, or %NULL. 1398 * 1399 * Returns: a pointer to a newly allocated UCS-4 string. 1400 * This value must be freed with g_free(). 1401 */ 1402 public static dchar* utf8ToUcs4Fast(string str, glong len, out glong itemsWritten) 1403 { 1404 return g_utf8_to_ucs4_fast(Str.toStringz(str), len, &itemsWritten); 1405 } 1406 1407 /** 1408 * Convert a string from UTF-8 to UTF-16. A 0 character will be 1409 * added to the result after the converted text. 1410 * 1411 * Params: 1412 * str = a UTF-8 encoded string 1413 * len = the maximum length (number of bytes) of @str to use. 1414 * If @len < 0, then the string is nul-terminated. 1415 * itemsRead = location to store number of 1416 * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 1417 * be returned in case @str contains a trailing partial character. If 1418 * an error occurs then the index of the invalid input is stored here. 1419 * itemsWritten = location to store number 1420 * of #gunichar2 written, or %NULL. The value stored here does not include 1421 * the trailing 0. 1422 * 1423 * Returns: a pointer to a newly allocated UTF-16 string. 1424 * This value must be freed with g_free(). If an error occurs, 1425 * %NULL will be returned and @error set. 1426 * 1427 * Throws: GException on failure. 1428 */ 1429 public static wchar* utf8ToUtf16(string str, glong len, out glong itemsRead, out glong itemsWritten) 1430 { 1431 GError* err = null; 1432 1433 auto p = g_utf8_to_utf16(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1434 1435 if (err !is null) 1436 { 1437 throw new GException( new ErrorG(err) ); 1438 } 1439 1440 return p; 1441 } 1442 1443 /** 1444 * Validates UTF-8 encoded text. @str is the text to validate; 1445 * if @str is nul-terminated, then @max_len can be -1, otherwise 1446 * @max_len should be the number of bytes to validate. 1447 * If @end is non-%NULL, then the end of the valid range 1448 * will be stored there (i.e. the start of the first invalid 1449 * character if some bytes were invalid, or the end of the text 1450 * being validated otherwise). 1451 * 1452 * Note that g_utf8_validate() returns %FALSE if @max_len is 1453 * positive and any of the @max_len bytes are nul. 1454 * 1455 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 1456 * routines require valid UTF-8 as input; so data read from a file 1457 * or the network should be checked with g_utf8_validate() before 1458 * doing anything else with it. 1459 * 1460 * Params: 1461 * str = a pointer to character data 1462 * maxLen = max bytes to validate, or -1 to go until NUL 1463 * end = return location for end of valid data 1464 * 1465 * Returns: %TRUE if the text was valid UTF-8 1466 */ 1467 public static bool utf8Validate(string str, out string end) 1468 { 1469 char* outend = null; 1470 1471 auto p = g_utf8_validate(Str.toStringz(str), cast(ptrdiff_t)str.length, &outend) != 0; 1472 1473 end = Str.toString(outend); 1474 1475 return p; 1476 } 1477 1478 /** 1479 * If the provided string is valid UTF-8, return a copy of it. If not, 1480 * return a copy in which bytes that could not be interpreted as valid Unicode 1481 * are replaced with the Unicode replacement character (U+FFFD). 1482 * 1483 * For example, this is an appropriate function to use if you have received 1484 * a string that was incorrectly declared to be UTF-8, and you need a valid 1485 * UTF-8 version of it that can be logged or displayed to the user, with the 1486 * assumption that it is close enough to ASCII or UTF-8 to be mostly 1487 * readable as-is. 1488 * 1489 * Params: 1490 * str = string to coerce into UTF-8 1491 * len = the maximum length of @str to use, in bytes. If @len < 0, 1492 * then the string is nul-terminated. 1493 * 1494 * Returns: a valid UTF-8 string whose content resembles @str 1495 * 1496 * Since: 2.52 1497 */ 1498 public static string utf8MakeValid(string str, ptrdiff_t len) 1499 { 1500 auto retStr = g_utf8_make_valid(Str.toStringz(str), len); 1501 1502 scope(exit) Str.freeString(retStr); 1503 return Str.toString(retStr); 1504 } 1505 }