1 /* 2 * This file is part of gtkD. 3 * 4 * gtkD is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License 6 * as published by the Free Software Foundation; either version 3 7 * of the License, or (at your option) any later version, with 8 * some exceptions, please read the COPYING file. 9 * 10 * gtkD is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with gtkD; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA 18 */ 19 20 // generated automatically - do not change 21 // find conversion definition on APILookup.txt 22 // implement new conversion functionalities on the wrap.utils pakage 23 24 25 module glib.Unicode; 26 27 private import glib.ErrorG; 28 private import glib.GException; 29 private import glib.Str; 30 private import glib.c.functions; 31 public import glib.c.types; 32 public import gtkc.glibtypes; 33 34 35 /** */ 36 public struct Unicode 37 { 38 39 /** 40 * Convert a string from UCS-4 to UTF-16. A 0 character will be 41 * added to the result after the converted text. 42 * 43 * Params: 44 * str = a UCS-4 encoded string 45 * len = the maximum length (number of characters) of @str to use. 46 * If @len < 0, then the string is nul-terminated. 47 * itemsRead = location to store number of 48 * bytes read, or %NULL. If an error occurs then the index of the invalid 49 * input is stored here. 50 * itemsWritten = location to store number 51 * of #gunichar2 written, or %NULL. The value stored here does not include 52 * the trailing 0. 53 * 54 * Returns: a pointer to a newly allocated UTF-16 string. 55 * This value must be freed with g_free(). If an error occurs, 56 * %NULL will be returned and @error set. 57 * 58 * Throws: GException on failure. 59 */ 60 public static wchar* ucs4ToUtf16(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 61 { 62 GError* err = null; 63 64 auto __p = g_ucs4_to_utf16(str, len, &itemsRead, &itemsWritten, &err); 65 66 if (err !is null) 67 { 68 throw new GException( new ErrorG(err) ); 69 } 70 71 return __p; 72 } 73 74 /** 75 * Convert a string from a 32-bit fixed width representation as UCS-4. 76 * to UTF-8. The result will be terminated with a 0 byte. 77 * 78 * Params: 79 * str = a UCS-4 encoded string 80 * len = the maximum length (number of characters) of @str to use. 81 * If @len < 0, then the string is nul-terminated. 82 * itemsRead = location to store number of 83 * characters read, or %NULL. 84 * itemsWritten = location to store number 85 * of bytes written or %NULL. The value here stored does not include the 86 * trailing 0 byte. 87 * 88 * Returns: a pointer to a newly allocated UTF-8 string. 89 * This value must be freed with g_free(). If an error occurs, 90 * %NULL will be returned and @error set. In that case, @items_read 91 * will be set to the position of the first invalid input character. 92 * 93 * Throws: GException on failure. 94 */ 95 public static string ucs4ToUtf8(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 96 { 97 GError* err = null; 98 99 auto retStr = g_ucs4_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 100 101 if (err !is null) 102 { 103 throw new GException( new ErrorG(err) ); 104 } 105 106 scope(exit) Str.freeString(retStr); 107 return Str.toString(retStr); 108 } 109 110 /** 111 * Determines the break type of @c. @c should be a Unicode character 112 * (to derive a character from UTF-8 encoded text, use 113 * g_utf8_get_char()). The break type is used to find word and line 114 * breaks ("text boundaries"), Pango implements the Unicode boundary 115 * resolution algorithms and normally you would use a function such 116 * as pango_break() instead of caring about break types yourself. 117 * 118 * Params: 119 * c = a Unicode character 120 * 121 * Returns: the break type of @c 122 */ 123 public static GUnicodeBreakType unicharBreakType(dchar c) 124 { 125 return g_unichar_break_type(c); 126 } 127 128 /** 129 * Determines the canonical combining class of a Unicode character. 130 * 131 * Params: 132 * uc = a Unicode character 133 * 134 * Returns: the combining class of the character 135 * 136 * Since: 2.14 137 */ 138 public static int unicharCombiningClass(dchar uc) 139 { 140 return g_unichar_combining_class(uc); 141 } 142 143 /** 144 * Performs a single composition step of the 145 * Unicode canonical composition algorithm. 146 * 147 * This function includes algorithmic Hangul Jamo composition, 148 * but it is not exactly the inverse of g_unichar_decompose(). 149 * No composition can have either of @a or @b equal to zero. 150 * To be precise, this function composes if and only if 151 * there exists a Primary Composite P which is canonically 152 * equivalent to the sequence <@a,@b>. See the Unicode 153 * Standard for the definition of Primary Composite. 154 * 155 * If @a and @b do not compose a new character, @ch is set to zero. 156 * 157 * See 158 * [UAX#15](http://unicode.org/reports/tr15/) 159 * for details. 160 * 161 * Params: 162 * a = a Unicode character 163 * b = a Unicode character 164 * ch = return location for the composed character 165 * 166 * Returns: %TRUE if the characters could be composed 167 * 168 * Since: 2.30 169 */ 170 public static bool unicharCompose(dchar a, dchar b, out dchar ch) 171 { 172 return g_unichar_compose(a, b, &ch) != 0; 173 } 174 175 /** 176 * Performs a single decomposition step of the 177 * Unicode canonical decomposition algorithm. 178 * 179 * This function does not include compatibility 180 * decompositions. It does, however, include algorithmic 181 * Hangul Jamo decomposition, as well as 'singleton' 182 * decompositions which replace a character by a single 183 * other character. In the case of singletons *@b will 184 * be set to zero. 185 * 186 * If @ch is not decomposable, *@a is set to @ch and *@b 187 * is set to zero. 188 * 189 * Note that the way Unicode decomposition pairs are 190 * defined, it is guaranteed that @b would not decompose 191 * further, but @a may itself decompose. To get the full 192 * canonical decomposition for @ch, one would need to 193 * recursively call this function on @a. Or use 194 * g_unichar_fully_decompose(). 195 * 196 * See 197 * [UAX#15](http://unicode.org/reports/tr15/) 198 * for details. 199 * 200 * Params: 201 * ch = a Unicode character 202 * a = return location for the first component of @ch 203 * b = return location for the second component of @ch 204 * 205 * Returns: %TRUE if the character could be decomposed 206 * 207 * Since: 2.30 208 */ 209 public static bool unicharDecompose(dchar ch, out dchar a, out dchar b) 210 { 211 return g_unichar_decompose(ch, &a, &b) != 0; 212 } 213 214 /** 215 * Determines the numeric value of a character as a decimal 216 * digit. 217 * 218 * Params: 219 * c = a Unicode character 220 * 221 * Returns: If @c is a decimal digit (according to 222 * g_unichar_isdigit()), its numeric value. Otherwise, -1. 223 */ 224 public static int unicharDigitValue(dchar c) 225 { 226 return g_unichar_digit_value(c); 227 } 228 229 /** 230 * Computes the canonical or compatibility decomposition of a 231 * Unicode character. For compatibility decomposition, 232 * pass %TRUE for @compat; for canonical decomposition 233 * pass %FALSE for @compat. 234 * 235 * The decomposed sequence is placed in @result. Only up to 236 * @result_len characters are written into @result. The length 237 * of the full decomposition (irrespective of @result_len) is 238 * returned by the function. For canonical decomposition, 239 * currently all decompositions are of length at most 4, but 240 * this may change in the future (very unlikely though). 241 * At any rate, Unicode does guarantee that a buffer of length 242 * 18 is always enough for both compatibility and canonical 243 * decompositions, so that is the size recommended. This is provided 244 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. 245 * 246 * See 247 * [UAX#15](http://unicode.org/reports/tr15/) 248 * for details. 249 * 250 * Params: 251 * ch = a Unicode character. 252 * compat = whether perform canonical or compatibility decomposition 253 * result = location to store decomposed result, or %NULL 254 * resultLen = length of @result 255 * 256 * Returns: the length of the full decomposition. 257 * 258 * Since: 2.30 259 */ 260 public static size_t unicharFullyDecompose(dchar ch, bool compat, out dchar result, size_t resultLen) 261 { 262 return g_unichar_fully_decompose(ch, compat, &result, resultLen); 263 } 264 265 /** 266 * In Unicode, some characters are "mirrored". This means that their 267 * images are mirrored horizontally in text that is laid out from right 268 * to left. For instance, "(" would become its mirror image, ")", in 269 * right-to-left text. 270 * 271 * If @ch has the Unicode mirrored property and there is another unicode 272 * character that typically has a glyph that is the mirror image of @ch's 273 * glyph and @mirrored_ch is set, it puts that character in the address 274 * pointed to by @mirrored_ch. Otherwise the original character is put. 275 * 276 * Params: 277 * ch = a Unicode character 278 * mirroredCh = location to store the mirrored character 279 * 280 * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise 281 * 282 * Since: 2.4 283 */ 284 public static bool unicharGetMirrorChar(dchar ch, dchar* mirroredCh) 285 { 286 return g_unichar_get_mirror_char(ch, mirroredCh) != 0; 287 } 288 289 /** 290 * Looks up the #GUnicodeScript for a particular character (as defined 291 * by Unicode Standard Annex \#24). No check is made for @ch being a 292 * valid Unicode character; if you pass in invalid character, the 293 * result is undefined. 294 * 295 * This function is equivalent to pango_script_for_unichar() and the 296 * two are interchangeable. 297 * 298 * Params: 299 * ch = a Unicode character 300 * 301 * Returns: the #GUnicodeScript for the character. 302 * 303 * Since: 2.14 304 */ 305 public static GUnicodeScript unicharGetScript(dchar ch) 306 { 307 return g_unichar_get_script(ch); 308 } 309 310 /** 311 * Determines whether a character is alphanumeric. 312 * Given some UTF-8 text, obtain a character value 313 * with g_utf8_get_char(). 314 * 315 * Params: 316 * c = a Unicode character 317 * 318 * Returns: %TRUE if @c is an alphanumeric character 319 */ 320 public static bool unicharIsalnum(dchar c) 321 { 322 return g_unichar_isalnum(c) != 0; 323 } 324 325 /** 326 * Determines whether a character is alphabetic (i.e. a letter). 327 * Given some UTF-8 text, obtain a character value with 328 * g_utf8_get_char(). 329 * 330 * Params: 331 * c = a Unicode character 332 * 333 * Returns: %TRUE if @c is an alphabetic character 334 */ 335 public static bool unicharIsalpha(dchar c) 336 { 337 return g_unichar_isalpha(c) != 0; 338 } 339 340 /** 341 * Determines whether a character is a control character. 342 * Given some UTF-8 text, obtain a character value with 343 * g_utf8_get_char(). 344 * 345 * Params: 346 * c = a Unicode character 347 * 348 * Returns: %TRUE if @c is a control character 349 */ 350 public static bool unicharIscntrl(dchar c) 351 { 352 return g_unichar_iscntrl(c) != 0; 353 } 354 355 /** 356 * Determines if a given character is assigned in the Unicode 357 * standard. 358 * 359 * Params: 360 * c = a Unicode character 361 * 362 * Returns: %TRUE if the character has an assigned value 363 */ 364 public static bool unicharIsdefined(dchar c) 365 { 366 return g_unichar_isdefined(c) != 0; 367 } 368 369 /** 370 * Determines whether a character is numeric (i.e. a digit). This 371 * covers ASCII 0-9 and also digits in other languages/scripts. Given 372 * some UTF-8 text, obtain a character value with g_utf8_get_char(). 373 * 374 * Params: 375 * c = a Unicode character 376 * 377 * Returns: %TRUE if @c is a digit 378 */ 379 public static bool unicharIsdigit(dchar c) 380 { 381 return g_unichar_isdigit(c) != 0; 382 } 383 384 /** 385 * Determines whether a character is printable and not a space 386 * (returns %FALSE for control characters, format characters, and 387 * spaces). g_unichar_isprint() is similar, but returns %TRUE for 388 * spaces. Given some UTF-8 text, obtain a character value with 389 * g_utf8_get_char(). 390 * 391 * Params: 392 * c = a Unicode character 393 * 394 * Returns: %TRUE if @c is printable unless it's a space 395 */ 396 public static bool unicharIsgraph(dchar c) 397 { 398 return g_unichar_isgraph(c) != 0; 399 } 400 401 /** 402 * Determines whether a character is a lowercase letter. 403 * Given some UTF-8 text, obtain a character value with 404 * g_utf8_get_char(). 405 * 406 * Params: 407 * c = a Unicode character 408 * 409 * Returns: %TRUE if @c is a lowercase letter 410 */ 411 public static bool unicharIslower(dchar c) 412 { 413 return g_unichar_islower(c) != 0; 414 } 415 416 /** 417 * Determines whether a character is a mark (non-spacing mark, 418 * combining mark, or enclosing mark in Unicode speak). 419 * Given some UTF-8 text, obtain a character value 420 * with g_utf8_get_char(). 421 * 422 * Note: in most cases where isalpha characters are allowed, 423 * ismark characters should be allowed to as they are essential 424 * for writing most European languages as well as many non-Latin 425 * scripts. 426 * 427 * Params: 428 * c = a Unicode character 429 * 430 * Returns: %TRUE if @c is a mark character 431 * 432 * Since: 2.14 433 */ 434 public static bool unicharIsmark(dchar c) 435 { 436 return g_unichar_ismark(c) != 0; 437 } 438 439 /** 440 * Determines whether a character is printable. 441 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. 442 * Given some UTF-8 text, obtain a character value with 443 * g_utf8_get_char(). 444 * 445 * Params: 446 * c = a Unicode character 447 * 448 * Returns: %TRUE if @c is printable 449 */ 450 public static bool unicharIsprint(dchar c) 451 { 452 return g_unichar_isprint(c) != 0; 453 } 454 455 /** 456 * Determines whether a character is punctuation or a symbol. 457 * Given some UTF-8 text, obtain a character value with 458 * g_utf8_get_char(). 459 * 460 * Params: 461 * c = a Unicode character 462 * 463 * Returns: %TRUE if @c is a punctuation or symbol character 464 */ 465 public static bool unicharIspunct(dchar c) 466 { 467 return g_unichar_ispunct(c) != 0; 468 } 469 470 /** 471 * Determines whether a character is a space, tab, or line separator 472 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a 473 * character value with g_utf8_get_char(). 474 * 475 * (Note: don't use this to do word breaking; you have to use 476 * Pango or equivalent to get word breaking right, the algorithm 477 * is fairly complex.) 478 * 479 * Params: 480 * c = a Unicode character 481 * 482 * Returns: %TRUE if @c is a space character 483 */ 484 public static bool unicharIsspace(dchar c) 485 { 486 return g_unichar_isspace(c) != 0; 487 } 488 489 /** 490 * Determines if a character is titlecase. Some characters in 491 * Unicode which are composites, such as the DZ digraph 492 * have three case variants instead of just two. The titlecase 493 * form is used at the beginning of a word where only the 494 * first letter is capitalized. The titlecase form of the DZ 495 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. 496 * 497 * Params: 498 * c = a Unicode character 499 * 500 * Returns: %TRUE if the character is titlecase 501 */ 502 public static bool unicharIstitle(dchar c) 503 { 504 return g_unichar_istitle(c) != 0; 505 } 506 507 /** 508 * Determines if a character is uppercase. 509 * 510 * Params: 511 * c = a Unicode character 512 * 513 * Returns: %TRUE if @c is an uppercase character 514 */ 515 public static bool unicharIsupper(dchar c) 516 { 517 return g_unichar_isupper(c) != 0; 518 } 519 520 /** 521 * Determines if a character is typically rendered in a double-width 522 * cell. 523 * 524 * Params: 525 * c = a Unicode character 526 * 527 * Returns: %TRUE if the character is wide 528 */ 529 public static bool unicharIswide(dchar c) 530 { 531 return g_unichar_iswide(c) != 0; 532 } 533 534 /** 535 * Determines if a character is typically rendered in a double-width 536 * cell under legacy East Asian locales. If a character is wide according to 537 * g_unichar_iswide(), then it is also reported wide with this function, but 538 * the converse is not necessarily true. See the 539 * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) 540 * for details. 541 * 542 * If a character passes the g_unichar_iswide() test then it will also pass 543 * this test, but not the other way around. Note that some characters may 544 * pass both this test and g_unichar_iszerowidth(). 545 * 546 * Params: 547 * c = a Unicode character 548 * 549 * Returns: %TRUE if the character is wide in legacy East Asian locales 550 * 551 * Since: 2.12 552 */ 553 public static bool unicharIswideCjk(dchar c) 554 { 555 return g_unichar_iswide_cjk(c) != 0; 556 } 557 558 /** 559 * Determines if a character is a hexidecimal digit. 560 * 561 * Params: 562 * c = a Unicode character. 563 * 564 * Returns: %TRUE if the character is a hexadecimal digit 565 */ 566 public static bool unicharIsxdigit(dchar c) 567 { 568 return g_unichar_isxdigit(c) != 0; 569 } 570 571 /** 572 * Determines if a given character typically takes zero width when rendered. 573 * The return value is %TRUE for all non-spacing and enclosing marks 574 * (e.g., combining accents), format characters, zero-width 575 * space, but not U+00AD SOFT HYPHEN. 576 * 577 * A typical use of this function is with one of g_unichar_iswide() or 578 * g_unichar_iswide_cjk() to determine the number of cells a string occupies 579 * when displayed on a grid display (terminals). However, note that not all 580 * terminals support zero-width rendering of zero-width marks. 581 * 582 * Params: 583 * c = a Unicode character 584 * 585 * Returns: %TRUE if the character has zero width 586 * 587 * Since: 2.14 588 */ 589 public static bool unicharIszerowidth(dchar c) 590 { 591 return g_unichar_iszerowidth(c) != 0; 592 } 593 594 /** 595 * Converts a single character to UTF-8. 596 * 597 * Params: 598 * c = a Unicode character code 599 * outbuf = output buffer, must have at 600 * least 6 bytes of space. If %NULL, the length will be computed and 601 * returned and nothing will be written to @outbuf. 602 * 603 * Returns: number of bytes written 604 */ 605 public static int unicharToUtf8(dchar c, out char[] outbuf) 606 { 607 return g_unichar_to_utf8(c, outbuf.ptr); 608 } 609 610 /** 611 * Converts a character to lower case. 612 * 613 * Params: 614 * c = a Unicode character. 615 * 616 * Returns: the result of converting @c to lower case. 617 * If @c is not an upperlower or titlecase character, 618 * or has no lowercase equivalent @c is returned unchanged. 619 */ 620 public static dchar unicharTolower(dchar c) 621 { 622 return g_unichar_tolower(c); 623 } 624 625 /** 626 * Converts a character to the titlecase. 627 * 628 * Params: 629 * c = a Unicode character 630 * 631 * Returns: the result of converting @c to titlecase. 632 * If @c is not an uppercase or lowercase character, 633 * @c is returned unchanged. 634 */ 635 public static dchar unicharTotitle(dchar c) 636 { 637 return g_unichar_totitle(c); 638 } 639 640 /** 641 * Converts a character to uppercase. 642 * 643 * Params: 644 * c = a Unicode character 645 * 646 * Returns: the result of converting @c to uppercase. 647 * If @c is not a lowercase or titlecase character, 648 * or has no upper case equivalent @c is returned unchanged. 649 */ 650 public static dchar unicharToupper(dchar c) 651 { 652 return g_unichar_toupper(c); 653 } 654 655 /** 656 * Classifies a Unicode character by type. 657 * 658 * Params: 659 * c = a Unicode character 660 * 661 * Returns: the type of the character. 662 */ 663 public static GUnicodeType unicharType(dchar c) 664 { 665 return g_unichar_type(c); 666 } 667 668 /** 669 * Checks whether @ch is a valid Unicode character. Some possible 670 * integer values of @ch will not be valid. 0 is considered a valid 671 * character, though it's normally a string terminator. 672 * 673 * Params: 674 * ch = a Unicode character 675 * 676 * Returns: %TRUE if @ch is a valid Unicode character 677 */ 678 public static bool unicharValidate(dchar ch) 679 { 680 return g_unichar_validate(ch) != 0; 681 } 682 683 /** 684 * Determines the numeric value of a character as a hexidecimal 685 * digit. 686 * 687 * Params: 688 * c = a Unicode character 689 * 690 * Returns: If @c is a hex digit (according to 691 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. 692 */ 693 public static int unicharXdigitValue(dchar c) 694 { 695 return g_unichar_xdigit_value(c); 696 } 697 698 /** 699 * Computes the canonical decomposition of a Unicode character. 700 * 701 * Deprecated: Use the more flexible g_unichar_fully_decompose() 702 * instead. 703 * 704 * Params: 705 * ch = a Unicode character. 706 * resultLen = location to store the length of the return value. 707 * 708 * Returns: a newly allocated string of Unicode characters. 709 * @result_len is set to the resulting length of the string. 710 */ 711 public static dchar* unicodeCanonicalDecomposition(dchar ch, size_t* resultLen) 712 { 713 return g_unicode_canonical_decomposition(ch, resultLen); 714 } 715 716 /** 717 * Computes the canonical ordering of a string in-place. 718 * This rearranges decomposed characters in the string 719 * according to their combining classes. See the Unicode 720 * manual for more information. 721 * 722 * Params: 723 * string_ = a UCS-4 encoded string. 724 * len = the maximum length of @string to use. 725 */ 726 public static void unicodeCanonicalOrdering(dchar* string_, size_t len) 727 { 728 g_unicode_canonical_ordering(string_, len); 729 } 730 731 /** 732 * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter 733 * codes to scripts. For example, the code for Arabic is 'Arab'. 734 * This function accepts four letter codes encoded as a @guint32 in a 735 * big-endian fashion. That is, the code expected for Arabic is 736 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 737 * 738 * See 739 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 740 * for details. 741 * 742 * Params: 743 * iso15924 = a Unicode script 744 * 745 * Returns: the Unicode script for @iso15924, or 746 * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and 747 * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. 748 * 749 * Since: 2.30 750 */ 751 public static GUnicodeScript unicodeScriptFromIso15924(uint iso15924) 752 { 753 return g_unicode_script_from_iso15924(iso15924); 754 } 755 756 /** 757 * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter 758 * codes to scripts. For example, the code for Arabic is 'Arab'. The 759 * four letter codes are encoded as a @guint32 by this function in a 760 * big-endian fashion. That is, the code returned for Arabic is 761 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 762 * 763 * See 764 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 765 * for details. 766 * 767 * Params: 768 * script = a Unicode script 769 * 770 * Returns: the ISO 15924 code for @script, encoded as an integer, 771 * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or 772 * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. 773 * 774 * Since: 2.30 775 */ 776 public static uint unicodeScriptToIso15924(GUnicodeScript script) 777 { 778 return g_unicode_script_to_iso15924(script); 779 } 780 781 /** 782 * Convert a string from UTF-16 to UCS-4. The result will be 783 * nul-terminated. 784 * 785 * Params: 786 * str = a UTF-16 encoded string 787 * len = the maximum length (number of #gunichar2) of @str to use. 788 * If @len < 0, then the string is nul-terminated. 789 * itemsRead = location to store number of 790 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 791 * be returned in case @str contains a trailing partial character. If 792 * an error occurs then the index of the invalid input is stored here. 793 * itemsWritten = location to store number 794 * of characters written, or %NULL. The value stored here does not include 795 * the trailing 0 character. 796 * 797 * Returns: a pointer to a newly allocated UCS-4 string. 798 * This value must be freed with g_free(). If an error occurs, 799 * %NULL will be returned and @error set. 800 * 801 * Throws: GException on failure. 802 */ 803 public static dchar* utf16ToUcs4(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 804 { 805 GError* err = null; 806 807 auto __p = g_utf16_to_ucs4(str, len, &itemsRead, &itemsWritten, &err); 808 809 if (err !is null) 810 { 811 throw new GException( new ErrorG(err) ); 812 } 813 814 return __p; 815 } 816 817 /** 818 * Convert a string from UTF-16 to UTF-8. The result will be 819 * terminated with a 0 byte. 820 * 821 * Note that the input is expected to be already in native endianness, 822 * an initial byte-order-mark character is not handled specially. 823 * g_convert() can be used to convert a byte buffer of UTF-16 data of 824 * ambiguous endianess. 825 * 826 * Further note that this function does not validate the result 827 * string; it may e.g. include embedded NUL characters. The only 828 * validation done by this function is to ensure that the input can 829 * be correctly interpreted as UTF-16, i.e. it doesn't contain 830 * things unpaired surrogates. 831 * 832 * Params: 833 * str = a UTF-16 encoded string 834 * len = the maximum length (number of #gunichar2) of @str to use. 835 * If @len < 0, then the string is nul-terminated. 836 * itemsRead = location to store number of 837 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 838 * be returned in case @str contains a trailing partial character. If 839 * an error occurs then the index of the invalid input is stored here. 840 * itemsWritten = location to store number 841 * of bytes written, or %NULL. The value stored here does not include the 842 * trailing 0 byte. 843 * 844 * Returns: a pointer to a newly allocated UTF-8 string. 845 * This value must be freed with g_free(). If an error occurs, 846 * %NULL will be returned and @error set. 847 * 848 * Throws: GException on failure. 849 */ 850 public static string utf16ToUtf8(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 851 { 852 GError* err = null; 853 854 auto retStr = g_utf16_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 855 856 if (err !is null) 857 { 858 throw new GException( new ErrorG(err) ); 859 } 860 861 scope(exit) Str.freeString(retStr); 862 return Str.toString(retStr); 863 } 864 865 /** 866 * Converts a string into a form that is independent of case. The 867 * result will not correspond to any particular case, but can be 868 * compared for equality or ordered with the results of calling 869 * g_utf8_casefold() on other strings. 870 * 871 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is 872 * only an approximation to the correct linguistic case insensitive 873 * ordering, though it is a fairly good one. Getting this exactly 874 * right would require a more sophisticated collation function that 875 * takes case sensitivity into account. GLib does not currently 876 * provide such a function. 877 * 878 * Params: 879 * str = a UTF-8 encoded string 880 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 881 * 882 * Returns: a newly allocated string, that is a 883 * case independent form of @str. 884 */ 885 public static string utf8Casefold(string str, ptrdiff_t len) 886 { 887 auto retStr = g_utf8_casefold(Str.toStringz(str), len); 888 889 scope(exit) Str.freeString(retStr); 890 return Str.toString(retStr); 891 } 892 893 /** 894 * Compares two strings for ordering using the linguistically 895 * correct rules for the [current locale][setlocale]. 896 * When sorting a large number of strings, it will be significantly 897 * faster to obtain collation keys with g_utf8_collate_key() and 898 * compare the keys with strcmp() when sorting instead of sorting 899 * the original strings. 900 * 901 * Params: 902 * str1 = a UTF-8 encoded string 903 * str2 = a UTF-8 encoded string 904 * 905 * Returns: < 0 if @str1 compares before @str2, 906 * 0 if they compare equal, > 0 if @str1 compares after @str2. 907 */ 908 public static int utf8Collate(string str1, string str2) 909 { 910 return g_utf8_collate(Str.toStringz(str1), Str.toStringz(str2)); 911 } 912 913 /** 914 * Converts a string into a collation key that can be compared 915 * with other collation keys produced by the same function using 916 * strcmp(). 917 * 918 * The results of comparing the collation keys of two strings 919 * with strcmp() will always be the same as comparing the two 920 * original keys with g_utf8_collate(). 921 * 922 * Note that this function depends on the [current locale][setlocale]. 923 * 924 * Params: 925 * str = a UTF-8 encoded string. 926 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 927 * 928 * Returns: a newly allocated string. This string should 929 * be freed with g_free() when you are done with it. 930 */ 931 public static string utf8CollateKey(string str, ptrdiff_t len) 932 { 933 auto retStr = g_utf8_collate_key(Str.toStringz(str), len); 934 935 scope(exit) Str.freeString(retStr); 936 return Str.toString(retStr); 937 } 938 939 /** 940 * Converts a string into a collation key that can be compared 941 * with other collation keys produced by the same function using strcmp(). 942 * 943 * In order to sort filenames correctly, this function treats the dot '.' 944 * as a special case. Most dictionary orderings seem to consider it 945 * insignificant, thus producing the ordering "event.c" "eventgenerator.c" 946 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we 947 * would like to treat numbers intelligently so that "file1" "file10" "file5" 948 * is sorted as "file1" "file5" "file10". 949 * 950 * Note that this function depends on the [current locale][setlocale]. 951 * 952 * Params: 953 * str = a UTF-8 encoded string. 954 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 955 * 956 * Returns: a newly allocated string. This string should 957 * be freed with g_free() when you are done with it. 958 * 959 * Since: 2.8 960 */ 961 public static string utf8CollateKeyForFilename(string str, ptrdiff_t len) 962 { 963 auto retStr = g_utf8_collate_key_for_filename(Str.toStringz(str), len); 964 965 scope(exit) Str.freeString(retStr); 966 return Str.toString(retStr); 967 } 968 969 /** 970 * Finds the start of the next UTF-8 character in the string after @p. 971 * 972 * @p does not have to be at the beginning of a UTF-8 character. No check 973 * is made to see if the character found is actually valid other than 974 * it starts with an appropriate byte. 975 * 976 * If @end is %NULL, the return value will never be %NULL: if the end of the 977 * string is reached, a pointer to the terminating nul byte is returned. If 978 * @end is non-%NULL, the return value will be %NULL if the end of the string 979 * is reached. 980 * 981 * Params: 982 * p = a pointer to a position within a UTF-8 encoded string 983 * end = a pointer to the byte following the end of the string, 984 * or %NULL to indicate that the string is nul-terminated 985 * 986 * Returns: a pointer to the found character or %NULL if @end is 987 * set and is reached 988 */ 989 public static string utf8FindNextChar(string p, string end) 990 { 991 return Str.toString(g_utf8_find_next_char(Str.toStringz(p), Str.toStringz(end))); 992 } 993 994 /** 995 * Given a position @p with a UTF-8 encoded string @str, find the start 996 * of the previous UTF-8 character starting before @p. Returns %NULL if no 997 * UTF-8 characters are present in @str before @p. 998 * 999 * @p does not have to be at the beginning of a UTF-8 character. No check 1000 * is made to see if the character found is actually valid other than 1001 * it starts with an appropriate byte. 1002 * 1003 * Params: 1004 * str = pointer to the beginning of a UTF-8 encoded string 1005 * p = pointer to some position within @str 1006 * 1007 * Returns: a pointer to the found character or %NULL. 1008 */ 1009 public static string utf8FindPrevChar(string str, string p) 1010 { 1011 return Str.toString(g_utf8_find_prev_char(Str.toStringz(str), Str.toStringz(p))); 1012 } 1013 1014 /** 1015 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 1016 * 1017 * If @p does not point to a valid UTF-8 encoded character, results 1018 * are undefined. If you are not sure that the bytes are complete 1019 * valid Unicode characters, you should use g_utf8_get_char_validated() 1020 * instead. 1021 * 1022 * Params: 1023 * p = a pointer to Unicode character encoded as UTF-8 1024 * 1025 * Returns: the resulting character 1026 */ 1027 public static dchar utf8GetChar(string p) 1028 { 1029 return g_utf8_get_char(Str.toStringz(p)); 1030 } 1031 1032 /** 1033 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 1034 * This function checks for incomplete characters, for invalid characters 1035 * such as characters that are out of the range of Unicode, and for 1036 * overlong encodings of valid characters. 1037 * 1038 * Note that g_utf8_get_char_validated() returns (gunichar)-2 if 1039 * @max_len is positive and any of the bytes in the first UTF-8 character 1040 * sequence are nul. 1041 * 1042 * Params: 1043 * p = a pointer to Unicode character encoded as UTF-8 1044 * maxLen = the maximum number of bytes to read, or -1 if @p is nul-terminated 1045 * 1046 * Returns: the resulting character. If @p points to a partial 1047 * sequence at the end of a string that could begin a valid 1048 * character (or if @max_len is zero), returns (gunichar)-2; 1049 * otherwise, if @p does not point to a valid UTF-8 encoded 1050 * Unicode character, returns (gunichar)-1. 1051 */ 1052 public static dchar utf8GetCharValidated(string p, ptrdiff_t maxLen) 1053 { 1054 return g_utf8_get_char_validated(Str.toStringz(p), maxLen); 1055 } 1056 1057 /** 1058 * Converts a string into canonical form, standardizing 1059 * such issues as whether a character with an accent 1060 * is represented as a base character and combining 1061 * accent or as a single precomposed character. The 1062 * string has to be valid UTF-8, otherwise %NULL is 1063 * returned. You should generally call g_utf8_normalize() 1064 * before comparing two Unicode strings. 1065 * 1066 * The normalization mode %G_NORMALIZE_DEFAULT only 1067 * standardizes differences that do not affect the 1068 * text content, such as the above-mentioned accent 1069 * representation. %G_NORMALIZE_ALL also standardizes 1070 * the "compatibility" characters in Unicode, such 1071 * as SUPERSCRIPT THREE to the standard forms 1072 * (in this case DIGIT THREE). Formatting information 1073 * may be lost but for most text operations such 1074 * characters should be considered the same. 1075 * 1076 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 1077 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 1078 * but returned a result with composed forms rather 1079 * than a maximally decomposed form. This is often 1080 * useful if you intend to convert the string to 1081 * a legacy encoding or pass it to a system with 1082 * less capable Unicode handling. 1083 * 1084 * Params: 1085 * str = a UTF-8 encoded string. 1086 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1087 * mode = the type of normalization to perform. 1088 * 1089 * Returns: a newly allocated string, that 1090 * is the normalized form of @str, or %NULL if @str 1091 * is not valid UTF-8. 1092 */ 1093 public static string utf8Normalize(string str, ptrdiff_t len, GNormalizeMode mode) 1094 { 1095 auto retStr = g_utf8_normalize(Str.toStringz(str), len, mode); 1096 1097 scope(exit) Str.freeString(retStr); 1098 return Str.toString(retStr); 1099 } 1100 1101 /** 1102 * Converts from an integer character offset to a pointer to a position 1103 * within the string. 1104 * 1105 * Since 2.10, this function allows to pass a negative @offset to 1106 * step backwards. It is usually worth stepping backwards from the end 1107 * instead of forwards if @offset is in the last fourth of the string, 1108 * since moving forward is about 3 times faster than moving backward. 1109 * 1110 * Note that this function doesn't abort when reaching the end of @str. 1111 * Therefore you should be sure that @offset is within string boundaries 1112 * before calling that function. Call g_utf8_strlen() when unsure. 1113 * This limitation exists as this function is called frequently during 1114 * text rendering and therefore has to be as fast as possible. 1115 * 1116 * Params: 1117 * str = a UTF-8 encoded string 1118 * offset = a character offset within @str 1119 * 1120 * Returns: the resulting pointer 1121 */ 1122 public static string utf8OffsetToPointer(string str, glong offset) 1123 { 1124 return Str.toString(g_utf8_offset_to_pointer(Str.toStringz(str), offset)); 1125 } 1126 1127 /** 1128 * Converts from a pointer to position within a string to an integer 1129 * character offset. 1130 * 1131 * Since 2.10, this function allows @pos to be before @str, and returns 1132 * a negative offset in this case. 1133 * 1134 * Params: 1135 * str = a UTF-8 encoded string 1136 * pos = a pointer to a position within @str 1137 * 1138 * Returns: the resulting character offset 1139 */ 1140 public static glong utf8PointerToOffset(string str, string pos) 1141 { 1142 return g_utf8_pointer_to_offset(Str.toStringz(str), Str.toStringz(pos)); 1143 } 1144 1145 /** 1146 * Finds the previous UTF-8 character in the string before @p. 1147 * 1148 * @p does not have to be at the beginning of a UTF-8 character. No check 1149 * is made to see if the character found is actually valid other than 1150 * it starts with an appropriate byte. If @p might be the first 1151 * character of the string, you must use g_utf8_find_prev_char() instead. 1152 * 1153 * Params: 1154 * p = a pointer to a position within a UTF-8 encoded string 1155 * 1156 * Returns: a pointer to the found character 1157 */ 1158 public static string utf8PrevChar(string p) 1159 { 1160 return Str.toString(g_utf8_prev_char(Str.toStringz(p))); 1161 } 1162 1163 /** 1164 * Finds the leftmost occurrence of the given Unicode character 1165 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1166 * If @len is -1, allow unbounded search. 1167 * 1168 * Params: 1169 * p = a nul-terminated UTF-8 encoded string 1170 * len = the maximum length of @p 1171 * c = a Unicode character 1172 * 1173 * Returns: %NULL if the string does not contain the character, 1174 * otherwise, a pointer to the start of the leftmost occurrence 1175 * of the character in the string. 1176 */ 1177 public static string utf8Strchr(string p, ptrdiff_t len, dchar c) 1178 { 1179 return Str.toString(g_utf8_strchr(Str.toStringz(p), len, c)); 1180 } 1181 1182 /** 1183 * Converts all Unicode characters in the string that have a case 1184 * to lowercase. The exact manner that this is done depends 1185 * on the current locale, and may result in the number of 1186 * characters in the string changing. 1187 * 1188 * Params: 1189 * str = a UTF-8 encoded string 1190 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1191 * 1192 * Returns: a newly allocated string, with all characters 1193 * converted to lowercase. 1194 */ 1195 public static string utf8Strdown(string str, ptrdiff_t len) 1196 { 1197 auto retStr = g_utf8_strdown(Str.toStringz(str), len); 1198 1199 scope(exit) Str.freeString(retStr); 1200 return Str.toString(retStr); 1201 } 1202 1203 /** 1204 * Computes the length of the string in characters, not including 1205 * the terminating nul character. If the @max'th byte falls in the 1206 * middle of a character, the last (partial) character is not counted. 1207 * 1208 * Params: 1209 * p = pointer to the start of a UTF-8 encoded string 1210 * max = the maximum number of bytes to examine. If @max 1211 * is less than 0, then the string is assumed to be 1212 * nul-terminated. If @max is 0, @p will not be examined and 1213 * may be %NULL. If @max is greater than 0, up to @max 1214 * bytes are examined 1215 * 1216 * Returns: the length of the string in characters 1217 */ 1218 public static glong utf8Strlen(string p, ptrdiff_t max) 1219 { 1220 return g_utf8_strlen(Str.toStringz(p), max); 1221 } 1222 1223 /** 1224 * Like the standard C strncpy() function, but copies a given number 1225 * of characters instead of a given number of bytes. The @src string 1226 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all 1227 * text before trying to use UTF-8 utility functions with it.) 1228 * 1229 * Note you must ensure @dest is at least 4 * @n to fit the 1230 * largest possible UTF-8 characters 1231 * 1232 * Params: 1233 * dest = buffer to fill with characters from @src 1234 * src = UTF-8 encoded string 1235 * n = character count 1236 * 1237 * Returns: @dest 1238 */ 1239 public static string utf8Strncpy(string dest, string src, size_t n) 1240 { 1241 return Str.toString(g_utf8_strncpy(Str.toStringz(dest), Str.toStringz(src), n)); 1242 } 1243 1244 /** 1245 * Find the rightmost occurrence of the given Unicode character 1246 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1247 * If @len is -1, allow unbounded search. 1248 * 1249 * Params: 1250 * p = a nul-terminated UTF-8 encoded string 1251 * len = the maximum length of @p 1252 * c = a Unicode character 1253 * 1254 * Returns: %NULL if the string does not contain the character, 1255 * otherwise, a pointer to the start of the rightmost occurrence 1256 * of the character in the string. 1257 */ 1258 public static string utf8Strrchr(string p, ptrdiff_t len, dchar c) 1259 { 1260 return Str.toString(g_utf8_strrchr(Str.toStringz(p), len, c)); 1261 } 1262 1263 /** 1264 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 1265 * (Use g_utf8_validate() on all text before trying to use UTF-8 1266 * utility functions with it.) 1267 * 1268 * This function is intended for programmatic uses of reversed strings. 1269 * It pays no attention to decomposed characters, combining marks, byte 1270 * order marks, directional indicators (LRM, LRO, etc) and similar 1271 * characters which might need special handling when reversing a string 1272 * for display purposes. 1273 * 1274 * Note that unlike g_strreverse(), this function returns 1275 * newly-allocated memory, which should be freed with g_free() when 1276 * no longer needed. 1277 * 1278 * Params: 1279 * str = a UTF-8 encoded string 1280 * len = the maximum length of @str to use, in bytes. If @len < 0, 1281 * then the string is nul-terminated. 1282 * 1283 * Returns: a newly-allocated string which is the reverse of @str 1284 * 1285 * Since: 2.2 1286 */ 1287 public static string utf8Strreverse(string str, ptrdiff_t len) 1288 { 1289 auto retStr = g_utf8_strreverse(Str.toStringz(str), len); 1290 1291 scope(exit) Str.freeString(retStr); 1292 return Str.toString(retStr); 1293 } 1294 1295 /** 1296 * Converts all Unicode characters in the string that have a case 1297 * to uppercase. The exact manner that this is done depends 1298 * on the current locale, and may result in the number of 1299 * characters in the string increasing. (For instance, the 1300 * German ess-zet will be changed to SS.) 1301 * 1302 * Params: 1303 * str = a UTF-8 encoded string 1304 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1305 * 1306 * Returns: a newly allocated string, with all characters 1307 * converted to uppercase. 1308 */ 1309 public static string utf8Strup(string str, ptrdiff_t len) 1310 { 1311 auto retStr = g_utf8_strup(Str.toStringz(str), len); 1312 1313 scope(exit) Str.freeString(retStr); 1314 return Str.toString(retStr); 1315 } 1316 1317 /** 1318 * Copies a substring out of a UTF-8 encoded string. 1319 * The substring will contain @end_pos - @start_pos characters. 1320 * 1321 * Params: 1322 * str = a UTF-8 encoded string 1323 * startPos = a character offset within @str 1324 * endPos = another character offset within @str 1325 * 1326 * Returns: a newly allocated copy of the requested 1327 * substring. Free with g_free() when no longer needed. 1328 * 1329 * Since: 2.30 1330 */ 1331 public static string utf8Substring(string str, glong startPos, glong endPos) 1332 { 1333 auto retStr = g_utf8_substring(Str.toStringz(str), startPos, endPos); 1334 1335 scope(exit) Str.freeString(retStr); 1336 return Str.toString(retStr); 1337 } 1338 1339 /** 1340 * Convert a string from UTF-8 to a 32-bit fixed width 1341 * representation as UCS-4. A trailing 0 character will be added to the 1342 * string after the converted text. 1343 * 1344 * Params: 1345 * str = a UTF-8 encoded string 1346 * len = the maximum length of @str to use, in bytes. If @len < 0, 1347 * then the string is nul-terminated. 1348 * itemsRead = location to store number of 1349 * bytes read, or %NULL. 1350 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1351 * returned in case @str contains a trailing partial 1352 * character. If an error occurs then the index of the 1353 * invalid input is stored here. 1354 * itemsWritten = location to store number 1355 * of characters written or %NULL. The value here stored does not include 1356 * the trailing 0 character. 1357 * 1358 * Returns: a pointer to a newly allocated UCS-4 string. 1359 * This value must be freed with g_free(). If an error occurs, 1360 * %NULL will be returned and @error set. 1361 * 1362 * Throws: GException on failure. 1363 */ 1364 public static dchar* utf8ToUcs4(string str, glong len, out glong itemsRead, out glong itemsWritten) 1365 { 1366 GError* err = null; 1367 1368 auto __p = g_utf8_to_ucs4(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1369 1370 if (err !is null) 1371 { 1372 throw new GException( new ErrorG(err) ); 1373 } 1374 1375 return __p; 1376 } 1377 1378 /** 1379 * Convert a string from UTF-8 to a 32-bit fixed width 1380 * representation as UCS-4, assuming valid UTF-8 input. 1381 * This function is roughly twice as fast as g_utf8_to_ucs4() 1382 * but does no error checking on the input. A trailing 0 character 1383 * will be added to the string after the converted text. 1384 * 1385 * Params: 1386 * str = a UTF-8 encoded string 1387 * len = the maximum length of @str to use, in bytes. If @len < 0, 1388 * then the string is nul-terminated. 1389 * itemsWritten = location to store the 1390 * number of characters in the result, or %NULL. 1391 * 1392 * Returns: a pointer to a newly allocated UCS-4 string. 1393 * This value must be freed with g_free(). 1394 */ 1395 public static dchar* utf8ToUcs4Fast(string str, glong len, out glong itemsWritten) 1396 { 1397 return g_utf8_to_ucs4_fast(Str.toStringz(str), len, &itemsWritten); 1398 } 1399 1400 /** 1401 * Convert a string from UTF-8 to UTF-16. A 0 character will be 1402 * added to the result after the converted text. 1403 * 1404 * Params: 1405 * str = a UTF-8 encoded string 1406 * len = the maximum length (number of bytes) of @str to use. 1407 * If @len < 0, then the string is nul-terminated. 1408 * itemsRead = location to store number of 1409 * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 1410 * be returned in case @str contains a trailing partial character. If 1411 * an error occurs then the index of the invalid input is stored here. 1412 * itemsWritten = location to store number 1413 * of #gunichar2 written, or %NULL. The value stored here does not include 1414 * the trailing 0. 1415 * 1416 * Returns: a pointer to a newly allocated UTF-16 string. 1417 * This value must be freed with g_free(). If an error occurs, 1418 * %NULL will be returned and @error set. 1419 * 1420 * Throws: GException on failure. 1421 */ 1422 public static wchar* utf8ToUtf16(string str, glong len, out glong itemsRead, out glong itemsWritten) 1423 { 1424 GError* err = null; 1425 1426 auto __p = g_utf8_to_utf16(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1427 1428 if (err !is null) 1429 { 1430 throw new GException( new ErrorG(err) ); 1431 } 1432 1433 return __p; 1434 } 1435 1436 /** 1437 * Validates UTF-8 encoded text. @str is the text to validate; 1438 * if @str is nul-terminated, then @max_len can be -1, otherwise 1439 * @max_len should be the number of bytes to validate. 1440 * If @end is non-%NULL, then the end of the valid range 1441 * will be stored there (i.e. the start of the first invalid 1442 * character if some bytes were invalid, or the end of the text 1443 * being validated otherwise). 1444 * 1445 * Note that g_utf8_validate() returns %FALSE if @max_len is 1446 * positive and any of the @max_len bytes are nul. 1447 * 1448 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 1449 * routines require valid UTF-8 as input; so data read from a file 1450 * or the network should be checked with g_utf8_validate() before 1451 * doing anything else with it. 1452 * 1453 * Params: 1454 * str = a pointer to character data 1455 * end = return location for end of valid data 1456 * 1457 * Returns: %TRUE if the text was valid UTF-8 1458 */ 1459 public static bool utf8Validate(string str, out string end) 1460 { 1461 char* outend = null; 1462 1463 auto __p = g_utf8_validate(Str.toStringz(str), cast(ptrdiff_t)str.length, &outend) != 0; 1464 1465 end = Str.toString(outend); 1466 1467 return __p; 1468 } 1469 1470 /** 1471 * If the provided string is valid UTF-8, return a copy of it. If not, 1472 * return a copy in which bytes that could not be interpreted as valid Unicode 1473 * are replaced with the Unicode replacement character (U+FFFD). 1474 * 1475 * For example, this is an appropriate function to use if you have received 1476 * a string that was incorrectly declared to be UTF-8, and you need a valid 1477 * UTF-8 version of it that can be logged or displayed to the user, with the 1478 * assumption that it is close enough to ASCII or UTF-8 to be mostly 1479 * readable as-is. 1480 * 1481 * Params: 1482 * str = string to coerce into UTF-8 1483 * len = the maximum length of @str to use, in bytes. If @len < 0, 1484 * then the string is nul-terminated. 1485 * 1486 * Returns: a valid UTF-8 string whose content resembles @str 1487 * 1488 * Since: 2.52 1489 */ 1490 public static string utf8MakeValid(string str, ptrdiff_t len) 1491 { 1492 auto retStr = g_utf8_make_valid(Str.toStringz(str), len); 1493 1494 scope(exit) Str.freeString(retStr); 1495 return Str.toString(retStr); 1496 } 1497 1498 /** 1499 * Validates UTF-8 encoded text. 1500 * 1501 * As with g_utf8_validate(), but @max_len must be set, and hence this function 1502 * will always return %FALSE if any of the bytes of @str are nul. 1503 * 1504 * Params: 1505 * str = a pointer to character data 1506 * end = return location for end of valid data 1507 * 1508 * Returns: %TRUE if the text was valid UTF-8 1509 * 1510 * Since: 2.60 1511 */ 1512 public static bool utf8ValidateLen(string str, out string end) 1513 { 1514 char* outend = null; 1515 1516 auto __p = g_utf8_validate_len(Str.toStringz(str), cast(size_t)str.length, &outend) != 0; 1517 1518 end = Str.toString(outend); 1519 1520 return __p; 1521 } 1522 }