1 /* 2 * This file is part of gtkD. 3 * 4 * gtkD is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU Lesser General Public License 6 * as published by the Free Software Foundation; either version 3 7 * of the License, or (at your option) any later version, with 8 * some exceptions, please read the COPYING file. 9 * 10 * gtkD is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU Lesser General Public License for more details. 14 * 15 * You should have received a copy of the GNU Lesser General Public License 16 * along with gtkD; if not, write to the Free Software 17 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110, USA 18 */ 19 20 // generated automatically - do not change 21 // find conversion definition on APILookup.txt 22 // implement new conversion functionalities on the wrap.utils pakage 23 24 25 module glib.Unicode; 26 27 private import glib.ErrorG; 28 private import glib.GException; 29 private import glib.Str; 30 private import glib.c.functions; 31 public import glib.c.types; 32 33 34 /** */ 35 public struct Unicode 36 { 37 38 /** 39 * Convert a string from UCS-4 to UTF-16. A 0 character will be 40 * added to the result after the converted text. 41 * 42 * Params: 43 * str = a UCS-4 encoded string 44 * len = the maximum length (number of characters) of @str to use. 45 * If @len < 0, then the string is nul-terminated. 46 * itemsRead = location to store number of 47 * bytes read, or %NULL. If an error occurs then the index of the invalid 48 * input is stored here. 49 * itemsWritten = location to store number 50 * of #gunichar2 written, or %NULL. The value stored here does not include 51 * the trailing 0. 52 * 53 * Returns: a pointer to a newly allocated UTF-16 string. 54 * This value must be freed with g_free(). If an error occurs, 55 * %NULL will be returned and @error set. 56 * 57 * Throws: GException on failure. 58 */ 59 public static wchar* ucs4ToUtf16(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 60 { 61 GError* err = null; 62 63 auto __p = g_ucs4_to_utf16(str, len, &itemsRead, &itemsWritten, &err); 64 65 if (err !is null) 66 { 67 throw new GException( new ErrorG(err) ); 68 } 69 70 return __p; 71 } 72 73 /** 74 * Convert a string from a 32-bit fixed width representation as UCS-4. 75 * to UTF-8. The result will be terminated with a 0 byte. 76 * 77 * Params: 78 * str = a UCS-4 encoded string 79 * len = the maximum length (number of characters) of @str to use. 80 * If @len < 0, then the string is nul-terminated. 81 * itemsRead = location to store number of 82 * characters read, or %NULL. 83 * itemsWritten = location to store number 84 * of bytes written or %NULL. The value here stored does not include the 85 * trailing 0 byte. 86 * 87 * Returns: a pointer to a newly allocated UTF-8 string. 88 * This value must be freed with g_free(). If an error occurs, 89 * %NULL will be returned and @error set. In that case, @items_read 90 * will be set to the position of the first invalid input character. 91 * 92 * Throws: GException on failure. 93 */ 94 public static string ucs4ToUtf8(dchar* str, glong len, out glong itemsRead, out glong itemsWritten) 95 { 96 GError* err = null; 97 98 auto retStr = g_ucs4_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 99 100 if (err !is null) 101 { 102 throw new GException( new ErrorG(err) ); 103 } 104 105 scope(exit) Str.freeString(retStr); 106 return Str.toString(retStr); 107 } 108 109 /** 110 * Determines the break type of @c. @c should be a Unicode character 111 * (to derive a character from UTF-8 encoded text, use 112 * g_utf8_get_char()). The break type is used to find word and line 113 * breaks ("text boundaries"), Pango implements the Unicode boundary 114 * resolution algorithms and normally you would use a function such 115 * as pango_break() instead of caring about break types yourself. 116 * 117 * Params: 118 * c = a Unicode character 119 * 120 * Returns: the break type of @c 121 */ 122 public static GUnicodeBreakType unicharBreakType(dchar c) 123 { 124 return g_unichar_break_type(c); 125 } 126 127 /** 128 * Determines the canonical combining class of a Unicode character. 129 * 130 * Params: 131 * uc = a Unicode character 132 * 133 * Returns: the combining class of the character 134 * 135 * Since: 2.14 136 */ 137 public static int unicharCombiningClass(dchar uc) 138 { 139 return g_unichar_combining_class(uc); 140 } 141 142 /** 143 * Performs a single composition step of the 144 * Unicode canonical composition algorithm. 145 * 146 * This function includes algorithmic Hangul Jamo composition, 147 * but it is not exactly the inverse of g_unichar_decompose(). 148 * No composition can have either of @a or @b equal to zero. 149 * To be precise, this function composes if and only if 150 * there exists a Primary Composite P which is canonically 151 * equivalent to the sequence <@a,@b>. See the Unicode 152 * Standard for the definition of Primary Composite. 153 * 154 * If @a and @b do not compose a new character, @ch is set to zero. 155 * 156 * See 157 * [UAX#15](http://unicode.org/reports/tr15/) 158 * for details. 159 * 160 * Params: 161 * a = a Unicode character 162 * b = a Unicode character 163 * ch = return location for the composed character 164 * 165 * Returns: %TRUE if the characters could be composed 166 * 167 * Since: 2.30 168 */ 169 public static bool unicharCompose(dchar a, dchar b, out dchar ch) 170 { 171 return g_unichar_compose(a, b, &ch) != 0; 172 } 173 174 /** 175 * Performs a single decomposition step of the 176 * Unicode canonical decomposition algorithm. 177 * 178 * This function does not include compatibility 179 * decompositions. It does, however, include algorithmic 180 * Hangul Jamo decomposition, as well as 'singleton' 181 * decompositions which replace a character by a single 182 * other character. In the case of singletons *@b will 183 * be set to zero. 184 * 185 * If @ch is not decomposable, *@a is set to @ch and *@b 186 * is set to zero. 187 * 188 * Note that the way Unicode decomposition pairs are 189 * defined, it is guaranteed that @b would not decompose 190 * further, but @a may itself decompose. To get the full 191 * canonical decomposition for @ch, one would need to 192 * recursively call this function on @a. Or use 193 * g_unichar_fully_decompose(). 194 * 195 * See 196 * [UAX#15](http://unicode.org/reports/tr15/) 197 * for details. 198 * 199 * Params: 200 * ch = a Unicode character 201 * a = return location for the first component of @ch 202 * b = return location for the second component of @ch 203 * 204 * Returns: %TRUE if the character could be decomposed 205 * 206 * Since: 2.30 207 */ 208 public static bool unicharDecompose(dchar ch, out dchar a, out dchar b) 209 { 210 return g_unichar_decompose(ch, &a, &b) != 0; 211 } 212 213 /** 214 * Determines the numeric value of a character as a decimal 215 * digit. 216 * 217 * Params: 218 * c = a Unicode character 219 * 220 * Returns: If @c is a decimal digit (according to 221 * g_unichar_isdigit()), its numeric value. Otherwise, -1. 222 */ 223 public static int unicharDigitValue(dchar c) 224 { 225 return g_unichar_digit_value(c); 226 } 227 228 /** 229 * Computes the canonical or compatibility decomposition of a 230 * Unicode character. For compatibility decomposition, 231 * pass %TRUE for @compat; for canonical decomposition 232 * pass %FALSE for @compat. 233 * 234 * The decomposed sequence is placed in @result. Only up to 235 * @result_len characters are written into @result. The length 236 * of the full decomposition (irrespective of @result_len) is 237 * returned by the function. For canonical decomposition, 238 * currently all decompositions are of length at most 4, but 239 * this may change in the future (very unlikely though). 240 * At any rate, Unicode does guarantee that a buffer of length 241 * 18 is always enough for both compatibility and canonical 242 * decompositions, so that is the size recommended. This is provided 243 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. 244 * 245 * See 246 * [UAX#15](http://unicode.org/reports/tr15/) 247 * for details. 248 * 249 * Params: 250 * ch = a Unicode character. 251 * compat = whether perform canonical or compatibility decomposition 252 * result = location to store decomposed result, or %NULL 253 * resultLen = length of @result 254 * 255 * Returns: the length of the full decomposition. 256 * 257 * Since: 2.30 258 */ 259 public static size_t unicharFullyDecompose(dchar ch, bool compat, out dchar result, size_t resultLen) 260 { 261 return g_unichar_fully_decompose(ch, compat, &result, resultLen); 262 } 263 264 /** 265 * In Unicode, some characters are "mirrored". This means that their 266 * images are mirrored horizontally in text that is laid out from right 267 * to left. For instance, "(" would become its mirror image, ")", in 268 * right-to-left text. 269 * 270 * If @ch has the Unicode mirrored property and there is another unicode 271 * character that typically has a glyph that is the mirror image of @ch's 272 * glyph and @mirrored_ch is set, it puts that character in the address 273 * pointed to by @mirrored_ch. Otherwise the original character is put. 274 * 275 * Params: 276 * ch = a Unicode character 277 * mirroredCh = location to store the mirrored character 278 * 279 * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise 280 * 281 * Since: 2.4 282 */ 283 public static bool unicharGetMirrorChar(dchar ch, dchar* mirroredCh) 284 { 285 return g_unichar_get_mirror_char(ch, mirroredCh) != 0; 286 } 287 288 /** 289 * Looks up the #GUnicodeScript for a particular character (as defined 290 * by Unicode Standard Annex \#24). No check is made for @ch being a 291 * valid Unicode character; if you pass in invalid character, the 292 * result is undefined. 293 * 294 * This function is equivalent to pango_script_for_unichar() and the 295 * two are interchangeable. 296 * 297 * Params: 298 * ch = a Unicode character 299 * 300 * Returns: the #GUnicodeScript for the character. 301 * 302 * Since: 2.14 303 */ 304 public static GUnicodeScript unicharGetScript(dchar ch) 305 { 306 return g_unichar_get_script(ch); 307 } 308 309 /** 310 * Determines whether a character is alphanumeric. 311 * Given some UTF-8 text, obtain a character value 312 * with g_utf8_get_char(). 313 * 314 * Params: 315 * c = a Unicode character 316 * 317 * Returns: %TRUE if @c is an alphanumeric character 318 */ 319 public static bool unicharIsalnum(dchar c) 320 { 321 return g_unichar_isalnum(c) != 0; 322 } 323 324 /** 325 * Determines whether a character is alphabetic (i.e. a letter). 326 * Given some UTF-8 text, obtain a character value with 327 * g_utf8_get_char(). 328 * 329 * Params: 330 * c = a Unicode character 331 * 332 * Returns: %TRUE if @c is an alphabetic character 333 */ 334 public static bool unicharIsalpha(dchar c) 335 { 336 return g_unichar_isalpha(c) != 0; 337 } 338 339 /** 340 * Determines whether a character is a control character. 341 * Given some UTF-8 text, obtain a character value with 342 * g_utf8_get_char(). 343 * 344 * Params: 345 * c = a Unicode character 346 * 347 * Returns: %TRUE if @c is a control character 348 */ 349 public static bool unicharIscntrl(dchar c) 350 { 351 return g_unichar_iscntrl(c) != 0; 352 } 353 354 /** 355 * Determines if a given character is assigned in the Unicode 356 * standard. 357 * 358 * Params: 359 * c = a Unicode character 360 * 361 * Returns: %TRUE if the character has an assigned value 362 */ 363 public static bool unicharIsdefined(dchar c) 364 { 365 return g_unichar_isdefined(c) != 0; 366 } 367 368 /** 369 * Determines whether a character is numeric (i.e. a digit). This 370 * covers ASCII 0-9 and also digits in other languages/scripts. Given 371 * some UTF-8 text, obtain a character value with g_utf8_get_char(). 372 * 373 * Params: 374 * c = a Unicode character 375 * 376 * Returns: %TRUE if @c is a digit 377 */ 378 public static bool unicharIsdigit(dchar c) 379 { 380 return g_unichar_isdigit(c) != 0; 381 } 382 383 /** 384 * Determines whether a character is printable and not a space 385 * (returns %FALSE for control characters, format characters, and 386 * spaces). g_unichar_isprint() is similar, but returns %TRUE for 387 * spaces. Given some UTF-8 text, obtain a character value with 388 * g_utf8_get_char(). 389 * 390 * Params: 391 * c = a Unicode character 392 * 393 * Returns: %TRUE if @c is printable unless it's a space 394 */ 395 public static bool unicharIsgraph(dchar c) 396 { 397 return g_unichar_isgraph(c) != 0; 398 } 399 400 /** 401 * Determines whether a character is a lowercase letter. 402 * Given some UTF-8 text, obtain a character value with 403 * g_utf8_get_char(). 404 * 405 * Params: 406 * c = a Unicode character 407 * 408 * Returns: %TRUE if @c is a lowercase letter 409 */ 410 public static bool unicharIslower(dchar c) 411 { 412 return g_unichar_islower(c) != 0; 413 } 414 415 /** 416 * Determines whether a character is a mark (non-spacing mark, 417 * combining mark, or enclosing mark in Unicode speak). 418 * Given some UTF-8 text, obtain a character value 419 * with g_utf8_get_char(). 420 * 421 * Note: in most cases where isalpha characters are allowed, 422 * ismark characters should be allowed to as they are essential 423 * for writing most European languages as well as many non-Latin 424 * scripts. 425 * 426 * Params: 427 * c = a Unicode character 428 * 429 * Returns: %TRUE if @c is a mark character 430 * 431 * Since: 2.14 432 */ 433 public static bool unicharIsmark(dchar c) 434 { 435 return g_unichar_ismark(c) != 0; 436 } 437 438 /** 439 * Determines whether a character is printable. 440 * Unlike g_unichar_isgraph(), returns %TRUE for spaces. 441 * Given some UTF-8 text, obtain a character value with 442 * g_utf8_get_char(). 443 * 444 * Params: 445 * c = a Unicode character 446 * 447 * Returns: %TRUE if @c is printable 448 */ 449 public static bool unicharIsprint(dchar c) 450 { 451 return g_unichar_isprint(c) != 0; 452 } 453 454 /** 455 * Determines whether a character is punctuation or a symbol. 456 * Given some UTF-8 text, obtain a character value with 457 * g_utf8_get_char(). 458 * 459 * Params: 460 * c = a Unicode character 461 * 462 * Returns: %TRUE if @c is a punctuation or symbol character 463 */ 464 public static bool unicharIspunct(dchar c) 465 { 466 return g_unichar_ispunct(c) != 0; 467 } 468 469 /** 470 * Determines whether a character is a space, tab, or line separator 471 * (newline, carriage return, etc.). Given some UTF-8 text, obtain a 472 * character value with g_utf8_get_char(). 473 * 474 * (Note: don't use this to do word breaking; you have to use 475 * Pango or equivalent to get word breaking right, the algorithm 476 * is fairly complex.) 477 * 478 * Params: 479 * c = a Unicode character 480 * 481 * Returns: %TRUE if @c is a space character 482 */ 483 public static bool unicharIsspace(dchar c) 484 { 485 return g_unichar_isspace(c) != 0; 486 } 487 488 /** 489 * Determines if a character is titlecase. Some characters in 490 * Unicode which are composites, such as the DZ digraph 491 * have three case variants instead of just two. The titlecase 492 * form is used at the beginning of a word where only the 493 * first letter is capitalized. The titlecase form of the DZ 494 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. 495 * 496 * Params: 497 * c = a Unicode character 498 * 499 * Returns: %TRUE if the character is titlecase 500 */ 501 public static bool unicharIstitle(dchar c) 502 { 503 return g_unichar_istitle(c) != 0; 504 } 505 506 /** 507 * Determines if a character is uppercase. 508 * 509 * Params: 510 * c = a Unicode character 511 * 512 * Returns: %TRUE if @c is an uppercase character 513 */ 514 public static bool unicharIsupper(dchar c) 515 { 516 return g_unichar_isupper(c) != 0; 517 } 518 519 /** 520 * Determines if a character is typically rendered in a double-width 521 * cell. 522 * 523 * Params: 524 * c = a Unicode character 525 * 526 * Returns: %TRUE if the character is wide 527 */ 528 public static bool unicharIswide(dchar c) 529 { 530 return g_unichar_iswide(c) != 0; 531 } 532 533 /** 534 * Determines if a character is typically rendered in a double-width 535 * cell under legacy East Asian locales. If a character is wide according to 536 * g_unichar_iswide(), then it is also reported wide with this function, but 537 * the converse is not necessarily true. See the 538 * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) 539 * for details. 540 * 541 * If a character passes the g_unichar_iswide() test then it will also pass 542 * this test, but not the other way around. Note that some characters may 543 * pass both this test and g_unichar_iszerowidth(). 544 * 545 * Params: 546 * c = a Unicode character 547 * 548 * Returns: %TRUE if the character is wide in legacy East Asian locales 549 * 550 * Since: 2.12 551 */ 552 public static bool unicharIswideCjk(dchar c) 553 { 554 return g_unichar_iswide_cjk(c) != 0; 555 } 556 557 /** 558 * Determines if a character is a hexadecimal digit. 559 * 560 * Params: 561 * c = a Unicode character. 562 * 563 * Returns: %TRUE if the character is a hexadecimal digit 564 */ 565 public static bool unicharIsxdigit(dchar c) 566 { 567 return g_unichar_isxdigit(c) != 0; 568 } 569 570 /** 571 * Determines if a given character typically takes zero width when rendered. 572 * The return value is %TRUE for all non-spacing and enclosing marks 573 * (e.g., combining accents), format characters, zero-width 574 * space, but not U+00AD SOFT HYPHEN. 575 * 576 * A typical use of this function is with one of g_unichar_iswide() or 577 * g_unichar_iswide_cjk() to determine the number of cells a string occupies 578 * when displayed on a grid display (terminals). However, note that not all 579 * terminals support zero-width rendering of zero-width marks. 580 * 581 * Params: 582 * c = a Unicode character 583 * 584 * Returns: %TRUE if the character has zero width 585 * 586 * Since: 2.14 587 */ 588 public static bool unicharIszerowidth(dchar c) 589 { 590 return g_unichar_iszerowidth(c) != 0; 591 } 592 593 /** 594 * Converts a single character to UTF-8. 595 * 596 * Params: 597 * c = a Unicode character code 598 * outbuf = output buffer, must have at 599 * least 6 bytes of space. If %NULL, the length will be computed and 600 * returned and nothing will be written to @outbuf. 601 * 602 * Returns: number of bytes written 603 */ 604 public static int unicharToUtf8(dchar c, out char[] outbuf) 605 { 606 return g_unichar_to_utf8(c, outbuf.ptr); 607 } 608 609 /** 610 * Converts a character to lower case. 611 * 612 * Params: 613 * c = a Unicode character. 614 * 615 * Returns: the result of converting @c to lower case. 616 * If @c is not an upperlower or titlecase character, 617 * or has no lowercase equivalent @c is returned unchanged. 618 */ 619 public static dchar unicharTolower(dchar c) 620 { 621 return g_unichar_tolower(c); 622 } 623 624 /** 625 * Converts a character to the titlecase. 626 * 627 * Params: 628 * c = a Unicode character 629 * 630 * Returns: the result of converting @c to titlecase. 631 * If @c is not an uppercase or lowercase character, 632 * @c is returned unchanged. 633 */ 634 public static dchar unicharTotitle(dchar c) 635 { 636 return g_unichar_totitle(c); 637 } 638 639 /** 640 * Converts a character to uppercase. 641 * 642 * Params: 643 * c = a Unicode character 644 * 645 * Returns: the result of converting @c to uppercase. 646 * If @c is not a lowercase or titlecase character, 647 * or has no upper case equivalent @c is returned unchanged. 648 */ 649 public static dchar unicharToupper(dchar c) 650 { 651 return g_unichar_toupper(c); 652 } 653 654 /** 655 * Classifies a Unicode character by type. 656 * 657 * Params: 658 * c = a Unicode character 659 * 660 * Returns: the type of the character. 661 */ 662 public static GUnicodeType unicharType(dchar c) 663 { 664 return g_unichar_type(c); 665 } 666 667 /** 668 * Checks whether @ch is a valid Unicode character. Some possible 669 * integer values of @ch will not be valid. 0 is considered a valid 670 * character, though it's normally a string terminator. 671 * 672 * Params: 673 * ch = a Unicode character 674 * 675 * Returns: %TRUE if @ch is a valid Unicode character 676 */ 677 public static bool unicharValidate(dchar ch) 678 { 679 return g_unichar_validate(ch) != 0; 680 } 681 682 /** 683 * Determines the numeric value of a character as a hexadecimal 684 * digit. 685 * 686 * Params: 687 * c = a Unicode character 688 * 689 * Returns: If @c is a hex digit (according to 690 * g_unichar_isxdigit()), its numeric value. Otherwise, -1. 691 */ 692 public static int unicharXdigitValue(dchar c) 693 { 694 return g_unichar_xdigit_value(c); 695 } 696 697 /** 698 * Computes the canonical decomposition of a Unicode character. 699 * 700 * Deprecated: Use the more flexible g_unichar_fully_decompose() 701 * instead. 702 * 703 * Params: 704 * ch = a Unicode character. 705 * resultLen = location to store the length of the return value. 706 * 707 * Returns: a newly allocated string of Unicode characters. 708 * @result_len is set to the resulting length of the string. 709 */ 710 public static dchar* unicodeCanonicalDecomposition(dchar ch, size_t* resultLen) 711 { 712 return g_unicode_canonical_decomposition(ch, resultLen); 713 } 714 715 /** 716 * Computes the canonical ordering of a string in-place. 717 * This rearranges decomposed characters in the string 718 * according to their combining classes. See the Unicode 719 * manual for more information. 720 * 721 * Params: 722 * string_ = a UCS-4 encoded string. 723 * len = the maximum length of @string to use. 724 */ 725 public static void unicodeCanonicalOrdering(dchar* string_, size_t len) 726 { 727 g_unicode_canonical_ordering(string_, len); 728 } 729 730 /** 731 * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter 732 * codes to scripts. For example, the code for Arabic is 'Arab'. 733 * This function accepts four letter codes encoded as a @guint32 in a 734 * big-endian fashion. That is, the code expected for Arabic is 735 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 736 * 737 * See 738 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 739 * for details. 740 * 741 * Params: 742 * iso15924 = a Unicode script 743 * 744 * Returns: the Unicode script for @iso15924, or 745 * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and 746 * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. 747 * 748 * Since: 2.30 749 */ 750 public static GUnicodeScript unicodeScriptFromIso15924(uint iso15924) 751 { 752 return g_unicode_script_from_iso15924(iso15924); 753 } 754 755 /** 756 * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter 757 * codes to scripts. For example, the code for Arabic is 'Arab'. The 758 * four letter codes are encoded as a @guint32 by this function in a 759 * big-endian fashion. That is, the code returned for Arabic is 760 * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). 761 * 762 * See 763 * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) 764 * for details. 765 * 766 * Params: 767 * script = a Unicode script 768 * 769 * Returns: the ISO 15924 code for @script, encoded as an integer, 770 * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or 771 * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. 772 * 773 * Since: 2.30 774 */ 775 public static uint unicodeScriptToIso15924(GUnicodeScript script) 776 { 777 return g_unicode_script_to_iso15924(script); 778 } 779 780 /** 781 * Convert a string from UTF-16 to UCS-4. The result will be 782 * nul-terminated. 783 * 784 * Params: 785 * str = a UTF-16 encoded string 786 * len = the maximum length (number of #gunichar2) of @str to use. 787 * If @len < 0, then the string is nul-terminated. 788 * itemsRead = location to store number of 789 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 790 * be returned in case @str contains a trailing partial character. If 791 * an error occurs then the index of the invalid input is stored here. 792 * itemsWritten = location to store number 793 * of characters written, or %NULL. The value stored here does not include 794 * the trailing 0 character. 795 * 796 * Returns: a pointer to a newly allocated UCS-4 string. 797 * This value must be freed with g_free(). If an error occurs, 798 * %NULL will be returned and @error set. 799 * 800 * Throws: GException on failure. 801 */ 802 public static dchar* utf16ToUcs4(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 803 { 804 GError* err = null; 805 806 auto __p = g_utf16_to_ucs4(str, len, &itemsRead, &itemsWritten, &err); 807 808 if (err !is null) 809 { 810 throw new GException( new ErrorG(err) ); 811 } 812 813 return __p; 814 } 815 816 /** 817 * Convert a string from UTF-16 to UTF-8. The result will be 818 * terminated with a 0 byte. 819 * 820 * Note that the input is expected to be already in native endianness, 821 * an initial byte-order-mark character is not handled specially. 822 * g_convert() can be used to convert a byte buffer of UTF-16 data of 823 * ambiguous endianness. 824 * 825 * Further note that this function does not validate the result 826 * string; it may e.g. include embedded NUL characters. The only 827 * validation done by this function is to ensure that the input can 828 * be correctly interpreted as UTF-16, i.e. it doesn't contain 829 * unpaired surrogates or partial character sequences. 830 * 831 * Params: 832 * str = a UTF-16 encoded string 833 * len = the maximum length (number of #gunichar2) of @str to use. 834 * If @len < 0, then the string is nul-terminated. 835 * itemsRead = location to store number of 836 * words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 837 * be returned in case @str contains a trailing partial character. If 838 * an error occurs then the index of the invalid input is stored here. 839 * itemsWritten = location to store number 840 * of bytes written, or %NULL. The value stored here does not include the 841 * trailing 0 byte. 842 * 843 * Returns: a pointer to a newly allocated UTF-8 string. 844 * This value must be freed with g_free(). If an error occurs, 845 * %NULL will be returned and @error set. 846 * 847 * Throws: GException on failure. 848 */ 849 public static string utf16ToUtf8(wchar* str, glong len, out glong itemsRead, out glong itemsWritten) 850 { 851 GError* err = null; 852 853 auto retStr = g_utf16_to_utf8(str, len, &itemsRead, &itemsWritten, &err); 854 855 if (err !is null) 856 { 857 throw new GException( new ErrorG(err) ); 858 } 859 860 scope(exit) Str.freeString(retStr); 861 return Str.toString(retStr); 862 } 863 864 /** 865 * Converts a string into a form that is independent of case. The 866 * result will not correspond to any particular case, but can be 867 * compared for equality or ordered with the results of calling 868 * g_utf8_casefold() on other strings. 869 * 870 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is 871 * only an approximation to the correct linguistic case insensitive 872 * ordering, though it is a fairly good one. Getting this exactly 873 * right would require a more sophisticated collation function that 874 * takes case sensitivity into account. GLib does not currently 875 * provide such a function. 876 * 877 * Params: 878 * str = a UTF-8 encoded string 879 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 880 * 881 * Returns: a newly allocated string, that is a 882 * case independent form of @str. 883 */ 884 public static string utf8Casefold(string str, ptrdiff_t len) 885 { 886 auto retStr = g_utf8_casefold(Str.toStringz(str), len); 887 888 scope(exit) Str.freeString(retStr); 889 return Str.toString(retStr); 890 } 891 892 /** 893 * Compares two strings for ordering using the linguistically 894 * correct rules for the [current locale][setlocale]. 895 * When sorting a large number of strings, it will be significantly 896 * faster to obtain collation keys with g_utf8_collate_key() and 897 * compare the keys with strcmp() when sorting instead of sorting 898 * the original strings. 899 * 900 * Params: 901 * str1 = a UTF-8 encoded string 902 * str2 = a UTF-8 encoded string 903 * 904 * Returns: < 0 if @str1 compares before @str2, 905 * 0 if they compare equal, > 0 if @str1 compares after @str2. 906 */ 907 public static int utf8Collate(string str1, string str2) 908 { 909 return g_utf8_collate(Str.toStringz(str1), Str.toStringz(str2)); 910 } 911 912 /** 913 * Converts a string into a collation key that can be compared 914 * with other collation keys produced by the same function using 915 * strcmp(). 916 * 917 * The results of comparing the collation keys of two strings 918 * with strcmp() will always be the same as comparing the two 919 * original keys with g_utf8_collate(). 920 * 921 * Note that this function depends on the [current locale][setlocale]. 922 * 923 * Params: 924 * str = a UTF-8 encoded string. 925 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 926 * 927 * Returns: a newly allocated string. This string should 928 * be freed with g_free() when you are done with it. 929 */ 930 public static string utf8CollateKey(string str, ptrdiff_t len) 931 { 932 auto retStr = g_utf8_collate_key(Str.toStringz(str), len); 933 934 scope(exit) Str.freeString(retStr); 935 return Str.toString(retStr); 936 } 937 938 /** 939 * Converts a string into a collation key that can be compared 940 * with other collation keys produced by the same function using strcmp(). 941 * 942 * In order to sort filenames correctly, this function treats the dot '.' 943 * as a special case. Most dictionary orderings seem to consider it 944 * insignificant, thus producing the ordering "event.c" "eventgenerator.c" 945 * "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we 946 * would like to treat numbers intelligently so that "file1" "file10" "file5" 947 * is sorted as "file1" "file5" "file10". 948 * 949 * Note that this function depends on the [current locale][setlocale]. 950 * 951 * Params: 952 * str = a UTF-8 encoded string. 953 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 954 * 955 * Returns: a newly allocated string. This string should 956 * be freed with g_free() when you are done with it. 957 * 958 * Since: 2.8 959 */ 960 public static string utf8CollateKeyForFilename(string str, ptrdiff_t len) 961 { 962 auto retStr = g_utf8_collate_key_for_filename(Str.toStringz(str), len); 963 964 scope(exit) Str.freeString(retStr); 965 return Str.toString(retStr); 966 } 967 968 /** 969 * Finds the start of the next UTF-8 character in the string after @p. 970 * 971 * @p does not have to be at the beginning of a UTF-8 character. No check 972 * is made to see if the character found is actually valid other than 973 * it starts with an appropriate byte. 974 * 975 * If @end is %NULL, the return value will never be %NULL: if the end of the 976 * string is reached, a pointer to the terminating nul byte is returned. If 977 * @end is non-%NULL, the return value will be %NULL if the end of the string 978 * is reached. 979 * 980 * Params: 981 * p = a pointer to a position within a UTF-8 encoded string 982 * end = a pointer to the byte following the end of the string, 983 * or %NULL to indicate that the string is nul-terminated 984 * 985 * Returns: a pointer to the found character or %NULL if @end is 986 * set and is reached 987 */ 988 public static string utf8FindNextChar(string p, string end) 989 { 990 return Str.toString(g_utf8_find_next_char(Str.toStringz(p), Str.toStringz(end))); 991 } 992 993 /** 994 * Given a position @p with a UTF-8 encoded string @str, find the start 995 * of the previous UTF-8 character starting before @p. Returns %NULL if no 996 * UTF-8 characters are present in @str before @p. 997 * 998 * @p does not have to be at the beginning of a UTF-8 character. No check 999 * is made to see if the character found is actually valid other than 1000 * it starts with an appropriate byte. 1001 * 1002 * Params: 1003 * str = pointer to the beginning of a UTF-8 encoded string 1004 * p = pointer to some position within @str 1005 * 1006 * Returns: a pointer to the found character or %NULL. 1007 */ 1008 public static string utf8FindPrevChar(string str, string p) 1009 { 1010 return Str.toString(g_utf8_find_prev_char(Str.toStringz(str), Str.toStringz(p))); 1011 } 1012 1013 /** 1014 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character. 1015 * 1016 * If @p does not point to a valid UTF-8 encoded character, results 1017 * are undefined. If you are not sure that the bytes are complete 1018 * valid Unicode characters, you should use g_utf8_get_char_validated() 1019 * instead. 1020 * 1021 * Params: 1022 * p = a pointer to Unicode character encoded as UTF-8 1023 * 1024 * Returns: the resulting character 1025 */ 1026 public static dchar utf8GetChar(string p) 1027 { 1028 return g_utf8_get_char(Str.toStringz(p)); 1029 } 1030 1031 /** 1032 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character. 1033 * This function checks for incomplete characters, for invalid characters 1034 * such as characters that are out of the range of Unicode, and for 1035 * overlong encodings of valid characters. 1036 * 1037 * Note that g_utf8_get_char_validated() returns (gunichar)-2 if 1038 * @max_len is positive and any of the bytes in the first UTF-8 character 1039 * sequence are nul. 1040 * 1041 * Params: 1042 * p = a pointer to Unicode character encoded as UTF-8 1043 * maxLen = the maximum number of bytes to read, or -1 if @p is nul-terminated 1044 * 1045 * Returns: the resulting character. If @p points to a partial 1046 * sequence at the end of a string that could begin a valid 1047 * character (or if @max_len is zero), returns (gunichar)-2; 1048 * otherwise, if @p does not point to a valid UTF-8 encoded 1049 * Unicode character, returns (gunichar)-1. 1050 */ 1051 public static dchar utf8GetCharValidated(string p, ptrdiff_t maxLen) 1052 { 1053 return g_utf8_get_char_validated(Str.toStringz(p), maxLen); 1054 } 1055 1056 /** 1057 * Converts a string into canonical form, standardizing 1058 * such issues as whether a character with an accent 1059 * is represented as a base character and combining 1060 * accent or as a single precomposed character. The 1061 * string has to be valid UTF-8, otherwise %NULL is 1062 * returned. You should generally call g_utf8_normalize() 1063 * before comparing two Unicode strings. 1064 * 1065 * The normalization mode %G_NORMALIZE_DEFAULT only 1066 * standardizes differences that do not affect the 1067 * text content, such as the above-mentioned accent 1068 * representation. %G_NORMALIZE_ALL also standardizes 1069 * the "compatibility" characters in Unicode, such 1070 * as SUPERSCRIPT THREE to the standard forms 1071 * (in this case DIGIT THREE). Formatting information 1072 * may be lost but for most text operations such 1073 * characters should be considered the same. 1074 * 1075 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE 1076 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, 1077 * but returned a result with composed forms rather 1078 * than a maximally decomposed form. This is often 1079 * useful if you intend to convert the string to 1080 * a legacy encoding or pass it to a system with 1081 * less capable Unicode handling. 1082 * 1083 * Params: 1084 * str = a UTF-8 encoded string. 1085 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1086 * mode = the type of normalization to perform. 1087 * 1088 * Returns: a newly allocated string, that 1089 * is the normalized form of @str, or %NULL if @str 1090 * is not valid UTF-8. 1091 */ 1092 public static string utf8Normalize(string str, ptrdiff_t len, GNormalizeMode mode) 1093 { 1094 auto retStr = g_utf8_normalize(Str.toStringz(str), len, mode); 1095 1096 scope(exit) Str.freeString(retStr); 1097 return Str.toString(retStr); 1098 } 1099 1100 /** 1101 * Converts from an integer character offset to a pointer to a position 1102 * within the string. 1103 * 1104 * Since 2.10, this function allows to pass a negative @offset to 1105 * step backwards. It is usually worth stepping backwards from the end 1106 * instead of forwards if @offset is in the last fourth of the string, 1107 * since moving forward is about 3 times faster than moving backward. 1108 * 1109 * Note that this function doesn't abort when reaching the end of @str. 1110 * Therefore you should be sure that @offset is within string boundaries 1111 * before calling that function. Call g_utf8_strlen() when unsure. 1112 * This limitation exists as this function is called frequently during 1113 * text rendering and therefore has to be as fast as possible. 1114 * 1115 * Params: 1116 * str = a UTF-8 encoded string 1117 * offset = a character offset within @str 1118 * 1119 * Returns: the resulting pointer 1120 */ 1121 public static string utf8OffsetToPointer(string str, glong offset) 1122 { 1123 return Str.toString(g_utf8_offset_to_pointer(Str.toStringz(str), offset)); 1124 } 1125 1126 /** 1127 * Converts from a pointer to position within a string to an integer 1128 * character offset. 1129 * 1130 * Since 2.10, this function allows @pos to be before @str, and returns 1131 * a negative offset in this case. 1132 * 1133 * Params: 1134 * str = a UTF-8 encoded string 1135 * pos = a pointer to a position within @str 1136 * 1137 * Returns: the resulting character offset 1138 */ 1139 public static glong utf8PointerToOffset(string str, string pos) 1140 { 1141 return g_utf8_pointer_to_offset(Str.toStringz(str), Str.toStringz(pos)); 1142 } 1143 1144 /** 1145 * Finds the previous UTF-8 character in the string before @p. 1146 * 1147 * @p does not have to be at the beginning of a UTF-8 character. No check 1148 * is made to see if the character found is actually valid other than 1149 * it starts with an appropriate byte. If @p might be the first 1150 * character of the string, you must use g_utf8_find_prev_char() instead. 1151 * 1152 * Params: 1153 * p = a pointer to a position within a UTF-8 encoded string 1154 * 1155 * Returns: a pointer to the found character 1156 */ 1157 public static string utf8PrevChar(string p) 1158 { 1159 return Str.toString(g_utf8_prev_char(Str.toStringz(p))); 1160 } 1161 1162 /** 1163 * Finds the leftmost occurrence of the given Unicode character 1164 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1165 * If @len is -1, allow unbounded search. 1166 * 1167 * Params: 1168 * p = a nul-terminated UTF-8 encoded string 1169 * len = the maximum length of @p 1170 * c = a Unicode character 1171 * 1172 * Returns: %NULL if the string does not contain the character, 1173 * otherwise, a pointer to the start of the leftmost occurrence 1174 * of the character in the string. 1175 */ 1176 public static string utf8Strchr(string p, ptrdiff_t len, dchar c) 1177 { 1178 return Str.toString(g_utf8_strchr(Str.toStringz(p), len, c)); 1179 } 1180 1181 /** 1182 * Converts all Unicode characters in the string that have a case 1183 * to lowercase. The exact manner that this is done depends 1184 * on the current locale, and may result in the number of 1185 * characters in the string changing. 1186 * 1187 * Params: 1188 * str = a UTF-8 encoded string 1189 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1190 * 1191 * Returns: a newly allocated string, with all characters 1192 * converted to lowercase. 1193 */ 1194 public static string utf8Strdown(string str, ptrdiff_t len) 1195 { 1196 auto retStr = g_utf8_strdown(Str.toStringz(str), len); 1197 1198 scope(exit) Str.freeString(retStr); 1199 return Str.toString(retStr); 1200 } 1201 1202 /** 1203 * Computes the length of the string in characters, not including 1204 * the terminating nul character. If the @max'th byte falls in the 1205 * middle of a character, the last (partial) character is not counted. 1206 * 1207 * Params: 1208 * p = pointer to the start of a UTF-8 encoded string 1209 * max = the maximum number of bytes to examine. If @max 1210 * is less than 0, then the string is assumed to be 1211 * nul-terminated. If @max is 0, @p will not be examined and 1212 * may be %NULL. If @max is greater than 0, up to @max 1213 * bytes are examined 1214 * 1215 * Returns: the length of the string in characters 1216 */ 1217 public static glong utf8Strlen(string p, ptrdiff_t max) 1218 { 1219 return g_utf8_strlen(Str.toStringz(p), max); 1220 } 1221 1222 /** 1223 * Like the standard C strncpy() function, but copies a given number 1224 * of characters instead of a given number of bytes. The @src string 1225 * must be valid UTF-8 encoded text. (Use g_utf8_validate() on all 1226 * text before trying to use UTF-8 utility functions with it.) 1227 * 1228 * Note you must ensure @dest is at least 4 * @n to fit the 1229 * largest possible UTF-8 characters 1230 * 1231 * Params: 1232 * dest = buffer to fill with characters from @src 1233 * src = UTF-8 encoded string 1234 * n = character count 1235 * 1236 * Returns: @dest 1237 */ 1238 public static string utf8Strncpy(string dest, string src, size_t n) 1239 { 1240 return Str.toString(g_utf8_strncpy(Str.toStringz(dest), Str.toStringz(src), n)); 1241 } 1242 1243 /** 1244 * Find the rightmost occurrence of the given Unicode character 1245 * in a UTF-8 encoded string, while limiting the search to @len bytes. 1246 * If @len is -1, allow unbounded search. 1247 * 1248 * Params: 1249 * p = a nul-terminated UTF-8 encoded string 1250 * len = the maximum length of @p 1251 * c = a Unicode character 1252 * 1253 * Returns: %NULL if the string does not contain the character, 1254 * otherwise, a pointer to the start of the rightmost occurrence 1255 * of the character in the string. 1256 */ 1257 public static string utf8Strrchr(string p, ptrdiff_t len, dchar c) 1258 { 1259 return Str.toString(g_utf8_strrchr(Str.toStringz(p), len, c)); 1260 } 1261 1262 /** 1263 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text. 1264 * (Use g_utf8_validate() on all text before trying to use UTF-8 1265 * utility functions with it.) 1266 * 1267 * This function is intended for programmatic uses of reversed strings. 1268 * It pays no attention to decomposed characters, combining marks, byte 1269 * order marks, directional indicators (LRM, LRO, etc) and similar 1270 * characters which might need special handling when reversing a string 1271 * for display purposes. 1272 * 1273 * Note that unlike g_strreverse(), this function returns 1274 * newly-allocated memory, which should be freed with g_free() when 1275 * no longer needed. 1276 * 1277 * Params: 1278 * str = a UTF-8 encoded string 1279 * len = the maximum length of @str to use, in bytes. If @len < 0, 1280 * then the string is nul-terminated. 1281 * 1282 * Returns: a newly-allocated string which is the reverse of @str 1283 * 1284 * Since: 2.2 1285 */ 1286 public static string utf8Strreverse(string str, ptrdiff_t len) 1287 { 1288 auto retStr = g_utf8_strreverse(Str.toStringz(str), len); 1289 1290 scope(exit) Str.freeString(retStr); 1291 return Str.toString(retStr); 1292 } 1293 1294 /** 1295 * Converts all Unicode characters in the string that have a case 1296 * to uppercase. The exact manner that this is done depends 1297 * on the current locale, and may result in the number of 1298 * characters in the string increasing. (For instance, the 1299 * German ess-zet will be changed to SS.) 1300 * 1301 * Params: 1302 * str = a UTF-8 encoded string 1303 * len = length of @str, in bytes, or -1 if @str is nul-terminated. 1304 * 1305 * Returns: a newly allocated string, with all characters 1306 * converted to uppercase. 1307 */ 1308 public static string utf8Strup(string str, ptrdiff_t len) 1309 { 1310 auto retStr = g_utf8_strup(Str.toStringz(str), len); 1311 1312 scope(exit) Str.freeString(retStr); 1313 return Str.toString(retStr); 1314 } 1315 1316 /** 1317 * Copies a substring out of a UTF-8 encoded string. 1318 * The substring will contain @end_pos - @start_pos characters. 1319 * 1320 * Params: 1321 * str = a UTF-8 encoded string 1322 * startPos = a character offset within @str 1323 * endPos = another character offset within @str 1324 * 1325 * Returns: a newly allocated copy of the requested 1326 * substring. Free with g_free() when no longer needed. 1327 * 1328 * Since: 2.30 1329 */ 1330 public static string utf8Substring(string str, glong startPos, glong endPos) 1331 { 1332 auto retStr = g_utf8_substring(Str.toStringz(str), startPos, endPos); 1333 1334 scope(exit) Str.freeString(retStr); 1335 return Str.toString(retStr); 1336 } 1337 1338 /** 1339 * Convert a string from UTF-8 to a 32-bit fixed width 1340 * representation as UCS-4. A trailing 0 character will be added to the 1341 * string after the converted text. 1342 * 1343 * Params: 1344 * str = a UTF-8 encoded string 1345 * len = the maximum length of @str to use, in bytes. If @len < 0, 1346 * then the string is nul-terminated. 1347 * itemsRead = location to store number of 1348 * bytes read, or %NULL. 1349 * If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be 1350 * returned in case @str contains a trailing partial 1351 * character. If an error occurs then the index of the 1352 * invalid input is stored here. 1353 * itemsWritten = location to store number 1354 * of characters written or %NULL. The value here stored does not include 1355 * the trailing 0 character. 1356 * 1357 * Returns: a pointer to a newly allocated UCS-4 string. 1358 * This value must be freed with g_free(). If an error occurs, 1359 * %NULL will be returned and @error set. 1360 * 1361 * Throws: GException on failure. 1362 */ 1363 public static dchar* utf8ToUcs4(string str, glong len, out glong itemsRead, out glong itemsWritten) 1364 { 1365 GError* err = null; 1366 1367 auto __p = g_utf8_to_ucs4(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1368 1369 if (err !is null) 1370 { 1371 throw new GException( new ErrorG(err) ); 1372 } 1373 1374 return __p; 1375 } 1376 1377 /** 1378 * Convert a string from UTF-8 to a 32-bit fixed width 1379 * representation as UCS-4, assuming valid UTF-8 input. 1380 * This function is roughly twice as fast as g_utf8_to_ucs4() 1381 * but does no error checking on the input. A trailing 0 character 1382 * will be added to the string after the converted text. 1383 * 1384 * Params: 1385 * str = a UTF-8 encoded string 1386 * len = the maximum length of @str to use, in bytes. If @len < 0, 1387 * then the string is nul-terminated. 1388 * itemsWritten = location to store the 1389 * number of characters in the result, or %NULL. 1390 * 1391 * Returns: a pointer to a newly allocated UCS-4 string. 1392 * This value must be freed with g_free(). 1393 */ 1394 public static dchar* utf8ToUcs4Fast(string str, glong len, out glong itemsWritten) 1395 { 1396 return g_utf8_to_ucs4_fast(Str.toStringz(str), len, &itemsWritten); 1397 } 1398 1399 /** 1400 * Convert a string from UTF-8 to UTF-16. A 0 character will be 1401 * added to the result after the converted text. 1402 * 1403 * Params: 1404 * str = a UTF-8 encoded string 1405 * len = the maximum length (number of bytes) of @str to use. 1406 * If @len < 0, then the string is nul-terminated. 1407 * itemsRead = location to store number of 1408 * bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will 1409 * be returned in case @str contains a trailing partial character. If 1410 * an error occurs then the index of the invalid input is stored here. 1411 * itemsWritten = location to store number 1412 * of #gunichar2 written, or %NULL. The value stored here does not include 1413 * the trailing 0. 1414 * 1415 * Returns: a pointer to a newly allocated UTF-16 string. 1416 * This value must be freed with g_free(). If an error occurs, 1417 * %NULL will be returned and @error set. 1418 * 1419 * Throws: GException on failure. 1420 */ 1421 public static wchar* utf8ToUtf16(string str, glong len, out glong itemsRead, out glong itemsWritten) 1422 { 1423 GError* err = null; 1424 1425 auto __p = g_utf8_to_utf16(Str.toStringz(str), len, &itemsRead, &itemsWritten, &err); 1426 1427 if (err !is null) 1428 { 1429 throw new GException( new ErrorG(err) ); 1430 } 1431 1432 return __p; 1433 } 1434 1435 /** 1436 * Validates UTF-8 encoded text. @str is the text to validate; 1437 * if @str is nul-terminated, then @max_len can be -1, otherwise 1438 * @max_len should be the number of bytes to validate. 1439 * If @end is non-%NULL, then the end of the valid range 1440 * will be stored there (i.e. the start of the first invalid 1441 * character if some bytes were invalid, or the end of the text 1442 * being validated otherwise). 1443 * 1444 * Note that g_utf8_validate() returns %FALSE if @max_len is 1445 * positive and any of the @max_len bytes are nul. 1446 * 1447 * Returns %TRUE if all of @str was valid. Many GLib and GTK+ 1448 * routines require valid UTF-8 as input; so data read from a file 1449 * or the network should be checked with g_utf8_validate() before 1450 * doing anything else with it. 1451 * 1452 * Params: 1453 * str = a pointer to character data 1454 * end = return location for end of valid data 1455 * 1456 * Returns: %TRUE if the text was valid UTF-8 1457 */ 1458 public static bool utf8Validate(string str, out string end) 1459 { 1460 char* outend = null; 1461 1462 auto __p = g_utf8_validate(Str.toStringz(str), cast(ptrdiff_t)str.length, &outend) != 0; 1463 1464 end = Str.toString(outend); 1465 1466 return __p; 1467 } 1468 1469 /** 1470 * If the provided string is valid UTF-8, return a copy of it. If not, 1471 * return a copy in which bytes that could not be interpreted as valid Unicode 1472 * are replaced with the Unicode replacement character (U+FFFD). 1473 * 1474 * For example, this is an appropriate function to use if you have received 1475 * a string that was incorrectly declared to be UTF-8, and you need a valid 1476 * UTF-8 version of it that can be logged or displayed to the user, with the 1477 * assumption that it is close enough to ASCII or UTF-8 to be mostly 1478 * readable as-is. 1479 * 1480 * Params: 1481 * str = string to coerce into UTF-8 1482 * len = the maximum length of @str to use, in bytes. If @len < 0, 1483 * then the string is nul-terminated. 1484 * 1485 * Returns: a valid UTF-8 string whose content resembles @str 1486 * 1487 * Since: 2.52 1488 */ 1489 public static string utf8MakeValid(string str, ptrdiff_t len) 1490 { 1491 auto retStr = g_utf8_make_valid(Str.toStringz(str), len); 1492 1493 scope(exit) Str.freeString(retStr); 1494 return Str.toString(retStr); 1495 } 1496 1497 /** 1498 * Validates UTF-8 encoded text. 1499 * 1500 * As with g_utf8_validate(), but @max_len must be set, and hence this function 1501 * will always return %FALSE if any of the bytes of @str are nul. 1502 * 1503 * Params: 1504 * str = a pointer to character data 1505 * end = return location for end of valid data 1506 * 1507 * Returns: %TRUE if the text was valid UTF-8 1508 * 1509 * Since: 2.60 1510 */ 1511 public static bool utf8ValidateLen(string str, out string end) 1512 { 1513 char* outend = null; 1514 1515 auto __p = g_utf8_validate_len(Str.toStringz(str), cast(size_t)str.length, &outend) != 0; 1516 1517 end = Str.toString(outend); 1518 1519 return __p; 1520 } 1521 }