Unicode

This section describes a number of functions for dealing with Unicode characters and strings. There are analogues of the traditional ctype.h character classification and case conversion functions, UTF-8 analogues of some string utility functions, functions to perform normalization, case conversion and collation on UTF-8 strings and finally functions to convert between the UTF-8, UTF-16 and UCS-4 encodings of Unicode.

The implementations of the Unicode functions in GLib are based on the Unicode Character Data tables, which are available from www.unicode.org. GLib 2.8 supports Unicode 4.0, GLib 2.10 supports Unicode 4.1, GLib 2.12 supports Unicode 5.0, GLib 2.16.3 supports Unicode 5.1, GLib 2.30 supports Unicode 6.0.

class Unicode {

static int unicharValidate(gunichar ch);

static int unicharIsalnum(gunichar c);

static int unicharIsalpha(gunichar c);

static int unicharIscntrl(gunichar c);

static int unicharIsdefined(gunichar c);

static int unicharIsdigit(gunichar c);

static int unicharIsgraph(gunichar c);

static int unicharIslower(gunichar c);

static int unicharIsmark(gunichar c);

static int unicharIsprint(gunichar c);

static int unicharIspunct(gunichar c);

static int unicharIsspace(gunichar c);

static int unicharIstitle(gunichar c);

static int unicharIsupper(gunichar c);

static int unicharIsxdigit(gunichar c);

static int unicharIswide(gunichar c);

static int unicharIswideCjk(gunichar c);

static int unicharIszerowidth(gunichar c);

static gunichar unicharToupper(gunichar c);

static gunichar unicharTolower(gunichar c);

static gunichar unicharTotitle(gunichar c);

static int unicharDigitValue(gunichar c);

static int unicharXdigitValue(gunichar c);

static int unicharCompose(gunichar a, gunichar b, gunichar ch);

static int unicharDecompose(gunichar ch, gunichar a, gunichar b);

static gsize unicharFullyDecompose(gunichar ch, int compat, gunichar[] result);

static GUnicodeType unicharType(gunichar c);

static GUnicodeBreakType unicharBreakType(gunichar c);

static int unicharCombiningClass(gunichar uc);

static void unicodeCanonicalOrdering(gunichar[] string);

static gunichar[] unicodeCanonicalDecomposition(gunichar ch);

static int unicharGetMirrorChar(gunichar ch, gunichar mirroredCh);

static GUnicodeScript unicharGetScript(gunichar ch);

static GUnicodeScript unicodeScriptFromIso15924(uint iso15924);

static uint unicodeScriptToIso15924(GUnicodeScript script);

static gunichar utf8_GetChar(string p);

static gunichar utf8_GetCharValidated(string p);

static string utf8_OffsetToPointer(string str, glong offset);

static glong utf8_PointerToOffset(string str, string pos);

static string utf8_PrevChar(string p);

static string utf8_FindNextChar(string p, string end);

static string utf8_FindPrevChar(string str, string p);

static glong utf8_Strlen(string p);

static string utf8_Strncpy(string dest, string src, gsize n);

static string utf8_Strchr(string p, gssize len, gunichar c);

static string utf8_Strrchr(string p, gssize len, gunichar c);

static string utf8_Strreverse(string str);

static string utf8_Substring(string str, glong startPos, glong endPos);

static int utf8_Validate(string str, string end);

static string utf8_Strup(string str);

static string utf8_Strdown(string str);

static string utf8_Casefold(string str);

static string utf8_Normalize(string str, GNormalizeMode mode);

static int utf8_Collate(string str1, string str2);

static string utf8_CollateKey(string str);

static string utf8_CollateKeyForFilename(string str);

static gunichar2[] utf8_ToUtf16(string str, glong itemsRead);

static gunichar[] utf8_ToUcs4(string str, glong itemsRead);

static gunichar[] utf8_ToUcs4_Fast(string str);

static gunichar[] utf16_ToUcs4(gunichar2[] str, glong itemsRead);

static string utf16_ToUtf8(gunichar2[] str, glong itemsRead);

static gunichar2[] ucs4_ToUtf16(gunichar[] str, glong itemsRead);

static string ucs4_ToUtf8(gunichar[] str, glong itemsRead);

static int unicharToUtf8(gunichar c, string outbuf);

}

Members

Static functions

ucs4_ToUtf16 gunichar2[] ucs4_ToUtf16(gunichar[] str, glong itemsRead): Convert a string from UCS-4 to UTF-16. A 0 character will be added to the result after the converted text.
ucs4_ToUtf8 string ucs4_ToUtf8(gunichar[] str, glong itemsRead): Convert a string from a 32-bit fixed width representation as UCS-4. to UTF-8. The result will be terminated with a 0 byte.
unicharBreakType GUnicodeBreakType unicharBreakType(gunichar c): Determines the break type of c. c should be a Unicode character (to derive a character from UTF-8 encoded text, use g_utf8_get_char()). The break type is used to find word and line breaks ("text boundaries"), Pango implements the Unicode boundary resolution algorithms and normally you would use a function such as pango_break() instead of caring about break types yourself.
unicharCombiningClass int unicharCombiningClass(gunichar uc): Determines the canonical combining class of a Unicode character. Since 2.14
unicharCompose int unicharCompose(gunichar a, gunichar b, gunichar ch): Performs a single composition step of the Unicode canonical composition algorithm. This function includes algorithmic Hangul Jamo composition, but it is not exactly the inverse of g_unichar_decompose(). No composition can have either of a or b equal to zero. To be precise, this function composes if and only if there exists a Primary Composite P which is canonically equivalent to the sequence <a,b>. See the Unicode Standard for the definition of Primary Composite. If a and b do not compose a new character, ch is set to zero. See UAX#15 for details. Since 2.30
unicharDecompose int unicharDecompose(gunichar ch, gunichar a, gunichar b): Performs a single decomposition step of the Unicode canonical decomposition algorithm. This function does not include compatibility decompositions. It does, however, include algorithmic Hangul Jamo decomposition, as well as 'singleton' decompositions which replace a character by a single other character. In the case of singletons *b will be set to zero. If ch is not decomposable, *a is set to ch and *b is set to zero. Note that the way Unicode decomposition pairs are defined, it is guaranteed that b would not decompose further, but a may itself decompose. To get the full canonical decomposition for ch, one would need to recursively call this function on a. Or use g_unichar_fully_decompose(). See UAX#15 for details. Since 2.30
unicharDigitValue int unicharDigitValue(gunichar c): Determines the numeric value of a character as a decimal digit.
unicharFullyDecompose gsize unicharFullyDecompose(gunichar ch, int compat, gunichar[] result): Computes the canonical or compatibility decomposition of a Unicode character. For compatibility decomposition, pass TRUE for compat; for canonical decomposition pass FALSE for compat. The decomposed sequence is placed in result. Only up to result_len characters are written into result. The length of the full decomposition (irrespective of result_len) is returned by the function. For canonical decomposition, currently all decompositions are of length at most 4, but this may change in the future (very unlikely though). At any rate, Unicode does guarantee that a buffer of length 18 is always enough for both compatibility and canonical decompositions, so that is the size recommended. This is provided as G_UNICHAR_MAX_DECOMPOSITION_LENGTH. See UAX#15 for details. Since 2.30
unicharGetMirrorChar int unicharGetMirrorChar(gunichar ch, gunichar mirroredCh): In Unicode, some characters are mirrored. This means that their images are mirrored horizontally in text that is laid out from right to left. For instance, "(" would become its mirror image, ")", in right-to-left text. If ch has the Unicode mirrored property and there is another unicode character that typically has a glyph that is the mirror image of ch's glyph and mirrored_ch is set, it puts that character in the address pointed to by mirrored_ch. Otherwise the original character is put. Since 2.4
unicharGetScript GUnicodeScript unicharGetScript(gunichar ch): Looks up the GUnicodeScript for a particular character (as defined by Unicode Standard Annex #24). No check is made for ch being a valid Unicode character; if you pass in invalid character, the result is undefined. This function is equivalent to pango_script_for_unichar() and the two are interchangeable. Since 2.14
unicharIsalnum int unicharIsalnum(gunichar c): Determines whether a character is alphanumeric. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIsalpha int unicharIsalpha(gunichar c): Determines whether a character is alphabetic (i.e. a letter). Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIscntrl int unicharIscntrl(gunichar c): Determines whether a character is a control character. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIsdefined int unicharIsdefined(gunichar c): Determines if a given character is assigned in the Unicode standard.
unicharIsdigit int unicharIsdigit(gunichar c): Determines whether a character is numeric (i.e. a digit). This covers ASCII 0-9 and also digits in other languages/scripts. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIsgraph int unicharIsgraph(gunichar c): Determines whether a character is printable and not a space (returns FALSE for control characters, format characters, and spaces). g_unichar_isprint() is similar, but returns TRUE for spaces. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIslower int unicharIslower(gunichar c): Determines whether a character is a lowercase letter. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIsmark int unicharIsmark(gunichar c): Determines whether a character is a mark (non-spacing mark, combining mark, or enclosing mark in Unicode speak). Given some UTF-8 text, obtain a character value with g_utf8_get_char(). Note: in most cases where isalpha characters are allowed, ismark characters should be allowed to as they are essential for writing most European languages as well as many non-Latin scripts. Since 2.14
unicharIsprint int unicharIsprint(gunichar c): Determines whether a character is printable. Unlike g_unichar_isgraph(), returns TRUE for spaces. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIspunct int unicharIspunct(gunichar c): Determines whether a character is punctuation or a symbol. Given some UTF-8 text, obtain a character value with g_utf8_get_char().
unicharIsspace int unicharIsspace(gunichar c): Determines whether a character is a space, tab, or line separator (newline, carriage return, etc.). Given some UTF-8 text, obtain a character value with g_utf8_get_char(). (Note: don't use this to do word breaking; you have to use Pango or equivalent to get word breaking right, the algorithm is fairly complex.)
unicharIstitle int unicharIstitle(gunichar c): Determines if a character is titlecase. Some characters in Unicode which are composites, such as the DZ digraph have three case variants instead of just two. The titlecase form is used at the beginning of a word where only the first letter is capitalized. The titlecase form of the DZ digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
unicharIsupper int unicharIsupper(gunichar c): Determines if a character is uppercase.
unicharIswide int unicharIswide(gunichar c): Determines if a character is typically rendered in a double-width cell.
unicharIswideCjk int unicharIswideCjk(gunichar c): Determines if a character is typically rendered in a double-width cell under legacy East Asian locales. If a character is wide according to g_unichar_iswide(), then it is also reported wide with this function, but the converse is not necessarily true. See the Unicode Standard Annex #11 for details. If a character passes the g_unichar_iswide() test then it will also pass this test, but not the other way around. Note that some characters may pas both this test and g_unichar_iszerowidth(). Since 2.12
unicharIsxdigit int unicharIsxdigit(gunichar c): Determines if a character is a hexidecimal digit.
unicharIszerowidth int unicharIszerowidth(gunichar c): Determines if a given character typically takes zero width when rendered. The return value is TRUE for all non-spacing and enclosing marks (e.g., combining accents), format characters, zero-width space, but not U+00AD SOFT HYPHEN. A typical use of this function is with one of g_unichar_iswide() or g_unichar_iswide_cjk() to determine the number of cells a string occupies when displayed on a grid display (terminals). However, note that not all terminals support zero-width rendering of zero-width marks. Since 2.14
unicharToUtf8 int unicharToUtf8(gunichar c, string outbuf): Converts a single character to UTF-8.
unicharTolower gunichar unicharTolower(gunichar c): Converts a character to lower case.
unicharTotitle gunichar unicharTotitle(gunichar c): Converts a character to the titlecase.
unicharToupper gunichar unicharToupper(gunichar c): Converts a character to uppercase.
unicharType GUnicodeType unicharType(gunichar c): Classifies a Unicode character by type.
unicharValidate int unicharValidate(gunichar ch): Checks whether ch is a valid Unicode character. Some possible integer values of ch will not be valid. 0 is considered a valid character, though it's normally a string terminator.
unicharXdigitValue int unicharXdigitValue(gunichar c): Determines the numeric value of a character as a hexidecimal digit.
unicodeCanonicalDecomposition gunichar[] unicodeCanonicalDecomposition(gunichar ch): Warning g_unicode_canonical_decomposition has been deprecated since version 2.30 and should not be used in newly-written code. Use the more flexible g_unichar_fully_decompose() instead. Computes the canonical decomposition of a Unicode character.
unicodeCanonicalOrdering void unicodeCanonicalOrdering(gunichar[] string): Computes the canonical ordering of a string in-place. This rearranges decomposed characters in the string according to their combining classes. See the Unicode manual for more information.
unicodeScriptFromIso15924 GUnicodeScript unicodeScriptFromIso15924(uint iso15924): Looks up the Unicode script for iso15924. ISO 15924 assigns four-letter codes to scripts. For example, the code for Arabic is 'Arab'. This function accepts four letter codes encoded as a guint32 in a big-endian fashion. That is, the code expected for Arabic is 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). See Codes for the representation of names of scripts for details. Since 2.30
unicodeScriptToIso15924 uint unicodeScriptToIso15924(GUnicodeScript script): Looks up the ISO 15924 code for script. ISO 15924 assigns four-letter codes to scripts. For example, the code for Arabic is 'Arab'. The four letter codes are encoded as a guint32 by this function in a big-endian fashion. That is, the code returned for Arabic is 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). See Codes for the representation of names of scripts for details. Since 2.30
utf16_ToUcs4 gunichar[] utf16_ToUcs4(gunichar2[] str, glong itemsRead): Convert a string from UTF-16 to UCS-4. The result will be nul-terminated.
utf16_ToUtf8 string utf16_ToUtf8(gunichar2[] str, glong itemsRead): Convert a string from UTF-16 to UTF-8. The result will be terminated with a 0 byte. Note that the input is expected to be already in native endianness, an initial byte-order-mark character is not handled specially. g_convert() can be used to convert a byte buffer of UTF-16 data of ambiguous endianess. Further note that this function does not validate the result string; it may e.g. include embedded NUL characters. The only validation done by this function is to ensure that the input can be correctly interpreted as UTF-16, i.e. it doesn't contain things unpaired surrogates.
utf8_Casefold string utf8_Casefold(string str): Converts a string into a form that is independent of case. The result will not correspond to any particular case, but can be compared for equality or ordered with the results of calling g_utf8_casefold() on other strings. Note that calling g_utf8_casefold() followed by g_utf8_collate() is only an approximation to the correct linguistic case insensitive ordering, though it is a fairly good one. Getting this exactly right would require a more sophisticated collation function that takes case sensitivity into account. GLib does not currently provide such a function.
utf8_Collate int utf8_Collate(string str1, string str2): Compares two strings for ordering using the linguistically correct rules for the current locale. When sorting a large number of strings, it will be significantly faster to obtain collation keys with g_utf8_collate_key() and compare the keys with strcmp() when sorting instead of sorting the original strings.
utf8_CollateKey string utf8_CollateKey(string str): Converts a string into a collation key that can be compared with other collation keys produced by the same function using strcmp(). The results of comparing the collation keys of two strings with strcmp() will always be the same as comparing the two original keys with g_utf8_collate(). Note that this function depends on the current locale.
utf8_CollateKeyForFilename string utf8_CollateKeyForFilename(string str): Converts a string into a collation key that can be compared with other collation keys produced by the same function using strcmp(). In order to sort filenames correctly, this function treats the dot '.' as a special case. Most dictionary orderings seem to consider it insignificant, thus producing the ordering "event.c" "eventgenerator.c" "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we would like to treat numbers intelligently so that "file1" "file10" "file5" is sorted as "file1" "file5" "file10". Note that this function depends on the current locale. Since 2.8
utf8_FindNextChar string utf8_FindNextChar(string p, string end): Finds the start of the next UTF-8 character in the string after p. p does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte.
utf8_FindPrevChar string utf8_FindPrevChar(string str, string p): Given a position p with a UTF-8 encoded string str, find the start of the previous UTF-8 character starting before p. Returns NULL if no UTF-8 characters are present in str before p. p does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte.
utf8_GetChar gunichar utf8_GetChar(string p): Converts a sequence of bytes encoded as UTF-8 to a Unicode character. If p does not point to a valid UTF-8 encoded character, results are undefined. If you are not sure that the bytes are complete valid Unicode characters, you should use g_utf8_get_char_validated() instead.
utf8_GetCharValidated gunichar utf8_GetCharValidated(string p): Convert a sequence of bytes encoded as UTF-8 to a Unicode character. This function checks for incomplete characters, for invalid characters such as characters that are out of the range of Unicode, and for overlong encodings of valid characters.
utf8_Normalize string utf8_Normalize(string str, GNormalizeMode mode): Converts a string into canonical form, standardizing such issues as whether a character with an accent is represented as a base character and combining accent or as a single precomposed character. The string has to be valid UTF-8, otherwise NULL is returned. You should generally call g_utf8_normalize() before comparing two Unicode strings. The normalization mode G_NORMALIZE_DEFAULT only standardizes differences that do not affect the text content, such as the above-mentioned accent representation. G_NORMALIZE_ALL also standardizes the "compatibility" characters in Unicode, such as SUPERSCRIPT THREE to the standard forms (in this case DIGIT THREE). Formatting information may be lost but for most text operations such characters should be considered the same. G_NORMALIZE_DEFAULT_COMPOSE and G_NORMALIZE_ALL_COMPOSE are like G_NORMALIZE_DEFAULT and G_NORMALIZE_ALL, but returned a result with composed forms rather than a maximally decomposed form. This is often useful if you intend to convert the string to a legacy encoding or pass it to a system with less capable Unicode handling.
utf8_OffsetToPointer string utf8_OffsetToPointer(string str, glong offset): Converts from an integer character offset to a pointer to a position within the string. Since 2.10, this function allows to pass a negative offset to step backwards. It is usually worth stepping backwards from the end instead of forwards if offset is in the last fourth of the string, since moving forward is about 3 times faster than moving backward. Note This function doesn't abort when reaching the end of str. Therefore you should be sure that offset is within string boundaries before calling that function. Call g_utf8_strlen() when unsure. This limitation exists as this function is called frequently during text rendering and therefore has to be as fast as possible.
utf8_PointerToOffset glong utf8_PointerToOffset(string str, string pos): Converts from a pointer to position within a string to a integer character offset. Since 2.10, this function allows pos to be before str, and returns a negative offset in this case.
utf8_PrevChar string utf8_PrevChar(string p): Finds the previous UTF-8 character in the string before p. p does not have to be at the beginning of a UTF-8 character. No check is made to see if the character found is actually valid other than it starts with an appropriate byte. If p might be the first character of the string, you must use g_utf8_find_prev_char() instead.
utf8_Strchr string utf8_Strchr(string p, gssize len, gunichar c): Finds the leftmost occurrence of the given Unicode character in a UTF-8 encoded string, while limiting the search to len bytes. If len is -1, allow unbounded search.
utf8_Strdown string utf8_Strdown(string str): Converts all Unicode characters in the string that have a case to lowercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string changing.
utf8_Strlen glong utf8_Strlen(string p): Computes the length of the string in characters, not including the terminating nul character. If the max'th byte falls in the middle of a character, the last (partial) character is not counted.
utf8_Strncpy string utf8_Strncpy(string dest, string src, gsize n): Like the standard C strncpy() function, but copies a given number of characters instead of a given number of bytes. The src string must be valid UTF-8 encoded text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility functions with it.)
utf8_Strrchr string utf8_Strrchr(string p, gssize len, gunichar c): Find the rightmost occurrence of the given Unicode character in a UTF-8 encoded string, while limiting the search to len bytes. If len is -1, allow unbounded search.
utf8_Strreverse string utf8_Strreverse(string str): Reverses a UTF-8 string. str must be valid UTF-8 encoded text. (Use g_utf8_validate() on all text before trying to use UTF-8 utility functions with it.) This function is intended for programmatic uses of reversed strings. It pays no attention to decomposed characters, combining marks, byte order marks, directional indicators (LRM, LRO, etc) and similar characters which might need special handling when reversing a string for display purposes. Note that unlike g_strreverse(), this function returns newly-allocated memory, which should be freed with g_free() when no longer needed. Since 2.2
utf8_Strup string utf8_Strup(string str): Converts all Unicode characters in the string that have a case to uppercase. The exact manner that this is done depends on the current locale, and may result in the number of characters in the string increasing. (For instance, the German ess-zet will be changed to SS.)
utf8_Substring string utf8_Substring(string str, glong startPos, glong endPos): Copies a substring out of a UTF-8 encoded string. The substring will contain end_pos - start_pos characters. Since 2.30
utf8_ToUcs4 gunichar[] utf8_ToUcs4(string str, glong itemsRead): Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4. A trailing 0 character will be added to the string after the converted text.
utf8_ToUcs4_Fast gunichar[] utf8_ToUcs4_Fast(string str): Convert a string from UTF-8 to a 32-bit fixed width representation as UCS-4, assuming valid UTF-8 input. This function is roughly twice as fast as g_utf8_to_ucs4() but does no error checking on the input. A trailing 0 character will be added to the string after the converted text.
utf8_ToUtf16 gunichar2[] utf8_ToUtf16(string str, glong itemsRead): Convert a string from UTF-8 to UTF-16. A 0 character will be added to the result after the converted text.
utf8_Validate int utf8_Validate(string str, string end): Validates UTF-8 encoded text. str is the text to validate; if str is nul-terminated, then max_len can be -1, otherwise max_len should be the number of bytes to validate. If end is non-NULL, then the end of the valid range will be stored there (i.e. the start of the first invalid character if some bytes were invalid, or the end of the text being validated otherwise). Note that g_utf8_validate() returns FALSE if max_len is positive and any of the max_len bytes are NUL. Returns TRUE if all of str was valid. Many GLib and GTK+ routines require valid UTF-8 as input; so data read from a file or the network should be checked with g_utf8_validate() before doing anything else with it.

Meta

Source

See Implementation

glib Unicode classes

Unicode