source: gsdl/trunk/trunk/mgpp/lib/unitool.h@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.0 KB
Line 
1#ifndef UNITOOL_H
2#define UNITOOL_H
3
4/* This module is based on Unicode 2.1 */
5
6
7/* parse_utf8_char parses the next utf-8 character, placing its
8 * unicode equivalent in *value. The length of the utf-8 character
9 * is returned. end is the address of the last character. */
10int parse_utf8_char (const unsigned char *here,
11 const unsigned char *end,
12 unsigned short *value);
13
14/* output_utf8_char encodes a unicode character as a UTF-8 character.
15 * The length of the encoding is returned. If the string was not
16 * long enough to encode the character 0 is returned. end is the
17 * address of the last character */
18int output_utf8_char (unsigned short value,
19 unsigned char *here,
20 unsigned char *end);
21
22/* decompose_str will decompose a unicode string into its canonical
23 * equivalents. NULL is returned if the input array was not
24 * large enough to contain the fully decomposed string (the array
25 * will be in a correct, but partially decomposed state). The input
26 * must be null-terminated. */
27unsigned short *decompose_str (unsigned short *input,
28 int max_output_len);
29
30/* tests to see whether 'value' is a valid Unicode letter */
31int is_unicode_letter (unsigned short value);
32
33/* tests to see whether 'value' is a valid Unicode digit */
34int is_unicode_digit (unsigned short value);
35
36/* tests to see whether 'value' is a valid Unicode letter or
37 * digit */
38int is_unicode_letdig (unsigned short value);
39
40/* tests to see whether 'value' is a valid space
41 * The test includes both "C" spaces and "Unicode" spaces, i.e.
42 * form-feed, newline, carriage return, horizontal tab,
43 * vertical tab, and the Zs, Zl, and Zp Unicode categorizations */
44int is_unicode_space (unsigned short value);
45
46/* returns the length of the unicode string */
47int unicode_strlen (const unsigned short *str);
48
49/* returns the length of the unicode string, up to a maximum */
50int unicode_strnlen (const unsigned short *str, int max_length);
51
52/* returns the upper-case equivalent of value */
53unsigned short unicode_toupper (unsigned short value);
54
55/* returns the lower-case equivalent of value */
56unsigned short unicode_tolower (unsigned short value);
57
58/* returns the simplified Chinese character equivalent of
59 * another Chinese character */
60unsigned short unicode_tosimplified (unsigned short value);
61
62
63/* converts a utf-8 word (string with length stored in the first byte
64 * to a Unicode array. To handle all situations the output buffer should
65 * be 256 unsigned shorts long. The output will also have the length as
66 * the first entry. */
67unsigned short *utf8_word_to_unicode (const unsigned char *input,
68 unsigned short *output,
69 int max_output_length);
70
71/* converts a unicode word buffer (with the length stored in the
72 * entry) to a utf8 encoded word output (with the length stored in
73 * the first byte. Only 255 bytes (not characters) can be stored
74 * in the output. */
75unsigned char *unicode_to_utf8_word (const unsigned short *input,
76 unsigned char *output,
77 int max_output_length);
78
79
80#endif
Note: See TracBrowser for help on using the repository browser.