source: gsdl/trunk/trunk/mg/lib/unitool.h@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.0 KB
Line 
1#ifndef UNITOOL_H
2#define UNITOOL_H
3
4#ifdef __cplusplus
5extern "C" {
6#endif
7
8
9/* This module is based on Unicode 2.1 */
10
11
12/* parse_utf8_char parses the next utf-8 character, placing its
13 * unicode equivalent in *value. The length of the utf-8 character
14 * is returned. end is the address of the last character. */
15int parse_utf8_char (const unsigned char *here,
16 const unsigned char *end,
17 unsigned short *value);
18
19/* output_utf8_char encodes a unicode character as a UTF-8 character.
20 * The length of the encoding is returned. If the string was not
21 * long enough to encode the character 0 is returned. end is the
22 * address of the last character */
23int output_utf8_char (unsigned short value,
24 unsigned char *here,
25 unsigned char *end);
26
27/* decompose_str will decompose a unicode string into its canonical
28 * equivalents. NULL is returned if the input array was not
29 * large enough to contain the fully decomposed string (the array
30 * will be in a correct, but partially decomposed state). The input
31 * must be null-terminated. */
32unsigned short *decompose_str (unsigned short *input,
33 int max_output_len);
34
35/* tests to see whether 'value' is a valid Unicode letter */
36int is_unicode_letter (unsigned short value);
37
38/* tests to see whether 'value' is a valid Unicode digit */
39int is_unicode_digit (unsigned short value);
40
41/* tests to see whether 'value' is a valid Unicode letter or
42 * digit */
43int is_unicode_letdig (unsigned short value);
44
45/* tests to see whether 'value' is a valid space
46 * The test includes both "C" spaces and "Unicode" spaces, i.e.
47 * form-feed, newline, carriage return, horizontal tab,
48 * vertical tab, and the Zs, Zl, and Zp Unicode categorizations */
49int is_unicode_space (unsigned short value);
50
51/* returns the length of the unicode string */
52int unicode_strlen (const unsigned short *str);
53
54/* returns the length of the unicode string, up to a maximum */
55int unicode_strnlen (const unsigned short *str, int max_length);
56
57/* returns the upper-case equivalent of value */
58unsigned short unicode_toupper (unsigned short value);
59
60/* returns the lower-case equivalent of value */
61unsigned short unicode_tolower (unsigned short value);
62
63/* returns the simplified Chinese character equivalent of
64 * another Chinese character */
65unsigned short unicode_tosimplified (unsigned short value);
66
67
68/* converts a utf-8 word (string with length stored in the first byte
69 * to a Unicode array. To handle all situations the output buffer should
70 * be 256 unsigned shorts long. The output will also have the length as
71 * the first entry. */
72unsigned short *utf8_word_to_unicode (const unsigned char *input,
73 unsigned short *output,
74 int max_output_length);
75
76/* converts a unicode word buffer (with the length stored in the
77 * entry) to a utf8 encoded word output (with the length stored in
78 * the first byte. Only 255 bytes (not characters) can be stored
79 * in the output. */
80unsigned char *unicode_to_utf8_word (const unsigned short *input,
81 unsigned char *output,
82 int max_output_length);
83
84#ifdef __cplusplus
85 }
86#endif
87
88#endif
Note: See TracBrowser for help on using the repository browser.