source: main/trunk/greenstone2/common-src/src/lib/unitool.h@ 21405

Last change on this file since 21405 was 21325, checked in by ak19, 14 years ago

Changes to makefiles, configure files, and source code to work with the new configure flags that allow indexers to be individually compiled up by setting each indexer to be enabled or disabled (enable-mg, enable-mgpp, enable-lucene).

  • Property svn:executable set to *
File size: 4.0 KB
Line 
1/**********************************************************************
2 *
3 * unitool.h -- Unicode support.
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * A component of the Greenstone digital library software
7 * from the New Zealand Digital Library Project at the
8 * University of Waikato, New Zealand.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 *********************************************************************/
25
26#ifndef UNITOOL_H
27#define UNITOOL_H
28
29/* This module is based on Unicode 2.1 */
30
31
32/* parse_utf8_char parses the next utf-8 character, placing its
33 * unicode equivalent in *value. The length of the utf-8 character
34 * is returned. end is the address of the last character. */
35int parse_utf8_char (const unsigned char *here,
36 const unsigned char *end,
37 unsigned short *value);
38
39/* output_utf8_char encodes a unicode character as a UTF-8 character.
40 * The length of the encoding is returned. If the string was not
41 * long enough to encode the character 0 is returned. end is the
42 * address of the last character */
43int output_utf8_char (unsigned short value,
44 unsigned char *here,
45 unsigned char *end);
46
47/* decompose_str will decompose a unicode string into its canonical
48 * equivalents. NULL is returned if the input array was not
49 * large enough to contain the fully decomposed string (the array
50 * will be in a correct, but partially decomposed state). The input
51 * must be null-terminated. */
52unsigned short *decompose_str (unsigned short *input,
53 int max_output_len);
54
55/* tests to see whether 'value' is a valid Unicode letter */
56int is_unicode_letter (unsigned short value);
57
58/* tests to see whether 'value' is a valid Unicode digit */
59int is_unicode_digit (unsigned short value);
60
61/* tests to see whether 'value' is a valid Unicode letter or
62 * digit */
63int is_unicode_letdig (unsigned short value);
64
65/* tests to see whether 'value' is a valid space
66 * The test includes both "C" spaces and "Unicode" spaces, i.e.
67 * form-feed, newline, carriage return, horizontal tab,
68 * vertical tab, and the Zs, Zl, and Zp Unicode categorizations */
69int is_unicode_space (unsigned short value);
70
71/* returns the length of the unicode string */
72int unicode_strlen (const unsigned short *str);
73
74/* returns the length of the unicode string, up to a maximum */
75int unicode_strnlen (const unsigned short *str, int max_length);
76
77/* returns the upper-case equivalent of value */
78unsigned short unicode_toupper (unsigned short value);
79
80/* returns the lower-case equivalent of value */
81unsigned short unicode_tolower (unsigned short value);
82
83/* returns the simplified Chinese character equivalent of
84 * another Chinese character */
85unsigned short unicode_tosimplified (unsigned short value);
86
87
88/* converts a utf-8 word (string with length stored in the first byte
89 * to a Unicode array. To handle all situations the output buffer should
90 * be 256 unsigned shorts long. The output will also have the length as
91 * the first entry. */
92unsigned short *utf8_word_to_unicode (const unsigned char *input,
93 unsigned short *output,
94 int max_output_length);
95
96/* converts a unicode word buffer (with the length stored in the
97 * entry) to a utf8 encoded word output (with the length stored in
98 * the first byte. Only 255 bytes (not characters) can be stored
99 * in the output. */
100unsigned char *unicode_to_utf8_word (const unsigned short *input,
101 unsigned char *output,
102 int max_output_length);
103
104
105#endif
Note: See TracBrowser for help on using the repository browser.