Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

unitool.h@ 21399

Last change on this file since 21399 was 21325, checked in by ak19, 14 years ago
Changes to makefiles, configure files, and source code to work with the new configure flags that allow indexers to be individually compiled up by setting each indexer to be enabled or disabled (enable-mg, enable-mgpp, enable-lucene).
Property svn:executable set to ``*
File size: 4.0 KB

Rev	Line
[21325]	1	/**********************************************************************
	2	*
	3	* unitool.h -- Unicode support.
	4	* Copyright (C) 1999 The New Zealand Digital Library Project
	5	*
	6	* A component of the Greenstone digital library software
	7	* from the New Zealand Digital Library Project at the
	8	* University of Waikato, New Zealand.
	9	*
	10	* This program is free software; you can redistribute it and/or modify
	11	* it under the terms of the GNU General Public License as published by
	12	* the Free Software Foundation; either version 2 of the License, or
	13	* (at your option) any later version.
	14	*
	15	* This program is distributed in the hope that it will be useful,
	16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	* GNU General Public License for more details.
	19	*
	20	* You should have received a copy of the GNU General Public License
	21	* along with this program; if not, write to the Free Software
	22	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	*
	24	*********************************************************************/
	25
	26	#ifndef UNITOOL_H
	27	#define UNITOOL_H
	28
	29	/* This module is based on Unicode 2.1 */
	30
	31
	32	/* parse_utf8_char parses the next utf-8 character, placing its
	33	* unicode equivalent in *value. The length of the utf-8 character
	34	* is returned. end is the address of the last character. */
	35	int parse_utf8_char (const unsigned char *here,
	36	const unsigned char *end,
	37	unsigned short *value);
	38
	39	/* output_utf8_char encodes a unicode character as a UTF-8 character.
	40	* The length of the encoding is returned. If the string was not
	41	* long enough to encode the character 0 is returned. end is the
	42	* address of the last character */
	43	int output_utf8_char (unsigned short value,
	44	unsigned char *here,
	45	unsigned char *end);
	46
	47	/* decompose_str will decompose a unicode string into its canonical
	48	* equivalents. NULL is returned if the input array was not
	49	* large enough to contain the fully decomposed string (the array
	50	* will be in a correct, but partially decomposed state). The input
	51	* must be null-terminated. */
	52	unsigned short decompose_str (unsigned short input,
	53	int max_output_len);
	54
	55	/* tests to see whether 'value' is a valid Unicode letter */
	56	int is_unicode_letter (unsigned short value);
	57
	58	/* tests to see whether 'value' is a valid Unicode digit */
	59	int is_unicode_digit (unsigned short value);
	60
	61	/* tests to see whether 'value' is a valid Unicode letter or
	62	* digit */
	63	int is_unicode_letdig (unsigned short value);
	64
	65	/* tests to see whether 'value' is a valid space
	66	* The test includes both "C" spaces and "Unicode" spaces, i.e.
	67	* form-feed, newline, carriage return, horizontal tab,
	68	* vertical tab, and the Zs, Zl, and Zp Unicode categorizations */
	69	int is_unicode_space (unsigned short value);
	70
	71	/* returns the length of the unicode string */
	72	int unicode_strlen (const unsigned short *str);
	73
	74	/* returns the length of the unicode string, up to a maximum */
	75	int unicode_strnlen (const unsigned short *str, int max_length);
	76
	77	/* returns the upper-case equivalent of value */
	78	unsigned short unicode_toupper (unsigned short value);
	79
	80	/* returns the lower-case equivalent of value */
	81	unsigned short unicode_tolower (unsigned short value);
	82
	83	/* returns the simplified Chinese character equivalent of
	84	* another Chinese character */
	85	unsigned short unicode_tosimplified (unsigned short value);
	86
	87
	88	/* converts a utf-8 word (string with length stored in the first byte
	89	* to a Unicode array. To handle all situations the output buffer should
	90	* be 256 unsigned shorts long. The output will also have the length as
	91	* the first entry. */
	92	unsigned short utf8_word_to_unicode (const unsigned char input,
	93	unsigned short *output,
	94	int max_output_length);
	95
	96	/* converts a unicode word buffer (with the length stored in the
	97	* entry) to a utf8 encoded word output (with the length stored in
	98	* the first byte. Only 255 bytes (not characters) can be stored
	99	* in the output. */
	100	unsigned char unicode_to_utf8_word (const unsigned short input,
	101	unsigned char *output,
	102	int max_output_length);
	103
	104
	105	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/src/lib/unitool.h@ 21399

Download in other formats: