/**********************************************************************
 *
 * unitool.h -- Unicode support.
 * Copyright (C) 1999  The New Zealand Digital Library Project
 *
 * A component of the Greenstone digital library software
 * from the New Zealand Digital Library Project at the
 * University of Waikato, New Zealand.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *********************************************************************/

#ifndef UNITOOL_H
#define UNITOOL_H

/* This module is based on Unicode 2.1 */


/* parse_utf8_char parses the next utf-8 character, placing its
 * unicode equivalent in *value. The length of the utf-8 character
 * is returned. end is the address of the last character. */
int parse_utf8_char (const unsigned char *here,
		     const unsigned char *end,
		     unsigned short *value);

/* output_utf8_char encodes a unicode character as a UTF-8 character.
 * The length of the encoding is returned. If the string was not
 * long enough to encode the character 0 is returned. end is the
 * address of the last character */
int output_utf8_char (unsigned short value,
		      unsigned char *here,
		      unsigned char *end);

/* decompose_str will decompose a unicode string into its canonical
 * equivalents. NULL is returned if the input array was not 
 * large enough to contain the fully decomposed string (the array
 * will be in a correct, but partially decomposed state). The input 
 * must be null-terminated. */
unsigned short *decompose_str (unsigned short *input,
			       int max_output_len);

/* tests to see whether 'value' is a valid Unicode letter */
int is_unicode_letter (unsigned short value);

/* tests to see whether 'value' is a valid Unicode digit */
int is_unicode_digit (unsigned short value);

/* tests to see whether 'value' is a valid Unicode letter or 
 * digit */
int is_unicode_letdig (unsigned short value);

/* tests to see whether 'value' is a valid space
 * The test includes both "C" spaces and "Unicode" spaces, i.e. 
 * form-feed, newline, carriage return, horizontal tab,
 * vertical tab, and the Zs, Zl, and Zp Unicode categorizations */
int is_unicode_space (unsigned short value);

/* returns the length of the unicode string */
int unicode_strlen (const unsigned short *str);

/* returns the length of the unicode string, up to a maximum */
int unicode_strnlen (const unsigned short *str, int max_length);

/* returns the upper-case equivalent of value */
unsigned short unicode_toupper (unsigned short value);

/* returns the lower-case equivalent of value */
unsigned short unicode_tolower (unsigned short value);

/* returns the simplified Chinese character equivalent of 
 * another Chinese character */
unsigned short unicode_tosimplified (unsigned short value);


/* converts a utf-8 word (string with length stored in the first byte
 * to a Unicode array. To handle all situations the output buffer should
 * be 256 unsigned shorts long. The output will also have the length as
 * the first entry. */
unsigned short *utf8_word_to_unicode (const unsigned char *input,
				      unsigned short *output,
				      int max_output_length);

/* converts a unicode word buffer (with the length stored in the
 * entry) to a utf8 encoded word output (with the length stored in
 * the first byte. Only 255 bytes (not characters) can be stored 
 * in the output. */
unsigned char *unicode_to_utf8_word (const unsigned short *input,
				     unsigned char *output,
				     int max_output_length);


#endif