Changeset 26163
- Timestamp:
- 2012-09-11T11:34:31+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/common-src/indexers/mgpp/lib/unitool.cpp
r26128 r26163 1 1 #include "unitool.h" 2 2 #include "sysfuncs.h" 3 4 3 5 4 /* unicode_cat_data is a combination of the letter, digit, and … … 1300 1299 * unicode equivalent in *value. The length of the utf-8 character 1301 1300 * is returned */ 1301 /* NOTE:!!!! end points to the last valid char, NOT past the end 1302 */ 1302 1303 int parse_utf8_char(const unsigned char *here, 1303 1304 const unsigned char *end, … … 1345 1346 * The length of the encoding is returned. If the string was not 1346 1347 * long enough to encode the character 0 is returned. */ 1348 /* NOTE:!!!! end points to the last valid char, NOT past the end 1349 */ 1347 1350 int output_utf8_char (unsigned short value, 1348 1351 unsigned char *here, 1349 1352 unsigned char *end) { 1350 1353 int len = (int)(end - here + 1); /* end is the last character */ 1354 1351 1355 if (value < 0x80) { 1352 1356 /* will be encoded in one byte */ … … 1698 1702 1699 1703 1700 /* converts a utf-8 word (string with length stored in the first byte 1704 /* converts a utf-8 word (string with length stored in the first byte) 1701 1705 * to a Unicode array. To handle all situations the output buffer should 1702 1706 * be 256 unsigned shorts long. The output will also have the length as 1703 * the first entry. */ 1707 * the first entry. 1708 * NOTE! max_output_length is the maximum length of the output string, ie 1709 * one less than the length of the output array, ie should be <=255 if output 1710 * buffer is 256 long 1711 */ 1704 1712 unsigned short *utf8_word_to_unicode (const unsigned char *input, 1705 1713 unsigned short *output, … … 1707 1715 int inlen = *input; 1708 1716 const unsigned char *inhere = input+1; 1709 const unsigned char *inend = &inhere[inlen-1]; 1717 const unsigned char *inend = &inhere[inlen-1]; // points to the last valid char, not past the end. 1710 1718 unsigned short c; 1711 1719 int clen; … … 1714 1722 unsigned short *outhere = output+1; 1715 1723 1716 while (inhere <= inend && outlen < max_output_length -1) {1724 while (inhere <= inend && outlen < max_output_length) { 1717 1725 /* decode the character */ 1718 1726 clen = parse_utf8_char (inhere, inend, &c); … … 1735 1743 * entry) to a utf8 encoded word output (with the length stored in 1736 1744 * the first byte. Only 255 bytes (not characters) can be stored 1737 * in the output. */ 1745 * in the output. 1746 * Note:!! max_output_length is the maximum length of the output string, ie 1747 * one less than the length of the output array, ie should be <=255 if output 1748 * buffer is 256 long 1749 */ 1738 1750 unsigned char *unicode_to_utf8_word (const unsigned short *input, 1739 1751 unsigned char *output, … … 1746 1758 int outlen = 0; 1747 1759 unsigned char *outhere = output+1; 1748 unsigned char *outend = output+max_output_length; 1760 unsigned char *outend = output+max_output_length; // points to last valid char, not past the end 1749 1761 1750 1762 while (in_i < inlen) {
Note:
See TracChangeset
for help on using the changeset viewer.