Changeset 26163

Show
Ignore:
Timestamp:
11.09.2012 11:34:31 (8 years ago)
Author:
kjdon
Message:

added some comments to make clear whether max_output_length includes the space for the length in position zero or not (not.) And fixed up an off-by-one error to do with the same thing

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/mgpp/lib/unitool.cpp

    r26128 r26163  
    11#include "unitool.h" 
    22#include "sysfuncs.h" 
    3  
    43 
    54/* unicode_cat_data is a combination of the letter, digit, and 
     
    13001299 * unicode equivalent in *value. The length of the utf-8 character 
    13011300 * is returned */ 
     1301/* NOTE:!!!! end points to the last valid char, NOT past the end 
     1302 */ 
    13021303int parse_utf8_char(const unsigned char *here, 
    13031304            const unsigned char *end, 
     
    13451346 * The length of the encoding is returned. If the string was not 
    13461347 * long enough to encode the character 0 is returned. */ 
     1348/* NOTE:!!!! end points to the last valid char, NOT past the end 
     1349 */ 
    13471350int output_utf8_char (unsigned short value, 
    13481351              unsigned char *here, 
    13491352              unsigned char *end) { 
    13501353  int len = (int)(end - here + 1); /* end is the last character */ 
     1354 
    13511355  if (value < 0x80) { 
    13521356    /* will be encoded in one byte */ 
     
    16981702 
    16991703 
    1700 /* converts a utf-8 word (string with length stored in the first byte 
     1704/* converts a utf-8 word (string with length stored in the first byte) 
    17011705 * to a Unicode array. To handle all situations the output buffer should 
    17021706 * be 256 unsigned shorts long. The output will also have the length as 
    1703  * the first entry. */ 
     1707 * the first entry.  
     1708 * NOTE! max_output_length is the maximum length of the output string, ie  
     1709 * one less than the length of the output array, ie should be <=255 if output 
     1710 * buffer is 256 long 
     1711 */ 
    17041712unsigned short *utf8_word_to_unicode (const unsigned char *input, 
    17051713                      unsigned short *output, 
     
    17071715  int inlen = *input; 
    17081716  const unsigned char *inhere = input+1; 
    1709   const unsigned char *inend = &inhere[inlen-1]; 
     1717  const unsigned char *inend = &inhere[inlen-1]; // points to the last valid char, not past the end. 
    17101718  unsigned short c; 
    17111719  int clen; 
     
    17141722  unsigned short *outhere = output+1; 
    17151723   
    1716   while (inhere <= inend && outlen < max_output_length-1) { 
     1724  while (inhere <= inend && outlen < max_output_length) { 
    17171725    /* decode the character */ 
    17181726    clen = parse_utf8_char (inhere, inend, &c); 
     
    17351743 * entry) to a utf8 encoded word output (with the length stored in 
    17361744 * the first byte. Only 255 bytes (not characters) can be stored  
    1737  * in the output. */ 
     1745 * in the output.  
     1746 * Note:!! max_output_length is the maximum length of the output string, ie  
     1747 * one less than the length of the output array, ie should be <=255 if output 
     1748 * buffer is 256 long 
     1749*/ 
    17381750unsigned char *unicode_to_utf8_word (const unsigned short *input, 
    17391751                     unsigned char *output, 
     
    17461758  int outlen = 0; 
    17471759  unsigned char *outhere = output+1; 
    1748   unsigned char *outend = output+max_output_length; 
     1760  unsigned char *outend = output+max_output_length; // points to last valid char, not past the end 
    17491761 
    17501762  while (in_i < inlen) {