Ignore:
Timestamp:
2012-09-11T11:34:31+12:00 (9 years ago)
Author:
kjdon
Message:

added some comments to make clear whether max_output_length includes the space for the length in position zero or not (not.) And fixed up an off-by-one error to do with the same thing

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/mgpp/lib/unitool.cpp

    r26128 r26163  
    11#include "unitool.h"
    22#include "sysfuncs.h"
    3 
    43
    54/* unicode_cat_data is a combination of the letter, digit, and
     
    13001299 * unicode equivalent in *value. The length of the utf-8 character
    13011300 * is returned */
     1301/* NOTE:!!!! end points to the last valid char, NOT past the end
     1302 */
    13021303int parse_utf8_char(const unsigned char *here,
    13031304            const unsigned char *end,
     
    13451346 * The length of the encoding is returned. If the string was not
    13461347 * long enough to encode the character 0 is returned. */
     1348/* NOTE:!!!! end points to the last valid char, NOT past the end
     1349 */
    13471350int output_utf8_char (unsigned short value,
    13481351              unsigned char *here,
    13491352              unsigned char *end) {
    13501353  int len = (int)(end - here + 1); /* end is the last character */
     1354
    13511355  if (value < 0x80) {
    13521356    /* will be encoded in one byte */
     
    16981702
    16991703
    1700 /* converts a utf-8 word (string with length stored in the first byte
     1704/* converts a utf-8 word (string with length stored in the first byte)
    17011705 * to a Unicode array. To handle all situations the output buffer should
    17021706 * be 256 unsigned shorts long. The output will also have the length as
    1703  * the first entry. */
     1707 * the first entry.
     1708 * NOTE! max_output_length is the maximum length of the output string, ie
     1709 * one less than the length of the output array, ie should be <=255 if output
     1710 * buffer is 256 long
     1711 */
    17041712unsigned short *utf8_word_to_unicode (const unsigned char *input,
    17051713                      unsigned short *output,
     
    17071715  int inlen = *input;
    17081716  const unsigned char *inhere = input+1;
    1709   const unsigned char *inend = &inhere[inlen-1];
     1717  const unsigned char *inend = &inhere[inlen-1]; // points to the last valid char, not past the end.
    17101718  unsigned short c;
    17111719  int clen;
     
    17141722  unsigned short *outhere = output+1;
    17151723 
    1716   while (inhere <= inend && outlen < max_output_length-1) {
     1724  while (inhere <= inend && outlen < max_output_length) {
    17171725    /* decode the character */
    17181726    clen = parse_utf8_char (inhere, inend, &c);
     
    17351743 * entry) to a utf8 encoded word output (with the length stored in
    17361744 * the first byte. Only 255 bytes (not characters) can be stored
    1737  * in the output. */
     1745 * in the output.
     1746 * Note:!! max_output_length is the maximum length of the output string, ie
     1747 * one less than the length of the output array, ie should be <=255 if output
     1748 * buffer is 256 long
     1749*/
    17381750unsigned char *unicode_to_utf8_word (const unsigned short *input,
    17391751                     unsigned char *output,
     
    17461758  int outlen = 0;
    17471759  unsigned char *outhere = output+1;
    1748   unsigned char *outend = output+max_output_length;
     1760  unsigned char *outend = output+max_output_length; // points to last valid char, not past the end
    17491761
    17501762  while (in_i < inlen) {
Note: See TracChangeset for help on using the changeset viewer.