Changeset 2693 for trunk/gsdl/src


Ignore:
Timestamp:
2001-08-10T12:20:23+12:00 (23 years ago)
Author:
kjm18
Message:

changed this to parse terms in the same way that the indexer does. Terms with
more than four digits are broken up, and terms that start with digits
but have letters in them are now treated as terms rather than as integers
eg 12345678 is split into 1234 5678 rather than staying as 12345678
100F stays as 100F rather than being split into 100 F

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/mgpp/text/GSDLQueryLex.cpp

    r2468 r2693  
    5757
    5858  // read in number part
     59  int numeric=0;
    5960  el.num = 0;
    6061  el.lexType = IntegerE;
    61   while (c >= '0' && c <= '9') {
     62  /* stop integers at 4 digits */
     63  while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
    6264    el.num = el.num*10 + c - '0';
    6365    AddNChar (here, el.text, charLen);
     
    7072}
    7173
     74static bool ParsePotentialInteger(UCArray::const_iterator &here,
     75                  UCArray::const_iterator end,
     76                  LexEl &el) {
     77  el.Clear();
     78
     79  // this version of end is used in unitool
     80  UCArray::const_iterator endMinus1 = end-1;
     81
     82  int charLen=0;
     83  int length=0;
     84  unsigned short c; // one character lookahead
     85  charLen = parse_utf8_char (here, endMinus1, &c);
     86
     87  // read in number part
     88  int numeric=0;
     89  el.num = 0;
     90  el.lexType = IntegerE;
     91  /* stop integers at 4 digits */
     92  while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
     93    el.num = el.num*10 + c - '0';
     94    AddNChar (here, el.text, charLen);
     95    length += charLen;
     96    charLen = parse_utf8_char (here, endMinus1, &c);
     97  }
     98  // check the next character -if it is a letter, then have a term, not an integer
     99  if (!is_unicode_letter(c)) {
     100    // this was just an integer
     101    return (!el.text.empty());
     102  }
     103  // else its a term
     104  el.lexType = TermE;
     105  el.num = 0;
     106  /* this bit taken from ParseIndexWord in words.h*/
     107  while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
     108     (is_unicode_letter(c) || (is_unicode_digit(c) &&
     109                   ++numeric <= MAXNUMERIC))) {
     110    AddNChar (here, el.text, charLen);
     111    length += charLen;
     112    charLen = parse_utf8_char (here, endMinus1, &c);
     113  }
     114
     115  return (!el.text.empty());
     116}
    72117static bool ParseTerm (UCArray::const_iterator &here,
    73118               UCArray::const_iterator end,
     
    165210    return true;
    166211   
    167   } else if (c == '+' || c == '-' ||
    168          (c >= '0' && c <= '9')) {
    169     return ParseInteger (here, end, el);
     212  } else if (c == '+' || c == '-' ) {
     213    return  ParseInteger (here, end, el);
     214  }
     215
     216  else if (c >= '0' && c <= '9') {
     217    return ParsePotentialInteger (here, end, el);
    170218  }
    171219
Note: See TracChangeset for help on using the changeset viewer.