root/main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp @ 26294

Revision 26294, 7.7 KB (checked in by ak19, 7 years ago)

Fix to server crashing bugs. Diego reported a bug when searching on partial numerical values like dates of the form 28-02-2012 (spotted in collections of simple html files). Search results are returned, but clicking a resulting document crashes the server. During testing, it turned out that an alphanumeric string that I tried also caused the same problem in another part of the same code (same cpp file), so I fixed it in multiple places: it was going past the array.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27              UCArray &text,
28              int len) {
29  if (text.capacity() < text.size() + len + 1) {
30    text.reserve(text.size() + len + 1);
31  }
32  while (len > 0) {
33    text.push_back (*here++);
34    --len;
35  }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39              UCArray::const_iterator end,
40              LexEl &el) {
41  el.Clear();
42
43  // this version of end is used in unitool
44  //  UCArray::const_iterator endMinus1 = end-1;
45  const unsigned char* endMinus1 = &*(end - 1);
46
47  int charLen;
48  unsigned short c; // one character lookahead
49  charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51  // check for positive or negative
52  bool neg = false;
53  if (c == '+') {
54    AddNChar (here, el.text, charLen);
55    if(here != end) {
56        charLen = parse_utf8_char (&*here, endMinus1, &c);
57    }
58  } else if (c == '-') {
59    neg = true;
60    AddNChar (here, el.text, charLen);
61    if(here != end) {   
62        charLen = parse_utf8_char (&*here, endMinus1, &c);
63    }
64  }
65
66  // read in number part
67  int numeric=0;
68  el.num = 0;
69  el.lexType = IntegerE;
70  /* stop integers at 4 digits */
71  while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
72    el.num = el.num*10 + c - '0';
73    AddNChar (here, el.text, charLen);
74    if(here == end) {
75        break;
76    } else {
77        charLen = parse_utf8_char (&*here, endMinus1, &c);
78    }
79  }
80
81  if (neg) el.num *= -1;
82
83  return (!el.text.empty());
84}
85
86static bool ParsePotentialInteger(UCArray::const_iterator &here,
87                  UCArray::const_iterator end,
88                  LexEl &el) {
89  el.Clear();
90
91  // this version of end is used in unitool
92  //UCArray::const_iterator endMinus1 = end-1;
93  const unsigned char* endMinus1 = &*(end - 1);
94
95  int charLen=0;
96  int length=0;
97  unsigned short c; // one character lookahead
98  charLen = parse_utf8_char (&*here, endMinus1, &c);
99
100  // read in number part
101  int numeric=0;
102  el.num = 0;
103  el.lexType = IntegerE;
104
105  /* stop integers at 4 digits */
106  while (here != end) {
107
108      charLen = parse_utf8_char (&*here, endMinus1, &c);
109      if (c < '0' || c > '9') {
110          // reached a non-digit character
111          break;
112      }
113      el.num = el.num*10 + c - '0';
114      AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
115      length += charLen;
116
117      numeric++;
118
119      if (numeric == MAXNUMERIC) {
120          // reached the max length of a number
121          break;
122      }
123
124  }
125
126
127  // check the next character -if it is a letter, then have a term, not an integer
128  if (!is_unicode_letter(c)) {
129    // this was just an integer
130    return (!el.text.empty());
131  }
132  // else its a term
133  el.lexType = TermE;
134  el.num = 0;
135  /* this bit taken from ParseIndexWord in words.h*/
136  while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
137     (is_unicode_letter(c) || (is_unicode_digit(c) &&
138                   ++numeric <= MAXNUMERIC))) {
139    AddNChar (here, el.text, charLen);
140    length += charLen;
141    if(here == end) {
142        break;
143    } else {
144        charLen = parse_utf8_char (&*here, endMinus1, &c);
145    }
146  }
147
148  return (!el.text.empty());
149}
150static bool ParseTerm (UCArray::const_iterator &here,
151               UCArray::const_iterator end,
152               UCArray &text) {
153  if (here == end)
154    return false;
155
156  //UCArray::const_iterator endMinus1 = end-1;
157  const unsigned char* endMinus1 = &*(end - 1);
158  const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
159  here += (new_here - &*here); // advance iterator by number of chars advanced
160  return !text.empty();
161}
162
163
164bool ParseLexEl (UCArray::const_iterator &here,
165         UCArray::const_iterator end,
166         LexEl &el) {
167  el.Clear();
168
169  // strange things can happen if here == end == 0
170  if (here == end)
171    return false;
172 
173  // this version of end is used in unitool
174  //UCArray::const_iterator endMinus1 = end-1;
175  const unsigned char* endMinus1 = &*(end - 1);
176
177  // ignore all white space
178  int charLen;
179  unsigned short c; // one character lookahead
180  charLen = parse_utf8_char (&*here, endMinus1, &c);
181  while (here != end && is_unicode_space (c)) {
182    here += charLen;
183    if (here == end) break;
184    charLen = parse_utf8_char (&*here, endMinus1, &c);
185  }
186  if (here == end) return false;
187
188  if (c == '(') {
189    el.lexType = OpenBracketE;
190    AddNChar (here, el.text, charLen);
191    return true;
192   
193  } else if (c == ')') {
194    el.lexType = CloseBracketE;
195    AddNChar (here, el.text, charLen);
196    return true;
197
198  } else if (c =='[') {
199    el.lexType = OpenSquareBracketE;
200    AddNChar (here, el.text, charLen);
201    return true;
202 
203  } else if (c ==']') {
204    el.lexType = CloseSquareBracketE;
205    AddNChar (here, el.text, charLen);
206    return true;
207 
208  } else if (c == '\"') {
209    el.lexType = QuoteE;
210    AddNChar (here, el.text, charLen);
211    return true;
212   
213  } else if (c == '/') {
214    el.lexType = TermWeightE;
215    AddNChar (here, el.text, charLen);
216    return true;
217   
218  } else if (c == '#') {
219    el.lexType = StemMethodE;
220    AddNChar (here, el.text, charLen);
221    return true;
222   
223  } else if (c == '*') {
224    el.lexType = StarE;
225    AddNChar (here, el.text, charLen);
226    return true;
227   
228  } else if (c == '^') {
229    el.lexType = RangeE;
230    AddNChar (here, el.text, charLen);
231    return true;
232   
233  } else if (c == '@') {
234    el.lexType = AtE;
235    AddNChar (here, el.text, charLen);
236    return true;
237   
238  } else if (c == ':') {
239    el.lexType = TagE;
240    AddNChar (here, el.text, charLen);
241    return true;
242   
243  } else if (c=='&') {
244    el.lexType = AndOpE;
245    AddNChar (here, el.text, charLen);
246    return true;
247   
248  } else if (c == '|') {
249    el.lexType = OrOpE;
250    AddNChar (here, el.text, charLen);
251    return true;
252   
253  } else if (c == '!') {
254    el.lexType = NotOpE;
255    AddNChar (here, el.text, charLen);
256    return true;
257   
258  } else if (c == '+' || c == '-' ) {
259    return  ParseInteger (here, end, el);
260  }
261
262  else if (c >= '0' && c <= '9') {
263    return ParsePotentialInteger (here, end, el);
264  }
265
266  // assume it is a term of some sort
267  if (!ParseTerm (here, end, el.text))  {
268    // parse term returns false if it hasn't parsed anything that is a term
269    // here should be the same as it was before
270    el.lexType = UnknownE;
271    AddNChar (here, el.text, charLen);
272    return true;
273  }
274  //return false;
275
276  //UCArray AND; SetCStr (AND, "AND");
277  //if (el.text == AND) {
278  if (UCArrayCStrEquals(el.text, "AND")) {
279    el.lexType = AndOpE;
280    return true;
281  }
282  //UCArray OR; SetCStr (OR, "OR");
283  //if (el.text == OR) {
284  if (UCArrayCStrEquals(el.text, "OR")) {
285    el.lexType = OrOpE;
286    return true;
287  }
288  //UCArray NOT; SetCStr (NOT, "NOT");
289  //if (el.text == NOT) {
290  if (UCArrayCStrEquals(el.text, "NOT")) {
291    el.lexType = NotOpE;
292    return true;
293  }
294  UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
295  if (PrefixLen(el.text, NEAR)==4) {
296    el.lexType = NearOpE;
297    return true;
298  }
299  UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
300  if (PrefixLen(el.text, WITHIN)==6) {
301    el.lexType = WithinOpE;
302    return true;
303  }
304 
305  el.lexType = TermE;
306  return true;
307}
308
Note: See TracBrowser for help on using the browser.