source: main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 26294

Last change on this file since 26294 was 26294, checked in by ak19, 12 years ago

Fix to server crashing bugs. Diego reported a bug when searching on partial numerical values like dates of the form 28-02-2012 (spotted in collections of simple html files). Search results are returned, but clicking a resulting document crashes the server. During testing, it turned out that an alphanumeric string that I tried also caused the same problem in another part of the same code (same cpp file), so I fixed it in multiple places: it was going past the array.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.7 KB
RevLine 
[3365]1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
[8692]29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
[3365]32 while (len > 0) {
33 text.push_back (*here++);
[8692]34 --len;
[3365]35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
44 // UCArray::const_iterator endMinus1 = end-1;
[12318]45 const unsigned char* endMinus1 = &*(end - 1);
[3365]46
47 int charLen;
48 unsigned short c; // one character lookahead
49 charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
[26294]55 if(here != end) {
56 charLen = parse_utf8_char (&*here, endMinus1, &c);
57 }
[3365]58 } else if (c == '-') {
59 neg = true;
60 AddNChar (here, el.text, charLen);
[26294]61 if(here != end) {
62 charLen = parse_utf8_char (&*here, endMinus1, &c);
63 }
[3365]64 }
65
66 // read in number part
67 int numeric=0;
68 el.num = 0;
69 el.lexType = IntegerE;
70 /* stop integers at 4 digits */
71 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
72 el.num = el.num*10 + c - '0';
73 AddNChar (here, el.text, charLen);
[26294]74 if(here == end) {
75 break;
76 } else {
77 charLen = parse_utf8_char (&*here, endMinus1, &c);
78 }
[3365]79 }
80
81 if (neg) el.num *= -1;
82
83 return (!el.text.empty());
84}
85
86static bool ParsePotentialInteger(UCArray::const_iterator &here,
87 UCArray::const_iterator end,
88 LexEl &el) {
89 el.Clear();
90
91 // this version of end is used in unitool
92 //UCArray::const_iterator endMinus1 = end-1;
[12318]93 const unsigned char* endMinus1 = &*(end - 1);
[3365]94
95 int charLen=0;
96 int length=0;
97 unsigned short c; // one character lookahead
98 charLen = parse_utf8_char (&*here, endMinus1, &c);
99
100 // read in number part
101 int numeric=0;
102 el.num = 0;
103 el.lexType = IntegerE;
[25526]104
[3365]105 /* stop integers at 4 digits */
[25526]106 while (here != end) {
107
108 charLen = parse_utf8_char (&*here, endMinus1, &c);
109 if (c < '0' || c > '9') {
110 // reached a non-digit character
111 break;
112 }
113 el.num = el.num*10 + c - '0';
114 AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
115 length += charLen;
116
117 numeric++;
118
119 if (numeric == MAXNUMERIC) {
120 // reached the max length of a number
121 break;
122 }
123
[3365]124 }
[25526]125
126
[3365]127 // check the next character -if it is a letter, then have a term, not an integer
128 if (!is_unicode_letter(c)) {
129 // this was just an integer
130 return (!el.text.empty());
131 }
132 // else its a term
133 el.lexType = TermE;
134 el.num = 0;
135 /* this bit taken from ParseIndexWord in words.h*/
136 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
137 (is_unicode_letter(c) || (is_unicode_digit(c) &&
138 ++numeric <= MAXNUMERIC))) {
139 AddNChar (here, el.text, charLen);
140 length += charLen;
[26294]141 if(here == end) {
142 break;
143 } else {
144 charLen = parse_utf8_char (&*here, endMinus1, &c);
145 }
[3365]146 }
147
148 return (!el.text.empty());
149}
150static bool ParseTerm (UCArray::const_iterator &here,
151 UCArray::const_iterator end,
152 UCArray &text) {
[12318]153 if (here == end)
154 return false;
155
[3365]156 //UCArray::const_iterator endMinus1 = end-1;
[12318]157 const unsigned char* endMinus1 = &*(end - 1);
[3365]158 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
159 here += (new_here - &*here); // advance iterator by number of chars advanced
160 return !text.empty();
161}
162
163
164bool ParseLexEl (UCArray::const_iterator &here,
165 UCArray::const_iterator end,
166 LexEl &el) {
167 el.Clear();
168
169 // strange things can happen if here == end == 0
[12318]170 if (here == end)
171 return false;
[3365]172
173 // this version of end is used in unitool
174 //UCArray::const_iterator endMinus1 = end-1;
[12318]175 const unsigned char* endMinus1 = &*(end - 1);
[3365]176
177 // ignore all white space
178 int charLen;
179 unsigned short c; // one character lookahead
180 charLen = parse_utf8_char (&*here, endMinus1, &c);
181 while (here != end && is_unicode_space (c)) {
182 here += charLen;
[18340]183 if (here == end) break;
[3365]184 charLen = parse_utf8_char (&*here, endMinus1, &c);
185 }
186 if (here == end) return false;
187
188 if (c == '(') {
189 el.lexType = OpenBracketE;
190 AddNChar (here, el.text, charLen);
191 return true;
192
193 } else if (c == ')') {
194 el.lexType = CloseBracketE;
195 AddNChar (here, el.text, charLen);
196 return true;
197
198 } else if (c =='[') {
199 el.lexType = OpenSquareBracketE;
200 AddNChar (here, el.text, charLen);
201 return true;
202
203 } else if (c ==']') {
204 el.lexType = CloseSquareBracketE;
205 AddNChar (here, el.text, charLen);
206 return true;
207
208 } else if (c == '\"') {
209 el.lexType = QuoteE;
210 AddNChar (here, el.text, charLen);
211 return true;
212
213 } else if (c == '/') {
214 el.lexType = TermWeightE;
215 AddNChar (here, el.text, charLen);
216 return true;
217
218 } else if (c == '#') {
219 el.lexType = StemMethodE;
220 AddNChar (here, el.text, charLen);
221 return true;
222
[8242]223 } else if (c == '*') {
224 el.lexType = StarE;
225 AddNChar (here, el.text, charLen);
226 return true;
227
[3365]228 } else if (c == '^') {
229 el.lexType = RangeE;
230 AddNChar (here, el.text, charLen);
231 return true;
232
233 } else if (c == '@') {
234 el.lexType = AtE;
235 AddNChar (here, el.text, charLen);
236 return true;
237
238 } else if (c == ':') {
239 el.lexType = TagE;
240 AddNChar (here, el.text, charLen);
241 return true;
242
243 } else if (c=='&') {
244 el.lexType = AndOpE;
245 AddNChar (here, el.text, charLen);
246 return true;
247
248 } else if (c == '|') {
249 el.lexType = OrOpE;
250 AddNChar (here, el.text, charLen);
251 return true;
252
253 } else if (c == '!') {
254 el.lexType = NotOpE;
255 AddNChar (here, el.text, charLen);
256 return true;
257
258 } else if (c == '+' || c == '-' ) {
259 return ParseInteger (here, end, el);
260 }
261
262 else if (c >= '0' && c <= '9') {
263 return ParsePotentialInteger (here, end, el);
264 }
265
266 // assume it is a term of some sort
[5449]267 if (!ParseTerm (here, end, el.text)) {
268 // parse term returns false if it hasn't parsed anything that is a term
269 // here should be the same as it was before
270 el.lexType = UnknownE;
271 AddNChar (here, el.text, charLen);
272 return true;
273 }
274 //return false;
[3365]275
[8692]276 //UCArray AND; SetCStr (AND, "AND");
277 //if (el.text == AND) {
278 if (UCArrayCStrEquals(el.text, "AND")) {
[3365]279 el.lexType = AndOpE;
280 return true;
281 }
[8692]282 //UCArray OR; SetCStr (OR, "OR");
283 //if (el.text == OR) {
284 if (UCArrayCStrEquals(el.text, "OR")) {
[3365]285 el.lexType = OrOpE;
286 return true;
287 }
[8692]288 //UCArray NOT; SetCStr (NOT, "NOT");
289 //if (el.text == NOT) {
290 if (UCArrayCStrEquals(el.text, "NOT")) {
[3365]291 el.lexType = NotOpE;
292 return true;
293 }
[8692]294 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[3365]295 if (PrefixLen(el.text, NEAR)==4) {
296 el.lexType = NearOpE;
297 return true;
298 }
[8692]299 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6119]300 if (PrefixLen(el.text, WITHIN)==6) {
301 el.lexType = WithinOpE;
302 return true;
303 }
304
[3365]305 el.lexType = TermE;
306 return true;
307}
308
Note: See TracBrowser for help on using the repository browser.