source: trunk/indexers/mgpp/text/GSDLQueryParser.cpp@ 4210

Last change on this file since 4210 was 4210, checked in by kjdon, 21 years ago

parseQuery (in GSDLQueryParser) now returns NULL if there has been a syntax error. So all the auxiliary functions either return NULl
if there is an error, or set an error flag to true. Queryer now tells the user that there has been invalid syntax rather than seg faulting

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCombine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby) {
95 UCArray NEARBY; SetCStr(NEARBY, "NEAR");
96 if (nearby == NEARBY) { // no modifier
97 termNode.startRange = (NEAR_DEFAULT+1)*-1;
98 termNode.endRange = NEAR_DEFAULT;
99
100 }
101 else { // extract number
102 UCArray::const_iterator here = nearby.begin()+4;
103 UCArray::const_iterator end = nearby.end();
104 int size=0;
105 while (here != end) {
106 size = size*10 + (*here-'0');
107 here++;
108 }
109 termNode.startRange = -1 * (size+1);
110 termNode.endRange = size;
111 }
112}
113
114static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
115 // here expect el to contain some of c,s,i,u
116 // stem method 0 = c,u 00
117 // stem method 1 = i,u 01 - default for DL
118 // stem method 2 = c, s 10
119 // stem method 3 = i,s 11
120
121 unsigned long stem = (unsigned long)defaultStemMethod;
122
123 UCArray::const_iterator here = el.text.begin();
124 UCArray::const_iterator end = el.text.end();
125
126 unsigned char c1 = *here;
127 if (!(c1 == 'c'| c1 == 'i' | c1 == 'u' | c1 == 's'))
128 return 4; // incorrect format
129
130 here++;
131 unsigned char c2 = 'a';
132 if (here !=end) {
133 c2 = *here;
134 if (!(c2 == 'c'| c2 == 'i' | c2 == 'u' | c2 == 's'))
135 return 4; // incorrect format
136 }
137
138 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
139 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
140 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
141 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
142 return stem;
143}
144
145
146static void ParseTermModifiers (UCArray::const_iterator &here,
147 UCArray::const_iterator end,
148 TermNode &termNode,
149 int defaultStemMethod) {
150
151 termNode.stemMethod = defaultStemMethod;
152
153 LexEl el;
154 UCArray::const_iterator oldHere = here;
155 while (ParseLexEl (here, end, el)) {
156 if (el.lexType == TermWeightE) {
157 termNode.termWeight = ParseInt (here, end);
158
159 } else if (el.lexType == StemMethodE) {
160 oldHere = here;
161 LexEl stem;
162 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
163 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
164 if (termNode.stemMethod == 4) { // error so backtrack
165 here = oldHere;
166 termNode.stemMethod = (unsigned long)defaultStemMethod;
167 }
168 }else here = oldHere; //ignore - wrong syntax
169
170 } else if (el.lexType == RangeE) {
171 termNode.startRange = ParseInt (here, end);
172 termNode.endRange = ParseInt (here, end);
173
174 } else if (el.lexType == AtE) {
175 termNode.startRange = termNode.endRange = ParseInt (here, end);
176
177 } else {
178 // no term modifiers
179 here = oldHere;
180 break;
181 }
182
183 oldHere = here;
184 }
185}
186
187static void ParseProxModifiers (UCArray::const_iterator &here,
188 UCArray::const_iterator end,
189 ProxMatchQueryNode *proxNode) {
190 // so far only have one - the tag stuff
191 LexEl el;
192 UCArray::const_iterator oldHere = here;
193 while (ParseLexEl (here, end, el)) {
194 if (el.lexType == TagE) {
195 oldHere = here; // don't backtrack past here
196 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
197 proxNode->tagNodePtr = new TagNode;
198 proxNode->tagNodePtr->tagName = el.text;
199
200 }
201 else { // error in tag
202 here = oldHere;
203 }
204 } // TagE
205 // add in other cases here
206 else {
207 // no modifiers
208 here = oldHere;
209 break;
210 }
211 oldHere = here;
212 }//while
213
214
215}
216
217// expects starting brackets to have been parsed
218// sets error to true if something has gone wrong
219static void ParseSquareBrackets(UCArray::const_iterator &here,
220 UCArray::const_iterator end,
221 ProxMatchQueryNode *proxNode,
222 int defaultStemMethod,
223 bool & error) {
224
225 LexEl el;
226 bool phrase=false;
227 bool first=true;
228 while (ParseLexEl (here, end, el)) {
229 if (el.lexType == TermE || el.lexType == IntegerE) {
230 TermNode termNode;
231 termNode.term = el.text;
232 ParseTermModifiers (here, end, termNode, defaultStemMethod);
233 if (phrase) {
234 if (first) first=false;
235 else {
236 termNode.startRange = -2;
237 termNode.endRange = -1;
238 }
239 }
240 proxNode->terms.push_back(termNode);
241 }
242 else if (el.lexType == CloseSquareBracketE) {
243 break;
244 }
245 else if (el.lexType == QuoteE) {
246 // phrase inside square brackets
247 if (phrase) phrase=false;
248 else phrase=true;
249 }
250 else {
251 //error - we set the proxNode to NULL,
252 cerr <<"GSDLQueryParser: bad syntax inside []\n";
253 error = true;
254 return;
255 }
256 } // while
257
258}
259// expects the starting quote to have been parsed
260// and discarded
261// now phrases use the case and stem preference options
262// ie can search for a phrase ignoring case
263static void ParsePhrase (UCArray::const_iterator &here,
264 UCArray::const_iterator end,
265 ProxMatchQueryNode &proxNode,
266 int defaultStemMethod,
267 bool &error) {
268 LexEl el;
269 bool first = true;
270 while (ParseLexEl (here, end, el)) {
271 if (el.lexType == TermE || el.lexType == IntegerE) {
272 TermNode termNode;
273 termNode.term = el.text;
274 //termNode.stemMethod = defaultStemMethod;
275 ParseTermModifiers (here, end, termNode, defaultStemMethod);
276 if (first) {
277 first = false;
278 }
279 else {
280 termNode.startRange = -2;
281 termNode.endRange = -1;
282 }
283 proxNode.terms.push_back (termNode);
284
285 } else if (el.lexType == QuoteE) {
286 break;
287
288 } else {
289 // error
290 error = true;
291 return;
292 }
293 }
294}
295
296static QueryNode *ParseTerm (UCArray::const_iterator &here,
297 UCArray::const_iterator end,
298 int defaultBoolCombine,
299 int defaultStemMethod) {
300 LexEl el;
301
302 UCArray::const_iterator oldHere = here;
303 if (!ParseLexEl (here, end, el)) return NULL;
304
305 if (el.lexType == OpenBracketE)
306 return ParseBracketExpression (here, end, defaultBoolCombine,
307 defaultStemMethod);
308
309 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
310
311 if (el.lexType == TermE || el.lexType == IntegerE) {
312 TermNode termNode;
313 termNode.term = el.text;
314 ParseTermModifiers (here, end, termNode, defaultStemMethod);
315 oldHere = here; // dont backtrack past here
316 if (ParseLexEl(here, end, el) && el.lexType == NearOpE) {
317 delete proxNode;
318 oldHere = here;
319 // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
320
321 // if the next element is a '(' have a syntax error, return NULL
322 LexEl temp_el;
323 if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
324 cerr << "GSDLQueryParser: NEAR cannot be followed by a '('\n";
325 return NULL;
326 }
327 here = oldHere; // else backtrack
328
329 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
330 defaultStemMethod);
331 SetRangeValues(termNode, el.text);
332 proxNode->terms.push_back (termNode);
333 return proxNode;
334
335 } else {
336 here = oldHere; // backtrack
337 proxNode->terms.push_back (termNode);
338 ParseProxModifiers(here, end, proxNode);
339 return proxNode;
340 }
341 } else if (el.lexType == QuoteE) {
342 bool error = false;
343 ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
344 if (error) {
345 delete proxNode;
346 return NULL;
347 }
348 return proxNode;
349 }
350 else if (el.lexType == OpenSquareBracketE) {
351 bool error = false;
352 ParseSquareBrackets (here, end, proxNode, defaultStemMethod, error);
353 if (error) {
354 delete proxNode;
355 return NULL;
356 }
357 ParseProxModifiers (here, end, proxNode);
358 return proxNode;
359 }
360
361 // not a term
362 here = oldHere;
363 delete proxNode;
364 return NULL;
365}
366
367
368static QueryNode *ParseExpression (UCArray::const_iterator &here,
369 UCArray::const_iterator end,
370 int defaultBoolCombine,
371 int defaultStemMethod) {
372 LexEl el;
373 QueryNode *curTree = NULL;
374 UCArray::const_iterator oldHere = here;
375 while (ParseLexEl (here, end, el)) {
376 if (el.lexType == CloseBracketE) {
377 // parsebracketexpression is waiting for the last bracket, so put it back
378 here = oldHere;
379 break;
380
381 } else if (el.lexType == OpenSquareBracketE ||
382 el.lexType == OpenBracketE ||
383 el.lexType == TermE ||
384 el.lexType == QuoteE ||
385 el.lexType == IntegerE ) {
386
387 // some type of term, back track and parse it
388 here = oldHere;
389
390 // parse the term
391 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
392 defaultStemMethod);
393 if (newTerm == NULL) {
394 delete curTree;
395 return NULL;
396 }
397
398 // if default==1, AND, else if==0, OR
399 if (defaultBoolCombine) {
400 curTree = AndAdd (curTree, newTerm);
401 }
402 else {
403 curTree = OrAdd (curTree, newTerm);
404 }
405
406 } else if (el.lexType == AndOpE) {
407 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
408 defaultStemMethod);
409 if (newTerm == NULL) {
410 delete curTree;
411 return NULL;
412 }
413 curTree = AndAdd (curTree, newTerm);
414
415 } else if (el.lexType == OrOpE) {
416 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
417 defaultStemMethod);
418 if (newTerm == NULL) {
419 delete curTree;
420 return NULL;
421 }
422 curTree = OrAdd (curTree, newTerm);
423
424 } else if (el.lexType == NotOpE) {
425 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
426 defaultStemMethod);
427 if (newTerm == NULL) {
428 delete curTree;
429 return NULL;
430 }
431 curTree = NotAdd (curTree, newTerm);
432
433 } else {
434
435 // syntax error, return NUll
436 delete curTree;
437 return NULL;
438 }
439
440 oldHere = here;
441 }
442
443 return curTree;
444}
445
446QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
447 int defaultStemMethod) {
448 UCArray::const_iterator here = queryStr.begin();
449 UCArray::const_iterator end = queryStr.end();
450 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
451}
Note: See TracBrowser for help on using the repository browser.