source: trunk/indexers/mgpp/text/GSDLQueryParser.cpp@ 12321

Last change on this file since 12321 was 12321, checked in by kjdon, 15 years ago

made MAXNUMERIC a global variable instead of a \#define. Its now a command line arg to mgpp_passes, and an option in Queryer, and a parameter to ParseQuery

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24#include "words.h"
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCombine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby,
95 bool reverse) {
96 UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
97 UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
98
99 if (nearby == NEARBY) { // no modifier
100 termNode.startRange = (NEAR_DEFAULT+1)*-1;
101 termNode.endRange = NEAR_DEFAULT;
102
103 } else if (nearby == WITHIN) { // no modifier
104 if (reverse) {
105 termNode.startRange = (NEAR_DEFAULT+1)*-1;
106 termNode.endRange = -1;
107 } else {
108 termNode.startRange = NEAR_DEFAULT;
109 termNode.endRange = 0;
110 }
111 }
112 else { // extract number
113 UCArray::const_iterator here;
114 bool within = false;
115 if (PrefixLen(nearby, WITHIN)==6) {
116 within=true;
117 here = nearby.begin()+6;
118 } else {
119 here = nearby.begin()+4;
120 }
121 UCArray::const_iterator end = nearby.end();
122 int size=0;
123 while (here != end) {
124 size = size*10 + (*here-'0');
125 ++here;
126 }
127 if (within) {
128 if (reverse) {
129 termNode.startRange = size;
130 termNode.endRange = 0;
131 } else {
132 termNode.startRange = -1 * (size+1);
133 termNode.endRange = -1;
134 }
135 } else {
136 termNode.startRange = -1 * (size+1);
137 termNode.endRange = size;
138 }
139 }
140}
141
142static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
143 // here expect el to contain some of c,s,i,u
144 // stem method 0 = c,u 00
145 // stem method 1 = i,u 01 - default for DL
146 // stem method 2 = c, s 10
147 // stem method 3 = i,s 11
148
149 unsigned long stem = (unsigned long)defaultStemMethod;
150
151 UCArray::const_iterator here = el.text.begin();
152 UCArray::const_iterator end = el.text.end();
153
154 unsigned char c1 = *here;
155 if (!(c1 == 'c'|| c1 == 'i' || c1 == 'u' || c1 == 's'))
156 return 4; // incorrect format
157
158 ++here;
159 unsigned char c2 = 'a';
160 if (here !=end) {
161 c2 = *here;
162 if (!(c2 == 'c'|| c2 == 'i' || c2 == 'u' || c2 == 's'))
163 return 4; // incorrect format
164 }
165
166 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
167 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
168 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
169 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
170 return stem;
171}
172
173
174static void ParseTermModifiers (UCArray::const_iterator &here,
175 UCArray::const_iterator end,
176 TermNode &termNode,
177 int defaultStemMethod) {
178
179 termNode.stemMethod = defaultStemMethod;
180 bool partial_match = false;
181 LexEl el;
182 UCArray::const_iterator oldHere = here;
183 while (ParseLexEl (here, end, el)) {
184 if (el.lexType == TermWeightE) {
185 termNode.termWeight = ParseInt (here, end);
186
187 } else if (el.lexType == StemMethodE) {
188 oldHere = here;
189 LexEl stem;
190 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
191 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
192 if (termNode.stemMethod == 4) { // error so backtrack
193 here = oldHere;
194 termNode.stemMethod = (unsigned long)defaultStemMethod;
195 }
196 }else here = oldHere; //ignore - wrong syntax
197
198 } else if (el.lexType == RangeE) {
199 termNode.startRange = ParseInt (here, end);
200 termNode.endRange = ParseInt (here, end);
201
202 } else if (el.lexType == AtE) {
203 termNode.startRange = termNode.endRange = ParseInt (here, end);
204 } else if (el.lexType == StarE) {
205 partial_match = true;
206 } else {
207 // no term modifiers
208 here = oldHere;
209 break;
210 }
211
212 if (partial_match) {
213 termNode.stemMethod |= 4; // set bit 2 to 1
214 termNode.stemMethod &=0xd; // set bit 1 to 0 // we dont have stemming on if doing partial matching.
215 }
216 oldHere = here;
217 }
218}
219
220static void ParseProxModifiers (UCArray::const_iterator &here,
221 UCArray::const_iterator end,
222 ProxMatchQueryNode *proxNode) {
223 // so far only have one - the tag stuff
224 LexEl el;
225 UCArray::const_iterator oldHere = here;
226 while (ParseLexEl (here, end, el)) {
227 if (el.lexType == TagE) {
228 oldHere = here; // don't backtrack past here
229 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
230 proxNode->tagNodePtr = new TagNode;
231 proxNode->tagNodePtr->tagName = el.text;
232
233 }
234 else { // error in tag
235 here = oldHere;
236 }
237 } // TagE
238 // add in other cases here
239 else {
240 // no modifiers
241 here = oldHere;
242 break;
243 }
244 oldHere = here;
245 }//while
246
247
248}
249
250// expects starting brackets to have been parsed
251// sets error to true if something has gone wrong
252static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
253 UCArray::const_iterator end,
254 /*ProxMatchQueryNode *proxNode,*/
255 int defaultStemMethod,
256 bool & error) {
257
258 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
259 LexEl el;
260 bool phrase=false;
261 bool first=true;
262 bool prox = false;
263 UCArray near_string;
264 while (ParseLexEl (here, end, el)) {
265 // cant have AND, OR, NOT in square brackets, so assume they are words
266 if (el.lexType == TermE || el.lexType == IntegerE || el.lexType == AndOpE || el.lexType == OrOpE || el.lexType == NotOpE) {
267 TermNode termNode;
268 termNode.term = el.text;
269 ParseTermModifiers (here, end, termNode, defaultStemMethod);
270 if (phrase) {
271 if (first) first=false;
272 else {
273 termNode.startRange = -2;
274 termNode.endRange = -1;
275 }
276 } else if (prox) {
277 SetRangeValues(termNode, near_string, false);
278 prox = false;
279 }
280 proxNode->terms.push_back(termNode);
281 }
282 else if (el.lexType == CloseSquareBracketE) {
283 break;
284 }
285 else if (el.lexType == QuoteE) {
286 // phrase inside square brackets
287 if (phrase) { // end of phrase
288 phrase=false;
289 first = true;
290 } else {
291 phrase=true; // start of phrase
292 }
293 } else if (el.lexType == NearOpE || el.lexType == WithinOpE) {
294 if (phrase) {
295 // cant have proximity op in a phrase - just assume its an actual word
296 TermNode termNode;
297 termNode.term = el.text;
298 ParseTermModifiers (here, end, termNode, defaultStemMethod);
299 proxNode->terms.push_back(termNode);
300 } else {
301 // its a NEAR or within op
302 prox = true;
303 near_string = el.text;
304 }
305
306 }
307 else if (el.lexType == UnknownE) {
308 // just ignore it
309 }
310 else {
311 //error - we set the proxNode to NULL,
312 cerr <<"GSDLQueryParser: bad syntax inside []\n";
313 error = true;
314 return NULL;
315 }
316 } // while
317 return proxNode;
318}
319// expects the starting quote to have been parsed
320// and discarded
321// now phrases use the case and stem preference options
322// ie can search for a phrase ignoring case
323static void ParsePhrase (UCArray::const_iterator &here,
324 UCArray::const_iterator end,
325 ProxMatchQueryNode &proxNode,
326 int defaultStemMethod,
327 bool &error) {
328 LexEl el;
329 bool first = true;
330 while (ParseLexEl (here, end, el)) {
331 if (el.lexType == TermE || el.lexType == IntegerE) {
332 TermNode termNode;
333 termNode.term = el.text;
334 //termNode.stemMethod = defaultStemMethod;
335 ParseTermModifiers (here, end, termNode, defaultStemMethod);
336 if (first) {
337 first = false;
338 }
339 else {
340 termNode.startRange = -2;
341 termNode.endRange = -1;
342 }
343 proxNode.terms.push_back (termNode);
344
345 } else if (el.lexType == QuoteE) {
346 break;
347
348 } else if (el.lexType == UnknownE) {
349 // just ignore it
350 } else {
351 // error
352 error = true;
353 return;
354 }
355 }
356}
357
358static QueryNode *ParseTerm (UCArray::const_iterator &here,
359 UCArray::const_iterator end,
360 int defaultBoolCombine,
361 int defaultStemMethod) {
362 LexEl el;
363
364 UCArray::const_iterator oldHere = here;
365 if (!ParseLexEl (here, end, el)) return NULL;
366
367 if (el.lexType == OpenBracketE)
368 return ParseBracketExpression (here, end, defaultBoolCombine,
369 defaultStemMethod);
370
371 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
372
373 if (el.lexType == TermE || el.lexType == IntegerE) {
374 TermNode termNode;
375 termNode.term = el.text;
376 ParseTermModifiers (here, end, termNode, defaultStemMethod);
377 oldHere = here; // dont backtrack past here
378 if (ParseLexEl(here, end, el) && (el.lexType == NearOpE || el.lexType == WithinOpE )) {
379 delete proxNode;
380 oldHere = here;
381 // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
382
383 // if the next element is a '(' have a syntax error, return NULL
384 LexEl temp_el;
385 if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
386 cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
387 return NULL;
388 }
389 here = oldHere; // else backtrack
390
391 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
392 defaultStemMethod);
393 SetRangeValues(termNode, el.text, true);
394 proxNode->terms.push_back (termNode);
395 return proxNode;
396
397 } else {
398 here = oldHere; // backtrack
399 proxNode->terms.push_back (termNode);
400 ParseProxModifiers(here, end, proxNode);
401 return proxNode;
402 }
403 } else if (el.lexType == QuoteE) {
404 bool error = false;
405 ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
406 if (error) {
407 delete proxNode;
408 return NULL;
409 }
410 return proxNode;
411 }
412 else if (el.lexType == OpenSquareBracketE) {
413 bool error = false;
414 proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error);
415 if (error) {
416 delete proxNode;
417 return NULL;
418 }
419 ParseProxModifiers (here, end, proxNode);
420 return proxNode;
421 }
422
423 // not a term
424 here = oldHere;
425 delete proxNode;
426 return NULL;
427}
428
429
430static QueryNode *ParseExpression (UCArray::const_iterator &here,
431 UCArray::const_iterator end,
432 int defaultBoolCombine,
433 int defaultStemMethod) {
434 LexEl el;
435 QueryNode *curTree = NULL;
436 UCArray::const_iterator oldHere = here;
437 while (ParseLexEl (here, end, el)) {
438 if (el.lexType == CloseBracketE) {
439 // parsebracketexpression is waiting for the last bracket, so put it back
440 here = oldHere;
441 break;
442
443 } else if (el.lexType == OpenSquareBracketE ||
444 el.lexType == OpenBracketE ||
445 el.lexType == TermE ||
446 el.lexType == QuoteE ||
447 el.lexType == IntegerE ) {
448
449 // some type of term, back track and parse it
450 here = oldHere;
451
452 // parse the term
453 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
454 defaultStemMethod);
455 if (newTerm == NULL) {
456 delete curTree;
457 return NULL;
458 }
459
460 // if default==1, AND, else if==0, OR
461 if (defaultBoolCombine) {
462 curTree = AndAdd (curTree, newTerm);
463 }
464 else {
465 curTree = OrAdd (curTree, newTerm);
466 }
467
468 } else if (el.lexType == AndOpE) {
469 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
470 defaultStemMethod);
471 if (newTerm == NULL) {
472 delete curTree;
473 return NULL;
474 }
475 curTree = AndAdd (curTree, newTerm);
476
477 } else if (el.lexType == OrOpE) {
478 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
479 defaultStemMethod);
480 if (newTerm == NULL) {
481 delete curTree;
482 return NULL;
483 }
484 curTree = OrAdd (curTree, newTerm);
485
486 } else if (el.lexType == NotOpE) {
487 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
488 defaultStemMethod);
489 if (newTerm == NULL) {
490 delete curTree;
491 return NULL;
492 }
493 curTree = NotAdd (curTree, newTerm);
494
495 } else if (el.lexType == UnknownE) {
496 // just ignore it
497 } else {
498
499 // syntax error, return NUll
500 delete curTree;
501 return NULL;
502 }
503
504 oldHere = here;
505 }
506
507 return curTree;
508}
509
510QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
511 int defaultStemMethod, int maxnumeric) {
512 if (4 < maxnumeric < 512) {
513 MAXNUMERIC = maxnumeric;
514 }
515 UCArray::const_iterator here = queryStr.begin();
516 UCArray::const_iterator end = queryStr.end();
517 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
518}
Note: See TracBrowser for help on using the repository browser.