source: trunk/mgpp/text/GSDLQueryParser.cpp@ 13653

Last change on this file since 13653 was 13653, checked in by kjdon, 17 years ago

Accent folding patch thanks to Juan Grigera. parsing of stem/case/accent term
modifiers now uses defines from mg_files.h

turned off accent folding if partial matching is being done - can't do them
together due to the way the index works. also, do the accentfold cases for
the switch in GetStemMethod only if ENABLE_ACCENTFOLD is defined
changed line 528 to avoid a compile warning on windows

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
RevLine 
[3365]1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
[12321]24#include "words.h"
[3365]25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
[4210]28 int defaultBoolCombine,
[3365]29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
[6119]94 UCArray &nearby,
95 bool reverse) {
[8692]96 UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
97 UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
[6119]98
[3365]99 if (nearby == NEARBY) { // no modifier
[3782]100 termNode.startRange = (NEAR_DEFAULT+1)*-1;
101 termNode.endRange = NEAR_DEFAULT;
102
[6119]103 } else if (nearby == WITHIN) { // no modifier
104 if (reverse) {
105 termNode.startRange = (NEAR_DEFAULT+1)*-1;
106 termNode.endRange = -1;
107 } else {
108 termNode.startRange = NEAR_DEFAULT;
109 termNode.endRange = 0;
110 }
[3365]111 }
112 else { // extract number
[6119]113 UCArray::const_iterator here;
114 bool within = false;
115 if (PrefixLen(nearby, WITHIN)==6) {
116 within=true;
117 here = nearby.begin()+6;
118 } else {
119 here = nearby.begin()+4;
120 }
[3365]121 UCArray::const_iterator end = nearby.end();
122 int size=0;
123 while (here != end) {
124 size = size*10 + (*here-'0');
[8692]125 ++here;
[3365]126 }
[6119]127 if (within) {
128 if (reverse) {
129 termNode.startRange = size;
130 termNode.endRange = 0;
131 } else {
132 termNode.startRange = -1 * (size+1);
133 termNode.endRange = -1;
134 }
135 } else {
136 termNode.startRange = -1 * (size+1);
137 termNode.endRange = size;
138 }
[3365]139 }
140}
141
142static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
[13653]143 // here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
[3365]144 unsigned long stem = (unsigned long)defaultStemMethod;
145
146 UCArray::const_iterator here = el.text.begin();
147 UCArray::const_iterator end = el.text.end();
148
[13653]149 /* [JFG - Mar 06: Accent folding patch] */
150 /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
151 while(here != end) {
152 unsigned char ch = *here;
153 if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
154 return STEM_INVALID; // incorrect format
155
156 switch(ch) {
157 case CHAR_FLAG_STEM_CaseFold: // ignore case (fold)
158 stem |= STEM_CaseFolding;
159 break;
160 case CHAR_FLAG_STEM_NoCaseFold: // case sensitive
161 stem &= (~STEM_CaseFolding);
162 break;
163 case CHAR_FLAG_STEM_Stemming: // stem words
164 stem |= STEM_Stemming;
165 break;
166 case CHAR_FLAG_STEM_NoStemming: // do not stem words
167 stem &= (~STEM_Stemming);
168 break;
169#ifdef ENABLE_ACCENTFOLD
170 case CHAR_FLAG_STEM_AccentFold: // accent fold
171 stem |= STEM_AccentFolding;
172 break;
173 case CHAR_FLAG_STEM_NoAccentFold: // do no accent folding
174 stem &= (~STEM_AccentFolding);
175 break;
176#endif
177 };
178
179 ++here;
[3365]180 }
181 return stem;
182}
183
184
185static void ParseTermModifiers (UCArray::const_iterator &here,
186 UCArray::const_iterator end,
187 TermNode &termNode,
188 int defaultStemMethod) {
189
190 termNode.stemMethod = defaultStemMethod;
[8242]191 bool partial_match = false;
[3365]192 LexEl el;
193 UCArray::const_iterator oldHere = here;
194 while (ParseLexEl (here, end, el)) {
195 if (el.lexType == TermWeightE) {
196 termNode.termWeight = ParseInt (here, end);
197
198 } else if (el.lexType == StemMethodE) {
199 oldHere = here;
200 LexEl stem;
201 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
202 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
[13653]203 /* [JFG - Mar 06: Accent folding patch] */
204 /* use STEM_INVALID instead of hardcoded 4 */
205 if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
[3365]206 here = oldHere;
207 termNode.stemMethod = (unsigned long)defaultStemMethod;
208 }
[13653]209 } else here = oldHere; //ignore - wrong syntax
[3365]210
211 } else if (el.lexType == RangeE) {
212 termNode.startRange = ParseInt (here, end);
213 termNode.endRange = ParseInt (here, end);
214
215 } else if (el.lexType == AtE) {
216 termNode.startRange = termNode.endRange = ParseInt (here, end);
[8242]217 } else if (el.lexType == StarE) {
218 partial_match = true;
[3365]219 } else {
220 // no term modifiers
221 here = oldHere;
222 break;
223 }
224
[8242]225 if (partial_match) {
[13653]226 /* [JFG - Mar 06: Accent folding patch] */
227 /* use STEM_PARTIAL_MATCH flag */
228 termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag
229 termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
230 termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
[8242]231 }
[3365]232 oldHere = here;
233 }
234}
235
236static void ParseProxModifiers (UCArray::const_iterator &here,
237 UCArray::const_iterator end,
238 ProxMatchQueryNode *proxNode) {
239 // so far only have one - the tag stuff
240 LexEl el;
241 UCArray::const_iterator oldHere = here;
242 while (ParseLexEl (here, end, el)) {
243 if (el.lexType == TagE) {
244 oldHere = here; // don't backtrack past here
245 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
246 proxNode->tagNodePtr = new TagNode;
247 proxNode->tagNodePtr->tagName = el.text;
248
249 }
250 else { // error in tag
251 here = oldHere;
252 }
253 } // TagE
254 // add in other cases here
255 else {
256 // no modifiers
257 here = oldHere;
258 break;
259 }
260 oldHere = here;
261 }//while
262
263
264}
265
266// expects starting brackets to have been parsed
[4210]267// sets error to true if something has gone wrong
[6082]268static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
[3365]269 UCArray::const_iterator end,
[6082]270 /*ProxMatchQueryNode *proxNode,*/
[4210]271 int defaultStemMethod,
272 bool & error) {
[3365]273
[6082]274 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
[3365]275 LexEl el;
276 bool phrase=false;
277 bool first=true;
[6129]278 bool prox = false;
[6082]279 UCArray near_string;
[3365]280 while (ParseLexEl (here, end, el)) {
[8692]281 // cant have AND, OR, NOT in square brackets, so assume they are words
282 if (el.lexType == TermE || el.lexType == IntegerE || el.lexType == AndOpE || el.lexType == OrOpE || el.lexType == NotOpE) {
[3365]283 TermNode termNode;
284 termNode.term = el.text;
285 ParseTermModifiers (here, end, termNode, defaultStemMethod);
286 if (phrase) {
287 if (first) first=false;
288 else {
289 termNode.startRange = -2;
290 termNode.endRange = -1;
291 }
[6129]292 } else if (prox) {
[6119]293 SetRangeValues(termNode, near_string, false);
[6129]294 prox = false;
[6082]295 }
[3365]296 proxNode->terms.push_back(termNode);
297 }
298 else if (el.lexType == CloseSquareBracketE) {
299 break;
300 }
301 else if (el.lexType == QuoteE) {
302 // phrase inside square brackets
[6082]303 if (phrase) { // end of phrase
304 phrase=false;
305 first = true;
306 } else {
307 phrase=true; // start of phrase
308 }
[6119]309 } else if (el.lexType == NearOpE || el.lexType == WithinOpE) {
[6082]310 if (phrase) {
[6119]311 // cant have proximity op in a phrase - just assume its an actual word
[6082]312 TermNode termNode;
313 termNode.term = el.text;
314 ParseTermModifiers (here, end, termNode, defaultStemMethod);
315 proxNode->terms.push_back(termNode);
316 } else {
[6119]317 // its a NEAR or within op
[6129]318 prox = true;
[6082]319 near_string = el.text;
320 }
321
322 }
323 else if (el.lexType == UnknownE) {
[5449]324 // just ignore it
[3365]325 }
326 else {
[4210]327 //error - we set the proxNode to NULL,
328 cerr <<"GSDLQueryParser: bad syntax inside []\n";
329 error = true;
[6082]330 return NULL;
[3365]331 }
332 } // while
[6082]333 return proxNode;
[3365]334}
335// expects the starting quote to have been parsed
336// and discarded
337// now phrases use the case and stem preference options
338// ie can search for a phrase ignoring case
339static void ParsePhrase (UCArray::const_iterator &here,
340 UCArray::const_iterator end,
341 ProxMatchQueryNode &proxNode,
[4210]342 int defaultStemMethod,
343 bool &error) {
[3365]344 LexEl el;
345 bool first = true;
346 while (ParseLexEl (here, end, el)) {
347 if (el.lexType == TermE || el.lexType == IntegerE) {
348 TermNode termNode;
349 termNode.term = el.text;
350 //termNode.stemMethod = defaultStemMethod;
351 ParseTermModifiers (here, end, termNode, defaultStemMethod);
352 if (first) {
353 first = false;
354 }
355 else {
356 termNode.startRange = -2;
357 termNode.endRange = -1;
358 }
359 proxNode.terms.push_back (termNode);
360
361 } else if (el.lexType == QuoteE) {
362 break;
363
[5449]364 } else if (el.lexType == UnknownE) {
365 // just ignore it
[3365]366 } else {
[4210]367 // error
368 error = true;
369 return;
[3365]370 }
371 }
372}
373
374static QueryNode *ParseTerm (UCArray::const_iterator &here,
375 UCArray::const_iterator end,
376 int defaultBoolCombine,
377 int defaultStemMethod) {
378 LexEl el;
379
380 UCArray::const_iterator oldHere = here;
381 if (!ParseLexEl (here, end, el)) return NULL;
382
383 if (el.lexType == OpenBracketE)
384 return ParseBracketExpression (here, end, defaultBoolCombine,
385 defaultStemMethod);
386
387 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
388
389 if (el.lexType == TermE || el.lexType == IntegerE) {
390 TermNode termNode;
391 termNode.term = el.text;
392 ParseTermModifiers (here, end, termNode, defaultStemMethod);
393 oldHere = here; // dont backtrack past here
[6119]394 if (ParseLexEl(here, end, el) && (el.lexType == NearOpE || el.lexType == WithinOpE )) {
[4210]395 delete proxNode;
396 oldHere = here;
397 // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
398
399 // if the next element is a '(' have a syntax error, return NULL
400 LexEl temp_el;
401 if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
[6119]402 cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
[4210]403 return NULL;
404 }
405 here = oldHere; // else backtrack
406
[3365]407 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
408 defaultStemMethod);
[6119]409 SetRangeValues(termNode, el.text, true);
[3365]410 proxNode->terms.push_back (termNode);
411 return proxNode;
[4210]412
413 } else {
[3365]414 here = oldHere; // backtrack
415 proxNode->terms.push_back (termNode);
416 ParseProxModifiers(here, end, proxNode);
417 return proxNode;
418 }
419 } else if (el.lexType == QuoteE) {
[4210]420 bool error = false;
421 ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
422 if (error) {
423 delete proxNode;
424 return NULL;
425 }
[3365]426 return proxNode;
427 }
428 else if (el.lexType == OpenSquareBracketE) {
[4210]429 bool error = false;
[6082]430 proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error);
[4210]431 if (error) {
432 delete proxNode;
433 return NULL;
434 }
[3365]435 ParseProxModifiers (here, end, proxNode);
436 return proxNode;
437 }
438
439 // not a term
440 here = oldHere;
441 delete proxNode;
442 return NULL;
443}
444
445
446static QueryNode *ParseExpression (UCArray::const_iterator &here,
447 UCArray::const_iterator end,
448 int defaultBoolCombine,
449 int defaultStemMethod) {
450 LexEl el;
451 QueryNode *curTree = NULL;
452 UCArray::const_iterator oldHere = here;
453 while (ParseLexEl (here, end, el)) {
[4210]454 if (el.lexType == CloseBracketE) {
455 // parsebracketexpression is waiting for the last bracket, so put it back
456 here = oldHere;
457 break;
458
459 } else if (el.lexType == OpenSquareBracketE ||
460 el.lexType == OpenBracketE ||
461 el.lexType == TermE ||
462 el.lexType == QuoteE ||
463 el.lexType == IntegerE ) {
464
[3365]465 // some type of term, back track and parse it
466 here = oldHere;
[4210]467
468 // parse the term
469 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
470 defaultStemMethod);
471 if (newTerm == NULL) {
472 delete curTree;
473 return NULL;
474 }
475
[3365]476 // if default==1, AND, else if==0, OR
477 if (defaultBoolCombine) {
[4210]478 curTree = AndAdd (curTree, newTerm);
[3365]479 }
480 else {
[4210]481 curTree = OrAdd (curTree, newTerm);
[3365]482 }
[4210]483
[3365]484 } else if (el.lexType == AndOpE) {
[4210]485 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
486 defaultStemMethod);
487 if (newTerm == NULL) {
488 delete curTree;
489 return NULL;
490 }
491 curTree = AndAdd (curTree, newTerm);
[3365]492
493 } else if (el.lexType == OrOpE) {
[4210]494 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
495 defaultStemMethod);
496 if (newTerm == NULL) {
497 delete curTree;
498 return NULL;
499 }
500 curTree = OrAdd (curTree, newTerm);
[3365]501
502 } else if (el.lexType == NotOpE) {
[4210]503 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
504 defaultStemMethod);
505 if (newTerm == NULL) {
506 delete curTree;
507 return NULL;
508 }
509 curTree = NotAdd (curTree, newTerm);
[3365]510
[5449]511 } else if (el.lexType == UnknownE) {
512 // just ignore it
[4210]513 } else {
514
515 // syntax error, return NUll
516 delete curTree;
517 return NULL;
518 }
519
[3365]520 oldHere = here;
521 }
522
523 return curTree;
524}
525
526QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
[12321]527 int defaultStemMethod, int maxnumeric) {
[13653]528 if (4 < maxnumeric && maxnumeric < 512) {
[12321]529 MAXNUMERIC = maxnumeric;
530 }
[3365]531 UCArray::const_iterator here = queryStr.begin();
532 UCArray::const_iterator end = queryStr.end();
533 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
534}
Note: See TracBrowser for help on using the repository browser.