source: trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp@ 12884

Last change on this file since 12884 was 12884, checked in by kjdon, 18 years ago

Accent folding patch thanks to Juan Grigera. parsing of stem/case/accent term modifiers now uses defines from mg_files.h

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24#include "words.h"
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCombine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby,
95 bool reverse) {
96 UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
97 UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
98
99 if (nearby == NEARBY) { // no modifier
100 termNode.startRange = (NEAR_DEFAULT+1)*-1;
101 termNode.endRange = NEAR_DEFAULT;
102
103 } else if (nearby == WITHIN) { // no modifier
104 if (reverse) {
105 termNode.startRange = (NEAR_DEFAULT+1)*-1;
106 termNode.endRange = -1;
107 } else {
108 termNode.startRange = NEAR_DEFAULT;
109 termNode.endRange = 0;
110 }
111 }
112 else { // extract number
113 UCArray::const_iterator here;
114 bool within = false;
115 if (PrefixLen(nearby, WITHIN)==6) {
116 within=true;
117 here = nearby.begin()+6;
118 } else {
119 here = nearby.begin()+4;
120 }
121 UCArray::const_iterator end = nearby.end();
122 int size=0;
123 while (here != end) {
124 size = size*10 + (*here-'0');
125 ++here;
126 }
127 if (within) {
128 if (reverse) {
129 termNode.startRange = size;
130 termNode.endRange = 0;
131 } else {
132 termNode.startRange = -1 * (size+1);
133 termNode.endRange = -1;
134 }
135 } else {
136 termNode.startRange = -1 * (size+1);
137 termNode.endRange = size;
138 }
139 }
140}
141
142static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
143 // here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
144 unsigned long stem = (unsigned long)defaultStemMethod;
145
146 UCArray::const_iterator here = el.text.begin();
147 UCArray::const_iterator end = el.text.end();
148
149 /* [JFG - Mar 06: Accent folding patch] */
150 /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
151 while(here != end) {
152 unsigned char ch = *here;
153 if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
154 return STEM_INVALID; // incorrect format
155
156 switch(ch) {
157 case CHAR_FLAG_STEM_CaseFold: // ignore case (fold)
158 stem |= STEM_CaseFolding;
159 break;
160 case CHAR_FLAG_STEM_NoCaseFold: // case sensitive
161 stem &= (~STEM_CaseFolding);
162 break;
163 case CHAR_FLAG_STEM_Stemming: // stem words
164 stem |= STEM_Stemming;
165 break;
166 case CHAR_FLAG_STEM_NoStemming: // do not stem words
167 stem &= (~STEM_Stemming);
168 break;
169 case CHAR_FLAG_STEM_AccentFold: // accent fold
170 stem |= STEM_AccentFolding;
171 break;
172 case CHAR_FLAG_STEM_NoAccentFold: // do no accent folding
173 stem &= (~STEM_AccentFolding);
174 break;
175 };
176
177 ++here;
178 }
179 return stem;
180}
181
182
183static void ParseTermModifiers (UCArray::const_iterator &here,
184 UCArray::const_iterator end,
185 TermNode &termNode,
186 int defaultStemMethod) {
187
188 termNode.stemMethod = defaultStemMethod;
189 bool partial_match = false;
190 LexEl el;
191 UCArray::const_iterator oldHere = here;
192 while (ParseLexEl (here, end, el)) {
193 if (el.lexType == TermWeightE) {
194 termNode.termWeight = ParseInt (here, end);
195
196 } else if (el.lexType == StemMethodE) {
197 oldHere = here;
198 LexEl stem;
199 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
200 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
201 /* [JFG - Mar 06: Accent folding patch] */
202 /* use STEM_INVALID instead of hardcoded 4 */
203 if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
204 here = oldHere;
205 termNode.stemMethod = (unsigned long)defaultStemMethod;
206 }
207 }else here = oldHere; //ignore - wrong syntax
208
209 } else if (el.lexType == RangeE) {
210 termNode.startRange = ParseInt (here, end);
211 termNode.endRange = ParseInt (here, end);
212
213 } else if (el.lexType == AtE) {
214 termNode.startRange = termNode.endRange = ParseInt (here, end);
215 } else if (el.lexType == StarE) {
216 partial_match = true;
217 } else {
218 // no term modifiers
219 here = oldHere;
220 break;
221 }
222
223 if (partial_match) {
224 /* [JFG - Mar 06: Accent folding patch] */
225 /* use STEM_PARTIAL_MATCH flag */
226 termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag
227 termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
228 }
229 oldHere = here;
230 }
231}
232
233static void ParseProxModifiers (UCArray::const_iterator &here,
234 UCArray::const_iterator end,
235 ProxMatchQueryNode *proxNode) {
236 // so far only have one - the tag stuff
237 LexEl el;
238 UCArray::const_iterator oldHere = here;
239 while (ParseLexEl (here, end, el)) {
240 if (el.lexType == TagE) {
241 oldHere = here; // don't backtrack past here
242 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
243 proxNode->tagNodePtr = new TagNode;
244 proxNode->tagNodePtr->tagName = el.text;
245
246 }
247 else { // error in tag
248 here = oldHere;
249 }
250 } // TagE
251 // add in other cases here
252 else {
253 // no modifiers
254 here = oldHere;
255 break;
256 }
257 oldHere = here;
258 }//while
259
260
261}
262
263// expects starting brackets to have been parsed
264// sets error to true if something has gone wrong
265static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
266 UCArray::const_iterator end,
267 /*ProxMatchQueryNode *proxNode,*/
268 int defaultStemMethod,
269 bool & error) {
270
271 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
272 LexEl el;
273 bool phrase=false;
274 bool first=true;
275 bool prox = false;
276 UCArray near_string;
277 while (ParseLexEl (here, end, el)) {
278 // cant have AND, OR, NOT in square brackets, so assume they are words
279 if (el.lexType == TermE || el.lexType == IntegerE || el.lexType == AndOpE || el.lexType == OrOpE || el.lexType == NotOpE) {
280 TermNode termNode;
281 termNode.term = el.text;
282 ParseTermModifiers (here, end, termNode, defaultStemMethod);
283 if (phrase) {
284 if (first) first=false;
285 else {
286 termNode.startRange = -2;
287 termNode.endRange = -1;
288 }
289 } else if (prox) {
290 SetRangeValues(termNode, near_string, false);
291 prox = false;
292 }
293 proxNode->terms.push_back(termNode);
294 }
295 else if (el.lexType == CloseSquareBracketE) {
296 break;
297 }
298 else if (el.lexType == QuoteE) {
299 // phrase inside square brackets
300 if (phrase) { // end of phrase
301 phrase=false;
302 first = true;
303 } else {
304 phrase=true; // start of phrase
305 }
306 } else if (el.lexType == NearOpE || el.lexType == WithinOpE) {
307 if (phrase) {
308 // cant have proximity op in a phrase - just assume its an actual word
309 TermNode termNode;
310 termNode.term = el.text;
311 ParseTermModifiers (here, end, termNode, defaultStemMethod);
312 proxNode->terms.push_back(termNode);
313 } else {
314 // its a NEAR or within op
315 prox = true;
316 near_string = el.text;
317 }
318
319 }
320 else if (el.lexType == UnknownE) {
321 // just ignore it
322 }
323 else {
324 //error - we set the proxNode to NULL,
325 cerr <<"GSDLQueryParser: bad syntax inside []\n";
326 error = true;
327 return NULL;
328 }
329 } // while
330 return proxNode;
331}
332// expects the starting quote to have been parsed
333// and discarded
334// now phrases use the case and stem preference options
335// ie can search for a phrase ignoring case
336static void ParsePhrase (UCArray::const_iterator &here,
337 UCArray::const_iterator end,
338 ProxMatchQueryNode &proxNode,
339 int defaultStemMethod,
340 bool &error) {
341 LexEl el;
342 bool first = true;
343 while (ParseLexEl (here, end, el)) {
344 if (el.lexType == TermE || el.lexType == IntegerE) {
345 TermNode termNode;
346 termNode.term = el.text;
347 //termNode.stemMethod = defaultStemMethod;
348 ParseTermModifiers (here, end, termNode, defaultStemMethod);
349 if (first) {
350 first = false;
351 }
352 else {
353 termNode.startRange = -2;
354 termNode.endRange = -1;
355 }
356 proxNode.terms.push_back (termNode);
357
358 } else if (el.lexType == QuoteE) {
359 break;
360
361 } else if (el.lexType == UnknownE) {
362 // just ignore it
363 } else {
364 // error
365 error = true;
366 return;
367 }
368 }
369}
370
371static QueryNode *ParseTerm (UCArray::const_iterator &here,
372 UCArray::const_iterator end,
373 int defaultBoolCombine,
374 int defaultStemMethod) {
375 LexEl el;
376
377 UCArray::const_iterator oldHere = here;
378 if (!ParseLexEl (here, end, el)) return NULL;
379
380 if (el.lexType == OpenBracketE)
381 return ParseBracketExpression (here, end, defaultBoolCombine,
382 defaultStemMethod);
383
384 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
385
386 if (el.lexType == TermE || el.lexType == IntegerE) {
387 TermNode termNode;
388 termNode.term = el.text;
389 ParseTermModifiers (here, end, termNode, defaultStemMethod);
390 oldHere = here; // dont backtrack past here
391 if (ParseLexEl(here, end, el) && (el.lexType == NearOpE || el.lexType == WithinOpE )) {
392 delete proxNode;
393 oldHere = here;
394 // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
395
396 // if the next element is a '(' have a syntax error, return NULL
397 LexEl temp_el;
398 if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
399 cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
400 return NULL;
401 }
402 here = oldHere; // else backtrack
403
404 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
405 defaultStemMethod);
406 SetRangeValues(termNode, el.text, true);
407 proxNode->terms.push_back (termNode);
408 return proxNode;
409
410 } else {
411 here = oldHere; // backtrack
412 proxNode->terms.push_back (termNode);
413 ParseProxModifiers(here, end, proxNode);
414 return proxNode;
415 }
416 } else if (el.lexType == QuoteE) {
417 bool error = false;
418 ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
419 if (error) {
420 delete proxNode;
421 return NULL;
422 }
423 return proxNode;
424 }
425 else if (el.lexType == OpenSquareBracketE) {
426 bool error = false;
427 proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error);
428 if (error) {
429 delete proxNode;
430 return NULL;
431 }
432 ParseProxModifiers (here, end, proxNode);
433 return proxNode;
434 }
435
436 // not a term
437 here = oldHere;
438 delete proxNode;
439 return NULL;
440}
441
442
443static QueryNode *ParseExpression (UCArray::const_iterator &here,
444 UCArray::const_iterator end,
445 int defaultBoolCombine,
446 int defaultStemMethod) {
447 LexEl el;
448 QueryNode *curTree = NULL;
449 UCArray::const_iterator oldHere = here;
450 while (ParseLexEl (here, end, el)) {
451 if (el.lexType == CloseBracketE) {
452 // parsebracketexpression is waiting for the last bracket, so put it back
453 here = oldHere;
454 break;
455
456 } else if (el.lexType == OpenSquareBracketE ||
457 el.lexType == OpenBracketE ||
458 el.lexType == TermE ||
459 el.lexType == QuoteE ||
460 el.lexType == IntegerE ) {
461
462 // some type of term, back track and parse it
463 here = oldHere;
464
465 // parse the term
466 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
467 defaultStemMethod);
468 if (newTerm == NULL) {
469 delete curTree;
470 return NULL;
471 }
472
473 // if default==1, AND, else if==0, OR
474 if (defaultBoolCombine) {
475 curTree = AndAdd (curTree, newTerm);
476 }
477 else {
478 curTree = OrAdd (curTree, newTerm);
479 }
480
481 } else if (el.lexType == AndOpE) {
482 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
483 defaultStemMethod);
484 if (newTerm == NULL) {
485 delete curTree;
486 return NULL;
487 }
488 curTree = AndAdd (curTree, newTerm);
489
490 } else if (el.lexType == OrOpE) {
491 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
492 defaultStemMethod);
493 if (newTerm == NULL) {
494 delete curTree;
495 return NULL;
496 }
497 curTree = OrAdd (curTree, newTerm);
498
499 } else if (el.lexType == NotOpE) {
500 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
501 defaultStemMethod);
502 if (newTerm == NULL) {
503 delete curTree;
504 return NULL;
505 }
506 curTree = NotAdd (curTree, newTerm);
507
508 } else if (el.lexType == UnknownE) {
509 // just ignore it
510 } else {
511
512 // syntax error, return NUll
513 delete curTree;
514 return NULL;
515 }
516
517 oldHere = here;
518 }
519
520 return curTree;
521}
522
523QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
524 int defaultStemMethod, int maxnumeric) {
525 if (4 < maxnumeric < 512) {
526 MAXNUMERIC = maxnumeric;
527 }
528 UCArray::const_iterator here = queryStr.begin();
529 UCArray::const_iterator end = queryStr.end();
530 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
531}
Note: See TracBrowser for help on using the repository browser.