root/main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryParser.cpp @ 25147

Revision 25147, 15.2 KB (checked in by kjdon, 9 years ago)

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24#include "words.h"
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27                   UCArray::const_iterator end,
28                   int defaultBoolCombine,
29                   int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32  if (t1 == NULL) return t2;
33  if (t2 == NULL) return t1;
34
35  AndQueryNode *andNode = new AndQueryNode;
36  andNode->leftNode = t1;
37  andNode->rightNode = t2;
38  return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42  if (t1 == NULL) return t2;
43  if (t2 == NULL) return t1;
44
45  OrQueryNode *orNode = new OrQueryNode;
46  orNode->leftNode = t1;
47  orNode->rightNode = t2;
48  return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52  if (t1 == NULL) return t2;
53  if (t2 == NULL) return t1;
54
55  NotQueryNode *notNode = new NotQueryNode;
56  notNode->queryNode = t1;
57  notNode->notNode = t2;
58  return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64                      UCArray::const_iterator end,
65                      int defaultBoolCombine,
66                      int defaultStemMethod) {
67  // get everything in the expression
68  QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69                    defaultStemMethod);
70 
71  // gobble up tokens until a closing bracket is found
72  // or the end of the string
73  LexEl el;
74  while (ParseLexEl (here, end, el)) {
75    if (el.lexType == CloseBracketE) break;
76  }
77
78  return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82             UCArray::const_iterator end) {
83  LexEl el;
84  UCArray::const_iterator oldHere = here;
85  if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86    return el.num;
87 
88  here = oldHere; // not an integer
89  return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94                UCArray &nearby,
95                bool reverse) {
96  UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
97  UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
98 
99  if (nearby == NEARBY) { // no modifier
100    termNode.startRange = (NEAR_DEFAULT+1)*-1;
101    termNode.endRange = NEAR_DEFAULT;
102
103  } else if (nearby == WITHIN) { // no modifier
104    if (reverse) {
105      termNode.startRange = (NEAR_DEFAULT+1)*-1;
106      termNode.endRange = -1;
107    } else {
108      termNode.startRange = NEAR_DEFAULT;
109      termNode.endRange = 0;
110    }
111  }
112  else { // extract number
113    UCArray::const_iterator here;
114    bool within = false;
115    if (PrefixLen(nearby, WITHIN)==6) {
116      within=true;
117      here = nearby.begin()+6;
118    } else {
119      here = nearby.begin()+4;
120    }
121    UCArray::const_iterator end = nearby.end();
122    int size=0;
123    while (here != end) {
124      size = size*10 + (*here-'0');
125      ++here;
126    }
127    if (within) {
128      if (reverse) {
129    termNode.startRange = size;
130    termNode.endRange = 0;
131      } else {
132    termNode.startRange = -1 * (size+1);
133    termNode.endRange = -1;
134      }
135    } else {
136      termNode.startRange = -1 * (size+1);
137      termNode.endRange = size;
138    }
139  }
140}
141
142static mg_u_long GetStemMethod(LexEl &el, int defaultStemMethod) {
143  // here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
144  mg_u_long stem = (mg_u_long)defaultStemMethod;
145     
146  UCArray::const_iterator here = el.text.begin();
147  UCArray::const_iterator end = el.text.end();
148
149  /* [JFG - Mar 06: Accent folding patch] */
150  /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
151  while(here != end) {
152    unsigned char ch = *here;
153    if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
154      return STEM_INVALID; // incorrect format
155   
156    switch(ch) {
157    case CHAR_FLAG_STEM_CaseFold:       // ignore case (fold)
158      stem |= STEM_CaseFolding;
159      break;
160    case CHAR_FLAG_STEM_NoCaseFold:     // case sensitive
161      stem &= (~STEM_CaseFolding);
162      break;
163    case CHAR_FLAG_STEM_Stemming:       // stem words
164      stem |= STEM_Stemming;
165      break;
166    case CHAR_FLAG_STEM_NoStemming:     // do not stem words
167      stem &= (~STEM_Stemming);
168      break;
169#ifdef ENABLE_ACCENTFOLD
170   case CHAR_FLAG_STEM_AccentFold:      // accent fold
171      stem |= STEM_AccentFolding;
172      break;
173    case CHAR_FLAG_STEM_NoAccentFold:   // do no accent folding
174      stem &= (~STEM_AccentFolding);
175      break;
176#endif
177    };
178   
179    ++here;     
180  }
181  return stem;
182}
183 
184
185static void ParseTermModifiers (UCArray::const_iterator &here,
186                UCArray::const_iterator end,
187                TermNode &termNode,
188                int defaultStemMethod) {
189 
190  termNode.stemMethod = defaultStemMethod;
191  bool partial_match = false;
192  LexEl el;
193  UCArray::const_iterator oldHere = here;
194  while (ParseLexEl (here, end, el)) {
195    if (el.lexType == TermWeightE) {
196      termNode.termWeight = ParseInt (here, end);
197     
198    } else if (el.lexType == StemMethodE) {
199      oldHere = here;
200      LexEl stem;
201      if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
202    termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
203    /* [JFG - Mar 06: Accent folding patch] */
204    /* use STEM_INVALID instead of hardcoded 4 */
205    if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
206      here = oldHere;
207      termNode.stemMethod = (mg_u_long)defaultStemMethod;
208    }
209      } else here = oldHere; //ignore - wrong syntax
210
211    } else if (el.lexType == RangeE) {
212      termNode.startRange = ParseInt (here, end);
213      termNode.endRange = ParseInt (here, end);
214     
215    } else if (el.lexType == AtE) {
216      termNode.startRange = termNode.endRange = ParseInt (here, end);
217    } else if (el.lexType == StarE) {
218      partial_match = true;
219    } else {
220      // no term modifiers
221      here = oldHere;
222      break;
223    }
224   
225    if (partial_match) {
226      /* [JFG - Mar 06: Accent folding patch] */
227      /* use STEM_PARTIAL_MATCH flag */
228      termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag
229      termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
230      termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
231    }
232    oldHere = here;
233  } 
234}
235
236static void ParseProxModifiers (UCArray::const_iterator &here,
237                UCArray::const_iterator end,
238                ProxMatchQueryNode *proxNode) {
239  // so far only have one - the tag stuff
240  LexEl el;
241  UCArray::const_iterator oldHere = here;
242  while (ParseLexEl (here, end, el)) {
243    if (el.lexType == TagE) {
244      oldHere = here; // don't backtrack past here
245      if (ParseLexEl (here, end, el) && el.lexType == TermE) {
246    proxNode->tagNodePtr = new TagNode;
247    proxNode->tagNodePtr->tagName = el.text;
248   
249      }
250      else { // error in tag
251    here = oldHere;
252      }
253    } // TagE
254    // add in other cases here
255    else {
256      // no modifiers
257      here = oldHere;
258      break;
259    }
260    oldHere = here;
261  }//while
262
263
264}
265
266// expects starting brackets to have been parsed
267// sets error to true if something has gone wrong
268static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
269                UCArray::const_iterator end,
270                /*ProxMatchQueryNode *proxNode,*/
271                int defaultStemMethod,
272                bool & error) {
273
274  ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
275  LexEl el;
276  bool phrase=false;
277  bool first=true;
278  bool prox = false;
279  UCArray near_string;
280  while (ParseLexEl (here, end, el)) {
281    // cant have AND, OR, NOT in square brackets, so assume they are words
282    if (el.lexType == TermE || el.lexType == IntegerE || el.lexType == AndOpE || el.lexType == OrOpE || el.lexType == NotOpE) {
283      TermNode termNode;
284      termNode.term = el.text;
285      ParseTermModifiers (here, end, termNode, defaultStemMethod);
286      if (phrase) {
287        if (first) first=false;
288        else {
289          termNode.startRange = -2;
290          termNode.endRange = -1;
291        }
292      } else if (prox) {
293    SetRangeValues(termNode, near_string, false);
294    prox = false;
295      }
296      proxNode->terms.push_back(termNode);
297    }
298    else if (el.lexType == CloseSquareBracketE) {
299      break;
300    }
301    else if (el.lexType == QuoteE) {
302      // phrase inside square brackets
303      if (phrase) { // end of phrase
304    phrase=false;
305    first = true;
306      } else {
307    phrase=true; // start of phrase
308      }
309    } else if (el.lexType == NearOpE || el.lexType == WithinOpE) {
310      if (phrase) {
311    // cant have proximity  op in a phrase - just assume its an actual word
312    TermNode termNode;
313    termNode.term = el.text;
314    ParseTermModifiers (here, end, termNode, defaultStemMethod);
315    proxNode->terms.push_back(termNode);
316      } else {
317    // its a NEAR or within op
318    prox = true;
319    near_string = el.text;
320      }
321     
322    }
323    else if (el.lexType == UnknownE) {
324      // just ignore it
325    }
326    else {
327      //error - we set the proxNode to NULL,
328      cerr <<"GSDLQueryParser: bad syntax inside []\n";
329      error = true;
330      return NULL;
331    }
332  } // while
333  return proxNode;
334}
335// expects the starting quote to have been parsed
336// and discarded
337// now phrases use the case and stem preference options
338// ie can search for a phrase ignoring case
339static void ParsePhrase (UCArray::const_iterator &here,
340             UCArray::const_iterator end,
341             ProxMatchQueryNode &proxNode,
342             int defaultStemMethod,
343             bool &error) {
344  LexEl el;
345  bool first = true;
346  while (ParseLexEl (here, end, el)) {
347    if (el.lexType == TermE || el.lexType == IntegerE) {
348      TermNode termNode;
349      termNode.term = el.text;
350      //termNode.stemMethod = defaultStemMethod;
351      ParseTermModifiers (here, end, termNode, defaultStemMethod);
352      if (first) {
353    first = false;
354      }
355      else {
356    termNode.startRange = -2;
357    termNode.endRange = -1;
358      }
359      proxNode.terms.push_back (termNode);
360     
361    } else if (el.lexType == QuoteE) {
362      break;
363     
364    } else if (el.lexType == UnknownE) {
365      // just ignore it
366    } else {
367      // error
368      error = true;
369      return;
370    }
371  }
372}
373
374static QueryNode *ParseTerm (UCArray::const_iterator &here,
375                 UCArray::const_iterator end,
376                 int defaultBoolCombine,
377                 int defaultStemMethod) {
378  LexEl el;
379
380  UCArray::const_iterator oldHere = here;
381  if (!ParseLexEl (here, end, el)) return NULL;
382
383  if (el.lexType == OpenBracketE)
384    return ParseBracketExpression (here, end, defaultBoolCombine,
385                   defaultStemMethod);
386
387  ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
388
389  if (el.lexType == TermE || el.lexType == IntegerE) {
390    TermNode termNode;
391    termNode.term = el.text;
392    ParseTermModifiers (here, end, termNode, defaultStemMethod);
393    oldHere = here;  // dont backtrack past here
394    if (ParseLexEl(here, end, el) && (el.lexType == NearOpE || el.lexType == WithinOpE )) {
395    delete proxNode;
396    oldHere = here;
397      // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
398     
399      // if the next element is a '(' have a syntax error, return NULL
400      LexEl temp_el;
401      if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
402    cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
403    return NULL;
404      }
405      here = oldHere; // else backtrack
406   
407      proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
408                         defaultStemMethod);
409      SetRangeValues(termNode, el.text, true);
410      proxNode->terms.push_back (termNode);
411      return proxNode;
412     
413    } else {
414      here = oldHere; // backtrack
415      proxNode->terms.push_back (termNode);
416      ParseProxModifiers(here, end, proxNode);
417      return proxNode;
418    }
419  } else if (el.lexType == QuoteE) {
420    bool error = false;
421    ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
422    if (error) {
423      delete proxNode;
424      return NULL;
425    }
426    return proxNode;
427  }
428  else if (el.lexType == OpenSquareBracketE) {
429    bool error = false;
430    proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error);
431    if (error) {
432      delete proxNode;
433      return NULL;
434    }
435    ParseProxModifiers (here, end, proxNode);
436    return proxNode;
437  }
438
439  // not a term
440  here = oldHere;
441  delete proxNode;
442  return NULL;
443}
444
445
446static QueryNode *ParseExpression (UCArray::const_iterator &here,
447                   UCArray::const_iterator end,
448                   int defaultBoolCombine,
449                   int defaultStemMethod) {
450  LexEl el;
451  QueryNode *curTree = NULL;
452  UCArray::const_iterator oldHere = here;
453  while (ParseLexEl (here, end, el)) {
454    if (el.lexType == CloseBracketE) {
455      // parsebracketexpression is waiting for the last bracket, so put it back
456      here = oldHere;
457      break;
458     
459    } else if (el.lexType == OpenSquareBracketE ||
460           el.lexType == OpenBracketE ||
461           el.lexType == TermE ||
462           el.lexType == QuoteE ||
463           el.lexType == IntegerE ) {
464     
465      // some type of term, back track and parse it
466      here = oldHere;
467
468      //  parse the term
469      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
470                       defaultStemMethod);
471      if (newTerm == NULL) {
472    delete curTree;
473    return NULL;
474      }
475
476      // if default==1, AND, else if==0, OR
477      if (defaultBoolCombine) {
478    curTree = AndAdd (curTree, newTerm);
479      }
480      else {
481    curTree = OrAdd (curTree, newTerm);
482      }
483     
484    } else if (el.lexType == AndOpE) {
485      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
486                       defaultStemMethod);
487      if (newTerm == NULL) {
488    delete curTree;
489    return NULL;
490      }
491      curTree = AndAdd (curTree, newTerm);
492     
493    } else if (el.lexType == OrOpE) {
494      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
495                       defaultStemMethod);
496      if (newTerm == NULL) {
497    delete curTree;
498    return NULL;
499      }
500      curTree = OrAdd (curTree, newTerm);
501     
502    } else if (el.lexType == NotOpE) {
503      QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
504                       defaultStemMethod);
505      if (newTerm == NULL) {
506    delete curTree;
507    return NULL;
508      }
509      curTree = NotAdd (curTree, newTerm);
510     
511    } else if (el.lexType == UnknownE) {
512      // just ignore it
513    } else {
514
515      // syntax error, return NUll
516      delete curTree;
517      return NULL;
518    }
519
520    oldHere = here;
521  }
522
523  return curTree;
524}
525
526QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
527               int defaultStemMethod, int maxnumeric) {
528  if (4 < maxnumeric && maxnumeric < 512) {
529    MAXNUMERIC = maxnumeric;
530  }
531  UCArray::const_iterator here = queryStr.begin();
532  UCArray::const_iterator end = queryStr.end();
533  return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
534}
Note: See TracBrowser for help on using the browser.