/************************************************************************** * * QueryParser.cpp -- Query parser for a simple query language * Copyright (C) 2000 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #include "GSDLQueryParser.h" #include "GSDLQueryLex.h" #include "words.h" static QueryNode *ParseExpression (UCArray::const_iterator &here, UCArray::const_iterator end, int defaultBoolCombine, int defaultStemMethod); static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) { if (t1 == NULL) return t2; if (t2 == NULL) return t1; AndQueryNode *andNode = new AndQueryNode; andNode->leftNode = t1; andNode->rightNode = t2; return andNode; } static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) { if (t1 == NULL) return t2; if (t2 == NULL) return t1; OrQueryNode *orNode = new OrQueryNode; orNode->leftNode = t1; orNode->rightNode = t2; return orNode; } static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) { if (t1 == NULL) return t2; if (t2 == NULL) return t1; NotQueryNode *notNode = new NotQueryNode; notNode->queryNode = t1; notNode->notNode = t2; return notNode; } // expects the opening bracket to have already been parsed // and discarded static QueryNode *ParseBracketExpression (UCArray::const_iterator &here, UCArray::const_iterator end, int defaultBoolCombine, int defaultStemMethod) { // get everything in the expression QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine, defaultStemMethod); // gobble up tokens until a closing bracket is found // or the end of the string LexEl el; while (ParseLexEl (here, end, el)) { if (el.lexType == CloseBracketE) break; } return curTree; } static int ParseInt (UCArray::const_iterator &here, UCArray::const_iterator end) { LexEl el; UCArray::const_iterator oldHere = here; if (ParseLexEl (here, end, el) && el.lexType == IntegerE) return el.num; here = oldHere; // not an integer return 0; } // default is within 20 words static void SetRangeValues (TermNode &termNode, UCArray &nearby, bool reverse) { UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4); UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6); if (nearby == NEARBY) { // no modifier termNode.startRange = (NEAR_DEFAULT+1)*-1; termNode.endRange = NEAR_DEFAULT; } else if (nearby == WITHIN) { // no modifier if (reverse) { termNode.startRange = (NEAR_DEFAULT+1)*-1; termNode.endRange = -1; } else { termNode.startRange = NEAR_DEFAULT; termNode.endRange = 0; } } else { // extract number UCArray::const_iterator here; bool within = false; if (PrefixLen(nearby, WITHIN)==6) { within=true; here = nearby.begin()+6; } else { here = nearby.begin()+4; } UCArray::const_iterator end = nearby.end(); int size=0; while (here != end) { size = size*10 + (*here-'0'); ++here; } if (within) { if (reverse) { termNode.startRange = size; termNode.endRange = 0; } else { termNode.startRange = -1 * (size+1); termNode.endRange = -1; } } else { termNode.startRange = -1 * (size+1); termNode.endRange = size; } } } static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) { // here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants unsigned long stem = (unsigned long)defaultStemMethod; UCArray::const_iterator here = el.text.begin(); UCArray::const_iterator end = el.text.end(); /* [JFG - Mar 06: Accent folding patch] */ /* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */ while(here != end) { unsigned char ch = *here; if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL) return STEM_INVALID; // incorrect format switch(ch) { case CHAR_FLAG_STEM_CaseFold: // ignore case (fold) stem |= STEM_CaseFolding; break; case CHAR_FLAG_STEM_NoCaseFold: // case sensitive stem &= (~STEM_CaseFolding); break; case CHAR_FLAG_STEM_Stemming: // stem words stem |= STEM_Stemming; break; case CHAR_FLAG_STEM_NoStemming: // do not stem words stem &= (~STEM_Stemming); break; #ifdef ENABLE_ACCENTFOLD case CHAR_FLAG_STEM_AccentFold: // accent fold stem |= STEM_AccentFolding; break; case CHAR_FLAG_STEM_NoAccentFold: // do no accent folding stem &= (~STEM_AccentFolding); break; #endif }; ++here; } return stem; } static void ParseTermModifiers (UCArray::const_iterator &here, UCArray::const_iterator end, TermNode &termNode, int defaultStemMethod) { termNode.stemMethod = defaultStemMethod; bool partial_match = false; LexEl el; UCArray::const_iterator oldHere = here; while (ParseLexEl (here, end, el)) { if (el.lexType == TermWeightE) { termNode.termWeight = ParseInt (here, end); } else if (el.lexType == StemMethodE) { oldHere = here; LexEl stem; if (ParseLexEl (here, end, stem) && stem.lexType == TermE) { termNode.stemMethod = GetStemMethod(stem, defaultStemMethod); /* [JFG - Mar 06: Accent folding patch] */ /* use STEM_INVALID instead of hardcoded 4 */ if (termNode.stemMethod == STEM_INVALID) { // error so backtrack here = oldHere; termNode.stemMethod = (unsigned long)defaultStemMethod; } } else here = oldHere; //ignore - wrong syntax } else if (el.lexType == RangeE) { termNode.startRange = ParseInt (here, end); termNode.endRange = ParseInt (here, end); } else if (el.lexType == AtE) { termNode.startRange = termNode.endRange = ParseInt (here, end); } else if (el.lexType == StarE) { partial_match = true; } else { // no term modifiers here = oldHere; break; } if (partial_match) { /* [JFG - Mar 06: Accent folding patch] */ /* use STEM_PARTIAL_MATCH flag */ termNode.stemMethod |= STEM_PARTIAL_MATCH; // set partial match flag termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching. termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching. } oldHere = here; } } static void ParseProxModifiers (UCArray::const_iterator &here, UCArray::const_iterator end, ProxMatchQueryNode *proxNode) { // so far only have one - the tag stuff LexEl el; UCArray::const_iterator oldHere = here; while (ParseLexEl (here, end, el)) { if (el.lexType == TagE) { oldHere = here; // don't backtrack past here if (ParseLexEl (here, end, el) && el.lexType == TermE) { proxNode->tagNodePtr = new TagNode; proxNode->tagNodePtr->tagName = el.text; } else { // error in tag here = oldHere; } } // TagE // add in other cases here else { // no modifiers here = oldHere; break; } oldHere = here; }//while } // expects starting brackets to have been parsed // sets error to true if something has gone wrong static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here, UCArray::const_iterator end, /*ProxMatchQueryNode *proxNode,*/ int defaultStemMethod, bool & error) { ProxMatchQueryNode *proxNode = new ProxMatchQueryNode; LexEl el; bool phrase=false; bool first=true; bool prox = false; UCArray near_string; while (ParseLexEl (here, end, el)) { // cant have AND, OR, NOT in square brackets, so assume they are words if (el.lexType == TermE || el.lexType == IntegerE || el.lexType == AndOpE || el.lexType == OrOpE || el.lexType == NotOpE) { TermNode termNode; termNode.term = el.text; ParseTermModifiers (here, end, termNode, defaultStemMethod); if (phrase) { if (first) first=false; else { termNode.startRange = -2; termNode.endRange = -1; } } else if (prox) { SetRangeValues(termNode, near_string, false); prox = false; } proxNode->terms.push_back(termNode); } else if (el.lexType == CloseSquareBracketE) { break; } else if (el.lexType == QuoteE) { // phrase inside square brackets if (phrase) { // end of phrase phrase=false; first = true; } else { phrase=true; // start of phrase } } else if (el.lexType == NearOpE || el.lexType == WithinOpE) { if (phrase) { // cant have proximity op in a phrase - just assume its an actual word TermNode termNode; termNode.term = el.text; ParseTermModifiers (here, end, termNode, defaultStemMethod); proxNode->terms.push_back(termNode); } else { // its a NEAR or within op prox = true; near_string = el.text; } } else if (el.lexType == UnknownE) { // just ignore it } else { //error - we set the proxNode to NULL, cerr <<"GSDLQueryParser: bad syntax inside []\n"; error = true; return NULL; } } // while return proxNode; } // expects the starting quote to have been parsed // and discarded // now phrases use the case and stem preference options // ie can search for a phrase ignoring case static void ParsePhrase (UCArray::const_iterator &here, UCArray::const_iterator end, ProxMatchQueryNode &proxNode, int defaultStemMethod, bool &error) { LexEl el; bool first = true; while (ParseLexEl (here, end, el)) { if (el.lexType == TermE || el.lexType == IntegerE) { TermNode termNode; termNode.term = el.text; //termNode.stemMethod = defaultStemMethod; ParseTermModifiers (here, end, termNode, defaultStemMethod); if (first) { first = false; } else { termNode.startRange = -2; termNode.endRange = -1; } proxNode.terms.push_back (termNode); } else if (el.lexType == QuoteE) { break; } else if (el.lexType == UnknownE) { // just ignore it } else { // error error = true; return; } } } static QueryNode *ParseTerm (UCArray::const_iterator &here, UCArray::const_iterator end, int defaultBoolCombine, int defaultStemMethod) { LexEl el; UCArray::const_iterator oldHere = here; if (!ParseLexEl (here, end, el)) return NULL; if (el.lexType == OpenBracketE) return ParseBracketExpression (here, end, defaultBoolCombine, defaultStemMethod); ProxMatchQueryNode *proxNode = new ProxMatchQueryNode; if (el.lexType == TermE || el.lexType == IntegerE) { TermNode termNode; termNode.term = el.text; ParseTermModifiers (here, end, termNode, defaultStemMethod); oldHere = here; // dont backtrack past here if (ParseLexEl(here, end, el) && (el.lexType == NearOpE || el.lexType == WithinOpE )) { delete proxNode; oldHere = here; // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003 // if the next element is a '(' have a syntax error, return NULL LexEl temp_el; if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) { cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n"; return NULL; } here = oldHere; // else backtrack proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine, defaultStemMethod); SetRangeValues(termNode, el.text, true); proxNode->terms.push_back (termNode); return proxNode; } else { here = oldHere; // backtrack proxNode->terms.push_back (termNode); ParseProxModifiers(here, end, proxNode); return proxNode; } } else if (el.lexType == QuoteE) { bool error = false; ParsePhrase (here, end, *proxNode, defaultStemMethod, error); if (error) { delete proxNode; return NULL; } return proxNode; } else if (el.lexType == OpenSquareBracketE) { bool error = false; proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error); if (error) { delete proxNode; return NULL; } ParseProxModifiers (here, end, proxNode); return proxNode; } // not a term here = oldHere; delete proxNode; return NULL; } static QueryNode *ParseExpression (UCArray::const_iterator &here, UCArray::const_iterator end, int defaultBoolCombine, int defaultStemMethod) { LexEl el; QueryNode *curTree = NULL; UCArray::const_iterator oldHere = here; while (ParseLexEl (here, end, el)) { if (el.lexType == CloseBracketE) { // parsebracketexpression is waiting for the last bracket, so put it back here = oldHere; break; } else if (el.lexType == OpenSquareBracketE || el.lexType == OpenBracketE || el.lexType == TermE || el.lexType == QuoteE || el.lexType == IntegerE ) { // some type of term, back track and parse it here = oldHere; // parse the term QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine, defaultStemMethod); if (newTerm == NULL) { delete curTree; return NULL; } // if default==1, AND, else if==0, OR if (defaultBoolCombine) { curTree = AndAdd (curTree, newTerm); } else { curTree = OrAdd (curTree, newTerm); } } else if (el.lexType == AndOpE) { QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine, defaultStemMethod); if (newTerm == NULL) { delete curTree; return NULL; } curTree = AndAdd (curTree, newTerm); } else if (el.lexType == OrOpE) { QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine, defaultStemMethod); if (newTerm == NULL) { delete curTree; return NULL; } curTree = OrAdd (curTree, newTerm); } else if (el.lexType == NotOpE) { QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine, defaultStemMethod); if (newTerm == NULL) { delete curTree; return NULL; } curTree = NotAdd (curTree, newTerm); } else if (el.lexType == UnknownE) { // just ignore it } else { // syntax error, return NUll delete curTree; return NULL; } oldHere = here; } return curTree; } QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine, int defaultStemMethod, int maxnumeric) { if (4 < maxnumeric && maxnumeric < 512) { MAXNUMERIC = maxnumeric; } UCArray::const_iterator here = queryStr.begin(); UCArray::const_iterator end = queryStr.end(); return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod); }