Context Navigation

source: trunk/mgpp/text/GSDLQueryParser.cpp@ 13653

Last change on this file since 13653 was 13653, checked in by kjdon, 17 years ago

Accent folding patch thanks to Juan Grigera. parsing of stem/case/accent term
modifiers now uses defines from mg_files.h

turned off accent folding if partial matching is being done - can't do them
together due to the way the index works. also, do the accentfold cases for
the switch in GetStemMethod only if ENABLE_ACCENTFOLD is defined
changed line 528 to avoid a compile warning on windows

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 15.2 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* QueryParser.cpp -- Query parser for a simple query language
	4	* Copyright (C) 2000 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "GSDLQueryParser.h"
	23	#include "GSDLQueryLex.h"
[12321]	24	#include "words.h"
[3365]	25
	26	static QueryNode *ParseExpression (UCArray::const_iterator &here,
	27	UCArray::const_iterator end,
[4210]	28	int defaultBoolCombine,
[3365]	29	int defaultStemMethod);
	30
	31	static QueryNode AndAdd (QueryNode t1, QueryNode *t2) {
	32	if (t1 == NULL) return t2;
	33	if (t2 == NULL) return t1;
	34
	35	AndQueryNode *andNode = new AndQueryNode;
	36	andNode->leftNode = t1;
	37	andNode->rightNode = t2;
	38	return andNode;
	39	}
	40
	41	static QueryNode OrAdd (QueryNode t1, QueryNode *t2) {
	42	if (t1 == NULL) return t2;
	43	if (t2 == NULL) return t1;
	44
	45	OrQueryNode *orNode = new OrQueryNode;
	46	orNode->leftNode = t1;
	47	orNode->rightNode = t2;
	48	return orNode;
	49	}
	50
	51	static QueryNode NotAdd (QueryNode t1, QueryNode *t2) {
	52	if (t1 == NULL) return t2;
	53	if (t2 == NULL) return t1;
	54
	55	NotQueryNode *notNode = new NotQueryNode;
	56	notNode->queryNode = t1;
	57	notNode->notNode = t2;
	58	return notNode;
	59	}
	60
	61	// expects the opening bracket to have already been parsed
	62	// and discarded
	63	static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
	64	UCArray::const_iterator end,
	65	int defaultBoolCombine,
	66	int defaultStemMethod) {
	67	// get everything in the expression
	68	QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
	69	defaultStemMethod);
	70
	71	// gobble up tokens until a closing bracket is found
	72	// or the end of the string
	73	LexEl el;
	74	while (ParseLexEl (here, end, el)) {
	75	if (el.lexType == CloseBracketE) break;
	76	}
	77
	78	return curTree;
	79	}
	80
	81	static int ParseInt (UCArray::const_iterator &here,
	82	UCArray::const_iterator end) {
	83	LexEl el;
	84	UCArray::const_iterator oldHere = here;
	85	if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
	86	return el.num;
	87
	88	here = oldHere; // not an integer
	89	return 0;
	90	}
	91
	92	// default is within 20 words
	93	static void SetRangeValues (TermNode &termNode,
[6119]	94	UCArray &nearby,
	95	bool reverse) {
[8692]	96	UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
	97	UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
[6119]	98
[3365]	99	if (nearby == NEARBY) { // no modifier
[3782]	100	termNode.startRange = (NEAR_DEFAULT+1)*-1;
	101	termNode.endRange = NEAR_DEFAULT;
	102
[6119]	103	} else if (nearby == WITHIN) { // no modifier
	104	if (reverse) {
	105	termNode.startRange = (NEAR_DEFAULT+1)*-1;
	106	termNode.endRange = -1;
	107	} else {
	108	termNode.startRange = NEAR_DEFAULT;
	109	termNode.endRange = 0;
	110	}
[3365]	111	}
	112	else { // extract number
[6119]	113	UCArray::const_iterator here;
	114	bool within = false;
	115	if (PrefixLen(nearby, WITHIN)==6) {
	116	within=true;
	117	here = nearby.begin()+6;
	118	} else {
	119	here = nearby.begin()+4;
	120	}
[3365]	121	UCArray::const_iterator end = nearby.end();
	122	int size=0;
	123	while (here != end) {
	124	size = size10 + (here-'0');
[8692]	125	++here;
[3365]	126	}
[6119]	127	if (within) {
	128	if (reverse) {
	129	termNode.startRange = size;
	130	termNode.endRange = 0;
	131	} else {
	132	termNode.startRange = -1 * (size+1);
	133	termNode.endRange = -1;
	134	}
	135	} else {
	136	termNode.startRange = -1 * (size+1);
	137	termNode.endRange = size;
	138	}
[3365]	139	}
	140	}
	141
	142	static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
[13653]	143	// here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
[3365]	144	unsigned long stem = (unsigned long)defaultStemMethod;
	145
	146	UCArray::const_iterator here = el.text.begin();
	147	UCArray::const_iterator end = el.text.end();
	148
[13653]	149	/* [JFG - Mar 06: Accent folding patch] */
	150	/* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
	151	while(here != end) {
	152	unsigned char ch = *here;
	153	if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
	154	return STEM_INVALID; // incorrect format
	155
	156	switch(ch) {
	157	case CHAR_FLAG_STEM_CaseFold: // ignore case (fold)
	158	stem \|= STEM_CaseFolding;
	159	break;
	160	case CHAR_FLAG_STEM_NoCaseFold: // case sensitive
	161	stem &= (~STEM_CaseFolding);
	162	break;
	163	case CHAR_FLAG_STEM_Stemming: // stem words
	164	stem \|= STEM_Stemming;
	165	break;
	166	case CHAR_FLAG_STEM_NoStemming: // do not stem words
	167	stem &= (~STEM_Stemming);
	168	break;
	169	#ifdef ENABLE_ACCENTFOLD
	170	case CHAR_FLAG_STEM_AccentFold: // accent fold
	171	stem \|= STEM_AccentFolding;
	172	break;
	173	case CHAR_FLAG_STEM_NoAccentFold: // do no accent folding
	174	stem &= (~STEM_AccentFolding);
	175	break;
	176	#endif
	177	};
	178
	179	++here;
[3365]	180	}
	181	return stem;
	182	}
	183
	184
	185	static void ParseTermModifiers (UCArray::const_iterator &here,
	186	UCArray::const_iterator end,
	187	TermNode &termNode,
	188	int defaultStemMethod) {
	189
	190	termNode.stemMethod = defaultStemMethod;
[8242]	191	bool partial_match = false;
[3365]	192	LexEl el;
	193	UCArray::const_iterator oldHere = here;
	194	while (ParseLexEl (here, end, el)) {
	195	if (el.lexType == TermWeightE) {
	196	termNode.termWeight = ParseInt (here, end);
	197
	198	} else if (el.lexType == StemMethodE) {
	199	oldHere = here;
	200	LexEl stem;
	201	if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
	202	termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
[13653]	203	/* [JFG - Mar 06: Accent folding patch] */
	204	/* use STEM_INVALID instead of hardcoded 4 */
	205	if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
[3365]	206	here = oldHere;
	207	termNode.stemMethod = (unsigned long)defaultStemMethod;
	208	}
[13653]	209	} else here = oldHere; //ignore - wrong syntax
[3365]	210
	211	} else if (el.lexType == RangeE) {
	212	termNode.startRange = ParseInt (here, end);
	213	termNode.endRange = ParseInt (here, end);
	214
	215	} else if (el.lexType == AtE) {
	216	termNode.startRange = termNode.endRange = ParseInt (here, end);
[8242]	217	} else if (el.lexType == StarE) {
	218	partial_match = true;
[3365]	219	} else {
	220	// no term modifiers
	221	here = oldHere;
	222	break;
	223	}
	224
[8242]	225	if (partial_match) {
[13653]	226	/* [JFG - Mar 06: Accent folding patch] */
	227	/* use STEM_PARTIAL_MATCH flag */
	228	termNode.stemMethod \|= STEM_PARTIAL_MATCH; // set partial match flag
	229	termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
	230	termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
[8242]	231	}
[3365]	232	oldHere = here;
	233	}
	234	}
	235
	236	static void ParseProxModifiers (UCArray::const_iterator &here,
	237	UCArray::const_iterator end,
	238	ProxMatchQueryNode *proxNode) {
	239	// so far only have one - the tag stuff
	240	LexEl el;
	241	UCArray::const_iterator oldHere = here;
	242	while (ParseLexEl (here, end, el)) {
	243	if (el.lexType == TagE) {
	244	oldHere = here; // don't backtrack past here
	245	if (ParseLexEl (here, end, el) && el.lexType == TermE) {
	246	proxNode->tagNodePtr = new TagNode;
	247	proxNode->tagNodePtr->tagName = el.text;
	248
	249	}
	250	else { // error in tag
	251	here = oldHere;
	252	}
	253	} // TagE
	254	// add in other cases here
	255	else {
	256	// no modifiers
	257	here = oldHere;
	258	break;
	259	}
	260	oldHere = here;
	261	}//while
	262
	263
	264	}
	265
	266	// expects starting brackets to have been parsed
[4210]	267	// sets error to true if something has gone wrong
[6082]	268	static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
[3365]	269	UCArray::const_iterator end,
[6082]	270	/ProxMatchQueryNode proxNode,*/
[4210]	271	int defaultStemMethod,
	272	bool & error) {
[3365]	273
[6082]	274	ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
[3365]	275	LexEl el;
	276	bool phrase=false;
	277	bool first=true;
[6129]	278	bool prox = false;
[6082]	279	UCArray near_string;
[3365]	280	while (ParseLexEl (here, end, el)) {
[8692]	281	// cant have AND, OR, NOT in square brackets, so assume they are words
	282	if (el.lexType == TermE \|\| el.lexType == IntegerE \|\| el.lexType == AndOpE \|\| el.lexType == OrOpE \|\| el.lexType == NotOpE) {
[3365]	283	TermNode termNode;
	284	termNode.term = el.text;
	285	ParseTermModifiers (here, end, termNode, defaultStemMethod);
	286	if (phrase) {
	287	if (first) first=false;
	288	else {
	289	termNode.startRange = -2;
	290	termNode.endRange = -1;
	291	}
[6129]	292	} else if (prox) {
[6119]	293	SetRangeValues(termNode, near_string, false);
[6129]	294	prox = false;
[6082]	295	}
[3365]	296	proxNode->terms.push_back(termNode);
	297	}
	298	else if (el.lexType == CloseSquareBracketE) {
	299	break;
	300	}
	301	else if (el.lexType == QuoteE) {
	302	// phrase inside square brackets
[6082]	303	if (phrase) { // end of phrase
	304	phrase=false;
	305	first = true;
	306	} else {
	307	phrase=true; // start of phrase
	308	}
[6119]	309	} else if (el.lexType == NearOpE \|\| el.lexType == WithinOpE) {
[6082]	310	if (phrase) {
[6119]	311	// cant have proximity op in a phrase - just assume its an actual word
[6082]	312	TermNode termNode;
	313	termNode.term = el.text;
	314	ParseTermModifiers (here, end, termNode, defaultStemMethod);
	315	proxNode->terms.push_back(termNode);
	316	} else {
[6119]	317	// its a NEAR or within op
[6129]	318	prox = true;
[6082]	319	near_string = el.text;
	320	}
	321
	322	}
	323	else if (el.lexType == UnknownE) {
[5449]	324	// just ignore it
[3365]	325	}
	326	else {
[4210]	327	//error - we set the proxNode to NULL,
	328	cerr <<"GSDLQueryParser: bad syntax inside []\n";
	329	error = true;
[6082]	330	return NULL;
[3365]	331	}
	332	} // while
[6082]	333	return proxNode;
[3365]	334	}
	335	// expects the starting quote to have been parsed
	336	// and discarded
	337	// now phrases use the case and stem preference options
	338	// ie can search for a phrase ignoring case
	339	static void ParsePhrase (UCArray::const_iterator &here,
	340	UCArray::const_iterator end,
	341	ProxMatchQueryNode &proxNode,
[4210]	342	int defaultStemMethod,
	343	bool &error) {
[3365]	344	LexEl el;
	345	bool first = true;
	346	while (ParseLexEl (here, end, el)) {
	347	if (el.lexType == TermE \|\| el.lexType == IntegerE) {
	348	TermNode termNode;
	349	termNode.term = el.text;
	350	//termNode.stemMethod = defaultStemMethod;
	351	ParseTermModifiers (here, end, termNode, defaultStemMethod);
	352	if (first) {
	353	first = false;
	354	}
	355	else {
	356	termNode.startRange = -2;
	357	termNode.endRange = -1;
	358	}
	359	proxNode.terms.push_back (termNode);
	360
	361	} else if (el.lexType == QuoteE) {
	362	break;
	363
[5449]	364	} else if (el.lexType == UnknownE) {
	365	// just ignore it
[3365]	366	} else {
[4210]	367	// error
	368	error = true;
	369	return;
[3365]	370	}
	371	}
	372	}
	373
	374	static QueryNode *ParseTerm (UCArray::const_iterator &here,
	375	UCArray::const_iterator end,
	376	int defaultBoolCombine,
	377	int defaultStemMethod) {
	378	LexEl el;
	379
	380	UCArray::const_iterator oldHere = here;
	381	if (!ParseLexEl (here, end, el)) return NULL;
	382
	383	if (el.lexType == OpenBracketE)
	384	return ParseBracketExpression (here, end, defaultBoolCombine,
	385	defaultStemMethod);
	386
	387	ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
	388
	389	if (el.lexType == TermE \|\| el.lexType == IntegerE) {
	390	TermNode termNode;
	391	termNode.term = el.text;
	392	ParseTermModifiers (here, end, termNode, defaultStemMethod);
	393	oldHere = here; // dont backtrack past here
[6119]	394	if (ParseLexEl(here, end, el) && (el.lexType == NearOpE \|\| el.lexType == WithinOpE )) {
[4210]	395	delete proxNode;
	396	oldHere = here;
	397	// this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
	398
	399	// if the next element is a '(' have a syntax error, return NULL
	400	LexEl temp_el;
	401	if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
[6119]	402	cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
[4210]	403	return NULL;
	404	}
	405	here = oldHere; // else backtrack
	406
[3365]	407	proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
	408	defaultStemMethod);
[6119]	409	SetRangeValues(termNode, el.text, true);
[3365]	410	proxNode->terms.push_back (termNode);
	411	return proxNode;
[4210]	412
	413	} else {
[3365]	414	here = oldHere; // backtrack
	415	proxNode->terms.push_back (termNode);
	416	ParseProxModifiers(here, end, proxNode);
	417	return proxNode;
	418	}
	419	} else if (el.lexType == QuoteE) {
[4210]	420	bool error = false;
	421	ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
	422	if (error) {
	423	delete proxNode;
	424	return NULL;
	425	}
[3365]	426	return proxNode;
	427	}
	428	else if (el.lexType == OpenSquareBracketE) {
[4210]	429	bool error = false;
[6082]	430	proxNode = ParseSquareBrackets (here, end, /proxNode, /defaultStemMethod, error);
[4210]	431	if (error) {
	432	delete proxNode;
	433	return NULL;
	434	}
[3365]	435	ParseProxModifiers (here, end, proxNode);
	436	return proxNode;
	437	}
	438
	439	// not a term
	440	here = oldHere;
	441	delete proxNode;
	442	return NULL;
	443	}
	444
	445
	446	static QueryNode *ParseExpression (UCArray::const_iterator &here,
	447	UCArray::const_iterator end,
	448	int defaultBoolCombine,
	449	int defaultStemMethod) {
	450	LexEl el;
	451	QueryNode *curTree = NULL;
	452	UCArray::const_iterator oldHere = here;
	453	while (ParseLexEl (here, end, el)) {
[4210]	454	if (el.lexType == CloseBracketE) {
	455	// parsebracketexpression is waiting for the last bracket, so put it back
	456	here = oldHere;
	457	break;
	458
	459	} else if (el.lexType == OpenSquareBracketE \|\|
	460	el.lexType == OpenBracketE \|\|
	461	el.lexType == TermE \|\|
	462	el.lexType == QuoteE \|\|
	463	el.lexType == IntegerE ) {
	464
[3365]	465	// some type of term, back track and parse it
	466	here = oldHere;
[4210]	467
	468	// parse the term
	469	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
	470	defaultStemMethod);
	471	if (newTerm == NULL) {
	472	delete curTree;
	473	return NULL;
	474	}
	475
[3365]	476	// if default==1, AND, else if==0, OR
	477	if (defaultBoolCombine) {
[4210]	478	curTree = AndAdd (curTree, newTerm);
[3365]	479	}
	480	else {
[4210]	481	curTree = OrAdd (curTree, newTerm);
[3365]	482	}
[4210]	483
[3365]	484	} else if (el.lexType == AndOpE) {
[4210]	485	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
	486	defaultStemMethod);
	487	if (newTerm == NULL) {
	488	delete curTree;
	489	return NULL;
	490	}
	491	curTree = AndAdd (curTree, newTerm);
[3365]	492
	493	} else if (el.lexType == OrOpE) {
[4210]	494	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
	495	defaultStemMethod);
	496	if (newTerm == NULL) {
	497	delete curTree;
	498	return NULL;
	499	}
	500	curTree = OrAdd (curTree, newTerm);
[3365]	501
	502	} else if (el.lexType == NotOpE) {
[4210]	503	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
	504	defaultStemMethod);
	505	if (newTerm == NULL) {
	506	delete curTree;
	507	return NULL;
	508	}
	509	curTree = NotAdd (curTree, newTerm);
[3365]	510
[5449]	511	} else if (el.lexType == UnknownE) {
	512	// just ignore it
[4210]	513	} else {
	514
	515	// syntax error, return NUll
	516	delete curTree;
	517	return NULL;
	518	}
	519
[3365]	520	oldHere = here;
	521	}
	522
	523	return curTree;
	524	}
	525
	526	QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
[12321]	527	int defaultStemMethod, int maxnumeric) {
[13653]	528	if (4 < maxnumeric && maxnumeric < 512) {
[12321]	529	MAXNUMERIC = maxnumeric;
	530	}
[3365]	531	UCArray::const_iterator here = queryStr.begin();
	532	UCArray::const_iterator end = queryStr.end();
	533	return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
	534	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: