Context Navigation

source: trunk/mgpp/text/GSDLQueryParser.cpp@ 13653

Last change on this file since 13653 was 13653, checked in by kjdon, 17 years ago

Accent folding patch thanks to Juan Grigera. parsing of stem/case/accent term
modifiers now uses defines from mg_files.h

turned off accent folding if partial matching is being done - can't do them
together due to the way the index works. also, do the accentfold cases for
the switch in GetStemMethod only if ENABLE_ACCENTFOLD is defined
changed line 528 to avoid a compile warning on windows

Property svn:executable set to *
Property svn:keywords set to Author Date Id Revision

File size: 15.2 KB

Line
1	/**************************************************************************
2	*
3	* QueryParser.cpp -- Query parser for a simple query language
4	* Copyright (C) 2000 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "GSDLQueryParser.h"
23	#include "GSDLQueryLex.h"
24	#include "words.h"
25
26	static QueryNode *ParseExpression (UCArray::const_iterator &here,
27	UCArray::const_iterator end,
28	int defaultBoolCombine,
29	int defaultStemMethod);
30
31	static QueryNode AndAdd (QueryNode t1, QueryNode *t2) {
32	if (t1 == NULL) return t2;
33	if (t2 == NULL) return t1;
34
35	AndQueryNode *andNode = new AndQueryNode;
36	andNode->leftNode = t1;
37	andNode->rightNode = t2;
38	return andNode;
39	}
40
41	static QueryNode OrAdd (QueryNode t1, QueryNode *t2) {
42	if (t1 == NULL) return t2;
43	if (t2 == NULL) return t1;
44
45	OrQueryNode *orNode = new OrQueryNode;
46	orNode->leftNode = t1;
47	orNode->rightNode = t2;
48	return orNode;
49	}
50
51	static QueryNode NotAdd (QueryNode t1, QueryNode *t2) {
52	if (t1 == NULL) return t2;
53	if (t2 == NULL) return t1;
54
55	NotQueryNode *notNode = new NotQueryNode;
56	notNode->queryNode = t1;
57	notNode->notNode = t2;
58	return notNode;
59	}
60
61	// expects the opening bracket to have already been parsed
62	// and discarded
63	static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64	UCArray::const_iterator end,
65	int defaultBoolCombine,
66	int defaultStemMethod) {
67	// get everything in the expression
68	QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69	defaultStemMethod);
70
71	// gobble up tokens until a closing bracket is found
72	// or the end of the string
73	LexEl el;
74	while (ParseLexEl (here, end, el)) {
75	if (el.lexType == CloseBracketE) break;
76	}
77
78	return curTree;
79	}
80
81	static int ParseInt (UCArray::const_iterator &here,
82	UCArray::const_iterator end) {
83	LexEl el;
84	UCArray::const_iterator oldHere = here;
85	if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86	return el.num;
87
88	here = oldHere; // not an integer
89	return 0;
90	}
91
92	// default is within 20 words
93	static void SetRangeValues (TermNode &termNode,
94	UCArray &nearby,
95	bool reverse) {
96	UCArray NEARBY; SetCStr(NEARBY, "NEAR", 4);
97	UCArray WITHIN; SetCStr(WITHIN, "WITHIN", 6);
98
99	if (nearby == NEARBY) { // no modifier
100	termNode.startRange = (NEAR_DEFAULT+1)*-1;
101	termNode.endRange = NEAR_DEFAULT;
102
103	} else if (nearby == WITHIN) { // no modifier
104	if (reverse) {
105	termNode.startRange = (NEAR_DEFAULT+1)*-1;
106	termNode.endRange = -1;
107	} else {
108	termNode.startRange = NEAR_DEFAULT;
109	termNode.endRange = 0;
110	}
111	}
112	else { // extract number
113	UCArray::const_iterator here;
114	bool within = false;
115	if (PrefixLen(nearby, WITHIN)==6) {
116	within=true;
117	here = nearby.begin()+6;
118	} else {
119	here = nearby.begin()+4;
120	}
121	UCArray::const_iterator end = nearby.end();
122	int size=0;
123	while (here != end) {
124	size = size10 + (here-'0');
125	++here;
126	}
127	if (within) {
128	if (reverse) {
129	termNode.startRange = size;
130	termNode.endRange = 0;
131	} else {
132	termNode.startRange = -1 * (size+1);
133	termNode.endRange = -1;
134	}
135	} else {
136	termNode.startRange = -1 * (size+1);
137	termNode.endRange = size;
138	}
139	}
140	}
141
142	static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
143	// here expect el to contain some of c,s,i,u,f,a -- see mg_files.h CHAR_FLAG_STEM_* constants
144	unsigned long stem = (unsigned long)defaultStemMethod;
145
146	UCArray::const_iterator here = el.text.begin();
147	UCArray::const_iterator end = el.text.end();
148
149	/* [JFG - Mar 06: Accent folding patch] */
150	/* Changed to use CHAR_FLAG_STEM* constants from mg_files.h */
151	while(here != end) {
152	unsigned char ch = *here;
153	if (strchr (CHAR_FLAG_STEM_Validator, ch) == NULL)
154	return STEM_INVALID; // incorrect format
155
156	switch(ch) {
157	case CHAR_FLAG_STEM_CaseFold: // ignore case (fold)
158	stem \|= STEM_CaseFolding;
159	break;
160	case CHAR_FLAG_STEM_NoCaseFold: // case sensitive
161	stem &= (~STEM_CaseFolding);
162	break;
163	case CHAR_FLAG_STEM_Stemming: // stem words
164	stem \|= STEM_Stemming;
165	break;
166	case CHAR_FLAG_STEM_NoStemming: // do not stem words
167	stem &= (~STEM_Stemming);
168	break;
169	#ifdef ENABLE_ACCENTFOLD
170	case CHAR_FLAG_STEM_AccentFold: // accent fold
171	stem \|= STEM_AccentFolding;
172	break;
173	case CHAR_FLAG_STEM_NoAccentFold: // do no accent folding
174	stem &= (~STEM_AccentFolding);
175	break;
176	#endif
177	};
178
179	++here;
180	}
181	return stem;
182	}
183
184
185	static void ParseTermModifiers (UCArray::const_iterator &here,
186	UCArray::const_iterator end,
187	TermNode &termNode,
188	int defaultStemMethod) {
189
190	termNode.stemMethod = defaultStemMethod;
191	bool partial_match = false;
192	LexEl el;
193	UCArray::const_iterator oldHere = here;
194	while (ParseLexEl (here, end, el)) {
195	if (el.lexType == TermWeightE) {
196	termNode.termWeight = ParseInt (here, end);
197
198	} else if (el.lexType == StemMethodE) {
199	oldHere = here;
200	LexEl stem;
201	if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
202	termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
203	/* [JFG - Mar 06: Accent folding patch] */
204	/* use STEM_INVALID instead of hardcoded 4 */
205	if (termNode.stemMethod == STEM_INVALID) { // error so backtrack
206	here = oldHere;
207	termNode.stemMethod = (unsigned long)defaultStemMethod;
208	}
209	} else here = oldHere; //ignore - wrong syntax
210
211	} else if (el.lexType == RangeE) {
212	termNode.startRange = ParseInt (here, end);
213	termNode.endRange = ParseInt (here, end);
214
215	} else if (el.lexType == AtE) {
216	termNode.startRange = termNode.endRange = ParseInt (here, end);
217	} else if (el.lexType == StarE) {
218	partial_match = true;
219	} else {
220	// no term modifiers
221	here = oldHere;
222	break;
223	}
224
225	if (partial_match) {
226	/* [JFG - Mar 06: Accent folding patch] */
227	/* use STEM_PARTIAL_MATCH flag */
228	termNode.stemMethod \|= STEM_PARTIAL_MATCH; // set partial match flag
229	termNode.stemMethod &= (~STEM_Stemming); // we dont have stemming on if doing partial matching.
230	termNode.stemMethod &= (~STEM_AccentFolding); // we dont have accentfolding on if doing partial matching.
231	}
232	oldHere = here;
233	}
234	}
235
236	static void ParseProxModifiers (UCArray::const_iterator &here,
237	UCArray::const_iterator end,
238	ProxMatchQueryNode *proxNode) {
239	// so far only have one - the tag stuff
240	LexEl el;
241	UCArray::const_iterator oldHere = here;
242	while (ParseLexEl (here, end, el)) {
243	if (el.lexType == TagE) {
244	oldHere = here; // don't backtrack past here
245	if (ParseLexEl (here, end, el) && el.lexType == TermE) {
246	proxNode->tagNodePtr = new TagNode;
247	proxNode->tagNodePtr->tagName = el.text;
248
249	}
250	else { // error in tag
251	here = oldHere;
252	}
253	} // TagE
254	// add in other cases here
255	else {
256	// no modifiers
257	here = oldHere;
258	break;
259	}
260	oldHere = here;
261	}//while
262
263
264	}
265
266	// expects starting brackets to have been parsed
267	// sets error to true if something has gone wrong
268	static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
269	UCArray::const_iterator end,
270	/ProxMatchQueryNode proxNode,*/
271	int defaultStemMethod,
272	bool & error) {
273
274	ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
275	LexEl el;
276	bool phrase=false;
277	bool first=true;
278	bool prox = false;
279	UCArray near_string;
280	while (ParseLexEl (here, end, el)) {
281	// cant have AND, OR, NOT in square brackets, so assume they are words
282	if (el.lexType == TermE \|\| el.lexType == IntegerE \|\| el.lexType == AndOpE \|\| el.lexType == OrOpE \|\| el.lexType == NotOpE) {
283	TermNode termNode;
284	termNode.term = el.text;
285	ParseTermModifiers (here, end, termNode, defaultStemMethod);
286	if (phrase) {
287	if (first) first=false;
288	else {
289	termNode.startRange = -2;
290	termNode.endRange = -1;
291	}
292	} else if (prox) {
293	SetRangeValues(termNode, near_string, false);
294	prox = false;
295	}
296	proxNode->terms.push_back(termNode);
297	}
298	else if (el.lexType == CloseSquareBracketE) {
299	break;
300	}
301	else if (el.lexType == QuoteE) {
302	// phrase inside square brackets
303	if (phrase) { // end of phrase
304	phrase=false;
305	first = true;
306	} else {
307	phrase=true; // start of phrase
308	}
309	} else if (el.lexType == NearOpE \|\| el.lexType == WithinOpE) {
310	if (phrase) {
311	// cant have proximity op in a phrase - just assume its an actual word
312	TermNode termNode;
313	termNode.term = el.text;
314	ParseTermModifiers (here, end, termNode, defaultStemMethod);
315	proxNode->terms.push_back(termNode);
316	} else {
317	// its a NEAR or within op
318	prox = true;
319	near_string = el.text;
320	}
321
322	}
323	else if (el.lexType == UnknownE) {
324	// just ignore it
325	}
326	else {
327	//error - we set the proxNode to NULL,
328	cerr <<"GSDLQueryParser: bad syntax inside []\n";
329	error = true;
330	return NULL;
331	}
332	} // while
333	return proxNode;
334	}
335	// expects the starting quote to have been parsed
336	// and discarded
337	// now phrases use the case and stem preference options
338	// ie can search for a phrase ignoring case
339	static void ParsePhrase (UCArray::const_iterator &here,
340	UCArray::const_iterator end,
341	ProxMatchQueryNode &proxNode,
342	int defaultStemMethod,
343	bool &error) {
344	LexEl el;
345	bool first = true;
346	while (ParseLexEl (here, end, el)) {
347	if (el.lexType == TermE \|\| el.lexType == IntegerE) {
348	TermNode termNode;
349	termNode.term = el.text;
350	//termNode.stemMethod = defaultStemMethod;
351	ParseTermModifiers (here, end, termNode, defaultStemMethod);
352	if (first) {
353	first = false;
354	}
355	else {
356	termNode.startRange = -2;
357	termNode.endRange = -1;
358	}
359	proxNode.terms.push_back (termNode);
360
361	} else if (el.lexType == QuoteE) {
362	break;
363
364	} else if (el.lexType == UnknownE) {
365	// just ignore it
366	} else {
367	// error
368	error = true;
369	return;
370	}
371	}
372	}
373
374	static QueryNode *ParseTerm (UCArray::const_iterator &here,
375	UCArray::const_iterator end,
376	int defaultBoolCombine,
377	int defaultStemMethod) {
378	LexEl el;
379
380	UCArray::const_iterator oldHere = here;
381	if (!ParseLexEl (here, end, el)) return NULL;
382
383	if (el.lexType == OpenBracketE)
384	return ParseBracketExpression (here, end, defaultBoolCombine,
385	defaultStemMethod);
386
387	ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
388
389	if (el.lexType == TermE \|\| el.lexType == IntegerE) {
390	TermNode termNode;
391	termNode.term = el.text;
392	ParseTermModifiers (here, end, termNode, defaultStemMethod);
393	oldHere = here; // dont backtrack past here
394	if (ParseLexEl(here, end, el) && (el.lexType == NearOpE \|\| el.lexType == WithinOpE )) {
395	delete proxNode;
396	oldHere = here;
397	// this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
398
399	// if the next element is a '(' have a syntax error, return NULL
400	LexEl temp_el;
401	if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
402	cerr << "GSDLQueryParser: NEAR/WITHIN cannot be followed by a '('\n";
403	return NULL;
404	}
405	here = oldHere; // else backtrack
406
407	proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
408	defaultStemMethod);
409	SetRangeValues(termNode, el.text, true);
410	proxNode->terms.push_back (termNode);
411	return proxNode;
412
413	} else {
414	here = oldHere; // backtrack
415	proxNode->terms.push_back (termNode);
416	ParseProxModifiers(here, end, proxNode);
417	return proxNode;
418	}
419	} else if (el.lexType == QuoteE) {
420	bool error = false;
421	ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
422	if (error) {
423	delete proxNode;
424	return NULL;
425	}
426	return proxNode;
427	}
428	else if (el.lexType == OpenSquareBracketE) {
429	bool error = false;
430	proxNode = ParseSquareBrackets (here, end, /proxNode, /defaultStemMethod, error);
431	if (error) {
432	delete proxNode;
433	return NULL;
434	}
435	ParseProxModifiers (here, end, proxNode);
436	return proxNode;
437	}
438
439	// not a term
440	here = oldHere;
441	delete proxNode;
442	return NULL;
443	}
444
445
446	static QueryNode *ParseExpression (UCArray::const_iterator &here,
447	UCArray::const_iterator end,
448	int defaultBoolCombine,
449	int defaultStemMethod) {
450	LexEl el;
451	QueryNode *curTree = NULL;
452	UCArray::const_iterator oldHere = here;
453	while (ParseLexEl (here, end, el)) {
454	if (el.lexType == CloseBracketE) {
455	// parsebracketexpression is waiting for the last bracket, so put it back
456	here = oldHere;
457	break;
458
459	} else if (el.lexType == OpenSquareBracketE \|\|
460	el.lexType == OpenBracketE \|\|
461	el.lexType == TermE \|\|
462	el.lexType == QuoteE \|\|
463	el.lexType == IntegerE ) {
464
465	// some type of term, back track and parse it
466	here = oldHere;
467
468	// parse the term
469	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
470	defaultStemMethod);
471	if (newTerm == NULL) {
472	delete curTree;
473	return NULL;
474	}
475
476	// if default==1, AND, else if==0, OR
477	if (defaultBoolCombine) {
478	curTree = AndAdd (curTree, newTerm);
479	}
480	else {
481	curTree = OrAdd (curTree, newTerm);
482	}
483
484	} else if (el.lexType == AndOpE) {
485	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
486	defaultStemMethod);
487	if (newTerm == NULL) {
488	delete curTree;
489	return NULL;
490	}
491	curTree = AndAdd (curTree, newTerm);
492
493	} else if (el.lexType == OrOpE) {
494	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
495	defaultStemMethod);
496	if (newTerm == NULL) {
497	delete curTree;
498	return NULL;
499	}
500	curTree = OrAdd (curTree, newTerm);
501
502	} else if (el.lexType == NotOpE) {
503	QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
504	defaultStemMethod);
505	if (newTerm == NULL) {
506	delete curTree;
507	return NULL;
508	}
509	curTree = NotAdd (curTree, newTerm);
510
511	} else if (el.lexType == UnknownE) {
512	// just ignore it
513	} else {
514
515	// syntax error, return NUll
516	delete curTree;
517	return NULL;
518	}
519
520	oldHere = here;
521	}
522
523	return curTree;
524	}
525
526	QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
527	int defaultStemMethod, int maxnumeric) {
528	if (4 < maxnumeric && maxnumeric < 512) {
529	MAXNUMERIC = maxnumeric;
530	}
531	UCArray::const_iterator here = queryStr.begin();
532	UCArray::const_iterator end = queryStr.end();
533	return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
534	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: