source: trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp@ 2692

Last change on this file since 2692 was 2692, checked in by kjm18, 23 years ago

phrases now use the case and stem preference settings, they are no longer
exact phrases unless stemming and casefolding are both turned off.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCOmbine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby) {
95 UCArray NEARBY; SetCStr(NEARBY, "NEAR");
96 if (nearby == NEARBY) { // no modifier
97 termNode.startRange = -21;
98 termNode.endRange = 20;
99 }
100 else { // extract number
101 UCArray::const_iterator here = nearby.begin()+4;
102 UCArray::const_iterator end = nearby.end();
103 int size=0;
104 while (here != end) {
105 size = size*10 + (*here-'0');
106 here++;
107 }
108 cerr <<"size= "<<size<<endl;
109 termNode.startRange = -1 * (size+1);
110 termNode.endRange = size;
111 }
112}
113
114static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
115 // here expect el to contain some of c,s,i,u
116 // stem method 0 = c,u 00
117 // stem method 1 = i,u 01 - default for DL
118 // stem method 2 = c, s 10
119 // stem method 3 = i,s 11
120
121 unsigned long stem = (unsigned long)defaultStemMethod;
122
123 UCArray::const_iterator here = el.text.begin();
124 UCArray::const_iterator end = el.text.end();
125
126 unsigned char c1 = *here;
127 if (!(c1 == 'c'| c1 == 'i' | c1 == 'u' | c1 == 's'))
128 return 4; // incorrect format
129
130 here++;
131 unsigned char c2 = 'a';
132 if (here !=end) {
133 c2 = *here;
134 if (!(c2 == 'c'| c2 == 'i' | c2 == 'u' | c2 == 's'))
135 return 4; // incorrect format
136 }
137
138 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
139 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
140 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
141 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
142 cerr << "stem is "<<stem<<endl;
143 return stem;
144}
145
146
147static void ParseTermModifiers (UCArray::const_iterator &here,
148 UCArray::const_iterator end,
149 TermNode &termNode,
150 int defaultStemMethod) {
151
152 termNode.stemMethod = defaultStemMethod;
153
154 LexEl el;
155 UCArray::const_iterator oldHere = here;
156 while (ParseLexEl (here, end, el)) {
157 if (el.lexType == TermWeightE) {
158 termNode.termWeight = ParseInt (here, end);
159
160 } else if (el.lexType == StemMethodE) {
161 oldHere = here;
162 LexEl stem;
163 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
164 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
165 if (termNode.stemMethod == 4) { // error so backtrack
166 here = oldHere;
167 termNode.stemMethod = (unsigned long)defaultStemMethod;
168 }
169 }else here = oldHere; //ignore - wrong syntax
170
171 } else if (el.lexType == RangeE) {
172 termNode.startRange = ParseInt (here, end);
173 termNode.endRange = ParseInt (here, end);
174
175 } else if (el.lexType == AtE) {
176 termNode.startRange = termNode.endRange = ParseInt (here, end);
177
178 } else {
179 // no term modifiers
180 here = oldHere;
181 break;
182 }
183
184 oldHere = here;
185 }
186}
187
188static void ParseProxModifiers (UCArray::const_iterator &here,
189 UCArray::const_iterator end,
190 ProxMatchQueryNode *proxNode) {
191 // so far only have one - the tag stuff
192 LexEl el;
193 UCArray::const_iterator oldHere = here;
194 while (ParseLexEl (here, end, el)) {
195 if (el.lexType == TagE) {
196 oldHere = here; // don't backtrack past here
197 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
198 proxNode->tagNodePtr = new TagNode;
199 proxNode->tagNodePtr->tagName = el.text;
200
201 }
202 else { // error in tag
203 here = oldHere;
204 }
205 } // TagE
206 // add in other cases here
207 else {
208 // no modifiers
209 here = oldHere;
210 break;
211 }
212 oldHere = here;
213 }//while
214
215
216}
217
218// expects starting brackets to have been parsed
219static void ParseSquareBrackets(UCArray::const_iterator &here,
220 UCArray::const_iterator end,
221 ProxMatchQueryNode *proxNode,
222 int defaultStemMethod) {
223
224 LexEl el;
225 while (ParseLexEl (here, end, el)) {
226 if (el.lexType == TermE || el.lexType == IntegerE) {
227 TermNode termNode;
228 termNode.term = el.text;
229 ParseTermModifiers (here, end, termNode, defaultStemMethod);
230 proxNode->terms.push_back(termNode);
231 }
232 else if (el.lexType == CloseSquareBracketE) {
233 break;
234 }
235 else if (el.lexType == AndOpE) {
236 // ignore, the words are AND'ed anyway
237 cerr << "and inside []\n";
238 }
239 else if (el.lexType == OrOpE) {
240 cerr << "or inside []\n";
241 }
242 else {
243 //error
244 // just ignore for now
245 cerr <<"bad syntax inside []\n";
246 }
247 } // while
248
249}
250// expects the starting quote to have been parsed
251// and discarded
252// now phrases use the case and stem preference options
253// ie can search for a phrase ignoring case
254static void ParsePhrase (UCArray::const_iterator &here,
255 UCArray::const_iterator end,
256 ProxMatchQueryNode &proxNode,
257 int defaultStemMethod) {
258 LexEl el;
259 bool first = true;
260 while (ParseLexEl (here, end, el)) {
261 if (el.lexType == TermE || el.lexType == IntegerE) {
262 TermNode termNode;
263 termNode.term = el.text;
264 termNode.stemMethod = defaultStemMethod;
265 //ParseTermModifiers (here, end, termNode, defaultStemMethod);
266 if (first) {
267 first = false;
268 }
269 else {
270 termNode.startRange = -2;
271 termNode.endRange = -1;
272 }
273 proxNode.terms.push_back (termNode);
274
275 } else if (el.lexType == QuoteE) {
276 break;
277
278 } else {
279 // error
280 break;
281 }
282 }
283}
284
285static QueryNode *ParseTerm (UCArray::const_iterator &here,
286 UCArray::const_iterator end,
287 int defaultBoolCombine,
288 int defaultStemMethod) {
289 LexEl el;
290
291 UCArray::const_iterator oldHere = here;
292 if (!ParseLexEl (here, end, el)) return NULL;
293
294 if (el.lexType == OpenBracketE)
295 return ParseBracketExpression (here, end, defaultBoolCombine,
296 defaultStemMethod);
297
298 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
299
300 if (el.lexType == TermE || el.lexType == IntegerE) {
301 TermNode termNode;
302 termNode.term = el.text;
303 ParseTermModifiers (here, end, termNode, defaultStemMethod);
304 oldHere = here; // dont backtrack past here
305 if (ParseLexEl(here, end, el) && el.lexType == NearOpE) {
306 delete proxNode;
307 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
308 defaultStemMethod);
309 SetRangeValues(termNode, el.text);
310 proxNode->terms.push_back (termNode);
311 return proxNode;
312 }
313 else {
314 here = oldHere; // backtrack
315 proxNode->terms.push_back (termNode);
316 ParseProxModifiers(here, end, proxNode);
317 return proxNode;
318 }
319 } else if (el.lexType == QuoteE) {
320 ParsePhrase (here, end, *proxNode, defaultStemMethod);
321 return proxNode;
322 }
323 else if (el.lexType == OpenSquareBracketE) {
324 ParseSquareBrackets (here, end, proxNode, defaultStemMethod);
325 ParseProxModifiers (here, end, proxNode);
326 return proxNode;
327 }
328
329 // not a term
330 here = oldHere;
331 delete proxNode;
332 return NULL;
333}
334
335
336static QueryNode *ParseExpression (UCArray::const_iterator &here,
337 UCArray::const_iterator end,
338 int defaultBoolCombine,
339 int defaultStemMethod) {
340 LexEl el;
341 QueryNode *curTree = NULL;
342
343 UCArray::const_iterator oldHere = here;
344 while (ParseLexEl (here, end, el)) {
345 if (el.lexType == OpenSquareBracketE ||
346 el.lexType == OpenBracketE ||
347 el.lexType == TermE ||
348 el.lexType == QuoteE ||
349 el.lexType == IntegerE ) {
350 // el.lexType == TagE) { //tag at end of term now
351 // some type of term, back track and parse it
352 here = oldHere;
353 // if default==1, AND, else if==0, OR
354 if (defaultBoolCombine) {
355 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
356 defaultStemMethod));
357 }
358 else {
359 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
360 defaultStemMethod));
361 }
362
363 } else if (el.lexType == AndOpE) {
364 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
365 defaultStemMethod));
366
367 } else if (el.lexType == OrOpE) {
368 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
369 defaultStemMethod));
370
371 } else if (el.lexType == NotOpE) {
372 curTree = NotAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
373 defaultStemMethod));
374
375 } else if (el.lexType == CloseBracketE) {
376 // parsebracketexpression is waiting for the last bracket, so put it back
377 here = oldHere;
378 break;
379
380 } else break;
381
382 oldHere = here;
383 }
384
385 return curTree;
386}
387
388QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
389 int defaultStemMethod) {
390 UCArray::const_iterator here = queryStr.begin();
391 UCArray::const_iterator end = queryStr.end();
392 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
393}
Note: See TracBrowser for help on using the repository browser.