source: trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp@ 6083

Last change on this file since 6083 was 6083, checked in by kjdon, 20 years ago

added NEAR inside []

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCombine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby) {
95 UCArray NEARBY; SetCStr(NEARBY, "NEAR");
96 if (nearby == NEARBY) { // no modifier
97 termNode.startRange = (NEAR_DEFAULT+1)*-1;
98 termNode.endRange = NEAR_DEFAULT;
99
100 }
101 else { // extract number
102 UCArray::const_iterator here = nearby.begin()+4;
103 UCArray::const_iterator end = nearby.end();
104 int size=0;
105 while (here != end) {
106 size = size*10 + (*here-'0');
107 here++;
108 }
109 termNode.startRange = -1 * (size+1);
110 termNode.endRange = size;
111 }
112}
113
114static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
115 // here expect el to contain some of c,s,i,u
116 // stem method 0 = c,u 00
117 // stem method 1 = i,u 01 - default for DL
118 // stem method 2 = c, s 10
119 // stem method 3 = i,s 11
120
121 unsigned long stem = (unsigned long)defaultStemMethod;
122
123 UCArray::const_iterator here = el.text.begin();
124 UCArray::const_iterator end = el.text.end();
125
126 unsigned char c1 = *here;
127 if (!(c1 == 'c'| c1 == 'i' | c1 == 'u' | c1 == 's'))
128 return 4; // incorrect format
129
130 here++;
131 unsigned char c2 = 'a';
132 if (here !=end) {
133 c2 = *here;
134 if (!(c2 == 'c'| c2 == 'i' | c2 == 'u' | c2 == 's'))
135 return 4; // incorrect format
136 }
137
138 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
139 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
140 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
141 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
142 return stem;
143}
144
145
146static void ParseTermModifiers (UCArray::const_iterator &here,
147 UCArray::const_iterator end,
148 TermNode &termNode,
149 int defaultStemMethod) {
150
151 termNode.stemMethod = defaultStemMethod;
152
153 LexEl el;
154 UCArray::const_iterator oldHere = here;
155 while (ParseLexEl (here, end, el)) {
156 if (el.lexType == TermWeightE) {
157 termNode.termWeight = ParseInt (here, end);
158
159 } else if (el.lexType == StemMethodE) {
160 oldHere = here;
161 LexEl stem;
162 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
163 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
164 if (termNode.stemMethod == 4) { // error so backtrack
165 here = oldHere;
166 termNode.stemMethod = (unsigned long)defaultStemMethod;
167 }
168 }else here = oldHere; //ignore - wrong syntax
169
170 } else if (el.lexType == RangeE) {
171 termNode.startRange = ParseInt (here, end);
172 termNode.endRange = ParseInt (here, end);
173
174 } else if (el.lexType == AtE) {
175 termNode.startRange = termNode.endRange = ParseInt (here, end);
176
177 } else {
178 // no term modifiers
179 here = oldHere;
180 break;
181 }
182
183 oldHere = here;
184 }
185}
186
187static void ParseProxModifiers (UCArray::const_iterator &here,
188 UCArray::const_iterator end,
189 ProxMatchQueryNode *proxNode) {
190 // so far only have one - the tag stuff
191 LexEl el;
192 UCArray::const_iterator oldHere = here;
193 while (ParseLexEl (here, end, el)) {
194 if (el.lexType == TagE) {
195 oldHere = here; // don't backtrack past here
196 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
197 proxNode->tagNodePtr = new TagNode;
198 proxNode->tagNodePtr->tagName = el.text;
199
200 }
201 else { // error in tag
202 here = oldHere;
203 }
204 } // TagE
205 // add in other cases here
206 else {
207 // no modifiers
208 here = oldHere;
209 break;
210 }
211 oldHere = here;
212 }//while
213
214
215}
216
217// expects starting brackets to have been parsed
218// sets error to true if something has gone wrong
219static ProxMatchQueryNode *ParseSquareBrackets(UCArray::const_iterator &here,
220 UCArray::const_iterator end,
221 /*ProxMatchQueryNode *proxNode,*/
222 int defaultStemMethod,
223 bool & error) {
224
225 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
226 LexEl el;
227 bool phrase=false;
228 bool first=true;
229 bool near = false;
230 UCArray near_string;
231 while (ParseLexEl (here, end, el)) {
232 if (el.lexType == TermE || el.lexType == IntegerE) {
233 TermNode termNode;
234 termNode.term = el.text;
235 ParseTermModifiers (here, end, termNode, defaultStemMethod);
236 if (phrase) {
237 if (first) first=false;
238 else {
239 termNode.startRange = -2;
240 termNode.endRange = -1;
241 }
242 } else if (near) {
243 SetRangeValues(termNode, near_string);
244 near = false;
245 }
246 proxNode->terms.push_back(termNode);
247 }
248 else if (el.lexType == CloseSquareBracketE) {
249 break;
250 }
251 else if (el.lexType == QuoteE) {
252 // phrase inside square brackets
253 if (phrase) { // end of phrase
254 phrase=false;
255 first = true;
256 } else {
257 phrase=true; // start of phrase
258 }
259 } else if (el.lexType == NearOpE) {
260 if (phrase) {
261 // cant have NEAR op in a phrase - just assume its an actual word
262 TermNode termNode;
263 termNode.term = el.text;
264 ParseTermModifiers (here, end, termNode, defaultStemMethod);
265 proxNode->terms.push_back(termNode);
266 } else {
267 // its a NEAR op
268 near = true;
269 near_string = el.text;
270 }
271
272 }
273 else if (el.lexType == UnknownE) {
274 // just ignore it
275 }
276 else {
277 //error - we set the proxNode to NULL,
278 cerr <<"GSDLQueryParser: bad syntax inside []\n";
279 error = true;
280 return NULL;
281 }
282 } // while
283 return proxNode;
284}
285// expects the starting quote to have been parsed
286// and discarded
287// now phrases use the case and stem preference options
288// ie can search for a phrase ignoring case
289static void ParsePhrase (UCArray::const_iterator &here,
290 UCArray::const_iterator end,
291 ProxMatchQueryNode &proxNode,
292 int defaultStemMethod,
293 bool &error) {
294 LexEl el;
295 bool first = true;
296 while (ParseLexEl (here, end, el)) {
297 if (el.lexType == TermE || el.lexType == IntegerE) {
298 TermNode termNode;
299 termNode.term = el.text;
300 //termNode.stemMethod = defaultStemMethod;
301 ParseTermModifiers (here, end, termNode, defaultStemMethod);
302 if (first) {
303 first = false;
304 }
305 else {
306 termNode.startRange = -2;
307 termNode.endRange = -1;
308 }
309 proxNode.terms.push_back (termNode);
310
311 } else if (el.lexType == QuoteE) {
312 break;
313
314 } else if (el.lexType == UnknownE) {
315 // just ignore it
316 } else {
317 // error
318 error = true;
319 return;
320 }
321 }
322}
323
324static QueryNode *ParseTerm (UCArray::const_iterator &here,
325 UCArray::const_iterator end,
326 int defaultBoolCombine,
327 int defaultStemMethod) {
328 LexEl el;
329
330 UCArray::const_iterator oldHere = here;
331 if (!ParseLexEl (here, end, el)) return NULL;
332
333 if (el.lexType == OpenBracketE)
334 return ParseBracketExpression (here, end, defaultBoolCombine,
335 defaultStemMethod);
336
337 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
338
339 if (el.lexType == TermE || el.lexType == IntegerE) {
340 TermNode termNode;
341 termNode.term = el.text;
342 ParseTermModifiers (here, end, termNode, defaultStemMethod);
343 oldHere = here; // dont backtrack past here
344 if (ParseLexEl(here, end, el) && el.lexType == NearOpE) {
345 delete proxNode;
346 oldHere = here;
347 // this is calling ParseTerm again, but only a subset of the things accepted by ParseTerm are appropriate here. add in some hacks to avoid segmentation faults - kjdon, 04/2003
348
349 // if the next element is a '(' have a syntax error, return NULL
350 LexEl temp_el;
351 if (ParseLexEl(here, end, temp_el) && temp_el.lexType == OpenBracketE) {
352 cerr << "GSDLQueryParser: NEAR cannot be followed by a '('\n";
353 return NULL;
354 }
355 here = oldHere; // else backtrack
356
357 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
358 defaultStemMethod);
359 SetRangeValues(termNode, el.text);
360 proxNode->terms.push_back (termNode);
361 return proxNode;
362
363 } else {
364 here = oldHere; // backtrack
365 proxNode->terms.push_back (termNode);
366 ParseProxModifiers(here, end, proxNode);
367 return proxNode;
368 }
369 } else if (el.lexType == QuoteE) {
370 bool error = false;
371 ParsePhrase (here, end, *proxNode, defaultStemMethod, error);
372 if (error) {
373 delete proxNode;
374 return NULL;
375 }
376 return proxNode;
377 }
378 else if (el.lexType == OpenSquareBracketE) {
379 bool error = false;
380 proxNode = ParseSquareBrackets (here, end, /*proxNode, */defaultStemMethod, error);
381 if (error) {
382 delete proxNode;
383 return NULL;
384 }
385 ParseProxModifiers (here, end, proxNode);
386 return proxNode;
387 }
388
389 // not a term
390 here = oldHere;
391 delete proxNode;
392 return NULL;
393}
394
395
396static QueryNode *ParseExpression (UCArray::const_iterator &here,
397 UCArray::const_iterator end,
398 int defaultBoolCombine,
399 int defaultStemMethod) {
400 LexEl el;
401 QueryNode *curTree = NULL;
402 UCArray::const_iterator oldHere = here;
403 while (ParseLexEl (here, end, el)) {
404 if (el.lexType == CloseBracketE) {
405 // parsebracketexpression is waiting for the last bracket, so put it back
406 here = oldHere;
407 break;
408
409 } else if (el.lexType == OpenSquareBracketE ||
410 el.lexType == OpenBracketE ||
411 el.lexType == TermE ||
412 el.lexType == QuoteE ||
413 el.lexType == IntegerE ) {
414
415 // some type of term, back track and parse it
416 here = oldHere;
417
418 // parse the term
419 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
420 defaultStemMethod);
421 if (newTerm == NULL) {
422 delete curTree;
423 return NULL;
424 }
425
426 // if default==1, AND, else if==0, OR
427 if (defaultBoolCombine) {
428 curTree = AndAdd (curTree, newTerm);
429 }
430 else {
431 curTree = OrAdd (curTree, newTerm);
432 }
433
434 } else if (el.lexType == AndOpE) {
435 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
436 defaultStemMethod);
437 if (newTerm == NULL) {
438 delete curTree;
439 return NULL;
440 }
441 curTree = AndAdd (curTree, newTerm);
442
443 } else if (el.lexType == OrOpE) {
444 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
445 defaultStemMethod);
446 if (newTerm == NULL) {
447 delete curTree;
448 return NULL;
449 }
450 curTree = OrAdd (curTree, newTerm);
451
452 } else if (el.lexType == NotOpE) {
453 QueryNode * newTerm = ParseTerm (here, end, defaultBoolCombine,
454 defaultStemMethod);
455 if (newTerm == NULL) {
456 delete curTree;
457 return NULL;
458 }
459 curTree = NotAdd (curTree, newTerm);
460
461 } else if (el.lexType == UnknownE) {
462 // just ignore it
463 } else {
464
465 // syntax error, return NUll
466 delete curTree;
467 return NULL;
468 }
469
470 oldHere = here;
471 }
472
473 return curTree;
474}
475
476QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
477 int defaultStemMethod) {
478 UCArray::const_iterator here = queryStr.begin();
479 UCArray::const_iterator end = queryStr.end();
480 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
481}
Note: See TracBrowser for help on using the repository browser.