source: trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp@ 1770

Last change on this file since 1770 was 1770, checked in by kjm18, 23 years ago

changed parseQuery to take a new parameter - defaultBoolCombine - this
determines if terms are AND'd or OR'd together by default if operators
aren't explicitly stated

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: GSDLQueryParser.cpp 1770 2000-12-07 22:36:32Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "GSDLQueryParser.h"
25#include "GSDLQueryLex.h"
26
27
28static QueryNode *ParseExpression (UCArray::const_iterator &here,
29 UCArray::const_iterator end,
30 int defaultBoolCOmbine,
31 int defaultStemMethod);
32
33static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
34 if (t1 == NULL) return t2;
35 if (t2 == NULL) return t1;
36
37 AndQueryNode *andNode = new AndQueryNode;
38 andNode->leftNode = t1;
39 andNode->rightNode = t2;
40 return andNode;
41}
42
43static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
44 if (t1 == NULL) return t2;
45 if (t2 == NULL) return t1;
46
47 OrQueryNode *orNode = new OrQueryNode;
48 orNode->leftNode = t1;
49 orNode->rightNode = t2;
50 return orNode;
51}
52
53static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
54 if (t1 == NULL) return t2;
55 if (t2 == NULL) return t1;
56
57 NotQueryNode *notNode = new NotQueryNode;
58 notNode->queryNode = t1;
59 notNode->notNode = t2;
60 return notNode;
61}
62
63// expects the opening bracket to have already been parsed
64// and discarded
65static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
66 UCArray::const_iterator end,
67 int defaultBoolCombine,
68 int defaultStemMethod) {
69 // get everything in the expression
70 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
71 defaultStemMethod);
72
73 // gobble up tokens until a closing bracket is found
74 // or the end of the string
75 LexEl el;
76 while (ParseLexEl (here, end, el)) {
77 if (el.lexType == CloseBracketE) break;
78 }
79
80 return curTree;
81}
82
83static int ParseInt (UCArray::const_iterator &here,
84 UCArray::const_iterator end) {
85 LexEl el;
86 UCArray::const_iterator oldHere = here;
87 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
88 return el.num;
89
90 here = oldHere; // not an integer
91 return 0;
92}
93
94// default is within 20 words
95static void SetRangeValues (TermNode &termNode,
96 UCArray &near) {
97 UCArray NEAR; SetCStr(NEAR, "NEAR");
98 if (near == NEAR) { // no modifier
99 termNode.startRange = -21;
100 termNode.endRange = 20;
101 }
102 else { // extract number
103 UCArray::const_iterator here = near.begin()+4;
104 UCArray::const_iterator end = near.end();
105 int size=0;
106 while (here != end) {
107 size = size*10 + (*here-'0');
108 here++;
109 }
110 cerr <<"size= "<<size<<endl;
111 termNode.startRange = -1 * (size+1);
112 termNode.endRange = size;
113 }
114
115
116}
117static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
118 // here expect el to contain some of c,s,i,u
119 // stem method 0 = c,u 00
120 // stem method 1 = i,u 01 - default for DL
121 // stem method 2 = c, s 10
122 // stem method 3 = i,s 11
123
124 unsigned long stem = (unsigned long)defaultStemMethod;
125
126 UCArray::const_iterator here = el.text.begin();
127 UCArray::const_iterator end = el.text.end();
128
129 unsigned char c1 = *here;
130 if (!(c1 == 'c'| c1 == 'i' | c1 == 'u' | c1 == 's'))
131 return 4; // incorrect format
132
133 here++;
134 unsigned char c2 = 'a';
135 if (here !=end) {
136 c2 = *here;
137 if (!(c2 == 'c'| c2 == 'i' | c2 == 'u' | c2 == 's'))
138 return 4; // incorrect format
139 }
140
141 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
142 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
143 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
144 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
145 cerr << "stem is "<<stem<<endl;
146 return stem;
147}
148
149
150static void ParseTermModifiers (UCArray::const_iterator &here,
151 UCArray::const_iterator end,
152 TermNode &termNode,
153 int defaultStemMethod) {
154
155 termNode.stemMethod = defaultStemMethod;
156
157 LexEl el;
158 UCArray::const_iterator oldHere = here;
159 while (ParseLexEl (here, end, el)) {
160 if (el.lexType == TermWeightE) {
161 termNode.termWeight = ParseInt (here, end);
162
163 } else if (el.lexType == StemMethodE) {
164 oldHere = here;
165 LexEl stem;
166 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
167 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
168 if (termNode.stemMethod == 4) { // error so backtrack
169 here = oldHere;
170 termNode.stemMethod = (unsigned long)defaultStemMethod;
171 }
172 }else here = oldHere; //ignore - wrong syntax
173
174 } else if (el.lexType == RangeE) {
175 termNode.startRange = ParseInt (here, end);
176 termNode.endRange = ParseInt (here, end);
177
178 } else if (el.lexType == AtE) {
179 termNode.startRange = termNode.endRange = ParseInt (here, end);
180
181 } else {
182 // no term modifiers
183 here = oldHere;
184 break;
185 }
186
187 oldHere = here;
188 }
189}
190
191static void ParseProxModifiers (UCArray::const_iterator &here,
192 UCArray::const_iterator end,
193 ProxMatchQueryNode *proxNode) {
194 // so far only have one - the tag stuff
195 LexEl el;
196 UCArray::const_iterator oldHere = here;
197 while (ParseLexEl (here, end, el)) {
198 if (el.lexType == TagE) {
199 oldHere = here; // don't backtrack past here
200 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
201 proxNode->tagNodePtr = new TagNode;
202 proxNode->tagNodePtr->tagName = el.text;
203
204 }
205 else { // error in tag
206 here = oldHere;
207 }
208 } // TagE
209 // add in other cases here
210 else {
211 // no modifiers
212 here = oldHere;
213 break;
214 }
215 oldHere = here;
216 }//while
217
218
219}
220
221// expects starting brackets to have been parsed
222static void ParseSquareBrackets(UCArray::const_iterator &here,
223 UCArray::const_iterator end,
224 ProxMatchQueryNode *proxNode,
225 int defaultStemMethod) {
226
227 LexEl el;
228 while (ParseLexEl (here, end, el)) {
229 if (el.lexType == TermE || el.lexType == IntegerE) {
230 TermNode termNode;
231 termNode.term = el.text;
232 ParseTermModifiers (here, end, termNode, defaultStemMethod);
233 proxNode->terms.push_back(termNode);
234 }
235 else if (el.lexType == CloseSquareBracketE) {
236 break;
237 }
238 else if (el.lexType == AndOpE) {
239 // ignore, the words are AND'ed anyway
240 cerr << "and inside []\n";
241 }
242 else if (el.lexType == OrOpE) {
243 cerr << "or inside []\n";
244 }
245 else {
246 //error
247 // just ignore for now
248 cerr <<"bad syntax inside []\n";
249 }
250 } // while
251
252}
253// expects the starting quote to have been parsed
254// and discarded
255// is now for exact phrases
256static void ParsePhrase (UCArray::const_iterator &here,
257 UCArray::const_iterator end,
258 ProxMatchQueryNode &proxNode) {
259 LexEl el;
260 bool first = true;
261 while (ParseLexEl (here, end, el)) {
262 if (el.lexType == TermE || el.lexType == IntegerE) {
263 TermNode termNode;
264 termNode.term = el.text;
265 //ParseTermModifiers (here, end, termNode);
266 if (first) {
267 first = false;
268 }
269 else {
270 termNode.startRange = -2;
271 termNode.endRange = -1;
272 }
273 proxNode.terms.push_back (termNode);
274
275 } else if (el.lexType == QuoteE) {
276 break;
277
278 } else {
279 // error
280 break;
281 }
282 }
283}
284
285static QueryNode *ParseTerm (UCArray::const_iterator &here,
286 UCArray::const_iterator end,
287 int defaultBoolCombine,
288 int defaultStemMethod) {
289 LexEl el;
290
291 UCArray::const_iterator oldHere = here;
292 if (!ParseLexEl (here, end, el)) return NULL;
293
294 if (el.lexType == OpenBracketE)
295 return ParseBracketExpression (here, end, defaultBoolCombine,
296 defaultStemMethod);
297
298 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
299
300 if (el.lexType == TermE || el.lexType == IntegerE) {
301 TermNode termNode;
302 termNode.term = el.text;
303 ParseTermModifiers (here, end, termNode, defaultStemMethod);
304 oldHere = here; // dont backtrack past here
305 if (ParseLexEl(here, end, el) && el.lexType == NearOpE) {
306 delete proxNode;
307 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
308 defaultStemMethod);
309 SetRangeValues(termNode, el.text);
310 proxNode->terms.push_back (termNode);
311 return proxNode;
312 }
313 else {
314 here = oldHere; // backtrack
315 proxNode->terms.push_back (termNode);
316 ParseProxModifiers(here, end, proxNode);
317 return proxNode;
318 }
319 } else if (el.lexType == QuoteE) {
320 ParsePhrase (here, end, *proxNode);
321 return proxNode;
322 }
323 else if (el.lexType == OpenSquareBracketE) {
324 ParseSquareBrackets (here, end, proxNode, defaultStemMethod);
325 ParseProxModifiers (here, end, proxNode);
326 return proxNode;
327 }
328
329 // not a term
330 here = oldHere;
331 delete proxNode;
332 return NULL;
333}
334
335
336static QueryNode *ParseExpression (UCArray::const_iterator &here,
337 UCArray::const_iterator end,
338 int defaultBoolCombine,
339 int defaultStemMethod) {
340 LexEl el;
341 QueryNode *curTree = NULL;
342
343 UCArray::const_iterator oldHere = here;
344 while (ParseLexEl (here, end, el)) {
345 if (el.lexType == OpenSquareBracketE ||
346 el.lexType == OpenBracketE ||
347 el.lexType == TermE ||
348 el.lexType == QuoteE ||
349 el.lexType == IntegerE ) {
350 // el.lexType == TagE) { //tag at end of term now
351 // some type of term, back track and parse it
352 here = oldHere;
353 // if default==1, AND, else if==0, OR
354 if (defaultBoolCombine) {
355 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
356 defaultStemMethod));
357 }
358 else {
359 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
360 defaultStemMethod));
361 }
362
363 } else if (el.lexType == AndOpE) {
364 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
365 defaultStemMethod));
366
367 } else if (el.lexType == OrOpE) {
368 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
369 defaultStemMethod));
370
371 } else if (el.lexType == NotOpE) {
372 curTree = NotAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
373 defaultStemMethod));
374
375 } else if (el.lexType == CloseBracketE) {
376 // parsebracketexpression is waiting for the last bracket, so put it back
377 here = oldHere;
378 break;
379
380 } else break;
381
382 oldHere = here;
383 }
384
385 return curTree;
386}
387
388QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
389 int defaultStemMethod) {
390 UCArray::const_iterator here = queryStr.begin();
391 UCArray::const_iterator end = queryStr.end();
392 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
393}
Note: See TracBrowser for help on using the repository browser.