source: trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp@ 3165

Last change on this file since 3165 was 3165, checked in by jrm21, 22 years ago

removed some debugging that was going to stderr.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCOmbine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby) {
95 UCArray NEARBY; SetCStr(NEARBY, "NEAR");
96 if (nearby == NEARBY) { // no modifier
97 termNode.startRange = -21;
98 termNode.endRange = 20;
99 }
100 else { // extract number
101 UCArray::const_iterator here = nearby.begin()+4;
102 UCArray::const_iterator end = nearby.end();
103 int size=0;
104 while (here != end) {
105 size = size*10 + (*here-'0');
106 here++;
107 }
108 termNode.startRange = -1 * (size+1);
109 termNode.endRange = size;
110 }
111}
112
113static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
114 // here expect el to contain some of c,s,i,u
115 // stem method 0 = c,u 00
116 // stem method 1 = i,u 01 - default for DL
117 // stem method 2 = c, s 10
118 // stem method 3 = i,s 11
119
120 unsigned long stem = (unsigned long)defaultStemMethod;
121
122 UCArray::const_iterator here = el.text.begin();
123 UCArray::const_iterator end = el.text.end();
124
125 unsigned char c1 = *here;
126 if (!(c1 == 'c'| c1 == 'i' | c1 == 'u' | c1 == 's'))
127 return 4; // incorrect format
128
129 here++;
130 unsigned char c2 = 'a';
131 if (here !=end) {
132 c2 = *here;
133 if (!(c2 == 'c'| c2 == 'i' | c2 == 'u' | c2 == 's'))
134 return 4; // incorrect format
135 }
136
137 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
138 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
139 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
140 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
141 return stem;
142}
143
144
145static void ParseTermModifiers (UCArray::const_iterator &here,
146 UCArray::const_iterator end,
147 TermNode &termNode,
148 int defaultStemMethod) {
149
150 termNode.stemMethod = defaultStemMethod;
151
152 LexEl el;
153 UCArray::const_iterator oldHere = here;
154 while (ParseLexEl (here, end, el)) {
155 if (el.lexType == TermWeightE) {
156 termNode.termWeight = ParseInt (here, end);
157
158 } else if (el.lexType == StemMethodE) {
159 oldHere = here;
160 LexEl stem;
161 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
162 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
163 if (termNode.stemMethod == 4) { // error so backtrack
164 here = oldHere;
165 termNode.stemMethod = (unsigned long)defaultStemMethod;
166 }
167 }else here = oldHere; //ignore - wrong syntax
168
169 } else if (el.lexType == RangeE) {
170 termNode.startRange = ParseInt (here, end);
171 termNode.endRange = ParseInt (here, end);
172
173 } else if (el.lexType == AtE) {
174 termNode.startRange = termNode.endRange = ParseInt (here, end);
175
176 } else {
177 // no term modifiers
178 here = oldHere;
179 break;
180 }
181
182 oldHere = here;
183 }
184}
185
186static void ParseProxModifiers (UCArray::const_iterator &here,
187 UCArray::const_iterator end,
188 ProxMatchQueryNode *proxNode) {
189 // so far only have one - the tag stuff
190 LexEl el;
191 UCArray::const_iterator oldHere = here;
192 while (ParseLexEl (here, end, el)) {
193 if (el.lexType == TagE) {
194 oldHere = here; // don't backtrack past here
195 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
196 proxNode->tagNodePtr = new TagNode;
197 proxNode->tagNodePtr->tagName = el.text;
198
199 }
200 else { // error in tag
201 here = oldHere;
202 }
203 } // TagE
204 // add in other cases here
205 else {
206 // no modifiers
207 here = oldHere;
208 break;
209 }
210 oldHere = here;
211 }//while
212
213
214}
215
216// expects starting brackets to have been parsed
217static void ParseSquareBrackets(UCArray::const_iterator &here,
218 UCArray::const_iterator end,
219 ProxMatchQueryNode *proxNode,
220 int defaultStemMethod) {
221
222 LexEl el;
223 bool phrase=false;
224 bool first=true;
225 while (ParseLexEl (here, end, el)) {
226 if (el.lexType == TermE || el.lexType == IntegerE) {
227 TermNode termNode;
228 termNode.term = el.text;
229 ParseTermModifiers (here, end, termNode, defaultStemMethod);
230 if (phrase) {
231 if (first) first=false;
232 else {
233 termNode.startRange = -2;
234 termNode.endRange = -1;
235 }
236 }
237 proxNode->terms.push_back(termNode);
238 }
239 else if (el.lexType == CloseSquareBracketE) {
240 break;
241 }
242 else if (el.lexType == AndOpE) {
243 // ignore, the words are AND'ed anyway
244 cerr << "and inside []\n";
245 }
246 else if (el.lexType == OrOpE) {
247 cerr << "or inside []\n";
248 }
249 else if (el.lexType == QuoteE) {
250 // phrase inside square brackets
251 if (phrase) phrase=false;
252 else phrase=true;
253 }
254 else {
255 //error
256 // just ignore for now
257 cerr <<"bad syntax inside []\n";
258 }
259 } // while
260
261}
262// expects the starting quote to have been parsed
263// and discarded
264// now phrases use the case and stem preference options
265// ie can search for a phrase ignoring case
266static void ParsePhrase (UCArray::const_iterator &here,
267 UCArray::const_iterator end,
268 ProxMatchQueryNode &proxNode,
269 int defaultStemMethod) {
270 LexEl el;
271 bool first = true;
272 while (ParseLexEl (here, end, el)) {
273 if (el.lexType == TermE || el.lexType == IntegerE) {
274 TermNode termNode;
275 termNode.term = el.text;
276 //termNode.stemMethod = defaultStemMethod;
277 ParseTermModifiers (here, end, termNode, defaultStemMethod);
278 if (first) {
279 first = false;
280 }
281 else {
282 termNode.startRange = -2;
283 termNode.endRange = -1;
284 }
285 proxNode.terms.push_back (termNode);
286
287 } else if (el.lexType == QuoteE) {
288 break;
289
290 } else {
291 // error
292 break;
293 }
294 }
295}
296
297static QueryNode *ParseTerm (UCArray::const_iterator &here,
298 UCArray::const_iterator end,
299 int defaultBoolCombine,
300 int defaultStemMethod) {
301 LexEl el;
302
303 UCArray::const_iterator oldHere = here;
304 if (!ParseLexEl (here, end, el)) return NULL;
305
306 if (el.lexType == OpenBracketE)
307 return ParseBracketExpression (here, end, defaultBoolCombine,
308 defaultStemMethod);
309
310 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
311
312 if (el.lexType == TermE || el.lexType == IntegerE) {
313 TermNode termNode;
314 termNode.term = el.text;
315 ParseTermModifiers (here, end, termNode, defaultStemMethod);
316 oldHere = here; // dont backtrack past here
317 if (ParseLexEl(here, end, el) && el.lexType == NearOpE) {
318 delete proxNode;
319 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
320 defaultStemMethod);
321 SetRangeValues(termNode, el.text);
322 proxNode->terms.push_back (termNode);
323 return proxNode;
324 }
325 else {
326 here = oldHere; // backtrack
327 proxNode->terms.push_back (termNode);
328 ParseProxModifiers(here, end, proxNode);
329 return proxNode;
330 }
331 } else if (el.lexType == QuoteE) {
332 ParsePhrase (here, end, *proxNode, defaultStemMethod);
333 return proxNode;
334 }
335 else if (el.lexType == OpenSquareBracketE) {
336 ParseSquareBrackets (here, end, proxNode, defaultStemMethod);
337 ParseProxModifiers (here, end, proxNode);
338 return proxNode;
339 }
340
341 // not a term
342 here = oldHere;
343 delete proxNode;
344 return NULL;
345}
346
347
348static QueryNode *ParseExpression (UCArray::const_iterator &here,
349 UCArray::const_iterator end,
350 int defaultBoolCombine,
351 int defaultStemMethod) {
352 LexEl el;
353 QueryNode *curTree = NULL;
354
355 UCArray::const_iterator oldHere = here;
356 while (ParseLexEl (here, end, el)) {
357 if (el.lexType == OpenSquareBracketE ||
358 el.lexType == OpenBracketE ||
359 el.lexType == TermE ||
360 el.lexType == QuoteE ||
361 el.lexType == IntegerE ) {
362 // el.lexType == TagE) { //tag at end of term now
363 // some type of term, back track and parse it
364 here = oldHere;
365 // if default==1, AND, else if==0, OR
366 if (defaultBoolCombine) {
367 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
368 defaultStemMethod));
369 }
370 else {
371 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
372 defaultStemMethod));
373 }
374
375 } else if (el.lexType == AndOpE) {
376 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
377 defaultStemMethod));
378
379 } else if (el.lexType == OrOpE) {
380 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
381 defaultStemMethod));
382
383 } else if (el.lexType == NotOpE) {
384 curTree = NotAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
385 defaultStemMethod));
386
387 } else if (el.lexType == CloseBracketE) {
388 // parsebracketexpression is waiting for the last bracket, so put it back
389 here = oldHere;
390 break;
391
392 } else break;
393
394 oldHere = here;
395 }
396
397 return curTree;
398}
399
400QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
401 int defaultStemMethod) {
402 UCArray::const_iterator here = queryStr.begin();
403 UCArray::const_iterator end = queryStr.end();
404 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
405}
Note: See TracBrowser for help on using the repository browser.