source: trunk/gsdl/src/mgpp/text/GSDLQueryParser.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.9 KB
Line 
1/**************************************************************************
2 *
3 * QueryParser.cpp -- Query parser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryParser.h"
23#include "GSDLQueryLex.h"
24
25
26static QueryNode *ParseExpression (UCArray::const_iterator &here,
27 UCArray::const_iterator end,
28 int defaultBoolCOmbine,
29 int defaultStemMethod);
30
31static QueryNode *AndAdd (QueryNode *t1, QueryNode *t2) {
32 if (t1 == NULL) return t2;
33 if (t2 == NULL) return t1;
34
35 AndQueryNode *andNode = new AndQueryNode;
36 andNode->leftNode = t1;
37 andNode->rightNode = t2;
38 return andNode;
39}
40
41static QueryNode *OrAdd (QueryNode *t1, QueryNode *t2) {
42 if (t1 == NULL) return t2;
43 if (t2 == NULL) return t1;
44
45 OrQueryNode *orNode = new OrQueryNode;
46 orNode->leftNode = t1;
47 orNode->rightNode = t2;
48 return orNode;
49}
50
51static QueryNode *NotAdd (QueryNode *t1, QueryNode *t2) {
52 if (t1 == NULL) return t2;
53 if (t2 == NULL) return t1;
54
55 NotQueryNode *notNode = new NotQueryNode;
56 notNode->queryNode = t1;
57 notNode->notNode = t2;
58 return notNode;
59}
60
61// expects the opening bracket to have already been parsed
62// and discarded
63static QueryNode *ParseBracketExpression (UCArray::const_iterator &here,
64 UCArray::const_iterator end,
65 int defaultBoolCombine,
66 int defaultStemMethod) {
67 // get everything in the expression
68 QueryNode *curTree = ParseExpression (here, end, defaultBoolCombine,
69 defaultStemMethod);
70
71 // gobble up tokens until a closing bracket is found
72 // or the end of the string
73 LexEl el;
74 while (ParseLexEl (here, end, el)) {
75 if (el.lexType == CloseBracketE) break;
76 }
77
78 return curTree;
79}
80
81static int ParseInt (UCArray::const_iterator &here,
82 UCArray::const_iterator end) {
83 LexEl el;
84 UCArray::const_iterator oldHere = here;
85 if (ParseLexEl (here, end, el) && el.lexType == IntegerE)
86 return el.num;
87
88 here = oldHere; // not an integer
89 return 0;
90}
91
92// default is within 20 words
93static void SetRangeValues (TermNode &termNode,
94 UCArray &nearby) {
95 UCArray NEARBY; SetCStr(NEARBY, "NEAR");
96 if (nearby == NEARBY) { // no modifier
97 termNode.startRange = -21;
98 termNode.endRange = 20;
99 }
100 else { // extract number
101 UCArray::const_iterator here = nearby.begin()+4;
102 UCArray::const_iterator end = nearby.end();
103 int size=0;
104 while (here != end) {
105 size = size*10 + (*here-'0');
106 here++;
107 }
108 cerr <<"size= "<<size<<endl;
109 termNode.startRange = -1 * (size+1);
110 termNode.endRange = size;
111 }
112}
113
114static unsigned long GetStemMethod(LexEl &el, int defaultStemMethod) {
115 // here expect el to contain some of c,s,i,u
116 // stem method 0 = c,u 00
117 // stem method 1 = i,u 01 - default for DL
118 // stem method 2 = c, s 10
119 // stem method 3 = i,s 11
120
121 unsigned long stem = (unsigned long)defaultStemMethod;
122
123 UCArray::const_iterator here = el.text.begin();
124 UCArray::const_iterator end = el.text.end();
125
126 unsigned char c1 = *here;
127 if (!(c1 == 'c'| c1 == 'i' | c1 == 'u' | c1 == 's'))
128 return 4; // incorrect format
129
130 here++;
131 unsigned char c2 = 'a';
132 if (here !=end) {
133 c2 = *here;
134 if (!(c2 == 'c'| c2 == 'i' | c2 == 'u' | c2 == 's'))
135 return 4; // incorrect format
136 }
137
138 if (c1 == 'i'|| c2=='i') stem |= 1; // set bit 0 to 1
139 if (c1 == 'c' || c2 == 'c') stem &=0xe; //set bit 0 to 0
140 if (c1 == 's'|| c2 == 's') stem |= 2; // set bit 1 to 1
141 if (c1 == 'u' || c2 =='u') stem &=0xd; // set bit 1 to 0
142 cerr << "stem is "<<stem<<endl;
143 return stem;
144}
145
146
147static void ParseTermModifiers (UCArray::const_iterator &here,
148 UCArray::const_iterator end,
149 TermNode &termNode,
150 int defaultStemMethod) {
151
152 termNode.stemMethod = defaultStemMethod;
153
154 LexEl el;
155 UCArray::const_iterator oldHere = here;
156 while (ParseLexEl (here, end, el)) {
157 if (el.lexType == TermWeightE) {
158 termNode.termWeight = ParseInt (here, end);
159
160 } else if (el.lexType == StemMethodE) {
161 oldHere = here;
162 LexEl stem;
163 if (ParseLexEl (here, end, stem) && stem.lexType == TermE) {
164 termNode.stemMethod = GetStemMethod(stem, defaultStemMethod);
165 if (termNode.stemMethod == 4) { // error so backtrack
166 here = oldHere;
167 termNode.stemMethod = (unsigned long)defaultStemMethod;
168 }
169 }else here = oldHere; //ignore - wrong syntax
170
171 } else if (el.lexType == RangeE) {
172 termNode.startRange = ParseInt (here, end);
173 termNode.endRange = ParseInt (here, end);
174
175 } else if (el.lexType == AtE) {
176 termNode.startRange = termNode.endRange = ParseInt (here, end);
177
178 } else {
179 // no term modifiers
180 here = oldHere;
181 break;
182 }
183
184 oldHere = here;
185 }
186}
187
188static void ParseProxModifiers (UCArray::const_iterator &here,
189 UCArray::const_iterator end,
190 ProxMatchQueryNode *proxNode) {
191 // so far only have one - the tag stuff
192 LexEl el;
193 UCArray::const_iterator oldHere = here;
194 while (ParseLexEl (here, end, el)) {
195 if (el.lexType == TagE) {
196 oldHere = here; // don't backtrack past here
197 if (ParseLexEl (here, end, el) && el.lexType == TermE) {
198 proxNode->tagNodePtr = new TagNode;
199 proxNode->tagNodePtr->tagName = el.text;
200
201 }
202 else { // error in tag
203 here = oldHere;
204 }
205 } // TagE
206 // add in other cases here
207 else {
208 // no modifiers
209 here = oldHere;
210 break;
211 }
212 oldHere = here;
213 }//while
214
215
216}
217
218// expects starting brackets to have been parsed
219static void ParseSquareBrackets(UCArray::const_iterator &here,
220 UCArray::const_iterator end,
221 ProxMatchQueryNode *proxNode,
222 int defaultStemMethod) {
223
224 LexEl el;
225 while (ParseLexEl (here, end, el)) {
226 if (el.lexType == TermE || el.lexType == IntegerE) {
227 TermNode termNode;
228 termNode.term = el.text;
229 ParseTermModifiers (here, end, termNode, defaultStemMethod);
230 proxNode->terms.push_back(termNode);
231 }
232 else if (el.lexType == CloseSquareBracketE) {
233 break;
234 }
235 else if (el.lexType == AndOpE) {
236 // ignore, the words are AND'ed anyway
237 cerr << "and inside []\n";
238 }
239 else if (el.lexType == OrOpE) {
240 cerr << "or inside []\n";
241 }
242 else {
243 //error
244 // just ignore for now
245 cerr <<"bad syntax inside []\n";
246 }
247 } // while
248
249}
250// expects the starting quote to have been parsed
251// and discarded
252// is now for exact phrases
253static void ParsePhrase (UCArray::const_iterator &here,
254 UCArray::const_iterator end,
255 ProxMatchQueryNode &proxNode) {
256 LexEl el;
257 bool first = true;
258 while (ParseLexEl (here, end, el)) {
259 if (el.lexType == TermE || el.lexType == IntegerE) {
260 TermNode termNode;
261 termNode.term = el.text;
262 //ParseTermModifiers (here, end, termNode);
263 if (first) {
264 first = false;
265 }
266 else {
267 termNode.startRange = -2;
268 termNode.endRange = -1;
269 }
270 proxNode.terms.push_back (termNode);
271
272 } else if (el.lexType == QuoteE) {
273 break;
274
275 } else {
276 // error
277 break;
278 }
279 }
280}
281
282static QueryNode *ParseTerm (UCArray::const_iterator &here,
283 UCArray::const_iterator end,
284 int defaultBoolCombine,
285 int defaultStemMethod) {
286 LexEl el;
287
288 UCArray::const_iterator oldHere = here;
289 if (!ParseLexEl (here, end, el)) return NULL;
290
291 if (el.lexType == OpenBracketE)
292 return ParseBracketExpression (here, end, defaultBoolCombine,
293 defaultStemMethod);
294
295 ProxMatchQueryNode *proxNode = new ProxMatchQueryNode;
296
297 if (el.lexType == TermE || el.lexType == IntegerE) {
298 TermNode termNode;
299 termNode.term = el.text;
300 ParseTermModifiers (here, end, termNode, defaultStemMethod);
301 oldHere = here; // dont backtrack past here
302 if (ParseLexEl(here, end, el) && el.lexType == NearOpE) {
303 delete proxNode;
304 proxNode = (ProxMatchQueryNode *)ParseTerm(here, end, defaultBoolCombine,
305 defaultStemMethod);
306 SetRangeValues(termNode, el.text);
307 proxNode->terms.push_back (termNode);
308 return proxNode;
309 }
310 else {
311 here = oldHere; // backtrack
312 proxNode->terms.push_back (termNode);
313 ParseProxModifiers(here, end, proxNode);
314 return proxNode;
315 }
316 } else if (el.lexType == QuoteE) {
317 ParsePhrase (here, end, *proxNode);
318 return proxNode;
319 }
320 else if (el.lexType == OpenSquareBracketE) {
321 ParseSquareBrackets (here, end, proxNode, defaultStemMethod);
322 ParseProxModifiers (here, end, proxNode);
323 return proxNode;
324 }
325
326 // not a term
327 here = oldHere;
328 delete proxNode;
329 return NULL;
330}
331
332
333static QueryNode *ParseExpression (UCArray::const_iterator &here,
334 UCArray::const_iterator end,
335 int defaultBoolCombine,
336 int defaultStemMethod) {
337 LexEl el;
338 QueryNode *curTree = NULL;
339
340 UCArray::const_iterator oldHere = here;
341 while (ParseLexEl (here, end, el)) {
342 if (el.lexType == OpenSquareBracketE ||
343 el.lexType == OpenBracketE ||
344 el.lexType == TermE ||
345 el.lexType == QuoteE ||
346 el.lexType == IntegerE ) {
347 // el.lexType == TagE) { //tag at end of term now
348 // some type of term, back track and parse it
349 here = oldHere;
350 // if default==1, AND, else if==0, OR
351 if (defaultBoolCombine) {
352 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
353 defaultStemMethod));
354 }
355 else {
356 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
357 defaultStemMethod));
358 }
359
360 } else if (el.lexType == AndOpE) {
361 curTree = AndAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
362 defaultStemMethod));
363
364 } else if (el.lexType == OrOpE) {
365 curTree = OrAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
366 defaultStemMethod));
367
368 } else if (el.lexType == NotOpE) {
369 curTree = NotAdd (curTree, ParseTerm (here, end, defaultBoolCombine,
370 defaultStemMethod));
371
372 } else if (el.lexType == CloseBracketE) {
373 // parsebracketexpression is waiting for the last bracket, so put it back
374 here = oldHere;
375 break;
376
377 } else break;
378
379 oldHere = here;
380 }
381
382 return curTree;
383}
384
385QueryNode *ParseQuery (const UCArray &queryStr, int defaultBoolCombine,
386 int defaultStemMethod) {
387 UCArray::const_iterator here = queryStr.begin();
388 UCArray::const_iterator end = queryStr.end();
389 return ParseExpression (here, end, defaultBoolCombine, defaultStemMethod);
390}
Note: See TracBrowser for help on using the repository browser.