source: main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 25526

Last change on this file since 25526 was 25526, checked in by ak19, 12 years ago

Dr Bainbridge fixed the problem noticed by Diego and which was thought to be fixed earlier. It had to do with searching for (3 digit) numbers, however Diego particularly experienced the problem when trying the ifl=1 argument (I Feel Lucky) for searching.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.6 KB
RevLine 
[3365]1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
[8692]29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
[3365]32 while (len > 0) {
33 text.push_back (*here++);
[8692]34 --len;
[3365]35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
44 // UCArray::const_iterator endMinus1 = end-1;
[12318]45 const unsigned char* endMinus1 = &*(end - 1);
[3365]46
47 int charLen;
48 unsigned short c; // one character lookahead
49 charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
55 charLen = parse_utf8_char (&*here, endMinus1, &c);
56 } else if (c == '-') {
57 neg = true;
58 AddNChar (here, el.text, charLen);
59 charLen = parse_utf8_char (&*here, endMinus1, &c);
60 }
61
62 // read in number part
63 int numeric=0;
64 el.num = 0;
65 el.lexType = IntegerE;
66 /* stop integers at 4 digits */
67 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68 el.num = el.num*10 + c - '0';
69 AddNChar (here, el.text, charLen);
70 charLen = parse_utf8_char (&*here, endMinus1, &c);
71 }
72
73 if (neg) el.num *= -1;
74
75 return (!el.text.empty());
76}
77
78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79 UCArray::const_iterator end,
80 LexEl &el) {
81 el.Clear();
82
83 // this version of end is used in unitool
84 //UCArray::const_iterator endMinus1 = end-1;
[12318]85 const unsigned char* endMinus1 = &*(end - 1);
[3365]86
87 int charLen=0;
88 int length=0;
89 unsigned short c; // one character lookahead
90 charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92 // read in number part
93 int numeric=0;
94 el.num = 0;
95 el.lexType = IntegerE;
[25526]96
[3365]97 /* stop integers at 4 digits */
[25526]98 while (here != end) {
99
100 charLen = parse_utf8_char (&*here, endMinus1, &c);
101 if (c < '0' || c > '9') {
102 // reached a non-digit character
103 break;
104 }
105 el.num = el.num*10 + c - '0';
106 AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
107 length += charLen;
108
109 numeric++;
110
111 if (numeric == MAXNUMERIC) {
112 // reached the max length of a number
113 break;
114 }
115
[3365]116 }
[25526]117
118
[3365]119 // check the next character -if it is a letter, then have a term, not an integer
120 if (!is_unicode_letter(c)) {
121 // this was just an integer
122 return (!el.text.empty());
123 }
124 // else its a term
125 el.lexType = TermE;
126 el.num = 0;
127 /* this bit taken from ParseIndexWord in words.h*/
128 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
129 (is_unicode_letter(c) || (is_unicode_digit(c) &&
130 ++numeric <= MAXNUMERIC))) {
131 AddNChar (here, el.text, charLen);
132 length += charLen;
133 charLen = parse_utf8_char (&*here, endMinus1, &c);
134 }
135
136 return (!el.text.empty());
137}
138static bool ParseTerm (UCArray::const_iterator &here,
139 UCArray::const_iterator end,
140 UCArray &text) {
[12318]141 if (here == end)
142 return false;
143
[3365]144 //UCArray::const_iterator endMinus1 = end-1;
[12318]145 const unsigned char* endMinus1 = &*(end - 1);
[3365]146 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
147 here += (new_here - &*here); // advance iterator by number of chars advanced
148 return !text.empty();
149}
150
151
152bool ParseLexEl (UCArray::const_iterator &here,
153 UCArray::const_iterator end,
154 LexEl &el) {
155 el.Clear();
156
157 // strange things can happen if here == end == 0
[12318]158 if (here == end)
159 return false;
[3365]160
161 // this version of end is used in unitool
162 //UCArray::const_iterator endMinus1 = end-1;
[12318]163 const unsigned char* endMinus1 = &*(end - 1);
[3365]164
165 // ignore all white space
166 int charLen;
167 unsigned short c; // one character lookahead
168 charLen = parse_utf8_char (&*here, endMinus1, &c);
169 while (here != end && is_unicode_space (c)) {
170 here += charLen;
[18340]171 if (here == end) break;
[3365]172 charLen = parse_utf8_char (&*here, endMinus1, &c);
173 }
174 if (here == end) return false;
175
176 if (c == '(') {
177 el.lexType = OpenBracketE;
178 AddNChar (here, el.text, charLen);
179 return true;
180
181 } else if (c == ')') {
182 el.lexType = CloseBracketE;
183 AddNChar (here, el.text, charLen);
184 return true;
185
186 } else if (c =='[') {
187 el.lexType = OpenSquareBracketE;
188 AddNChar (here, el.text, charLen);
189 return true;
190
191 } else if (c ==']') {
192 el.lexType = CloseSquareBracketE;
193 AddNChar (here, el.text, charLen);
194 return true;
195
196 } else if (c == '\"') {
197 el.lexType = QuoteE;
198 AddNChar (here, el.text, charLen);
199 return true;
200
201 } else if (c == '/') {
202 el.lexType = TermWeightE;
203 AddNChar (here, el.text, charLen);
204 return true;
205
206 } else if (c == '#') {
207 el.lexType = StemMethodE;
208 AddNChar (here, el.text, charLen);
209 return true;
210
[8242]211 } else if (c == '*') {
212 el.lexType = StarE;
213 AddNChar (here, el.text, charLen);
214 return true;
215
[3365]216 } else if (c == '^') {
217 el.lexType = RangeE;
218 AddNChar (here, el.text, charLen);
219 return true;
220
221 } else if (c == '@') {
222 el.lexType = AtE;
223 AddNChar (here, el.text, charLen);
224 return true;
225
226 } else if (c == ':') {
227 el.lexType = TagE;
228 AddNChar (here, el.text, charLen);
229 return true;
230
231 } else if (c=='&') {
232 el.lexType = AndOpE;
233 AddNChar (here, el.text, charLen);
234 return true;
235
236 } else if (c == '|') {
237 el.lexType = OrOpE;
238 AddNChar (here, el.text, charLen);
239 return true;
240
241 } else if (c == '!') {
242 el.lexType = NotOpE;
243 AddNChar (here, el.text, charLen);
244 return true;
245
246 } else if (c == '+' || c == '-' ) {
247 return ParseInteger (here, end, el);
248 }
249
250 else if (c >= '0' && c <= '9') {
251 return ParsePotentialInteger (here, end, el);
252 }
253
254 // assume it is a term of some sort
[5449]255 if (!ParseTerm (here, end, el.text)) {
256 // parse term returns false if it hasn't parsed anything that is a term
257 // here should be the same as it was before
258 el.lexType = UnknownE;
259 AddNChar (here, el.text, charLen);
260 return true;
261 }
262 //return false;
[3365]263
[8692]264 //UCArray AND; SetCStr (AND, "AND");
265 //if (el.text == AND) {
266 if (UCArrayCStrEquals(el.text, "AND")) {
[3365]267 el.lexType = AndOpE;
268 return true;
269 }
[8692]270 //UCArray OR; SetCStr (OR, "OR");
271 //if (el.text == OR) {
272 if (UCArrayCStrEquals(el.text, "OR")) {
[3365]273 el.lexType = OrOpE;
274 return true;
275 }
[8692]276 //UCArray NOT; SetCStr (NOT, "NOT");
277 //if (el.text == NOT) {
278 if (UCArrayCStrEquals(el.text, "NOT")) {
[3365]279 el.lexType = NotOpE;
280 return true;
281 }
[8692]282 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[3365]283 if (PrefixLen(el.text, NEAR)==4) {
284 el.lexType = NearOpE;
285 return true;
286 }
[8692]287 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6119]288 if (PrefixLen(el.text, WITHIN)==6) {
289 el.lexType = WithinOpE;
290 return true;
291 }
292
[3365]293 el.lexType = TermE;
294 return true;
295}
296
Note: See TracBrowser for help on using the repository browser.