source: indexers/trunk/mgpp/text/GSDLQueryLex.cpp@ 18340

Last change on this file since 18340 was 18340, checked in by davidb, 15 years ago

Extra test added inside while loop to prevent parser trying to read past end of string

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
32 while (len > 0) {
33 text.push_back (*here++);
34 --len;
35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
44 // UCArray::const_iterator endMinus1 = end-1;
45 const unsigned char* endMinus1 = &*(end - 1);
46
47 int charLen;
48 unsigned short c; // one character lookahead
49 charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
55 charLen = parse_utf8_char (&*here, endMinus1, &c);
56 } else if (c == '-') {
57 neg = true;
58 AddNChar (here, el.text, charLen);
59 charLen = parse_utf8_char (&*here, endMinus1, &c);
60 }
61
62 // read in number part
63 int numeric=0;
64 el.num = 0;
65 el.lexType = IntegerE;
66 /* stop integers at 4 digits */
67 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68 el.num = el.num*10 + c - '0';
69 AddNChar (here, el.text, charLen);
70 charLen = parse_utf8_char (&*here, endMinus1, &c);
71 }
72
73 if (neg) el.num *= -1;
74
75 return (!el.text.empty());
76}
77
78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79 UCArray::const_iterator end,
80 LexEl &el) {
81 el.Clear();
82
83 // this version of end is used in unitool
84 //UCArray::const_iterator endMinus1 = end-1;
85 const unsigned char* endMinus1 = &*(end - 1);
86
87 int charLen=0;
88 int length=0;
89 unsigned short c; // one character lookahead
90 charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92 // read in number part
93 int numeric=0;
94 el.num = 0;
95 el.lexType = IntegerE;
96 /* stop integers at 4 digits */
97 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98 el.num = el.num*10 + c - '0';
99 AddNChar (here, el.text, charLen);
100 length += charLen;
101 charLen = parse_utf8_char (&*here, endMinus1, &c);
102 }
103 // check the next character -if it is a letter, then have a term, not an integer
104 if (!is_unicode_letter(c)) {
105 // this was just an integer
106 return (!el.text.empty());
107 }
108 // else its a term
109 el.lexType = TermE;
110 el.num = 0;
111 /* this bit taken from ParseIndexWord in words.h*/
112 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
113 (is_unicode_letter(c) || (is_unicode_digit(c) &&
114 ++numeric <= MAXNUMERIC))) {
115 AddNChar (here, el.text, charLen);
116 length += charLen;
117 charLen = parse_utf8_char (&*here, endMinus1, &c);
118 }
119
120 return (!el.text.empty());
121}
122static bool ParseTerm (UCArray::const_iterator &here,
123 UCArray::const_iterator end,
124 UCArray &text) {
125 if (here == end)
126 return false;
127
128 //UCArray::const_iterator endMinus1 = end-1;
129 const unsigned char* endMinus1 = &*(end - 1);
130 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
131 here += (new_here - &*here); // advance iterator by number of chars advanced
132 return !text.empty();
133}
134
135
136bool ParseLexEl (UCArray::const_iterator &here,
137 UCArray::const_iterator end,
138 LexEl &el) {
139 el.Clear();
140
141 // strange things can happen if here == end == 0
142 if (here == end)
143 return false;
144
145 // this version of end is used in unitool
146 //UCArray::const_iterator endMinus1 = end-1;
147 const unsigned char* endMinus1 = &*(end - 1);
148
149 // ignore all white space
150 int charLen;
151 unsigned short c; // one character lookahead
152 charLen = parse_utf8_char (&*here, endMinus1, &c);
153 while (here != end && is_unicode_space (c)) {
154 here += charLen;
155 if (here == end) break;
156 charLen = parse_utf8_char (&*here, endMinus1, &c);
157 }
158 if (here == end) return false;
159
160 if (c == '(') {
161 el.lexType = OpenBracketE;
162 AddNChar (here, el.text, charLen);
163 return true;
164
165 } else if (c == ')') {
166 el.lexType = CloseBracketE;
167 AddNChar (here, el.text, charLen);
168 return true;
169
170 } else if (c =='[') {
171 el.lexType = OpenSquareBracketE;
172 AddNChar (here, el.text, charLen);
173 return true;
174
175 } else if (c ==']') {
176 el.lexType = CloseSquareBracketE;
177 AddNChar (here, el.text, charLen);
178 return true;
179
180 } else if (c == '\"') {
181 el.lexType = QuoteE;
182 AddNChar (here, el.text, charLen);
183 return true;
184
185 } else if (c == '/') {
186 el.lexType = TermWeightE;
187 AddNChar (here, el.text, charLen);
188 return true;
189
190 } else if (c == '#') {
191 el.lexType = StemMethodE;
192 AddNChar (here, el.text, charLen);
193 return true;
194
195 } else if (c == '*') {
196 el.lexType = StarE;
197 AddNChar (here, el.text, charLen);
198 return true;
199
200 } else if (c == '^') {
201 el.lexType = RangeE;
202 AddNChar (here, el.text, charLen);
203 return true;
204
205 } else if (c == '@') {
206 el.lexType = AtE;
207 AddNChar (here, el.text, charLen);
208 return true;
209
210 } else if (c == ':') {
211 el.lexType = TagE;
212 AddNChar (here, el.text, charLen);
213 return true;
214
215 } else if (c=='&') {
216 el.lexType = AndOpE;
217 AddNChar (here, el.text, charLen);
218 return true;
219
220 } else if (c == '|') {
221 el.lexType = OrOpE;
222 AddNChar (here, el.text, charLen);
223 return true;
224
225 } else if (c == '!') {
226 el.lexType = NotOpE;
227 AddNChar (here, el.text, charLen);
228 return true;
229
230 } else if (c == '+' || c == '-' ) {
231 return ParseInteger (here, end, el);
232 }
233
234 else if (c >= '0' && c <= '9') {
235 return ParsePotentialInteger (here, end, el);
236 }
237
238 // assume it is a term of some sort
239 if (!ParseTerm (here, end, el.text)) {
240 // parse term returns false if it hasn't parsed anything that is a term
241 // here should be the same as it was before
242 el.lexType = UnknownE;
243 AddNChar (here, el.text, charLen);
244 return true;
245 }
246 //return false;
247
248 //UCArray AND; SetCStr (AND, "AND");
249 //if (el.text == AND) {
250 if (UCArrayCStrEquals(el.text, "AND")) {
251 el.lexType = AndOpE;
252 return true;
253 }
254 //UCArray OR; SetCStr (OR, "OR");
255 //if (el.text == OR) {
256 if (UCArrayCStrEquals(el.text, "OR")) {
257 el.lexType = OrOpE;
258 return true;
259 }
260 //UCArray NOT; SetCStr (NOT, "NOT");
261 //if (el.text == NOT) {
262 if (UCArrayCStrEquals(el.text, "NOT")) {
263 el.lexType = NotOpE;
264 return true;
265 }
266 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
267 if (PrefixLen(el.text, NEAR)==4) {
268 el.lexType = NearOpE;
269 return true;
270 }
271 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
272 if (PrefixLen(el.text, WITHIN)==6) {
273 el.lexType = WithinOpE;
274 return true;
275 }
276
277 el.lexType = TermE;
278 return true;
279}
280
Note: See TracBrowser for help on using the repository browser.