source: trunk/gsdl/src/mgpp/text/GSDLQueryLex.cpp@ 11259

Last change on this file since 11259 was 11259, checked in by mdewsnip, 18 years ago

Various little bug fixes and improvements (many to get things working with Visual Studio 2005), by Emanuel Dejanu.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
32 while (len > 0) {
33 text.push_back (*here++);
34 --len;
35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
44 // UCArray::const_iterator endMinus1 = end-1;
45 const unsigned char* endMinus1 = &*(end - 1);
46
47 int charLen;
48 unsigned short c; // one character lookahead
49 charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
55 charLen = parse_utf8_char (&*here, endMinus1, &c);
56 } else if (c == '-') {
57 neg = true;
58 AddNChar (here, el.text, charLen);
59 charLen = parse_utf8_char (&*here, endMinus1, &c);
60 }
61
62 // read in number part
63 int numeric=0;
64 el.num = 0;
65 el.lexType = IntegerE;
66 /* stop integers at 4 digits */
67 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68 el.num = el.num*10 + c - '0';
69 AddNChar (here, el.text, charLen);
70 charLen = parse_utf8_char (&*here, endMinus1, &c);
71 }
72
73 if (neg) el.num *= -1;
74
75 return (!el.text.empty());
76}
77
78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79 UCArray::const_iterator end,
80 LexEl &el) {
81 el.Clear();
82
83 // this version of end is used in unitool
84 //UCArray::const_iterator endMinus1 = end-1;
85 const unsigned char* endMinus1 = &*(end - 1);
86
87 int charLen=0;
88 int length=0;
89 unsigned short c; // one character lookahead
90 charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92 // read in number part
93 int numeric=0;
94 el.num = 0;
95 el.lexType = IntegerE;
96 /* stop integers at 4 digits */
97 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98 el.num = el.num*10 + c - '0';
99 AddNChar (here, el.text, charLen);
100 length += charLen;
101 charLen = parse_utf8_char (&*here, endMinus1, &c);
102 }
103 // check the next character -if it is a letter, then have a term, not an integer
104 if (!is_unicode_letter(c)) {
105 // this was just an integer
106 return (!el.text.empty());
107 }
108 // else its a term
109 el.lexType = TermE;
110 el.num = 0;
111 /* this bit taken from ParseIndexWord in words.h*/
112 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
113 (is_unicode_letter(c) || (is_unicode_digit(c) &&
114 ++numeric <= MAXNUMERIC))) {
115 AddNChar (here, el.text, charLen);
116 length += charLen;
117 charLen = parse_utf8_char (&*here, endMinus1, &c);
118 }
119
120 return (!el.text.empty());
121}
122static bool ParseTerm (UCArray::const_iterator &here,
123 UCArray::const_iterator end,
124 UCArray &text) {
125 if (here == end)
126 return false;
127
128 //UCArray::const_iterator endMinus1 = end-1;
129 const unsigned char* endMinus1 = &*(end - 1);
130 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
131 here += (new_here - &*here); // advance iterator by number of chars advanced
132 return !text.empty();
133}
134
135
136bool ParseLexEl (UCArray::const_iterator &here,
137 UCArray::const_iterator end,
138 LexEl &el) {
139 el.Clear();
140
141 // strange things can happen if here == end == 0
142 if (here == end)
143 return false;
144
145 // this version of end is used in unitool
146 //UCArray::const_iterator endMinus1 = end-1;
147 const unsigned char* endMinus1 = &*(end - 1);
148
149 // ignore all white space
150 int charLen;
151 unsigned short c; // one character lookahead
152 charLen = parse_utf8_char (&*here, endMinus1, &c);
153 while (here != end && is_unicode_space (c)) {
154 here += charLen;
155 charLen = parse_utf8_char (&*here, endMinus1, &c);
156 }
157 if (here == end) return false;
158
159 if (c == '(') {
160 el.lexType = OpenBracketE;
161 AddNChar (here, el.text, charLen);
162 return true;
163
164 } else if (c == ')') {
165 el.lexType = CloseBracketE;
166 AddNChar (here, el.text, charLen);
167 return true;
168
169 } else if (c =='[') {
170 el.lexType = OpenSquareBracketE;
171 AddNChar (here, el.text, charLen);
172 return true;
173
174 } else if (c ==']') {
175 el.lexType = CloseSquareBracketE;
176 AddNChar (here, el.text, charLen);
177 return true;
178
179 } else if (c == '\"') {
180 el.lexType = QuoteE;
181 AddNChar (here, el.text, charLen);
182 return true;
183
184 } else if (c == '/') {
185 el.lexType = TermWeightE;
186 AddNChar (here, el.text, charLen);
187 return true;
188
189 } else if (c == '#') {
190 el.lexType = StemMethodE;
191 AddNChar (here, el.text, charLen);
192 return true;
193
194 } else if (c == '*') {
195 el.lexType = StarE;
196 AddNChar (here, el.text, charLen);
197 return true;
198
199 } else if (c == '^') {
200 el.lexType = RangeE;
201 AddNChar (here, el.text, charLen);
202 return true;
203
204 } else if (c == '@') {
205 el.lexType = AtE;
206 AddNChar (here, el.text, charLen);
207 return true;
208
209 } else if (c == ':') {
210 el.lexType = TagE;
211 AddNChar (here, el.text, charLen);
212 return true;
213
214 } else if (c=='&') {
215 el.lexType = AndOpE;
216 AddNChar (here, el.text, charLen);
217 return true;
218
219 } else if (c == '|') {
220 el.lexType = OrOpE;
221 AddNChar (here, el.text, charLen);
222 return true;
223
224 } else if (c == '!') {
225 el.lexType = NotOpE;
226 AddNChar (here, el.text, charLen);
227 return true;
228
229 } else if (c == '+' || c == '-' ) {
230 return ParseInteger (here, end, el);
231 }
232
233 else if (c >= '0' && c <= '9') {
234 return ParsePotentialInteger (here, end, el);
235 }
236
237 // assume it is a term of some sort
238 if (!ParseTerm (here, end, el.text)) {
239 // parse term returns false if it hasn't parsed anything that is a term
240 // here should be the same as it was before
241 el.lexType = UnknownE;
242 AddNChar (here, el.text, charLen);
243 return true;
244 }
245 //return false;
246
247 //UCArray AND; SetCStr (AND, "AND");
248 //if (el.text == AND) {
249 if (UCArrayCStrEquals(el.text, "AND")) {
250 el.lexType = AndOpE;
251 return true;
252 }
253 //UCArray OR; SetCStr (OR, "OR");
254 //if (el.text == OR) {
255 if (UCArrayCStrEquals(el.text, "OR")) {
256 el.lexType = OrOpE;
257 return true;
258 }
259 //UCArray NOT; SetCStr (NOT, "NOT");
260 //if (el.text == NOT) {
261 if (UCArrayCStrEquals(el.text, "NOT")) {
262 el.lexType = NotOpE;
263 return true;
264 }
265 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
266 if (PrefixLen(el.text, NEAR)==4) {
267 el.lexType = NearOpE;
268 return true;
269 }
270 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
271 if (PrefixLen(el.text, WITHIN)==6) {
272 el.lexType = WithinOpE;
273 return true;
274 }
275
276 el.lexType = TermE;
277 return true;
278}
279
Note: See TracBrowser for help on using the repository browser.