source: trunk/gsdl/src/mgpp/text/GSDLQueryLex.cpp@ 8691

Last change on this file since 8691 was 8691, checked in by kjdon, 19 years ago

Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
32 while (len > 0) {
33 text.push_back (*here++);
34 --len;
35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
44 // UCArray::const_iterator endMinus1 = end-1;
45 const unsigned char* endMinus1 = &(*end)-1;
46
47 int charLen;
48 unsigned short c; // one character lookahead
49 charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
55 charLen = parse_utf8_char (&*here, endMinus1, &c);
56 } else if (c == '-') {
57 neg = true;
58 AddNChar (here, el.text, charLen);
59 charLen = parse_utf8_char (&*here, endMinus1, &c);
60 }
61
62 // read in number part
63 int numeric=0;
64 el.num = 0;
65 el.lexType = IntegerE;
66 /* stop integers at 4 digits */
67 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68 el.num = el.num*10 + c - '0';
69 AddNChar (here, el.text, charLen);
70 charLen = parse_utf8_char (&*here, endMinus1, &c);
71 }
72
73 if (neg) el.num *= -1;
74
75 return (!el.text.empty());
76}
77
78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79 UCArray::const_iterator end,
80 LexEl &el) {
81 el.Clear();
82
83 // this version of end is used in unitool
84 //UCArray::const_iterator endMinus1 = end-1;
85 const unsigned char* endMinus1 = &(*end)-1;
86
87 int charLen=0;
88 int length=0;
89 unsigned short c; // one character lookahead
90 charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92 // read in number part
93 int numeric=0;
94 el.num = 0;
95 el.lexType = IntegerE;
96 /* stop integers at 4 digits */
97 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98 el.num = el.num*10 + c - '0';
99 AddNChar (here, el.text, charLen);
100 length += charLen;
101 charLen = parse_utf8_char (&*here, endMinus1, &c);
102 }
103 // check the next character -if it is a letter, then have a term, not an integer
104 if (!is_unicode_letter(c)) {
105 // this was just an integer
106 return (!el.text.empty());
107 }
108 // else its a term
109 el.lexType = TermE;
110 el.num = 0;
111 /* this bit taken from ParseIndexWord in words.h*/
112 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
113 (is_unicode_letter(c) || (is_unicode_digit(c) &&
114 ++numeric <= MAXNUMERIC))) {
115 AddNChar (here, el.text, charLen);
116 length += charLen;
117 charLen = parse_utf8_char (&*here, endMinus1, &c);
118 }
119
120 return (!el.text.empty());
121}
122static bool ParseTerm (UCArray::const_iterator &here,
123 UCArray::const_iterator end,
124 UCArray &text) {
125 //UCArray::const_iterator endMinus1 = end-1;
126 const unsigned char* endMinus1 = &(*end)-1;
127 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
128 here += (new_here - &*here); // advance iterator by number of chars advanced
129 return !text.empty();
130}
131
132
133bool ParseLexEl (UCArray::const_iterator &here,
134 UCArray::const_iterator end,
135 LexEl &el) {
136 el.Clear();
137
138 // strange things can happen if here == end == 0
139 if (here == end) return false;
140
141 // this version of end is used in unitool
142 //UCArray::const_iterator endMinus1 = end-1;
143 const unsigned char* endMinus1 = &(*end)-1;
144
145 // ignore all white space
146 int charLen;
147 unsigned short c; // one character lookahead
148 charLen = parse_utf8_char (&*here, endMinus1, &c);
149 while (here != end && is_unicode_space (c)) {
150 here += charLen;
151 charLen = parse_utf8_char (&*here, endMinus1, &c);
152 }
153 if (here == end) return false;
154
155 if (c == '(') {
156 el.lexType = OpenBracketE;
157 AddNChar (here, el.text, charLen);
158 return true;
159
160 } else if (c == ')') {
161 el.lexType = CloseBracketE;
162 AddNChar (here, el.text, charLen);
163 return true;
164
165 } else if (c =='[') {
166 el.lexType = OpenSquareBracketE;
167 AddNChar (here, el.text, charLen);
168 return true;
169
170 } else if (c ==']') {
171 el.lexType = CloseSquareBracketE;
172 AddNChar (here, el.text, charLen);
173 return true;
174
175 } else if (c == '\"') {
176 el.lexType = QuoteE;
177 AddNChar (here, el.text, charLen);
178 return true;
179
180 } else if (c == '/') {
181 el.lexType = TermWeightE;
182 AddNChar (here, el.text, charLen);
183 return true;
184
185 } else if (c == '#') {
186 el.lexType = StemMethodE;
187 AddNChar (here, el.text, charLen);
188 return true;
189
190 } else if (c == '*') {
191 el.lexType = StarE;
192 AddNChar (here, el.text, charLen);
193 return true;
194
195 } else if (c == '^') {
196 el.lexType = RangeE;
197 AddNChar (here, el.text, charLen);
198 return true;
199
200 } else if (c == '@') {
201 el.lexType = AtE;
202 AddNChar (here, el.text, charLen);
203 return true;
204
205 } else if (c == ':') {
206 el.lexType = TagE;
207 AddNChar (here, el.text, charLen);
208 return true;
209
210 } else if (c=='&') {
211 el.lexType = AndOpE;
212 AddNChar (here, el.text, charLen);
213 return true;
214
215 } else if (c == '|') {
216 el.lexType = OrOpE;
217 AddNChar (here, el.text, charLen);
218 return true;
219
220 } else if (c == '!') {
221 el.lexType = NotOpE;
222 AddNChar (here, el.text, charLen);
223 return true;
224
225 } else if (c == '+' || c == '-' ) {
226 return ParseInteger (here, end, el);
227 }
228
229 else if (c >= '0' && c <= '9') {
230 return ParsePotentialInteger (here, end, el);
231 }
232
233 // assume it is a term of some sort
234 if (!ParseTerm (here, end, el.text)) {
235 // parse term returns false if it hasn't parsed anything that is a term
236 // here should be the same as it was before
237 el.lexType = UnknownE;
238 AddNChar (here, el.text, charLen);
239 return true;
240 }
241 //return false;
242
243 //UCArray AND; SetCStr (AND, "AND");
244 //if (el.text == AND) {
245 if (UCArrayCStrEquals(el.text, "AND")) {
246 el.lexType = AndOpE;
247 return true;
248 }
249 //UCArray OR; SetCStr (OR, "OR");
250 //if (el.text == OR) {
251 if (UCArrayCStrEquals(el.text, "OR")) {
252 el.lexType = OrOpE;
253 return true;
254 }
255 //UCArray NOT; SetCStr (NOT, "NOT");
256 //if (el.text == NOT) {
257 if (UCArrayCStrEquals(el.text, "NOT")) {
258 el.lexType = NotOpE;
259 return true;
260 }
261 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
262 if (PrefixLen(el.text, NEAR)==4) {
263 el.lexType = NearOpE;
264 return true;
265 }
266 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
267 if (PrefixLen(el.text, WITHIN)==6) {
268 el.lexType = WithinOpE;
269 return true;
270 }
271
272 el.lexType = TermE;
273 return true;
274}
275
Note: See TracBrowser for help on using the repository browser.