source: trunk/gsdl/src/mgpp/text/GSDLQueryLex.cpp@ 8691

Last change on this file since 8691 was 8691, checked in by kjdon, 19 years ago

Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
RevLine 
[1127]1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
[8691]29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
[1127]32 while (len > 0) {
33 text.push_back (*here++);
[8691]34 --len;
[1127]35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
[3008]44 // UCArray::const_iterator endMinus1 = end-1;
45 const unsigned char* endMinus1 = &(*end)-1;
[1127]46
47 int charLen;
48 unsigned short c; // one character lookahead
[3008]49 charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
[3008]55 charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]56 } else if (c == '-') {
57 neg = true;
58 AddNChar (here, el.text, charLen);
[3008]59 charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]60 }
61
62 // read in number part
[2693]63 int numeric=0;
[1127]64 el.num = 0;
65 el.lexType = IntegerE;
[2693]66 /* stop integers at 4 digits */
67 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
[1127]68 el.num = el.num*10 + c - '0';
69 AddNChar (here, el.text, charLen);
[3008]70 charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]71 }
72
73 if (neg) el.num *= -1;
74
75 return (!el.text.empty());
76}
77
[2693]78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79 UCArray::const_iterator end,
80 LexEl &el) {
81 el.Clear();
82
83 // this version of end is used in unitool
[3008]84 //UCArray::const_iterator endMinus1 = end-1;
85 const unsigned char* endMinus1 = &(*end)-1;
[2693]86
87 int charLen=0;
88 int length=0;
89 unsigned short c; // one character lookahead
[3008]90 charLen = parse_utf8_char (&*here, endMinus1, &c);
[2693]91
92 // read in number part
93 int numeric=0;
94 el.num = 0;
95 el.lexType = IntegerE;
96 /* stop integers at 4 digits */
97 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98 el.num = el.num*10 + c - '0';
99 AddNChar (here, el.text, charLen);
100 length += charLen;
[3008]101 charLen = parse_utf8_char (&*here, endMinus1, &c);
[2693]102 }
103 // check the next character -if it is a letter, then have a term, not an integer
104 if (!is_unicode_letter(c)) {
105 // this was just an integer
106 return (!el.text.empty());
107 }
108 // else its a term
109 el.lexType = TermE;
110 el.num = 0;
111 /* this bit taken from ParseIndexWord in words.h*/
112 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
113 (is_unicode_letter(c) || (is_unicode_digit(c) &&
114 ++numeric <= MAXNUMERIC))) {
115 AddNChar (here, el.text, charLen);
116 length += charLen;
[3008]117 charLen = parse_utf8_char (&*here, endMinus1, &c);
[2693]118 }
119
120 return (!el.text.empty());
121}
[1127]122static bool ParseTerm (UCArray::const_iterator &here,
123 UCArray::const_iterator end,
124 UCArray &text) {
[3008]125 //UCArray::const_iterator endMinus1 = end-1;
126 const unsigned char* endMinus1 = &(*end)-1;
127 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
128 here += (new_here - &*here); // advance iterator by number of chars advanced
[1127]129 return !text.empty();
130}
131
132
133bool ParseLexEl (UCArray::const_iterator &here,
134 UCArray::const_iterator end,
135 LexEl &el) {
136 el.Clear();
137
138 // strange things can happen if here == end == 0
139 if (here == end) return false;
140
141 // this version of end is used in unitool
[3008]142 //UCArray::const_iterator endMinus1 = end-1;
143 const unsigned char* endMinus1 = &(*end)-1;
144
[1127]145 // ignore all white space
146 int charLen;
147 unsigned short c; // one character lookahead
[3008]148 charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]149 while (here != end && is_unicode_space (c)) {
150 here += charLen;
[3008]151 charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]152 }
153 if (here == end) return false;
154
155 if (c == '(') {
156 el.lexType = OpenBracketE;
157 AddNChar (here, el.text, charLen);
158 return true;
159
160 } else if (c == ')') {
161 el.lexType = CloseBracketE;
162 AddNChar (here, el.text, charLen);
163 return true;
164
165 } else if (c =='[') {
166 el.lexType = OpenSquareBracketE;
167 AddNChar (here, el.text, charLen);
168 return true;
169
170 } else if (c ==']') {
171 el.lexType = CloseSquareBracketE;
172 AddNChar (here, el.text, charLen);
173 return true;
174
175 } else if (c == '\"') {
176 el.lexType = QuoteE;
177 AddNChar (here, el.text, charLen);
178 return true;
179
180 } else if (c == '/') {
181 el.lexType = TermWeightE;
182 AddNChar (here, el.text, charLen);
183 return true;
184
185 } else if (c == '#') {
186 el.lexType = StemMethodE;
187 AddNChar (here, el.text, charLen);
188 return true;
189
[8244]190 } else if (c == '*') {
191 el.lexType = StarE;
192 AddNChar (here, el.text, charLen);
193 return true;
194
[1127]195 } else if (c == '^') {
196 el.lexType = RangeE;
197 AddNChar (here, el.text, charLen);
198 return true;
199
200 } else if (c == '@') {
201 el.lexType = AtE;
202 AddNChar (here, el.text, charLen);
203 return true;
204
205 } else if (c == ':') {
206 el.lexType = TagE;
207 AddNChar (here, el.text, charLen);
208 return true;
209
210 } else if (c=='&') {
211 el.lexType = AndOpE;
212 AddNChar (here, el.text, charLen);
213 return true;
214
215 } else if (c == '|') {
216 el.lexType = OrOpE;
217 AddNChar (here, el.text, charLen);
218 return true;
219
220 } else if (c == '!') {
221 el.lexType = NotOpE;
222 AddNChar (here, el.text, charLen);
223 return true;
224
[2693]225 } else if (c == '+' || c == '-' ) {
226 return ParseInteger (here, end, el);
[1127]227 }
228
[2693]229 else if (c >= '0' && c <= '9') {
230 return ParsePotentialInteger (here, end, el);
231 }
232
[1127]233 // assume it is a term of some sort
[6121]234 if (!ParseTerm (here, end, el.text)) {
[5448]235 // parse term returns false if it hasn't parsed anything that is a term
236 // here should be the same as it was before
237 el.lexType = UnknownE;
238 AddNChar (here, el.text, charLen);
239 return true;
240 }
241 //return false;
[1127]242
[8691]243 //UCArray AND; SetCStr (AND, "AND");
244 //if (el.text == AND) {
245 if (UCArrayCStrEquals(el.text, "AND")) {
[1127]246 el.lexType = AndOpE;
247 return true;
248 }
[8691]249 //UCArray OR; SetCStr (OR, "OR");
250 //if (el.text == OR) {
251 if (UCArrayCStrEquals(el.text, "OR")) {
[1127]252 el.lexType = OrOpE;
253 return true;
254 }
[8691]255 //UCArray NOT; SetCStr (NOT, "NOT");
256 //if (el.text == NOT) {
257 if (UCArrayCStrEquals(el.text, "NOT")) {
[1127]258 el.lexType = NotOpE;
259 return true;
260 }
[8691]261 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[1127]262 if (PrefixLen(el.text, NEAR)==4) {
263 el.lexType = NearOpE;
264 return true;
265 }
[8691]266 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6121]267 if (PrefixLen(el.text, WITHIN)==6) {
268 el.lexType = WithinOpE;
269 return true;
270 }
271
[1127]272 el.lexType = TermE;
273 return true;
274}
275
Note: See TracBrowser for help on using the repository browser.