source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 25139

Last change on this file since 25139 was 25139, checked in by kjdon, 12 years ago

merged with trunk rev 25137

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.5 KB
Line 
1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27 UCArray &text,
28 int len) {
29 if (text.capacity() < text.size() + len + 1) {
30 text.reserve(text.size() + len + 1);
31 }
32 while (len > 0) {
33 text.push_back (*here++);
34 --len;
35 }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39 UCArray::const_iterator end,
40 LexEl &el) {
41 el.Clear();
42
43 // this version of end is used in unitool
44 // UCArray::const_iterator endMinus1 = end-1;
45 const unsigned char* endMinus1 = &*(end - 1);
46
47 int charLen;
48 unsigned short c; // one character lookahead
49 charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51 // check for positive or negative
52 bool neg = false;
53 if (c == '+') {
54 AddNChar (here, el.text, charLen);
55 charLen = parse_utf8_char (&*here, endMinus1, &c);
56 } else if (c == '-') {
57 neg = true;
58 AddNChar (here, el.text, charLen);
59 charLen = parse_utf8_char (&*here, endMinus1, &c);
60 }
61
62 // read in number part
63 int numeric=0;
64 el.num = 0;
65 el.lexType = IntegerE;
66 /* stop integers at 4 digits */
67 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68 el.num = el.num*10 + c - '0';
69 AddNChar (here, el.text, charLen);
70 charLen = parse_utf8_char (&*here, endMinus1, &c);
71 }
72
73 if (neg) el.num *= -1;
74
75 return (!el.text.empty());
76}
77
78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79 UCArray::const_iterator end,
80 LexEl &el) {
81 el.Clear();
82
83 // this version of end is used in unitool
84 //UCArray::const_iterator endMinus1 = end-1;
85 const unsigned char* endMinus1 = &*(end - 1);
86
87 int charLen=0;
88 int length=0;
89 unsigned short c; // one character lookahead
90 charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92 // read in number part
93 int numeric=0;
94 el.num = 0;
95 el.lexType = IntegerE;
96 /* stop integers at 4 digits */
97 while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98 el.num = el.num*10 + c - '0';
99 AddNChar (here, el.text, charLen);
100 length += charLen;
101 if(numeric < MAXNUMERIC) { // server crash bugfix: don't go past the end with the endMinus1 pointer
102 charLen = parse_utf8_char (&*here, endMinus1, &c);
103 }
104 }
105 // check the next character -if it is a letter, then have a term, not an integer
106 if (!is_unicode_letter(c)) {
107 // this was just an integer
108 return (!el.text.empty());
109 }
110 // else its a term
111 el.lexType = TermE;
112 el.num = 0;
113 /* this bit taken from ParseIndexWord in words.h*/
114 while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
115 (is_unicode_letter(c) || (is_unicode_digit(c) &&
116 ++numeric <= MAXNUMERIC))) {
117 AddNChar (here, el.text, charLen);
118 length += charLen;
119 charLen = parse_utf8_char (&*here, endMinus1, &c);
120 }
121
122 return (!el.text.empty());
123}
124static bool ParseTerm (UCArray::const_iterator &here,
125 UCArray::const_iterator end,
126 UCArray &text) {
127 if (here == end)
128 return false;
129
130 //UCArray::const_iterator endMinus1 = end-1;
131 const unsigned char* endMinus1 = &*(end - 1);
132 const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
133 here += (new_here - &*here); // advance iterator by number of chars advanced
134 return !text.empty();
135}
136
137
138bool ParseLexEl (UCArray::const_iterator &here,
139 UCArray::const_iterator end,
140 LexEl &el) {
141 el.Clear();
142
143 // strange things can happen if here == end == 0
144 if (here == end)
145 return false;
146
147 // this version of end is used in unitool
148 //UCArray::const_iterator endMinus1 = end-1;
149 const unsigned char* endMinus1 = &*(end - 1);
150
151 // ignore all white space
152 int charLen;
153 unsigned short c; // one character lookahead
154 charLen = parse_utf8_char (&*here, endMinus1, &c);
155 while (here != end && is_unicode_space (c)) {
156 here += charLen;
157 if (here == end) break;
158 charLen = parse_utf8_char (&*here, endMinus1, &c);
159 }
160 if (here == end) return false;
161
162 if (c == '(') {
163 el.lexType = OpenBracketE;
164 AddNChar (here, el.text, charLen);
165 return true;
166
167 } else if (c == ')') {
168 el.lexType = CloseBracketE;
169 AddNChar (here, el.text, charLen);
170 return true;
171
172 } else if (c =='[') {
173 el.lexType = OpenSquareBracketE;
174 AddNChar (here, el.text, charLen);
175 return true;
176
177 } else if (c ==']') {
178 el.lexType = CloseSquareBracketE;
179 AddNChar (here, el.text, charLen);
180 return true;
181
182 } else if (c == '\"') {
183 el.lexType = QuoteE;
184 AddNChar (here, el.text, charLen);
185 return true;
186
187 } else if (c == '/') {
188 el.lexType = TermWeightE;
189 AddNChar (here, el.text, charLen);
190 return true;
191
192 } else if (c == '#') {
193 el.lexType = StemMethodE;
194 AddNChar (here, el.text, charLen);
195 return true;
196
197 } else if (c == '*') {
198 el.lexType = StarE;
199 AddNChar (here, el.text, charLen);
200 return true;
201
202 } else if (c == '^') {
203 el.lexType = RangeE;
204 AddNChar (here, el.text, charLen);
205 return true;
206
207 } else if (c == '@') {
208 el.lexType = AtE;
209 AddNChar (here, el.text, charLen);
210 return true;
211
212 } else if (c == ':') {
213 el.lexType = TagE;
214 AddNChar (here, el.text, charLen);
215 return true;
216
217 } else if (c=='&') {
218 el.lexType = AndOpE;
219 AddNChar (here, el.text, charLen);
220 return true;
221
222 } else if (c == '|') {
223 el.lexType = OrOpE;
224 AddNChar (here, el.text, charLen);
225 return true;
226
227 } else if (c == '!') {
228 el.lexType = NotOpE;
229 AddNChar (here, el.text, charLen);
230 return true;
231
232 } else if (c == '+' || c == '-' ) {
233 return ParseInteger (here, end, el);
234 }
235
236 else if (c >= '0' && c <= '9') {
237 return ParsePotentialInteger (here, end, el);
238 }
239
240 // assume it is a term of some sort
241 if (!ParseTerm (here, end, el.text)) {
242 // parse term returns false if it hasn't parsed anything that is a term
243 // here should be the same as it was before
244 el.lexType = UnknownE;
245 AddNChar (here, el.text, charLen);
246 return true;
247 }
248 //return false;
249
250 //UCArray AND; SetCStr (AND, "AND");
251 //if (el.text == AND) {
252 if (UCArrayCStrEquals(el.text, "AND")) {
253 el.lexType = AndOpE;
254 return true;
255 }
256 //UCArray OR; SetCStr (OR, "OR");
257 //if (el.text == OR) {
258 if (UCArrayCStrEquals(el.text, "OR")) {
259 el.lexType = OrOpE;
260 return true;
261 }
262 //UCArray NOT; SetCStr (NOT, "NOT");
263 //if (el.text == NOT) {
264 if (UCArrayCStrEquals(el.text, "NOT")) {
265 el.lexType = NotOpE;
266 return true;
267 }
268 UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
269 if (PrefixLen(el.text, NEAR)==4) {
270 el.lexType = NearOpE;
271 return true;
272 }
273 UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
274 if (PrefixLen(el.text, WITHIN)==6) {
275 el.lexType = WithinOpE;
276 return true;
277 }
278
279 el.lexType = TermE;
280 return true;
281}
282
Note: See TracBrowser for help on using the repository browser.