Context Navigation

source: indexers/trunk/mgpp/text/GSDLQueryLex.cpp@ 18340

Last change on this file since 18340 was 18340, checked in by davidb, 15 years ago
Extra test added inside while loop to prevent parser trying to read past end of string
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.4 KB

Line
1	/**************************************************************************
2	*
3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4	* Copyright (C) 2000 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "GSDLQueryLex.h"
23	#include "unitool.h"
24	#include "words.h"
25
26	inline void AddNChar (UCArray::const_iterator &here,
27	UCArray &text,
28	int len) {
29	if (text.capacity() < text.size() + len + 1) {
30	text.reserve(text.size() + len + 1);
31	}
32	while (len > 0) {
33	text.push_back (*here++);
34	--len;
35	}
36	}
37
38	static bool ParseInteger (UCArray::const_iterator &here,
39	UCArray::const_iterator end,
40	LexEl &el) {
41	el.Clear();
42
43	// this version of end is used in unitool
44	// UCArray::const_iterator endMinus1 = end-1;
45	const unsigned char* endMinus1 = &*(end - 1);
46
47	int charLen;
48	unsigned short c; // one character lookahead
49	charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51	// check for positive or negative
52	bool neg = false;
53	if (c == '+') {
54	AddNChar (here, el.text, charLen);
55	charLen = parse_utf8_char (&*here, endMinus1, &c);
56	} else if (c == '-') {
57	neg = true;
58	AddNChar (here, el.text, charLen);
59	charLen = parse_utf8_char (&*here, endMinus1, &c);
60	}
61
62	// read in number part
63	int numeric=0;
64	el.num = 0;
65	el.lexType = IntegerE;
66	/* stop integers at 4 digits */
67	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68	el.num = el.num*10 + c - '0';
69	AddNChar (here, el.text, charLen);
70	charLen = parse_utf8_char (&*here, endMinus1, &c);
71	}
72
73	if (neg) el.num *= -1;
74
75	return (!el.text.empty());
76	}
77
78	static bool ParsePotentialInteger(UCArray::const_iterator &here,
79	UCArray::const_iterator end,
80	LexEl &el) {
81	el.Clear();
82
83	// this version of end is used in unitool
84	//UCArray::const_iterator endMinus1 = end-1;
85	const unsigned char* endMinus1 = &*(end - 1);
86
87	int charLen=0;
88	int length=0;
89	unsigned short c; // one character lookahead
90	charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92	// read in number part
93	int numeric=0;
94	el.num = 0;
95	el.lexType = IntegerE;
96	/* stop integers at 4 digits */
97	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98	el.num = el.num*10 + c - '0';
99	AddNChar (here, el.text, charLen);
100	length += charLen;
101	charLen = parse_utf8_char (&*here, endMinus1, &c);
102	}
103	// check the next character -if it is a letter, then have a term, not an integer
104	if (!is_unicode_letter(c)) {
105	// this was just an integer
106	return (!el.text.empty());
107	}
108	// else its a term
109	el.lexType = TermE;
110	el.num = 0;
111	/* this bit taken from ParseIndexWord in words.h*/
112	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
113	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
114	++numeric <= MAXNUMERIC))) {
115	AddNChar (here, el.text, charLen);
116	length += charLen;
117	charLen = parse_utf8_char (&*here, endMinus1, &c);
118	}
119
120	return (!el.text.empty());
121	}
122	static bool ParseTerm (UCArray::const_iterator &here,
123	UCArray::const_iterator end,
124	UCArray &text) {
125	if (here == end)
126	return false;
127
128	//UCArray::const_iterator endMinus1 = end-1;
129	const unsigned char* endMinus1 = &*(end - 1);
130	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
131	here += (new_here - &*here); // advance iterator by number of chars advanced
132	return !text.empty();
133	}
134
135
136	bool ParseLexEl (UCArray::const_iterator &here,
137	UCArray::const_iterator end,
138	LexEl &el) {
139	el.Clear();
140
141	// strange things can happen if here == end == 0
142	if (here == end)
143	return false;
144
145	// this version of end is used in unitool
146	//UCArray::const_iterator endMinus1 = end-1;
147	const unsigned char* endMinus1 = &*(end - 1);
148
149	// ignore all white space
150	int charLen;
151	unsigned short c; // one character lookahead
152	charLen = parse_utf8_char (&*here, endMinus1, &c);
153	while (here != end && is_unicode_space (c)) {
154	here += charLen;
155	if (here == end) break;
156	charLen = parse_utf8_char (&*here, endMinus1, &c);
157	}
158	if (here == end) return false;
159
160	if (c == '(') {
161	el.lexType = OpenBracketE;
162	AddNChar (here, el.text, charLen);
163	return true;
164
165	} else if (c == ')') {
166	el.lexType = CloseBracketE;
167	AddNChar (here, el.text, charLen);
168	return true;
169
170	} else if (c =='[') {
171	el.lexType = OpenSquareBracketE;
172	AddNChar (here, el.text, charLen);
173	return true;
174
175	} else if (c ==']') {
176	el.lexType = CloseSquareBracketE;
177	AddNChar (here, el.text, charLen);
178	return true;
179
180	} else if (c == '\"') {
181	el.lexType = QuoteE;
182	AddNChar (here, el.text, charLen);
183	return true;
184
185	} else if (c == '/') {
186	el.lexType = TermWeightE;
187	AddNChar (here, el.text, charLen);
188	return true;
189
190	} else if (c == '#') {
191	el.lexType = StemMethodE;
192	AddNChar (here, el.text, charLen);
193	return true;
194
195	} else if (c == '*') {
196	el.lexType = StarE;
197	AddNChar (here, el.text, charLen);
198	return true;
199
200	} else if (c == '^') {
201	el.lexType = RangeE;
202	AddNChar (here, el.text, charLen);
203	return true;
204
205	} else if (c == '@') {
206	el.lexType = AtE;
207	AddNChar (here, el.text, charLen);
208	return true;
209
210	} else if (c == ':') {
211	el.lexType = TagE;
212	AddNChar (here, el.text, charLen);
213	return true;
214
215	} else if (c=='&') {
216	el.lexType = AndOpE;
217	AddNChar (here, el.text, charLen);
218	return true;
219
220	} else if (c == '\|') {
221	el.lexType = OrOpE;
222	AddNChar (here, el.text, charLen);
223	return true;
224
225	} else if (c == '!') {
226	el.lexType = NotOpE;
227	AddNChar (here, el.text, charLen);
228	return true;
229
230	} else if (c == '+' \|\| c == '-' ) {
231	return ParseInteger (here, end, el);
232	}
233
234	else if (c >= '0' && c <= '9') {
235	return ParsePotentialInteger (here, end, el);
236	}
237
238	// assume it is a term of some sort
239	if (!ParseTerm (here, end, el.text)) {
240	// parse term returns false if it hasn't parsed anything that is a term
241	// here should be the same as it was before
242	el.lexType = UnknownE;
243	AddNChar (here, el.text, charLen);
244	return true;
245	}
246	//return false;
247
248	//UCArray AND; SetCStr (AND, "AND");
249	//if (el.text == AND) {
250	if (UCArrayCStrEquals(el.text, "AND")) {
251	el.lexType = AndOpE;
252	return true;
253	}
254	//UCArray OR; SetCStr (OR, "OR");
255	//if (el.text == OR) {
256	if (UCArrayCStrEquals(el.text, "OR")) {
257	el.lexType = OrOpE;
258	return true;
259	}
260	//UCArray NOT; SetCStr (NOT, "NOT");
261	//if (el.text == NOT) {
262	if (UCArrayCStrEquals(el.text, "NOT")) {
263	el.lexType = NotOpE;
264	return true;
265	}
266	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
267	if (PrefixLen(el.text, NEAR)==4) {
268	el.lexType = NearOpE;
269	return true;
270	}
271	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
272	if (PrefixLen(el.text, WITHIN)==6) {
273	el.lexType = WithinOpE;
274	return true;
275	}
276
277	el.lexType = TermE;
278	return true;
279	}
280

Note: See TracBrowser for help on using the repository browser.

Download in other formats: