Context Navigation

GSDLQueryLex.cpp@ 25139

Last change on this file since 25139 was 25139, checked in by kjdon, 12 years ago
merged with trunk rev 25137
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.5 KB

Line
1	/**************************************************************************
2	*
3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4	* Copyright (C) 2000 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "GSDLQueryLex.h"
23	#include "unitool.h"
24	#include "words.h"
25
26	inline void AddNChar (UCArray::const_iterator &here,
27	UCArray &text,
28	int len) {
29	if (text.capacity() < text.size() + len + 1) {
30	text.reserve(text.size() + len + 1);
31	}
32	while (len > 0) {
33	text.push_back (*here++);
34	--len;
35	}
36	}
37
38	static bool ParseInteger (UCArray::const_iterator &here,
39	UCArray::const_iterator end,
40	LexEl &el) {
41	el.Clear();
42
43	// this version of end is used in unitool
44	// UCArray::const_iterator endMinus1 = end-1;
45	const unsigned char* endMinus1 = &*(end - 1);
46
47	int charLen;
48	unsigned short c; // one character lookahead
49	charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51	// check for positive or negative
52	bool neg = false;
53	if (c == '+') {
54	AddNChar (here, el.text, charLen);
55	charLen = parse_utf8_char (&*here, endMinus1, &c);
56	} else if (c == '-') {
57	neg = true;
58	AddNChar (here, el.text, charLen);
59	charLen = parse_utf8_char (&*here, endMinus1, &c);
60	}
61
62	// read in number part
63	int numeric=0;
64	el.num = 0;
65	el.lexType = IntegerE;
66	/* stop integers at 4 digits */
67	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68	el.num = el.num*10 + c - '0';
69	AddNChar (here, el.text, charLen);
70	charLen = parse_utf8_char (&*here, endMinus1, &c);
71	}
72
73	if (neg) el.num *= -1;
74
75	return (!el.text.empty());
76	}
77
78	static bool ParsePotentialInteger(UCArray::const_iterator &here,
79	UCArray::const_iterator end,
80	LexEl &el) {
81	el.Clear();
82
83	// this version of end is used in unitool
84	//UCArray::const_iterator endMinus1 = end-1;
85	const unsigned char* endMinus1 = &*(end - 1);
86
87	int charLen=0;
88	int length=0;
89	unsigned short c; // one character lookahead
90	charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92	// read in number part
93	int numeric=0;
94	el.num = 0;
95	el.lexType = IntegerE;
96	/* stop integers at 4 digits */
97	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
98	el.num = el.num*10 + c - '0';
99	AddNChar (here, el.text, charLen);
100	length += charLen;
101	if(numeric < MAXNUMERIC) { // server crash bugfix: don't go past the end with the endMinus1 pointer
102	charLen = parse_utf8_char (&*here, endMinus1, &c);
103	}
104	}
105	// check the next character -if it is a letter, then have a term, not an integer
106	if (!is_unicode_letter(c)) {
107	// this was just an integer
108	return (!el.text.empty());
109	}
110	// else its a term
111	el.lexType = TermE;
112	el.num = 0;
113	/* this bit taken from ParseIndexWord in words.h*/
114	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
115	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
116	++numeric <= MAXNUMERIC))) {
117	AddNChar (here, el.text, charLen);
118	length += charLen;
119	charLen = parse_utf8_char (&*here, endMinus1, &c);
120	}
121
122	return (!el.text.empty());
123	}
124	static bool ParseTerm (UCArray::const_iterator &here,
125	UCArray::const_iterator end,
126	UCArray &text) {
127	if (here == end)
128	return false;
129
130	//UCArray::const_iterator endMinus1 = end-1;
131	const unsigned char* endMinus1 = &*(end - 1);
132	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
133	here += (new_here - &*here); // advance iterator by number of chars advanced
134	return !text.empty();
135	}
136
137
138	bool ParseLexEl (UCArray::const_iterator &here,
139	UCArray::const_iterator end,
140	LexEl &el) {
141	el.Clear();
142
143	// strange things can happen if here == end == 0
144	if (here == end)
145	return false;
146
147	// this version of end is used in unitool
148	//UCArray::const_iterator endMinus1 = end-1;
149	const unsigned char* endMinus1 = &*(end - 1);
150
151	// ignore all white space
152	int charLen;
153	unsigned short c; // one character lookahead
154	charLen = parse_utf8_char (&*here, endMinus1, &c);
155	while (here != end && is_unicode_space (c)) {
156	here += charLen;
157	if (here == end) break;
158	charLen = parse_utf8_char (&*here, endMinus1, &c);
159	}
160	if (here == end) return false;
161
162	if (c == '(') {
163	el.lexType = OpenBracketE;
164	AddNChar (here, el.text, charLen);
165	return true;
166
167	} else if (c == ')') {
168	el.lexType = CloseBracketE;
169	AddNChar (here, el.text, charLen);
170	return true;
171
172	} else if (c =='[') {
173	el.lexType = OpenSquareBracketE;
174	AddNChar (here, el.text, charLen);
175	return true;
176
177	} else if (c ==']') {
178	el.lexType = CloseSquareBracketE;
179	AddNChar (here, el.text, charLen);
180	return true;
181
182	} else if (c == '\"') {
183	el.lexType = QuoteE;
184	AddNChar (here, el.text, charLen);
185	return true;
186
187	} else if (c == '/') {
188	el.lexType = TermWeightE;
189	AddNChar (here, el.text, charLen);
190	return true;
191
192	} else if (c == '#') {
193	el.lexType = StemMethodE;
194	AddNChar (here, el.text, charLen);
195	return true;
196
197	} else if (c == '*') {
198	el.lexType = StarE;
199	AddNChar (here, el.text, charLen);
200	return true;
201
202	} else if (c == '^') {
203	el.lexType = RangeE;
204	AddNChar (here, el.text, charLen);
205	return true;
206
207	} else if (c == '@') {
208	el.lexType = AtE;
209	AddNChar (here, el.text, charLen);
210	return true;
211
212	} else if (c == ':') {
213	el.lexType = TagE;
214	AddNChar (here, el.text, charLen);
215	return true;
216
217	} else if (c=='&') {
218	el.lexType = AndOpE;
219	AddNChar (here, el.text, charLen);
220	return true;
221
222	} else if (c == '\|') {
223	el.lexType = OrOpE;
224	AddNChar (here, el.text, charLen);
225	return true;
226
227	} else if (c == '!') {
228	el.lexType = NotOpE;
229	AddNChar (here, el.text, charLen);
230	return true;
231
232	} else if (c == '+' \|\| c == '-' ) {
233	return ParseInteger (here, end, el);
234	}
235
236	else if (c >= '0' && c <= '9') {
237	return ParsePotentialInteger (here, end, el);
238	}
239
240	// assume it is a term of some sort
241	if (!ParseTerm (here, end, el.text)) {
242	// parse term returns false if it hasn't parsed anything that is a term
243	// here should be the same as it was before
244	el.lexType = UnknownE;
245	AddNChar (here, el.text, charLen);
246	return true;
247	}
248	//return false;
249
250	//UCArray AND; SetCStr (AND, "AND");
251	//if (el.text == AND) {
252	if (UCArrayCStrEquals(el.text, "AND")) {
253	el.lexType = AndOpE;
254	return true;
255	}
256	//UCArray OR; SetCStr (OR, "OR");
257	//if (el.text == OR) {
258	if (UCArrayCStrEquals(el.text, "OR")) {
259	el.lexType = OrOpE;
260	return true;
261	}
262	//UCArray NOT; SetCStr (NOT, "NOT");
263	//if (el.text == NOT) {
264	if (UCArrayCStrEquals(el.text, "NOT")) {
265	el.lexType = NotOpE;
266	return true;
267	}
268	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
269	if (PrefixLen(el.text, NEAR)==4) {
270	el.lexType = NearOpE;
271	return true;
272	}
273	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
274	if (PrefixLen(el.text, WITHIN)==6) {
275	el.lexType = WithinOpE;
276	return true;
277	}
278
279	el.lexType = TermE;
280	return true;
281	}
282

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 25139

Download in other formats: