Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 25526

Last change on this file since 25526 was 25526, checked in by ak19, 12 years ago
Dr Bainbridge fixed the problem noticed by Diego and which was thought to be fixed earlier. It had to do with searching for (3 digit) numbers, however Diego particularly experienced the problem when trying the ifl=1 argument (I Feel Lucky) for searching.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.6 KB

Line
1	/**************************************************************************
2	*
3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4	* Copyright (C) 2000 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "GSDLQueryLex.h"
23	#include "unitool.h"
24	#include "words.h"
25
26	inline void AddNChar (UCArray::const_iterator &here,
27	UCArray &text,
28	int len) {
29	if (text.capacity() < text.size() + len + 1) {
30	text.reserve(text.size() + len + 1);
31	}
32	while (len > 0) {
33	text.push_back (*here++);
34	--len;
35	}
36	}
37
38	static bool ParseInteger (UCArray::const_iterator &here,
39	UCArray::const_iterator end,
40	LexEl &el) {
41	el.Clear();
42
43	// this version of end is used in unitool
44	// UCArray::const_iterator endMinus1 = end-1;
45	const unsigned char* endMinus1 = &*(end - 1);
46
47	int charLen;
48	unsigned short c; // one character lookahead
49	charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51	// check for positive or negative
52	bool neg = false;
53	if (c == '+') {
54	AddNChar (here, el.text, charLen);
55	charLen = parse_utf8_char (&*here, endMinus1, &c);
56	} else if (c == '-') {
57	neg = true;
58	AddNChar (here, el.text, charLen);
59	charLen = parse_utf8_char (&*here, endMinus1, &c);
60	}
61
62	// read in number part
63	int numeric=0;
64	el.num = 0;
65	el.lexType = IntegerE;
66	/* stop integers at 4 digits */
67	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68	el.num = el.num*10 + c - '0';
69	AddNChar (here, el.text, charLen);
70	charLen = parse_utf8_char (&*here, endMinus1, &c);
71	}
72
73	if (neg) el.num *= -1;
74
75	return (!el.text.empty());
76	}
77
78	static bool ParsePotentialInteger(UCArray::const_iterator &here,
79	UCArray::const_iterator end,
80	LexEl &el) {
81	el.Clear();
82
83	// this version of end is used in unitool
84	//UCArray::const_iterator endMinus1 = end-1;
85	const unsigned char* endMinus1 = &*(end - 1);
86
87	int charLen=0;
88	int length=0;
89	unsigned short c; // one character lookahead
90	charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92	// read in number part
93	int numeric=0;
94	el.num = 0;
95	el.lexType = IntegerE;
96
97	/* stop integers at 4 digits */
98	while (here != end) {
99
100	charLen = parse_utf8_char (&*here, endMinus1, &c);
101	if (c < '0' \|\| c > '9') {
102	// reached a non-digit character
103	break;
104	}
105	el.num = el.num*10 + c - '0';
106	AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
107	length += charLen;
108
109	numeric++;
110
111	if (numeric == MAXNUMERIC) {
112	// reached the max length of a number
113	break;
114	}
115
116	}
117
118
119	// check the next character -if it is a letter, then have a term, not an integer
120	if (!is_unicode_letter(c)) {
121	// this was just an integer
122	return (!el.text.empty());
123	}
124	// else its a term
125	el.lexType = TermE;
126	el.num = 0;
127	/* this bit taken from ParseIndexWord in words.h*/
128	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
129	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
130	++numeric <= MAXNUMERIC))) {
131	AddNChar (here, el.text, charLen);
132	length += charLen;
133	charLen = parse_utf8_char (&*here, endMinus1, &c);
134	}
135
136	return (!el.text.empty());
137	}
138	static bool ParseTerm (UCArray::const_iterator &here,
139	UCArray::const_iterator end,
140	UCArray &text) {
141	if (here == end)
142	return false;
143
144	//UCArray::const_iterator endMinus1 = end-1;
145	const unsigned char* endMinus1 = &*(end - 1);
146	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
147	here += (new_here - &*here); // advance iterator by number of chars advanced
148	return !text.empty();
149	}
150
151
152	bool ParseLexEl (UCArray::const_iterator &here,
153	UCArray::const_iterator end,
154	LexEl &el) {
155	el.Clear();
156
157	// strange things can happen if here == end == 0
158	if (here == end)
159	return false;
160
161	// this version of end is used in unitool
162	//UCArray::const_iterator endMinus1 = end-1;
163	const unsigned char* endMinus1 = &*(end - 1);
164
165	// ignore all white space
166	int charLen;
167	unsigned short c; // one character lookahead
168	charLen = parse_utf8_char (&*here, endMinus1, &c);
169	while (here != end && is_unicode_space (c)) {
170	here += charLen;
171	if (here == end) break;
172	charLen = parse_utf8_char (&*here, endMinus1, &c);
173	}
174	if (here == end) return false;
175
176	if (c == '(') {
177	el.lexType = OpenBracketE;
178	AddNChar (here, el.text, charLen);
179	return true;
180
181	} else if (c == ')') {
182	el.lexType = CloseBracketE;
183	AddNChar (here, el.text, charLen);
184	return true;
185
186	} else if (c =='[') {
187	el.lexType = OpenSquareBracketE;
188	AddNChar (here, el.text, charLen);
189	return true;
190
191	} else if (c ==']') {
192	el.lexType = CloseSquareBracketE;
193	AddNChar (here, el.text, charLen);
194	return true;
195
196	} else if (c == '\"') {
197	el.lexType = QuoteE;
198	AddNChar (here, el.text, charLen);
199	return true;
200
201	} else if (c == '/') {
202	el.lexType = TermWeightE;
203	AddNChar (here, el.text, charLen);
204	return true;
205
206	} else if (c == '#') {
207	el.lexType = StemMethodE;
208	AddNChar (here, el.text, charLen);
209	return true;
210
211	} else if (c == '*') {
212	el.lexType = StarE;
213	AddNChar (here, el.text, charLen);
214	return true;
215
216	} else if (c == '^') {
217	el.lexType = RangeE;
218	AddNChar (here, el.text, charLen);
219	return true;
220
221	} else if (c == '@') {
222	el.lexType = AtE;
223	AddNChar (here, el.text, charLen);
224	return true;
225
226	} else if (c == ':') {
227	el.lexType = TagE;
228	AddNChar (here, el.text, charLen);
229	return true;
230
231	} else if (c=='&') {
232	el.lexType = AndOpE;
233	AddNChar (here, el.text, charLen);
234	return true;
235
236	} else if (c == '\|') {
237	el.lexType = OrOpE;
238	AddNChar (here, el.text, charLen);
239	return true;
240
241	} else if (c == '!') {
242	el.lexType = NotOpE;
243	AddNChar (here, el.text, charLen);
244	return true;
245
246	} else if (c == '+' \|\| c == '-' ) {
247	return ParseInteger (here, end, el);
248	}
249
250	else if (c >= '0' && c <= '9') {
251	return ParsePotentialInteger (here, end, el);
252	}
253
254	// assume it is a term of some sort
255	if (!ParseTerm (here, end, el.text)) {
256	// parse term returns false if it hasn't parsed anything that is a term
257	// here should be the same as it was before
258	el.lexType = UnknownE;
259	AddNChar (here, el.text, charLen);
260	return true;
261	}
262	//return false;
263
264	//UCArray AND; SetCStr (AND, "AND");
265	//if (el.text == AND) {
266	if (UCArrayCStrEquals(el.text, "AND")) {
267	el.lexType = AndOpE;
268	return true;
269	}
270	//UCArray OR; SetCStr (OR, "OR");
271	//if (el.text == OR) {
272	if (UCArrayCStrEquals(el.text, "OR")) {
273	el.lexType = OrOpE;
274	return true;
275	}
276	//UCArray NOT; SetCStr (NOT, "NOT");
277	//if (el.text == NOT) {
278	if (UCArrayCStrEquals(el.text, "NOT")) {
279	el.lexType = NotOpE;
280	return true;
281	}
282	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
283	if (PrefixLen(el.text, NEAR)==4) {
284	el.lexType = NearOpE;
285	return true;
286	}
287	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
288	if (PrefixLen(el.text, WITHIN)==6) {
289	el.lexType = WithinOpE;
290	return true;
291	}
292
293	el.lexType = TermE;
294	return true;
295	}
296

Note: See TracBrowser for help on using the repository browser.

Download in other formats: