Context Navigation

GSDLQueryLex.cpp@ 26294

Last change on this file since 26294 was 26294, checked in by ak19, 12 years ago
Fix to server crashing bugs. Diego reported a bug when searching on partial numerical values like dates of the form 28-02-2012 (spotted in collections of simple html files). Search results are returned, but clicking a resulting document crashes the server. During testing, it turned out that an alphanumeric string that I tried also caused the same problem in another part of the same code (same cpp file), so I fixed it in multiple places: it was going past the array.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.7 KB

Line
1	/**************************************************************************
2	*
3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4	* Copyright (C) 2000 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "GSDLQueryLex.h"
23	#include "unitool.h"
24	#include "words.h"
25
26	inline void AddNChar (UCArray::const_iterator &here,
27	UCArray &text,
28	int len) {
29	if (text.capacity() < text.size() + len + 1) {
30	text.reserve(text.size() + len + 1);
31	}
32	while (len > 0) {
33	text.push_back (*here++);
34	--len;
35	}
36	}
37
38	static bool ParseInteger (UCArray::const_iterator &here,
39	UCArray::const_iterator end,
40	LexEl &el) {
41	el.Clear();
42
43	// this version of end is used in unitool
44	// UCArray::const_iterator endMinus1 = end-1;
45	const unsigned char* endMinus1 = &*(end - 1);
46
47	int charLen;
48	unsigned short c; // one character lookahead
49	charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51	// check for positive or negative
52	bool neg = false;
53	if (c == '+') {
54	AddNChar (here, el.text, charLen);
55	if(here != end) {
56	charLen = parse_utf8_char (&*here, endMinus1, &c);
57	}
58	} else if (c == '-') {
59	neg = true;
60	AddNChar (here, el.text, charLen);
61	if(here != end) {
62	charLen = parse_utf8_char (&*here, endMinus1, &c);
63	}
64	}
65
66	// read in number part
67	int numeric=0;
68	el.num = 0;
69	el.lexType = IntegerE;
70	/* stop integers at 4 digits */
71	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
72	el.num = el.num*10 + c - '0';
73	AddNChar (here, el.text, charLen);
74	if(here == end) {
75	break;
76	} else {
77	charLen = parse_utf8_char (&*here, endMinus1, &c);
78	}
79	}
80
81	if (neg) el.num *= -1;
82
83	return (!el.text.empty());
84	}
85
86	static bool ParsePotentialInteger(UCArray::const_iterator &here,
87	UCArray::const_iterator end,
88	LexEl &el) {
89	el.Clear();
90
91	// this version of end is used in unitool
92	//UCArray::const_iterator endMinus1 = end-1;
93	const unsigned char* endMinus1 = &*(end - 1);
94
95	int charLen=0;
96	int length=0;
97	unsigned short c; // one character lookahead
98	charLen = parse_utf8_char (&*here, endMinus1, &c);
99
100	// read in number part
101	int numeric=0;
102	el.num = 0;
103	el.lexType = IntegerE;
104
105	/* stop integers at 4 digits */
106	while (here != end) {
107
108	charLen = parse_utf8_char (&*here, endMinus1, &c);
109	if (c < '0' \|\| c > '9') {
110	// reached a non-digit character
111	break;
112	}
113	el.num = el.num*10 + c - '0';
114	AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
115	length += charLen;
116
117	numeric++;
118
119	if (numeric == MAXNUMERIC) {
120	// reached the max length of a number
121	break;
122	}
123
124	}
125
126
127	// check the next character -if it is a letter, then have a term, not an integer
128	if (!is_unicode_letter(c)) {
129	// this was just an integer
130	return (!el.text.empty());
131	}
132	// else its a term
133	el.lexType = TermE;
134	el.num = 0;
135	/* this bit taken from ParseIndexWord in words.h*/
136	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
137	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
138	++numeric <= MAXNUMERIC))) {
139	AddNChar (here, el.text, charLen);
140	length += charLen;
141	if(here == end) {
142	break;
143	} else {
144	charLen = parse_utf8_char (&*here, endMinus1, &c);
145	}
146	}
147
148	return (!el.text.empty());
149	}
150	static bool ParseTerm (UCArray::const_iterator &here,
151	UCArray::const_iterator end,
152	UCArray &text) {
153	if (here == end)
154	return false;
155
156	//UCArray::const_iterator endMinus1 = end-1;
157	const unsigned char* endMinus1 = &*(end - 1);
158	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
159	here += (new_here - &*here); // advance iterator by number of chars advanced
160	return !text.empty();
161	}
162
163
164	bool ParseLexEl (UCArray::const_iterator &here,
165	UCArray::const_iterator end,
166	LexEl &el) {
167	el.Clear();
168
169	// strange things can happen if here == end == 0
170	if (here == end)
171	return false;
172
173	// this version of end is used in unitool
174	//UCArray::const_iterator endMinus1 = end-1;
175	const unsigned char* endMinus1 = &*(end - 1);
176
177	// ignore all white space
178	int charLen;
179	unsigned short c; // one character lookahead
180	charLen = parse_utf8_char (&*here, endMinus1, &c);
181	while (here != end && is_unicode_space (c)) {
182	here += charLen;
183	if (here == end) break;
184	charLen = parse_utf8_char (&*here, endMinus1, &c);
185	}
186	if (here == end) return false;
187
188	if (c == '(') {
189	el.lexType = OpenBracketE;
190	AddNChar (here, el.text, charLen);
191	return true;
192
193	} else if (c == ')') {
194	el.lexType = CloseBracketE;
195	AddNChar (here, el.text, charLen);
196	return true;
197
198	} else if (c =='[') {
199	el.lexType = OpenSquareBracketE;
200	AddNChar (here, el.text, charLen);
201	return true;
202
203	} else if (c ==']') {
204	el.lexType = CloseSquareBracketE;
205	AddNChar (here, el.text, charLen);
206	return true;
207
208	} else if (c == '\"') {
209	el.lexType = QuoteE;
210	AddNChar (here, el.text, charLen);
211	return true;
212
213	} else if (c == '/') {
214	el.lexType = TermWeightE;
215	AddNChar (here, el.text, charLen);
216	return true;
217
218	} else if (c == '#') {
219	el.lexType = StemMethodE;
220	AddNChar (here, el.text, charLen);
221	return true;
222
223	} else if (c == '*') {
224	el.lexType = StarE;
225	AddNChar (here, el.text, charLen);
226	return true;
227
228	} else if (c == '^') {
229	el.lexType = RangeE;
230	AddNChar (here, el.text, charLen);
231	return true;
232
233	} else if (c == '@') {
234	el.lexType = AtE;
235	AddNChar (here, el.text, charLen);
236	return true;
237
238	} else if (c == ':') {
239	el.lexType = TagE;
240	AddNChar (here, el.text, charLen);
241	return true;
242
243	} else if (c=='&') {
244	el.lexType = AndOpE;
245	AddNChar (here, el.text, charLen);
246	return true;
247
248	} else if (c == '\|') {
249	el.lexType = OrOpE;
250	AddNChar (here, el.text, charLen);
251	return true;
252
253	} else if (c == '!') {
254	el.lexType = NotOpE;
255	AddNChar (here, el.text, charLen);
256	return true;
257
258	} else if (c == '+' \|\| c == '-' ) {
259	return ParseInteger (here, end, el);
260	}
261
262	else if (c >= '0' && c <= '9') {
263	return ParsePotentialInteger (here, end, el);
264	}
265
266	// assume it is a term of some sort
267	if (!ParseTerm (here, end, el.text)) {
268	// parse term returns false if it hasn't parsed anything that is a term
269	// here should be the same as it was before
270	el.lexType = UnknownE;
271	AddNChar (here, el.text, charLen);
272	return true;
273	}
274	//return false;
275
276	//UCArray AND; SetCStr (AND, "AND");
277	//if (el.text == AND) {
278	if (UCArrayCStrEquals(el.text, "AND")) {
279	el.lexType = AndOpE;
280	return true;
281	}
282	//UCArray OR; SetCStr (OR, "OR");
283	//if (el.text == OR) {
284	if (UCArrayCStrEquals(el.text, "OR")) {
285	el.lexType = OrOpE;
286	return true;
287	}
288	//UCArray NOT; SetCStr (NOT, "NOT");
289	//if (el.text == NOT) {
290	if (UCArrayCStrEquals(el.text, "NOT")) {
291	el.lexType = NotOpE;
292	return true;
293	}
294	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
295	if (PrefixLen(el.text, NEAR)==4) {
296	el.lexType = NearOpE;
297	return true;
298	}
299	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
300	if (PrefixLen(el.text, WITHIN)==6) {
301	el.lexType = WithinOpE;
302	return true;
303	}
304
305	el.lexType = TermE;
306	return true;
307	}
308

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 26294

Download in other formats: