Context Navigation

GSDLQueryLex.cpp@ 26294

Last change on this file since 26294 was 26294, checked in by ak19, 12 years ago
Fix to server crashing bugs. Diego reported a bug when searching on partial numerical values like dates of the form 28-02-2012 (spotted in collections of simple html files). Search results are returned, but clicking a resulting document crashes the server. During testing, it turned out that an alphanumeric string that I tried also caused the same problem in another part of the same code (same cpp file), so I fixed it in multiple places: it was going past the array.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.7 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
	4	* Copyright (C) 2000 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "GSDLQueryLex.h"
	23	#include "unitool.h"
	24	#include "words.h"
	25
	26	inline void AddNChar (UCArray::const_iterator &here,
	27	UCArray &text,
	28	int len) {
[8692]	29	if (text.capacity() < text.size() + len + 1) {
	30	text.reserve(text.size() + len + 1);
	31	}
[3365]	32	while (len > 0) {
	33	text.push_back (*here++);
[8692]	34	--len;
[3365]	35	}
	36	}
	37
	38	static bool ParseInteger (UCArray::const_iterator &here,
	39	UCArray::const_iterator end,
	40	LexEl &el) {
	41	el.Clear();
	42
	43	// this version of end is used in unitool
	44	// UCArray::const_iterator endMinus1 = end-1;
[12318]	45	const unsigned char* endMinus1 = &*(end - 1);
[3365]	46
	47	int charLen;
	48	unsigned short c; // one character lookahead
	49	charLen = parse_utf8_char (&*here, endMinus1, &c);
	50
	51	// check for positive or negative
	52	bool neg = false;
	53	if (c == '+') {
	54	AddNChar (here, el.text, charLen);
[26294]	55	if(here != end) {
	56	charLen = parse_utf8_char (&*here, endMinus1, &c);
	57	}
[3365]	58	} else if (c == '-') {
	59	neg = true;
	60	AddNChar (here, el.text, charLen);
[26294]	61	if(here != end) {
	62	charLen = parse_utf8_char (&*here, endMinus1, &c);
	63	}
[3365]	64	}
	65
	66	// read in number part
	67	int numeric=0;
	68	el.num = 0;
	69	el.lexType = IntegerE;
	70	/* stop integers at 4 digits */
	71	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
	72	el.num = el.num*10 + c - '0';
	73	AddNChar (here, el.text, charLen);
[26294]	74	if(here == end) {
	75	break;
	76	} else {
	77	charLen = parse_utf8_char (&*here, endMinus1, &c);
	78	}
[3365]	79	}
	80
	81	if (neg) el.num *= -1;
	82
	83	return (!el.text.empty());
	84	}
	85
	86	static bool ParsePotentialInteger(UCArray::const_iterator &here,
	87	UCArray::const_iterator end,
	88	LexEl &el) {
	89	el.Clear();
	90
	91	// this version of end is used in unitool
	92	//UCArray::const_iterator endMinus1 = end-1;
[12318]	93	const unsigned char* endMinus1 = &*(end - 1);
[3365]	94
	95	int charLen=0;
	96	int length=0;
	97	unsigned short c; // one character lookahead
	98	charLen = parse_utf8_char (&*here, endMinus1, &c);
	99
	100	// read in number part
	101	int numeric=0;
	102	el.num = 0;
	103	el.lexType = IntegerE;
[25526]	104
[3365]	105	/* stop integers at 4 digits */
[25526]	106	while (here != end) {
	107
	108	charLen = parse_utf8_char (&*here, endMinus1, &c);
	109	if (c < '0' \|\| c > '9') {
	110	// reached a non-digit character
	111	break;
	112	}
	113	el.num = el.num*10 + c - '0';
	114	AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
	115	length += charLen;
	116
	117	numeric++;
	118
	119	if (numeric == MAXNUMERIC) {
	120	// reached the max length of a number
	121	break;
	122	}
	123
[3365]	124	}
[25526]	125
	126
[3365]	127	// check the next character -if it is a letter, then have a term, not an integer
	128	if (!is_unicode_letter(c)) {
	129	// this was just an integer
	130	return (!el.text.empty());
	131	}
	132	// else its a term
	133	el.lexType = TermE;
	134	el.num = 0;
	135	/* this bit taken from ParseIndexWord in words.h*/
	136	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
	137	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
	138	++numeric <= MAXNUMERIC))) {
	139	AddNChar (here, el.text, charLen);
	140	length += charLen;
[26294]	141	if(here == end) {
	142	break;
	143	} else {
	144	charLen = parse_utf8_char (&*here, endMinus1, &c);
	145	}
[3365]	146	}
	147
	148	return (!el.text.empty());
	149	}
	150	static bool ParseTerm (UCArray::const_iterator &here,
	151	UCArray::const_iterator end,
	152	UCArray &text) {
[12318]	153	if (here == end)
	154	return false;
	155
[3365]	156	//UCArray::const_iterator endMinus1 = end-1;
[12318]	157	const unsigned char* endMinus1 = &*(end - 1);
[3365]	158	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
	159	here += (new_here - &*here); // advance iterator by number of chars advanced
	160	return !text.empty();
	161	}
	162
	163
	164	bool ParseLexEl (UCArray::const_iterator &here,
	165	UCArray::const_iterator end,
	166	LexEl &el) {
	167	el.Clear();
	168
	169	// strange things can happen if here == end == 0
[12318]	170	if (here == end)
	171	return false;
[3365]	172
	173	// this version of end is used in unitool
	174	//UCArray::const_iterator endMinus1 = end-1;
[12318]	175	const unsigned char* endMinus1 = &*(end - 1);
[3365]	176
	177	// ignore all white space
	178	int charLen;
	179	unsigned short c; // one character lookahead
	180	charLen = parse_utf8_char (&*here, endMinus1, &c);
	181	while (here != end && is_unicode_space (c)) {
	182	here += charLen;
[18340]	183	if (here == end) break;
[3365]	184	charLen = parse_utf8_char (&*here, endMinus1, &c);
	185	}
	186	if (here == end) return false;
	187
	188	if (c == '(') {
	189	el.lexType = OpenBracketE;
	190	AddNChar (here, el.text, charLen);
	191	return true;
	192
	193	} else if (c == ')') {
	194	el.lexType = CloseBracketE;
	195	AddNChar (here, el.text, charLen);
	196	return true;
	197
	198	} else if (c =='[') {
	199	el.lexType = OpenSquareBracketE;
	200	AddNChar (here, el.text, charLen);
	201	return true;
	202
	203	} else if (c ==']') {
	204	el.lexType = CloseSquareBracketE;
	205	AddNChar (here, el.text, charLen);
	206	return true;
	207
	208	} else if (c == '\"') {
	209	el.lexType = QuoteE;
	210	AddNChar (here, el.text, charLen);
	211	return true;
	212
	213	} else if (c == '/') {
	214	el.lexType = TermWeightE;
	215	AddNChar (here, el.text, charLen);
	216	return true;
	217
	218	} else if (c == '#') {
	219	el.lexType = StemMethodE;
	220	AddNChar (here, el.text, charLen);
	221	return true;
	222
[8242]	223	} else if (c == '*') {
	224	el.lexType = StarE;
	225	AddNChar (here, el.text, charLen);
	226	return true;
	227
[3365]	228	} else if (c == '^') {
	229	el.lexType = RangeE;
	230	AddNChar (here, el.text, charLen);
	231	return true;
	232
	233	} else if (c == '@') {
	234	el.lexType = AtE;
	235	AddNChar (here, el.text, charLen);
	236	return true;
	237
	238	} else if (c == ':') {
	239	el.lexType = TagE;
	240	AddNChar (here, el.text, charLen);
	241	return true;
	242
	243	} else if (c=='&') {
	244	el.lexType = AndOpE;
	245	AddNChar (here, el.text, charLen);
	246	return true;
	247
	248	} else if (c == '\|') {
	249	el.lexType = OrOpE;
	250	AddNChar (here, el.text, charLen);
	251	return true;
	252
	253	} else if (c == '!') {
	254	el.lexType = NotOpE;
	255	AddNChar (here, el.text, charLen);
	256	return true;
	257
	258	} else if (c == '+' \|\| c == '-' ) {
	259	return ParseInteger (here, end, el);
	260	}
	261
	262	else if (c >= '0' && c <= '9') {
	263	return ParsePotentialInteger (here, end, el);
	264	}
	265
	266	// assume it is a term of some sort
[5449]	267	if (!ParseTerm (here, end, el.text)) {
	268	// parse term returns false if it hasn't parsed anything that is a term
	269	// here should be the same as it was before
	270	el.lexType = UnknownE;
	271	AddNChar (here, el.text, charLen);
	272	return true;
	273	}
	274	//return false;
[3365]	275
[8692]	276	//UCArray AND; SetCStr (AND, "AND");
	277	//if (el.text == AND) {
	278	if (UCArrayCStrEquals(el.text, "AND")) {
[3365]	279	el.lexType = AndOpE;
	280	return true;
	281	}
[8692]	282	//UCArray OR; SetCStr (OR, "OR");
	283	//if (el.text == OR) {
	284	if (UCArrayCStrEquals(el.text, "OR")) {
[3365]	285	el.lexType = OrOpE;
	286	return true;
	287	}
[8692]	288	//UCArray NOT; SetCStr (NOT, "NOT");
	289	//if (el.text == NOT) {
	290	if (UCArrayCStrEquals(el.text, "NOT")) {
[3365]	291	el.lexType = NotOpE;
	292	return true;
	293	}
[8692]	294	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[3365]	295	if (PrefixLen(el.text, NEAR)==4) {
	296	el.lexType = NearOpE;
	297	return true;
	298	}
[8692]	299	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6119]	300	if (PrefixLen(el.text, WITHIN)==6) {
	301	el.lexType = WithinOpE;
	302	return true;
	303	}
	304
[3365]	305	el.lexType = TermE;
	306	return true;
	307	}
	308

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 26294

Download in other formats: