Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 25526

Last change on this file since 25526 was 25526, checked in by ak19, 12 years ago
Dr Bainbridge fixed the problem noticed by Diego and which was thought to be fixed earlier. It had to do with searching for (3 digit) numbers, however Diego particularly experienced the problem when trying the ifl=1 argument (I Feel Lucky) for searching.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.6 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
	4	* Copyright (C) 2000 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "GSDLQueryLex.h"
	23	#include "unitool.h"
	24	#include "words.h"
	25
	26	inline void AddNChar (UCArray::const_iterator &here,
	27	UCArray &text,
	28	int len) {
[8692]	29	if (text.capacity() < text.size() + len + 1) {
	30	text.reserve(text.size() + len + 1);
	31	}
[3365]	32	while (len > 0) {
	33	text.push_back (*here++);
[8692]	34	--len;
[3365]	35	}
	36	}
	37
	38	static bool ParseInteger (UCArray::const_iterator &here,
	39	UCArray::const_iterator end,
	40	LexEl &el) {
	41	el.Clear();
	42
	43	// this version of end is used in unitool
	44	// UCArray::const_iterator endMinus1 = end-1;
[12318]	45	const unsigned char* endMinus1 = &*(end - 1);
[3365]	46
	47	int charLen;
	48	unsigned short c; // one character lookahead
	49	charLen = parse_utf8_char (&*here, endMinus1, &c);
	50
	51	// check for positive or negative
	52	bool neg = false;
	53	if (c == '+') {
	54	AddNChar (here, el.text, charLen);
	55	charLen = parse_utf8_char (&*here, endMinus1, &c);
	56	} else if (c == '-') {
	57	neg = true;
	58	AddNChar (here, el.text, charLen);
	59	charLen = parse_utf8_char (&*here, endMinus1, &c);
	60	}
	61
	62	// read in number part
	63	int numeric=0;
	64	el.num = 0;
	65	el.lexType = IntegerE;
	66	/* stop integers at 4 digits */
	67	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
	68	el.num = el.num*10 + c - '0';
	69	AddNChar (here, el.text, charLen);
	70	charLen = parse_utf8_char (&*here, endMinus1, &c);
	71	}
	72
	73	if (neg) el.num *= -1;
	74
	75	return (!el.text.empty());
	76	}
	77
	78	static bool ParsePotentialInteger(UCArray::const_iterator &here,
	79	UCArray::const_iterator end,
	80	LexEl &el) {
	81	el.Clear();
	82
	83	// this version of end is used in unitool
	84	//UCArray::const_iterator endMinus1 = end-1;
[12318]	85	const unsigned char* endMinus1 = &*(end - 1);
[3365]	86
	87	int charLen=0;
	88	int length=0;
	89	unsigned short c; // one character lookahead
	90	charLen = parse_utf8_char (&*here, endMinus1, &c);
	91
	92	// read in number part
	93	int numeric=0;
	94	el.num = 0;
	95	el.lexType = IntegerE;
[25526]	96
[3365]	97	/* stop integers at 4 digits */
[25526]	98	while (here != end) {
	99
	100	charLen = parse_utf8_char (&*here, endMinus1, &c);
	101	if (c < '0' \|\| c > '9') {
	102	// reached a non-digit character
	103	break;
	104	}
	105	el.num = el.num*10 + c - '0';
	106	AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
	107	length += charLen;
	108
	109	numeric++;
	110
	111	if (numeric == MAXNUMERIC) {
	112	// reached the max length of a number
	113	break;
	114	}
	115
[3365]	116	}
[25526]	117
	118
[3365]	119	// check the next character -if it is a letter, then have a term, not an integer
	120	if (!is_unicode_letter(c)) {
	121	// this was just an integer
	122	return (!el.text.empty());
	123	}
	124	// else its a term
	125	el.lexType = TermE;
	126	el.num = 0;
	127	/* this bit taken from ParseIndexWord in words.h*/
	128	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
	129	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
	130	++numeric <= MAXNUMERIC))) {
	131	AddNChar (here, el.text, charLen);
	132	length += charLen;
	133	charLen = parse_utf8_char (&*here, endMinus1, &c);
	134	}
	135
	136	return (!el.text.empty());
	137	}
	138	static bool ParseTerm (UCArray::const_iterator &here,
	139	UCArray::const_iterator end,
	140	UCArray &text) {
[12318]	141	if (here == end)
	142	return false;
	143
[3365]	144	//UCArray::const_iterator endMinus1 = end-1;
[12318]	145	const unsigned char* endMinus1 = &*(end - 1);
[3365]	146	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
	147	here += (new_here - &*here); // advance iterator by number of chars advanced
	148	return !text.empty();
	149	}
	150
	151
	152	bool ParseLexEl (UCArray::const_iterator &here,
	153	UCArray::const_iterator end,
	154	LexEl &el) {
	155	el.Clear();
	156
	157	// strange things can happen if here == end == 0
[12318]	158	if (here == end)
	159	return false;
[3365]	160
	161	// this version of end is used in unitool
	162	//UCArray::const_iterator endMinus1 = end-1;
[12318]	163	const unsigned char* endMinus1 = &*(end - 1);
[3365]	164
	165	// ignore all white space
	166	int charLen;
	167	unsigned short c; // one character lookahead
	168	charLen = parse_utf8_char (&*here, endMinus1, &c);
	169	while (here != end && is_unicode_space (c)) {
	170	here += charLen;
[18340]	171	if (here == end) break;
[3365]	172	charLen = parse_utf8_char (&*here, endMinus1, &c);
	173	}
	174	if (here == end) return false;
	175
	176	if (c == '(') {
	177	el.lexType = OpenBracketE;
	178	AddNChar (here, el.text, charLen);
	179	return true;
	180
	181	} else if (c == ')') {
	182	el.lexType = CloseBracketE;
	183	AddNChar (here, el.text, charLen);
	184	return true;
	185
	186	} else if (c =='[') {
	187	el.lexType = OpenSquareBracketE;
	188	AddNChar (here, el.text, charLen);
	189	return true;
	190
	191	} else if (c ==']') {
	192	el.lexType = CloseSquareBracketE;
	193	AddNChar (here, el.text, charLen);
	194	return true;
	195
	196	} else if (c == '\"') {
	197	el.lexType = QuoteE;
	198	AddNChar (here, el.text, charLen);
	199	return true;
	200
	201	} else if (c == '/') {
	202	el.lexType = TermWeightE;
	203	AddNChar (here, el.text, charLen);
	204	return true;
	205
	206	} else if (c == '#') {
	207	el.lexType = StemMethodE;
	208	AddNChar (here, el.text, charLen);
	209	return true;
	210
[8242]	211	} else if (c == '*') {
	212	el.lexType = StarE;
	213	AddNChar (here, el.text, charLen);
	214	return true;
	215
[3365]	216	} else if (c == '^') {
	217	el.lexType = RangeE;
	218	AddNChar (here, el.text, charLen);
	219	return true;
	220
	221	} else if (c == '@') {
	222	el.lexType = AtE;
	223	AddNChar (here, el.text, charLen);
	224	return true;
	225
	226	} else if (c == ':') {
	227	el.lexType = TagE;
	228	AddNChar (here, el.text, charLen);
	229	return true;
	230
	231	} else if (c=='&') {
	232	el.lexType = AndOpE;
	233	AddNChar (here, el.text, charLen);
	234	return true;
	235
	236	} else if (c == '\|') {
	237	el.lexType = OrOpE;
	238	AddNChar (here, el.text, charLen);
	239	return true;
	240
	241	} else if (c == '!') {
	242	el.lexType = NotOpE;
	243	AddNChar (here, el.text, charLen);
	244	return true;
	245
	246	} else if (c == '+' \|\| c == '-' ) {
	247	return ParseInteger (here, end, el);
	248	}
	249
	250	else if (c >= '0' && c <= '9') {
	251	return ParsePotentialInteger (here, end, el);
	252	}
	253
	254	// assume it is a term of some sort
[5449]	255	if (!ParseTerm (here, end, el.text)) {
	256	// parse term returns false if it hasn't parsed anything that is a term
	257	// here should be the same as it was before
	258	el.lexType = UnknownE;
	259	AddNChar (here, el.text, charLen);
	260	return true;
	261	}
	262	//return false;
[3365]	263
[8692]	264	//UCArray AND; SetCStr (AND, "AND");
	265	//if (el.text == AND) {
	266	if (UCArrayCStrEquals(el.text, "AND")) {
[3365]	267	el.lexType = AndOpE;
	268	return true;
	269	}
[8692]	270	//UCArray OR; SetCStr (OR, "OR");
	271	//if (el.text == OR) {
	272	if (UCArrayCStrEquals(el.text, "OR")) {
[3365]	273	el.lexType = OrOpE;
	274	return true;
	275	}
[8692]	276	//UCArray NOT; SetCStr (NOT, "NOT");
	277	//if (el.text == NOT) {
	278	if (UCArrayCStrEquals(el.text, "NOT")) {
[3365]	279	el.lexType = NotOpE;
	280	return true;
	281	}
[8692]	282	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[3365]	283	if (PrefixLen(el.text, NEAR)==4) {
	284	el.lexType = NearOpE;
	285	return true;
	286	}
[8692]	287	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6119]	288	if (PrefixLen(el.text, WITHIN)==6) {
	289	el.lexType = WithinOpE;
	290	return true;
	291	}
	292
[3365]	293	el.lexType = TermE;
	294	return true;
	295	}
	296

Note: See TracBrowser for help on using the repository browser.

Download in other formats: