Context Navigation

GSDLQueryLex.cpp@ 25139

Last change on this file since 25139 was 25139, checked in by kjdon, 12 years ago
merged with trunk rev 25137
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.5 KB

Rev	Line
[3365]	1	/**************************************************************************
	2	*
	3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
	4	* Copyright (C) 2000 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "GSDLQueryLex.h"
	23	#include "unitool.h"
	24	#include "words.h"
	25
	26	inline void AddNChar (UCArray::const_iterator &here,
	27	UCArray &text,
	28	int len) {
[8692]	29	if (text.capacity() < text.size() + len + 1) {
	30	text.reserve(text.size() + len + 1);
	31	}
[3365]	32	while (len > 0) {
	33	text.push_back (*here++);
[8692]	34	--len;
[3365]	35	}
	36	}
	37
	38	static bool ParseInteger (UCArray::const_iterator &here,
	39	UCArray::const_iterator end,
	40	LexEl &el) {
	41	el.Clear();
	42
	43	// this version of end is used in unitool
	44	// UCArray::const_iterator endMinus1 = end-1;
[12318]	45	const unsigned char* endMinus1 = &*(end - 1);
[3365]	46
	47	int charLen;
	48	unsigned short c; // one character lookahead
	49	charLen = parse_utf8_char (&*here, endMinus1, &c);
	50
	51	// check for positive or negative
	52	bool neg = false;
	53	if (c == '+') {
	54	AddNChar (here, el.text, charLen);
	55	charLen = parse_utf8_char (&*here, endMinus1, &c);
	56	} else if (c == '-') {
	57	neg = true;
	58	AddNChar (here, el.text, charLen);
	59	charLen = parse_utf8_char (&*here, endMinus1, &c);
	60	}
	61
	62	// read in number part
	63	int numeric=0;
	64	el.num = 0;
	65	el.lexType = IntegerE;
	66	/* stop integers at 4 digits */
	67	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
	68	el.num = el.num*10 + c - '0';
	69	AddNChar (here, el.text, charLen);
	70	charLen = parse_utf8_char (&*here, endMinus1, &c);
	71	}
	72
	73	if (neg) el.num *= -1;
	74
	75	return (!el.text.empty());
	76	}
	77
	78	static bool ParsePotentialInteger(UCArray::const_iterator &here,
	79	UCArray::const_iterator end,
	80	LexEl &el) {
	81	el.Clear();
	82
	83	// this version of end is used in unitool
	84	//UCArray::const_iterator endMinus1 = end-1;
[12318]	85	const unsigned char* endMinus1 = &*(end - 1);
[3365]	86
	87	int charLen=0;
	88	int length=0;
	89	unsigned short c; // one character lookahead
	90	charLen = parse_utf8_char (&*here, endMinus1, &c);
	91
	92	// read in number part
	93	int numeric=0;
	94	el.num = 0;
	95	el.lexType = IntegerE;
	96	/* stop integers at 4 digits */
	97	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
	98	el.num = el.num*10 + c - '0';
	99	AddNChar (here, el.text, charLen);
	100	length += charLen;
[25139]	101	if(numeric < MAXNUMERIC) { // server crash bugfix: don't go past the end with the endMinus1 pointer
	102	charLen = parse_utf8_char (&*here, endMinus1, &c);
	103	}
[3365]	104	}
	105	// check the next character -if it is a letter, then have a term, not an integer
	106	if (!is_unicode_letter(c)) {
	107	// this was just an integer
	108	return (!el.text.empty());
	109	}
	110	// else its a term
	111	el.lexType = TermE;
	112	el.num = 0;
	113	/* this bit taken from ParseIndexWord in words.h*/
	114	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
	115	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
	116	++numeric <= MAXNUMERIC))) {
	117	AddNChar (here, el.text, charLen);
	118	length += charLen;
	119	charLen = parse_utf8_char (&*here, endMinus1, &c);
	120	}
	121
	122	return (!el.text.empty());
	123	}
	124	static bool ParseTerm (UCArray::const_iterator &here,
	125	UCArray::const_iterator end,
	126	UCArray &text) {
[12318]	127	if (here == end)
	128	return false;
	129
[3365]	130	//UCArray::const_iterator endMinus1 = end-1;
[12318]	131	const unsigned char* endMinus1 = &*(end - 1);
[3365]	132	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
	133	here += (new_here - &*here); // advance iterator by number of chars advanced
	134	return !text.empty();
	135	}
	136
	137
	138	bool ParseLexEl (UCArray::const_iterator &here,
	139	UCArray::const_iterator end,
	140	LexEl &el) {
	141	el.Clear();
	142
	143	// strange things can happen if here == end == 0
[12318]	144	if (here == end)
	145	return false;
[3365]	146
	147	// this version of end is used in unitool
	148	//UCArray::const_iterator endMinus1 = end-1;
[12318]	149	const unsigned char* endMinus1 = &*(end - 1);
[3365]	150
	151	// ignore all white space
	152	int charLen;
	153	unsigned short c; // one character lookahead
	154	charLen = parse_utf8_char (&*here, endMinus1, &c);
	155	while (here != end && is_unicode_space (c)) {
	156	here += charLen;
[18340]	157	if (here == end) break;
[3365]	158	charLen = parse_utf8_char (&*here, endMinus1, &c);
	159	}
	160	if (here == end) return false;
	161
	162	if (c == '(') {
	163	el.lexType = OpenBracketE;
	164	AddNChar (here, el.text, charLen);
	165	return true;
	166
	167	} else if (c == ')') {
	168	el.lexType = CloseBracketE;
	169	AddNChar (here, el.text, charLen);
	170	return true;
	171
	172	} else if (c =='[') {
	173	el.lexType = OpenSquareBracketE;
	174	AddNChar (here, el.text, charLen);
	175	return true;
	176
	177	} else if (c ==']') {
	178	el.lexType = CloseSquareBracketE;
	179	AddNChar (here, el.text, charLen);
	180	return true;
	181
	182	} else if (c == '\"') {
	183	el.lexType = QuoteE;
	184	AddNChar (here, el.text, charLen);
	185	return true;
	186
	187	} else if (c == '/') {
	188	el.lexType = TermWeightE;
	189	AddNChar (here, el.text, charLen);
	190	return true;
	191
	192	} else if (c == '#') {
	193	el.lexType = StemMethodE;
	194	AddNChar (here, el.text, charLen);
	195	return true;
	196
[8242]	197	} else if (c == '*') {
	198	el.lexType = StarE;
	199	AddNChar (here, el.text, charLen);
	200	return true;
	201
[3365]	202	} else if (c == '^') {
	203	el.lexType = RangeE;
	204	AddNChar (here, el.text, charLen);
	205	return true;
	206
	207	} else if (c == '@') {
	208	el.lexType = AtE;
	209	AddNChar (here, el.text, charLen);
	210	return true;
	211
	212	} else if (c == ':') {
	213	el.lexType = TagE;
	214	AddNChar (here, el.text, charLen);
	215	return true;
	216
	217	} else if (c=='&') {
	218	el.lexType = AndOpE;
	219	AddNChar (here, el.text, charLen);
	220	return true;
	221
	222	} else if (c == '\|') {
	223	el.lexType = OrOpE;
	224	AddNChar (here, el.text, charLen);
	225	return true;
	226
	227	} else if (c == '!') {
	228	el.lexType = NotOpE;
	229	AddNChar (here, el.text, charLen);
	230	return true;
	231
	232	} else if (c == '+' \|\| c == '-' ) {
	233	return ParseInteger (here, end, el);
	234	}
	235
	236	else if (c >= '0' && c <= '9') {
	237	return ParsePotentialInteger (here, end, el);
	238	}
	239
	240	// assume it is a term of some sort
[5449]	241	if (!ParseTerm (here, end, el.text)) {
	242	// parse term returns false if it hasn't parsed anything that is a term
	243	// here should be the same as it was before
	244	el.lexType = UnknownE;
	245	AddNChar (here, el.text, charLen);
	246	return true;
	247	}
	248	//return false;
[3365]	249
[8692]	250	//UCArray AND; SetCStr (AND, "AND");
	251	//if (el.text == AND) {
	252	if (UCArrayCStrEquals(el.text, "AND")) {
[3365]	253	el.lexType = AndOpE;
	254	return true;
	255	}
[8692]	256	//UCArray OR; SetCStr (OR, "OR");
	257	//if (el.text == OR) {
	258	if (UCArrayCStrEquals(el.text, "OR")) {
[3365]	259	el.lexType = OrOpE;
	260	return true;
	261	}
[8692]	262	//UCArray NOT; SetCStr (NOT, "NOT");
	263	//if (el.text == NOT) {
	264	if (UCArrayCStrEquals(el.text, "NOT")) {
[3365]	265	el.lexType = NotOpE;
	266	return true;
	267	}
[8692]	268	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[3365]	269	if (PrefixLen(el.text, NEAR)==4) {
	270	el.lexType = NearOpE;
	271	return true;
	272	}
[8692]	273	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6119]	274	if (PrefixLen(el.text, WITHIN)==6) {
	275	el.lexType = WithinOpE;
	276	return true;
	277	}
	278
[3365]	279	el.lexType = TermE;
	280	return true;
	281	}
	282

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp@ 25139

Download in other formats: