Context Navigation

source: trunk/gsdl/src/mgpp/text/GSDLQueryLex.cpp@ 8691

Last change on this file since 8691 was 8691, checked in by kjdon, 20 years ago
Added the changes from Emanuel Dejanu (Simple Words) - mostly efficiency changes. For example, changing i++ to ++i, delete xxx to delete []xxx, some stuff to do with UCArrays...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.3 KB

Rev	Line
[1127]	1	/**************************************************************************
	2	*
	3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
	4	* Copyright (C) 2000 Rodger McNab
	5	*
	6	* This program is free software; you can redistribute it and/or modify
	7	* it under the terms of the GNU General Public License as published by
	8	* the Free Software Foundation; either version 2 of the License, or
	9	* (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it will be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write to the Free Software
	18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	19	*
	20	**************************************************************************/
	21
	22	#include "GSDLQueryLex.h"
	23	#include "unitool.h"
	24	#include "words.h"
	25
	26	inline void AddNChar (UCArray::const_iterator &here,
	27	UCArray &text,
	28	int len) {
[8691]	29	if (text.capacity() < text.size() + len + 1) {
	30	text.reserve(text.size() + len + 1);
	31	}
[1127]	32	while (len > 0) {
	33	text.push_back (*here++);
[8691]	34	--len;
[1127]	35	}
	36	}
	37
	38	static bool ParseInteger (UCArray::const_iterator &here,
	39	UCArray::const_iterator end,
	40	LexEl &el) {
	41	el.Clear();
	42
	43	// this version of end is used in unitool
[3008]	44	// UCArray::const_iterator endMinus1 = end-1;
	45	const unsigned char* endMinus1 = &(*end)-1;
[1127]	46
	47	int charLen;
	48	unsigned short c; // one character lookahead
[3008]	49	charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]	50
	51	// check for positive or negative
	52	bool neg = false;
	53	if (c == '+') {
	54	AddNChar (here, el.text, charLen);
[3008]	55	charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]	56	} else if (c == '-') {
	57	neg = true;
	58	AddNChar (here, el.text, charLen);
[3008]	59	charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]	60	}
	61
	62	// read in number part
[2693]	63	int numeric=0;
[1127]	64	el.num = 0;
	65	el.lexType = IntegerE;
[2693]	66	/* stop integers at 4 digits */
	67	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
[1127]	68	el.num = el.num*10 + c - '0';
	69	AddNChar (here, el.text, charLen);
[3008]	70	charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]	71	}
	72
	73	if (neg) el.num *= -1;
	74
	75	return (!el.text.empty());
	76	}
	77
[2693]	78	static bool ParsePotentialInteger(UCArray::const_iterator &here,
	79	UCArray::const_iterator end,
	80	LexEl &el) {
	81	el.Clear();
	82
	83	// this version of end is used in unitool
[3008]	84	//UCArray::const_iterator endMinus1 = end-1;
	85	const unsigned char* endMinus1 = &(*end)-1;
[2693]	86
	87	int charLen=0;
	88	int length=0;
	89	unsigned short c; // one character lookahead
[3008]	90	charLen = parse_utf8_char (&*here, endMinus1, &c);
[2693]	91
	92	// read in number part
	93	int numeric=0;
	94	el.num = 0;
	95	el.lexType = IntegerE;
	96	/* stop integers at 4 digits */
	97	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
	98	el.num = el.num*10 + c - '0';
	99	AddNChar (here, el.text, charLen);
	100	length += charLen;
[3008]	101	charLen = parse_utf8_char (&*here, endMinus1, &c);
[2693]	102	}
	103	// check the next character -if it is a letter, then have a term, not an integer
	104	if (!is_unicode_letter(c)) {
	105	// this was just an integer
	106	return (!el.text.empty());
	107	}
	108	// else its a term
	109	el.lexType = TermE;
	110	el.num = 0;
	111	/* this bit taken from ParseIndexWord in words.h*/
	112	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
	113	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
	114	++numeric <= MAXNUMERIC))) {
	115	AddNChar (here, el.text, charLen);
	116	length += charLen;
[3008]	117	charLen = parse_utf8_char (&*here, endMinus1, &c);
[2693]	118	}
	119
	120	return (!el.text.empty());
	121	}
[1127]	122	static bool ParseTerm (UCArray::const_iterator &here,
	123	UCArray::const_iterator end,
	124	UCArray &text) {
[3008]	125	//UCArray::const_iterator endMinus1 = end-1;
	126	const unsigned char* endMinus1 = &(*end)-1;
	127	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
	128	here += (new_here - &*here); // advance iterator by number of chars advanced
[1127]	129	return !text.empty();
	130	}
	131
	132
	133	bool ParseLexEl (UCArray::const_iterator &here,
	134	UCArray::const_iterator end,
	135	LexEl &el) {
	136	el.Clear();
	137
	138	// strange things can happen if here == end == 0
	139	if (here == end) return false;
	140
	141	// this version of end is used in unitool
[3008]	142	//UCArray::const_iterator endMinus1 = end-1;
	143	const unsigned char* endMinus1 = &(*end)-1;
	144
[1127]	145	// ignore all white space
	146	int charLen;
	147	unsigned short c; // one character lookahead
[3008]	148	charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]	149	while (here != end && is_unicode_space (c)) {
	150	here += charLen;
[3008]	151	charLen = parse_utf8_char (&*here, endMinus1, &c);
[1127]	152	}
	153	if (here == end) return false;
	154
	155	if (c == '(') {
	156	el.lexType = OpenBracketE;
	157	AddNChar (here, el.text, charLen);
	158	return true;
	159
	160	} else if (c == ')') {
	161	el.lexType = CloseBracketE;
	162	AddNChar (here, el.text, charLen);
	163	return true;
	164
	165	} else if (c =='[') {
	166	el.lexType = OpenSquareBracketE;
	167	AddNChar (here, el.text, charLen);
	168	return true;
	169
	170	} else if (c ==']') {
	171	el.lexType = CloseSquareBracketE;
	172	AddNChar (here, el.text, charLen);
	173	return true;
	174
	175	} else if (c == '\"') {
	176	el.lexType = QuoteE;
	177	AddNChar (here, el.text, charLen);
	178	return true;
	179
	180	} else if (c == '/') {
	181	el.lexType = TermWeightE;
	182	AddNChar (here, el.text, charLen);
	183	return true;
	184
	185	} else if (c == '#') {
	186	el.lexType = StemMethodE;
	187	AddNChar (here, el.text, charLen);
	188	return true;
	189
[8244]	190	} else if (c == '*') {
	191	el.lexType = StarE;
	192	AddNChar (here, el.text, charLen);
	193	return true;
	194
[1127]	195	} else if (c == '^') {
	196	el.lexType = RangeE;
	197	AddNChar (here, el.text, charLen);
	198	return true;
	199
	200	} else if (c == '@') {
	201	el.lexType = AtE;
	202	AddNChar (here, el.text, charLen);
	203	return true;
	204
	205	} else if (c == ':') {
	206	el.lexType = TagE;
	207	AddNChar (here, el.text, charLen);
	208	return true;
	209
	210	} else if (c=='&') {
	211	el.lexType = AndOpE;
	212	AddNChar (here, el.text, charLen);
	213	return true;
	214
	215	} else if (c == '\|') {
	216	el.lexType = OrOpE;
	217	AddNChar (here, el.text, charLen);
	218	return true;
	219
	220	} else if (c == '!') {
	221	el.lexType = NotOpE;
	222	AddNChar (here, el.text, charLen);
	223	return true;
	224
[2693]	225	} else if (c == '+' \|\| c == '-' ) {
	226	return ParseInteger (here, end, el);
[1127]	227	}
	228
[2693]	229	else if (c >= '0' && c <= '9') {
	230	return ParsePotentialInteger (here, end, el);
	231	}
	232
[1127]	233	// assume it is a term of some sort
[6121]	234	if (!ParseTerm (here, end, el.text)) {
[5448]	235	// parse term returns false if it hasn't parsed anything that is a term
	236	// here should be the same as it was before
	237	el.lexType = UnknownE;
	238	AddNChar (here, el.text, charLen);
	239	return true;
	240	}
	241	//return false;
[1127]	242
[8691]	243	//UCArray AND; SetCStr (AND, "AND");
	244	//if (el.text == AND) {
	245	if (UCArrayCStrEquals(el.text, "AND")) {
[1127]	246	el.lexType = AndOpE;
	247	return true;
	248	}
[8691]	249	//UCArray OR; SetCStr (OR, "OR");
	250	//if (el.text == OR) {
	251	if (UCArrayCStrEquals(el.text, "OR")) {
[1127]	252	el.lexType = OrOpE;
	253	return true;
	254	}
[8691]	255	//UCArray NOT; SetCStr (NOT, "NOT");
	256	//if (el.text == NOT) {
	257	if (UCArrayCStrEquals(el.text, "NOT")) {
[1127]	258	el.lexType = NotOpE;
	259	return true;
	260	}
[8691]	261	UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
[1127]	262	if (PrefixLen(el.text, NEAR)==4) {
	263	el.lexType = NearOpE;
	264	return true;
	265	}
[8691]	266	UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
[6121]	267	if (PrefixLen(el.text, WITHIN)==6) {
	268	el.lexType = WithinOpE;
	269	return true;
	270	}
	271
[1127]	272	el.lexType = TermE;
	273	return true;
	274	}
	275

Note: See TracBrowser for help on using the repository browser.

Download in other formats: