Context Navigation

source: trunk/mgpp/text/GSDLQueryLex.cpp@ 5638

Last change on this file since 5638 was 5449, checked in by kjdon, 21 years ago
added a new token type to the Lex module - UnknownE. this will match any unrecognisable char - can now just skip over unknown chars rather than spitting the dummy and stopping the parsing
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.9 KB

Line
1	/**************************************************************************
2	*
3	* GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4	* Copyright (C) 2000 Rodger McNab
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	**************************************************************************/
21
22	#include "GSDLQueryLex.h"
23	#include "unitool.h"
24	#include "words.h"
25
26	inline void AddNChar (UCArray::const_iterator &here,
27	UCArray &text,
28	int len) {
29	while (len > 0) {
30	text.push_back (*here++);
31	len--;
32	}
33	}
34
35	static bool ParseInteger (UCArray::const_iterator &here,
36	UCArray::const_iterator end,
37	LexEl &el) {
38	el.Clear();
39
40	// this version of end is used in unitool
41	// UCArray::const_iterator endMinus1 = end-1;
42	const unsigned char* endMinus1 = &(*end)-1;
43
44	int charLen;
45	unsigned short c; // one character lookahead
46	charLen = parse_utf8_char (&*here, endMinus1, &c);
47
48	// check for positive or negative
49	bool neg = false;
50	if (c == '+') {
51	AddNChar (here, el.text, charLen);
52	charLen = parse_utf8_char (&*here, endMinus1, &c);
53	} else if (c == '-') {
54	neg = true;
55	AddNChar (here, el.text, charLen);
56	charLen = parse_utf8_char (&*here, endMinus1, &c);
57	}
58
59	// read in number part
60	int numeric=0;
61	el.num = 0;
62	el.lexType = IntegerE;
63	/* stop integers at 4 digits */
64	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
65	el.num = el.num*10 + c - '0';
66	AddNChar (here, el.text, charLen);
67	charLen = parse_utf8_char (&*here, endMinus1, &c);
68	}
69
70	if (neg) el.num *= -1;
71
72	return (!el.text.empty());
73	}
74
75	static bool ParsePotentialInteger(UCArray::const_iterator &here,
76	UCArray::const_iterator end,
77	LexEl &el) {
78	el.Clear();
79
80	// this version of end is used in unitool
81	//UCArray::const_iterator endMinus1 = end-1;
82	const unsigned char* endMinus1 = &(*end)-1;
83
84	int charLen=0;
85	int length=0;
86	unsigned short c; // one character lookahead
87	charLen = parse_utf8_char (&*here, endMinus1, &c);
88
89	// read in number part
90	int numeric=0;
91	el.num = 0;
92	el.lexType = IntegerE;
93	/* stop integers at 4 digits */
94	while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
95	el.num = el.num*10 + c - '0';
96	AddNChar (here, el.text, charLen);
97	length += charLen;
98	charLen = parse_utf8_char (&*here, endMinus1, &c);
99	}
100	// check the next character -if it is a letter, then have a term, not an integer
101	if (!is_unicode_letter(c)) {
102	// this was just an integer
103	return (!el.text.empty());
104	}
105	// else its a term
106	el.lexType = TermE;
107	el.num = 0;
108	/* this bit taken from ParseIndexWord in words.h*/
109	while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
110	(is_unicode_letter(c) \|\| (is_unicode_digit(c) &&
111	++numeric <= MAXNUMERIC))) {
112	AddNChar (here, el.text, charLen);
113	length += charLen;
114	charLen = parse_utf8_char (&*here, endMinus1, &c);
115	}
116
117	return (!el.text.empty());
118	}
119	static bool ParseTerm (UCArray::const_iterator &here,
120	UCArray::const_iterator end,
121	UCArray &text) {
122	//UCArray::const_iterator endMinus1 = end-1;
123	const unsigned char* endMinus1 = &(*end)-1;
124	const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
125	here += (new_here - &*here); // advance iterator by number of chars advanced
126	return !text.empty();
127	}
128
129
130	bool ParseLexEl (UCArray::const_iterator &here,
131	UCArray::const_iterator end,
132	LexEl &el) {
133	el.Clear();
134
135	// strange things can happen if here == end == 0
136	if (here == end) return false;
137
138	// this version of end is used in unitool
139	//UCArray::const_iterator endMinus1 = end-1;
140	const unsigned char* endMinus1 = &(*end)-1;
141
142	// ignore all white space
143	int charLen;
144	unsigned short c; // one character lookahead
145	charLen = parse_utf8_char (&*here, endMinus1, &c);
146	while (here != end && is_unicode_space (c)) {
147	here += charLen;
148	charLen = parse_utf8_char (&*here, endMinus1, &c);
149	}
150	if (here == end) return false;
151
152	if (c == '(') {
153	el.lexType = OpenBracketE;
154	AddNChar (here, el.text, charLen);
155	return true;
156
157	} else if (c == ')') {
158	el.lexType = CloseBracketE;
159	AddNChar (here, el.text, charLen);
160	return true;
161
162	} else if (c =='[') {
163	el.lexType = OpenSquareBracketE;
164	AddNChar (here, el.text, charLen);
165	return true;
166
167	} else if (c ==']') {
168	el.lexType = CloseSquareBracketE;
169	AddNChar (here, el.text, charLen);
170	return true;
171
172	} else if (c == '\"') {
173	el.lexType = QuoteE;
174	AddNChar (here, el.text, charLen);
175	return true;
176
177	} else if (c == '/') {
178	el.lexType = TermWeightE;
179	AddNChar (here, el.text, charLen);
180	return true;
181
182	} else if (c == '#') {
183	el.lexType = StemMethodE;
184	AddNChar (here, el.text, charLen);
185	return true;
186
187	} else if (c == '^') {
188	el.lexType = RangeE;
189	AddNChar (here, el.text, charLen);
190	return true;
191
192	} else if (c == '@') {
193	el.lexType = AtE;
194	AddNChar (here, el.text, charLen);
195	return true;
196
197	} else if (c == ':') {
198	el.lexType = TagE;
199	AddNChar (here, el.text, charLen);
200	return true;
201
202	} else if (c=='&') {
203	el.lexType = AndOpE;
204	AddNChar (here, el.text, charLen);
205	return true;
206
207	} else if (c == '\|') {
208	el.lexType = OrOpE;
209	AddNChar (here, el.text, charLen);
210	return true;
211
212	} else if (c == '!') {
213	el.lexType = NotOpE;
214	AddNChar (here, el.text, charLen);
215	return true;
216
217	} else if (c == '+' \|\| c == '-' ) {
218	return ParseInteger (here, end, el);
219	}
220
221	else if (c >= '0' && c <= '9') {
222	return ParsePotentialInteger (here, end, el);
223	}
224
225	// assume it is a term of some sort
226	if (!ParseTerm (here, end, el.text)) {
227	// parse term returns false if it hasn't parsed anything that is a term
228	// here should be the same as it was before
229	el.lexType = UnknownE;
230	AddNChar (here, el.text, charLen);
231	return true;
232	}
233	//return false;
234
235	UCArray AND; SetCStr (AND, "AND");
236	if (el.text == AND) {
237	el.lexType = AndOpE;
238	return true;
239	}
240	UCArray OR; SetCStr (OR, "OR");
241	if (el.text == OR) {
242	el.lexType = OrOpE;
243	return true;
244	}
245	UCArray NOT; SetCStr (NOT, "NOT");
246	if (el.text == NOT) {
247	el.lexType = NotOpE;
248	return true;
249	}
250	UCArray NEAR; SetCStr (NEAR, "NEAR");
251	if (PrefixLen(el.text, NEAR)==4) {
252	el.lexType = NearOpE;
253	return true;
254	}
255	el.lexType = TermE;
256	return true;
257	}
258

Note: See TracBrowser for help on using the repository browser.

Download in other formats: