root/main/trunk/greenstone2/common-src/indexers/mgpp/text/GSDLQueryLex.cpp @ 25526

Revision 25526, 7.6 KB (checked in by ak19, 7 years ago)

Dr Bainbridge fixed the problem noticed by Diego and which was thought to be fixed earlier. It had to do with searching for (3 digit) numbers, however Diego particularly experienced the problem when trying the ifl=1 argument (I Feel Lucky) for searching.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * GSDLQueryLex.cpp -- Lexical analyser for a simple query language
4 * Copyright (C) 2000  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#include "GSDLQueryLex.h"
23#include "unitool.h"
24#include "words.h"
25
26inline void AddNChar (UCArray::const_iterator &here,
27              UCArray &text,
28              int len) {
29  if (text.capacity() < text.size() + len + 1) {
30    text.reserve(text.size() + len + 1);
31  }
32  while (len > 0) {
33    text.push_back (*here++);
34    --len;
35  }
36}
37
38static bool ParseInteger (UCArray::const_iterator &here,
39              UCArray::const_iterator end,
40              LexEl &el) {
41  el.Clear();
42
43  // this version of end is used in unitool
44  //  UCArray::const_iterator endMinus1 = end-1;
45  const unsigned char* endMinus1 = &*(end - 1);
46
47  int charLen;
48  unsigned short c; // one character lookahead
49  charLen = parse_utf8_char (&*here, endMinus1, &c);
50
51  // check for positive or negative
52  bool neg = false;
53  if (c == '+') {
54    AddNChar (here, el.text, charLen);
55    charLen = parse_utf8_char (&*here, endMinus1, &c);
56  } else if (c == '-') {
57    neg = true;
58    AddNChar (here, el.text, charLen);
59    charLen = parse_utf8_char (&*here, endMinus1, &c);
60  }
61
62  // read in number part
63  int numeric=0;
64  el.num = 0;
65  el.lexType = IntegerE;
66  /* stop integers at 4 digits */
67  while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) {
68    el.num = el.num*10 + c - '0';
69    AddNChar (here, el.text, charLen);
70    charLen = parse_utf8_char (&*here, endMinus1, &c);
71  }
72
73  if (neg) el.num *= -1;
74
75  return (!el.text.empty());
76}
77
78static bool ParsePotentialInteger(UCArray::const_iterator &here,
79                  UCArray::const_iterator end,
80                  LexEl &el) {
81  el.Clear();
82
83  // this version of end is used in unitool
84  //UCArray::const_iterator endMinus1 = end-1;
85  const unsigned char* endMinus1 = &*(end - 1);
86
87  int charLen=0;
88  int length=0;
89  unsigned short c; // one character lookahead
90  charLen = parse_utf8_char (&*here, endMinus1, &c);
91
92  // read in number part
93  int numeric=0;
94  el.num = 0;
95  el.lexType = IntegerE;
96
97  /* stop integers at 4 digits */
98  while (here != end) {
99
100      charLen = parse_utf8_char (&*here, endMinus1, &c);
101      if (c < '0' || c > '9') {
102          // reached a non-digit character
103          break;
104      }
105      el.num = el.num*10 + c - '0';
106      AddNChar (here, el.text, charLen); // advances 'here' by 'charLen'
107      length += charLen;
108
109      numeric++;
110
111      if (numeric == MAXNUMERIC) {
112          // reached the max length of a number
113          break;
114      }
115
116  }
117
118
119  // check the next character -if it is a letter, then have a term, not an integer
120  if (!is_unicode_letter(c)) {
121    // this was just an integer
122    return (!el.text.empty());
123  }
124  // else its a term
125  el.lexType = TermE;
126  el.num = 0;
127  /* this bit taken from ParseIndexWord in words.h*/
128  while (length+charLen<=MAXSTEMLEN && charLen > 0 &&
129     (is_unicode_letter(c) || (is_unicode_digit(c) &&
130                   ++numeric <= MAXNUMERIC))) {
131    AddNChar (here, el.text, charLen);
132    length += charLen;
133    charLen = parse_utf8_char (&*here, endMinus1, &c);
134  }
135
136  return (!el.text.empty());
137}
138static bool ParseTerm (UCArray::const_iterator &here,
139               UCArray::const_iterator end,
140               UCArray &text) {
141  if (here == end)
142    return false;
143
144  //UCArray::const_iterator endMinus1 = end-1;
145  const unsigned char* endMinus1 = &*(end - 1);
146  const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text);
147  here += (new_here - &*here); // advance iterator by number of chars advanced
148  return !text.empty();
149}
150
151
152bool ParseLexEl (UCArray::const_iterator &here,
153         UCArray::const_iterator end,
154         LexEl &el) {
155  el.Clear();
156
157  // strange things can happen if here == end == 0
158  if (here == end)
159    return false;
160 
161  // this version of end is used in unitool
162  //UCArray::const_iterator endMinus1 = end-1;
163  const unsigned char* endMinus1 = &*(end - 1);
164
165  // ignore all white space
166  int charLen;
167  unsigned short c; // one character lookahead
168  charLen = parse_utf8_char (&*here, endMinus1, &c);
169  while (here != end && is_unicode_space (c)) {
170    here += charLen;
171    if (here == end) break;
172    charLen = parse_utf8_char (&*here, endMinus1, &c);
173  }
174  if (here == end) return false;
175
176  if (c == '(') {
177    el.lexType = OpenBracketE;
178    AddNChar (here, el.text, charLen);
179    return true;
180   
181  } else if (c == ')') {
182    el.lexType = CloseBracketE;
183    AddNChar (here, el.text, charLen);
184    return true;
185
186  } else if (c =='[') {
187    el.lexType = OpenSquareBracketE;
188    AddNChar (here, el.text, charLen);
189    return true;
190 
191  } else if (c ==']') {
192    el.lexType = CloseSquareBracketE;
193    AddNChar (here, el.text, charLen);
194    return true;
195 
196  } else if (c == '\"') {
197    el.lexType = QuoteE;
198    AddNChar (here, el.text, charLen);
199    return true;
200   
201  } else if (c == '/') {
202    el.lexType = TermWeightE;
203    AddNChar (here, el.text, charLen);
204    return true;
205   
206  } else if (c == '#') {
207    el.lexType = StemMethodE;
208    AddNChar (here, el.text, charLen);
209    return true;
210   
211  } else if (c == '*') {
212    el.lexType = StarE;
213    AddNChar (here, el.text, charLen);
214    return true;
215   
216  } else if (c == '^') {
217    el.lexType = RangeE;
218    AddNChar (here, el.text, charLen);
219    return true;
220   
221  } else if (c == '@') {
222    el.lexType = AtE;
223    AddNChar (here, el.text, charLen);
224    return true;
225   
226  } else if (c == ':') {
227    el.lexType = TagE;
228    AddNChar (here, el.text, charLen);
229    return true;
230   
231  } else if (c=='&') {
232    el.lexType = AndOpE;
233    AddNChar (here, el.text, charLen);
234    return true;
235   
236  } else if (c == '|') {
237    el.lexType = OrOpE;
238    AddNChar (here, el.text, charLen);
239    return true;
240   
241  } else if (c == '!') {
242    el.lexType = NotOpE;
243    AddNChar (here, el.text, charLen);
244    return true;
245   
246  } else if (c == '+' || c == '-' ) {
247    return  ParseInteger (here, end, el);
248  }
249
250  else if (c >= '0' && c <= '9') {
251    return ParsePotentialInteger (here, end, el);
252  }
253
254  // assume it is a term of some sort
255  if (!ParseTerm (here, end, el.text))  {
256    // parse term returns false if it hasn't parsed anything that is a term
257    // here should be the same as it was before
258    el.lexType = UnknownE;
259    AddNChar (here, el.text, charLen);
260    return true;
261  }
262  //return false;
263
264  //UCArray AND; SetCStr (AND, "AND");
265  //if (el.text == AND) {
266  if (UCArrayCStrEquals(el.text, "AND")) {
267    el.lexType = AndOpE;
268    return true;
269  }
270  //UCArray OR; SetCStr (OR, "OR");
271  //if (el.text == OR) {
272  if (UCArrayCStrEquals(el.text, "OR")) {
273    el.lexType = OrOpE;
274    return true;
275  }
276  //UCArray NOT; SetCStr (NOT, "NOT");
277  //if (el.text == NOT) {
278  if (UCArrayCStrEquals(el.text, "NOT")) {
279    el.lexType = NotOpE;
280    return true;
281  }
282  UCArray NEAR; SetCStr (NEAR, "NEAR", 4);
283  if (PrefixLen(el.text, NEAR)==4) {
284    el.lexType = NearOpE;
285    return true;
286  }
287  UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6);
288  if (PrefixLen(el.text, WITHIN)==6) {
289    el.lexType = WithinOpE;
290    return true;
291  }
292 
293  el.lexType = TermE;
294  return true;
295}
296
Note: See TracBrowser for help on using the browser.