/************************************************************************** * * GSDLQueryLex.cpp -- Lexical analyser for a simple query language * Copyright (C) 2000 Rodger McNab * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * **************************************************************************/ #include "GSDLQueryLex.h" #include "unitool.h" #include "words.h" inline void AddNChar (UCArray::const_iterator &here, UCArray &text, int len) { if (text.capacity() < text.size() + len + 1) { text.reserve(text.size() + len + 1); } while (len > 0) { text.push_back (*here++); --len; } } static bool ParseInteger (UCArray::const_iterator &here, UCArray::const_iterator end, LexEl &el) { el.Clear(); // this version of end is used in unitool // UCArray::const_iterator endMinus1 = end-1; const unsigned char* endMinus1 = &*(end - 1); int charLen; unsigned short c; // one character lookahead charLen = parse_utf8_char (&*here, endMinus1, &c); // check for positive or negative bool neg = false; if (c == '+') { AddNChar (here, el.text, charLen); charLen = parse_utf8_char (&*here, endMinus1, &c); } else if (c == '-') { neg = true; AddNChar (here, el.text, charLen); charLen = parse_utf8_char (&*here, endMinus1, &c); } // read in number part int numeric=0; el.num = 0; el.lexType = IntegerE; /* stop integers at 4 digits */ while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) { el.num = el.num*10 + c - '0'; AddNChar (here, el.text, charLen); charLen = parse_utf8_char (&*here, endMinus1, &c); } if (neg) el.num *= -1; return (!el.text.empty()); } static bool ParsePotentialInteger(UCArray::const_iterator &here, UCArray::const_iterator end, LexEl &el) { el.Clear(); // this version of end is used in unitool //UCArray::const_iterator endMinus1 = end-1; const unsigned char* endMinus1 = &*(end - 1); int charLen=0; int length=0; unsigned short c; // one character lookahead charLen = parse_utf8_char (&*here, endMinus1, &c); // read in number part int numeric=0; el.num = 0; el.lexType = IntegerE; /* stop integers at 4 digits */ while (c >= '0' && c <= '9'&& ++numeric<=MAXNUMERIC) { el.num = el.num*10 + c - '0'; AddNChar (here, el.text, charLen); length += charLen; if(numeric < MAXNUMERIC) { // server crash bugfix: don't go past the end with the endMinus1 pointer charLen = parse_utf8_char (&*here, endMinus1, &c); } } // check the next character -if it is a letter, then have a term, not an integer if (!is_unicode_letter(c)) { // this was just an integer return (!el.text.empty()); } // else its a term el.lexType = TermE; el.num = 0; /* this bit taken from ParseIndexWord in words.h*/ while (length+charLen<=MAXSTEMLEN && charLen > 0 && (is_unicode_letter(c) || (is_unicode_digit(c) && ++numeric <= MAXNUMERIC))) { AddNChar (here, el.text, charLen); length += charLen; charLen = parse_utf8_char (&*here, endMinus1, &c); } return (!el.text.empty()); } static bool ParseTerm (UCArray::const_iterator &here, UCArray::const_iterator end, UCArray &text) { if (here == end) return false; //UCArray::const_iterator endMinus1 = end-1; const unsigned char* endMinus1 = &*(end - 1); const unsigned char* new_here = ParseIndexWord (&*here, endMinus1, text); here += (new_here - &*here); // advance iterator by number of chars advanced return !text.empty(); } bool ParseLexEl (UCArray::const_iterator &here, UCArray::const_iterator end, LexEl &el) { el.Clear(); // strange things can happen if here == end == 0 if (here == end) return false; // this version of end is used in unitool //UCArray::const_iterator endMinus1 = end-1; const unsigned char* endMinus1 = &*(end - 1); // ignore all white space int charLen; unsigned short c; // one character lookahead charLen = parse_utf8_char (&*here, endMinus1, &c); while (here != end && is_unicode_space (c)) { here += charLen; if (here == end) break; charLen = parse_utf8_char (&*here, endMinus1, &c); } if (here == end) return false; if (c == '(') { el.lexType = OpenBracketE; AddNChar (here, el.text, charLen); return true; } else if (c == ')') { el.lexType = CloseBracketE; AddNChar (here, el.text, charLen); return true; } else if (c =='[') { el.lexType = OpenSquareBracketE; AddNChar (here, el.text, charLen); return true; } else if (c ==']') { el.lexType = CloseSquareBracketE; AddNChar (here, el.text, charLen); return true; } else if (c == '\"') { el.lexType = QuoteE; AddNChar (here, el.text, charLen); return true; } else if (c == '/') { el.lexType = TermWeightE; AddNChar (here, el.text, charLen); return true; } else if (c == '#') { el.lexType = StemMethodE; AddNChar (here, el.text, charLen); return true; } else if (c == '*') { el.lexType = StarE; AddNChar (here, el.text, charLen); return true; } else if (c == '^') { el.lexType = RangeE; AddNChar (here, el.text, charLen); return true; } else if (c == '@') { el.lexType = AtE; AddNChar (here, el.text, charLen); return true; } else if (c == ':') { el.lexType = TagE; AddNChar (here, el.text, charLen); return true; } else if (c=='&') { el.lexType = AndOpE; AddNChar (here, el.text, charLen); return true; } else if (c == '|') { el.lexType = OrOpE; AddNChar (here, el.text, charLen); return true; } else if (c == '!') { el.lexType = NotOpE; AddNChar (here, el.text, charLen); return true; } else if (c == '+' || c == '-' ) { return ParseInteger (here, end, el); } else if (c >= '0' && c <= '9') { return ParsePotentialInteger (here, end, el); } // assume it is a term of some sort if (!ParseTerm (here, end, el.text)) { // parse term returns false if it hasn't parsed anything that is a term // here should be the same as it was before el.lexType = UnknownE; AddNChar (here, el.text, charLen); return true; } //return false; //UCArray AND; SetCStr (AND, "AND"); //if (el.text == AND) { if (UCArrayCStrEquals(el.text, "AND")) { el.lexType = AndOpE; return true; } //UCArray OR; SetCStr (OR, "OR"); //if (el.text == OR) { if (UCArrayCStrEquals(el.text, "OR")) { el.lexType = OrOpE; return true; } //UCArray NOT; SetCStr (NOT, "NOT"); //if (el.text == NOT) { if (UCArrayCStrEquals(el.text, "NOT")) { el.lexType = NotOpE; return true; } UCArray NEAR; SetCStr (NEAR, "NEAR", 4); if (PrefixLen(el.text, NEAR)==4) { el.lexType = NearOpE; return true; } UCArray WITHIN; SetCStr (WITHIN, "WITHIN", 6); if (PrefixLen(el.text, WITHIN)==6) { el.lexType = WithinOpE; return true; } el.lexType = TermE; return true; }