/************************************************************************** * * words.h -- Macros for parsing out words from the source text * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: words.h 13660 2007-01-16 23:13:03Z kjdon $ * **************************************************************************/ #include "sysfuncs.h" #include "unitool.h" /* * This has been cleaned up by Tim Shimmin. */ /* * ---NOTE--- * * "WORD" refers to a word in the compressed text. * "STEM" or "STEM_WORD" refers to a word for indexing on * */ #define MAXWORDLEN 15 /* Maximum length in bytes of any word or non-word. Note that variations to MAXWORDLEN may have dramatic effects on the rest of the program, as the length and the prefix match are packed together into a four bit nibble, and there is not check that this is possible, i.e., leave MAXWORDLEN alone... */ #define MAXSTEMLEN 255 /* Maximum length in bytes of any stem. Note that variations to MAXSTEMLEN may have dramatic effects on the rest of the program, , i.e., leave MAXSTEMLEN alone... */ /*#define MAXNUMERIC 4*/ /* Maximum number of numeric characters permitted in a word. This avoids long sequences of numbers creating just one word occurrence for each number. At most 10,000 all numeric words will be permitted. */ /* [RPAP - Jan 97: Stem Index Change] */ #define MAXPARAMLEN 20 /* Maximum number of bytes to read for a parameter value for a term in a query. */ #define WEIGHTPARAM '/' #define STEMPARAM '#' /* [RJM 07/97: Ranked Required Terms] */ #define MUSTMATCHPARAM '+' #define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff)) /* The definition of what characters are permitted in a word. This macro is pessimistic, you cannot tell from a particular byte above 0x80 whether it is a character or not. This function is needed by various functions relating to huffman coding where frequency counts need to be primed, it should not be used in parsing the UTF-8 encoded input. */ int inaword (const u_char *here, const u_char *end); /* Takes the place of the old INAWORD macro. It determines whether a given place in a UTF-8 encoded Unicode string is part of a word. */ int isaspace (const u_char *here, const u_char *end); /* It determines whether a given place in a UTF-8 encoded Unicode string is a unicode space. */ u_char *skipspace(u_char *here, u_char *end); /* Return a the UTF-8 encoded Unicode string with beginning unicode spaces skipped. */ /* ========================================================================= * Macro: PARSE_WORD * Description: * Extract a word out for compressing text * Input: * s_in = string start in buffer * end = string end in buffer * Output: * Word = extracted word with length in 1st byte * s_in = ptr to next character in buffer yet to be processed * ========================================================================= */ #define PARSE_WORD(Word, s_in, end) \ do { \ register int charlength = 0; \ register u_char *wptr = (Word)+1; \ register int length = 0; \ register int numeric = 0; \ unsigned short c; \ register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \ \ charlength = parse_utf8_char((s_in),(end),&c); \ \ while (length+charlength <= MAXWORDLEN && charlength > 0 && \ (is_unicode_letter(c) || (is_unicode_digit(c) && \ ++numeric <= maxnumeric))) { \ while (charlength-- > 0) { \ *wptr++ = *(s_in)++; ++length; \ } \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ *(Word) = length; \ }while(0) /* #define PARSE_WORD(Word, s_in, end) \ do { \ register u_char *wptr = (Word)+1; \ register int length = 0; \ register int c = *(s_in); \ register int numeric = 0; \ \ while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \ { \ if ((numeric += INNUMBER(c)) > MAXNUMERIC) \ break; \ *wptr++ = c; \ ++length; \ c = *++(s_in); \ } \ *(Word) = length; \ }while(0) */ /* ========================================================================= * Macro: PARSE_NON_WORD * Description: * Extract a non-word out for storing compressed text * Input: as above * Output: as above * ========================================================================= */ #define PARSE_NON_WORD(Word, s_in, end) \ do { \ register int charlength = 0; \ register u_char *wptr = (Word)+1; \ register int length = 0; \ unsigned short c; \ \ charlength = parse_utf8_char((s_in),(end),&c); \ \ while (length+charlength <= MAXWORDLEN && charlength > 0 && \ !is_unicode_letdig(c)) { \ while (charlength-- > 0) { \ *wptr++ = *(s_in)++; ++length; \ } \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ *(Word) = length; \ }while(0) /* #define PARSE_NON_WORD(Word, s_in, end) \ do { \ register u_char *wptr = (Word)+1; \ register int length = 0; \ register int c = *(s_in); \ \ while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \ { \ *wptr++ = c; \ ++length; \ c = *++(s_in); \ } \ *(Word) = length; \ }while(0) */ /* ========================================================================= * Macro: PARSE_STEM_WORD * Description: * Extracts out Word. * Input: * s_in points to 1st letter in buffer to test * end points to last letter in buffer * Output: * s_in is modified to move to next word * Returns Word filled in with length in 1st byte. * ========================================================================= */ #define PARSE_STEM_WORD(Word, s_in, end) \ do { \ register int charlength = 0; \ register u_char *wptr = (Word)+1; \ register int length = 0; \ register int numeric = 0; \ unsigned short c; \ register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \ \ charlength = parse_utf8_char((s_in),(end),&c); \ \ while (length+charlength <= MAXSTEMLEN && charlength > 0 && \ (is_unicode_letter(c) || (is_unicode_digit(c) && \ ++numeric <= maxnumeric))) { \ while (charlength-- > 0) { \ *wptr++ = *(s_in)++; ++length; \ } \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ *(Word) = length; \ }while(0) /* #define PARSE_STEM_WORD(Word, s_in, end) \ do \ { \ register u_char *wptr = (Word)+1; \ register int length = 0; \ register int c = *(s_in); \ register int numeric = 0; \ \ while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \ { \ if ((numeric += INNUMBER(c)) > MAXNUMERIC) \ break; \ *wptr++ = c; \ ++length; \ c = *++(s_in); \ } \ *(Word) = length; \ }while(0) */ /* ========================================================================= * Macro: PARSE_NON_STEM_WORD * Description: * Eat up non-word. Do not store non-word. * It is not needed in index only in text ! * * Input: as above but no Word needed * Output: as above * ========================================================================= */ #define PARSE_NON_STEM_WORD(s_in, end) \ do { \ register int charlength = 0; \ unsigned short c; \ \ charlength = parse_utf8_char((s_in),(end),&c); \ \ while (charlength > 0 && !is_unicode_letdig(c)) { \ (s_in) += charlength; \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ }while(0) /* #define PARSE_NON_STEM_WORD(s_in, end) \ do \ { \ while (!INAWORD(*(s_in)) && (s_in)<=(end)) \ (s_in)++; \ }while(0) */ /* ========================================================================= * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG * Description: * Like PARSE_NON_STEM_WORD but also eats up SGML tags * Input: as above * Output: as above * ========================================================================= */ #define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \ do { \ register int charlength = 0; \ unsigned short c; \ \ charlength = parse_utf8_char((s_in),(end),&c); \ \ while (charlength > 0 && !is_unicode_letdig(c)) { \ if (c == '<') { \ while (charlength > 0 && c != '>') { \ (s_in) += charlength; \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ } \ (s_in) += charlength; \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ }while(0) /* #define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \ do \ { \ register int c = *(s_in); \ \ while (!INAWORD(c) && (s_in)<=(end)) \ { \ if (c == '<') \ { \ while (c != '>' && (s_in)<=(end)) \ c = *++(s_in); \ } \ if ((s_in)<=(end)) \ c = *++(s_in); \ } \ }while(0) */ /* ========================================================================= * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change] * Description: * Extracts out optional paramater for query term. * Needed only in parsing the query line ! * * Note: that this function has not been converted to use UTF-8 * as it should still work as it is (only uses ascii * characters) * * Input: as above but no Word needed * Output: as above * ========================================================================= */ #define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \ do \ { \ register u_char *wptr = (Param); \ register int length = 0; \ register int c = *(s_in); \ \ if (c == WEIGHTPARAM || c == STEMPARAM) \ { \ type = c; \ c = *++(s_in); \ while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \ { \ *wptr++ = c; \ ++length; \ c = *++(s_in); \ } \ *wptr = '\0'; \ for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \ ; \ } \ }while(0) /* ========================================================================= * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms] * Description: * Eat up non-word. Do not store non-word. * If come across a match requirement store it in require_match * It is not needed in index only in text ! * * Input: as above * Output: the requirement mode for the next term. -1=must not match, * 0=optional match, 1=must match * ========================================================================= */ #define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \ do { \ register int charlength = 0; \ unsigned short c; \ (require_match) = 0; \ \ charlength = parse_utf8_char((s_in),(end),&c); \ \ while (charlength > 0 && !is_unicode_letdig(c)) { \ if (c == MUSTMATCHPARAM) (require_match) = 1; \ (s_in) += charlength; \ charlength = parse_utf8_char((s_in),(end),&c); \ } \ }while(0) /* #define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \ do { \ (require_match) = 0; \ while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \ if (*(s_in) == MUSTMATCHPARAM) { \ (require_match) = 1; \ } \ (s_in)++; \ } \ } while (0) */