[3745] | 1 | /**************************************************************************
|
---|
| 2 | *
|
---|
| 3 | * words.h -- Macros for parsing out words from the source text
|
---|
| 4 | * Copyright (C) 1994 Neil Sharman
|
---|
| 5 | *
|
---|
| 6 | * This program is free software; you can redistribute it and/or modify
|
---|
| 7 | * it under the terms of the GNU General Public License as published by
|
---|
| 8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 9 | * (at your option) any later version.
|
---|
| 10 | *
|
---|
| 11 | * This program is distributed in the hope that it will be useful,
|
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 14 | * GNU General Public License for more details.
|
---|
| 15 | *
|
---|
| 16 | * You should have received a copy of the GNU General Public License
|
---|
| 17 | * along with this program; if not, write to the Free Software
|
---|
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 19 | *
|
---|
| 20 | * $Id: words.h 13660 2007-01-16 23:13:03Z kjdon $
|
---|
| 21 | *
|
---|
| 22 | **************************************************************************/
|
---|
| 23 |
|
---|
[7228] | 24 |
|
---|
[3745] | 25 | #include "sysfuncs.h"
|
---|
| 26 | #include "unitool.h"
|
---|
| 27 |
|
---|
| 28 | /*
|
---|
| 29 | * This has been cleaned up by Tim Shimmin.
|
---|
| 30 | */
|
---|
| 31 |
|
---|
| 32 | /*
|
---|
| 33 | * ---NOTE---
|
---|
| 34 | *
|
---|
| 35 | * "WORD" refers to a word in the compressed text.
|
---|
| 36 | * "STEM" or "STEM_WORD" refers to a word for indexing on
|
---|
| 37 | *
|
---|
| 38 | */
|
---|
| 39 |
|
---|
| 40 | #define MAXWORDLEN 15
|
---|
| 41 | /* Maximum length in bytes of any word or non-word. Note that
|
---|
| 42 | variations to MAXWORDLEN may have dramatic effects on the rest
|
---|
| 43 | of the program, as the length and the prefix match are packed
|
---|
| 44 | together into a four bit nibble, and there is not check that
|
---|
| 45 | this is possible, i.e., leave MAXWORDLEN alone... */
|
---|
| 46 |
|
---|
| 47 | #define MAXSTEMLEN 255
|
---|
| 48 | /* Maximum length in bytes of any stem. Note that
|
---|
| 49 | variations to MAXSTEMLEN may have dramatic effects on the rest
|
---|
| 50 | of the program, , i.e., leave MAXSTEMLEN alone... */
|
---|
| 51 |
|
---|
[7228] | 52 | /*#define MAXNUMERIC 4*/
|
---|
| 53 |
|
---|
[3745] | 54 | /* Maximum number of numeric characters permitted in a word.
|
---|
| 55 | This avoids long sequences of numbers creating just one
|
---|
| 56 | word occurrence for each number. At most 10,000 all numeric
|
---|
| 57 | words will be permitted. */
|
---|
| 58 |
|
---|
| 59 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 60 | #define MAXPARAMLEN 20
|
---|
| 61 | /* Maximum number of bytes to read for a parameter value for a
|
---|
| 62 | term in a query. */
|
---|
| 63 | #define WEIGHTPARAM '/'
|
---|
| 64 | #define STEMPARAM '#'
|
---|
| 65 |
|
---|
| 66 | /* [RJM 07/97: Ranked Required Terms] */
|
---|
| 67 | #define MUSTMATCHPARAM '+'
|
---|
| 68 |
|
---|
| 69 |
|
---|
| 70 | #define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
|
---|
| 71 | /* The definition of what characters are permitted in a word.
|
---|
| 72 | This macro is pessimistic, you cannot tell from a particular
|
---|
| 73 | byte above 0x80 whether it is a character or not. This function
|
---|
| 74 | is needed by various functions relating to huffman coding
|
---|
| 75 | where frequency counts need to be primed, it should not be
|
---|
| 76 | used in parsing the UTF-8 encoded input. */
|
---|
| 77 |
|
---|
| 78 | int inaword (const u_char *here, const u_char *end);
|
---|
| 79 | /* Takes the place of the old INAWORD macro. It determines
|
---|
| 80 | whether a given place in a UTF-8 encoded Unicode string
|
---|
| 81 | is part of a word. */
|
---|
| 82 |
|
---|
[8694] | 83 | int isaspace (const u_char *here, const u_char *end);
|
---|
| 84 | /* It determines whether a given place in a UTF-8 encoded
|
---|
| 85 | Unicode string is a unicode space. */
|
---|
[13660] | 86 |
|
---|
[8694] | 87 | u_char *skipspace(u_char *here, u_char *end);
|
---|
[13660] | 88 | /* Return a the UTF-8 encoded Unicode string with beginning
|
---|
| 89 | unicode spaces skipped. */
|
---|
| 90 |
|
---|
[8694] | 91 |
|
---|
[3745] | 92 | /* =========================================================================
|
---|
| 93 | * Macro: PARSE_WORD
|
---|
| 94 | * Description:
|
---|
| 95 | * Extract a word out for compressing text
|
---|
| 96 | * Input:
|
---|
| 97 | * s_in = string start in buffer
|
---|
| 98 | * end = string end in buffer
|
---|
| 99 | * Output:
|
---|
| 100 | * Word = extracted word with length in 1st byte
|
---|
| 101 | * s_in = ptr to next character in buffer yet to be processed
|
---|
| 102 | * ========================================================================= */
|
---|
| 103 | #define PARSE_WORD(Word, s_in, end) \
|
---|
| 104 | do { \
|
---|
| 105 | register int charlength = 0; \
|
---|
| 106 | register u_char *wptr = (Word)+1; \
|
---|
| 107 | register int length = 0; \
|
---|
| 108 | register int numeric = 0; \
|
---|
| 109 | unsigned short c; \
|
---|
[7228] | 110 | register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
|
---|
[3745] | 111 | \
|
---|
| 112 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 113 | \
|
---|
| 114 | while (length+charlength <= MAXWORDLEN && charlength > 0 && \
|
---|
| 115 | (is_unicode_letter(c) || (is_unicode_digit(c) && \
|
---|
[7228] | 116 | ++numeric <= maxnumeric))) { \
|
---|
[3745] | 117 | while (charlength-- > 0) { \
|
---|
[13660] | 118 | *wptr++ = *(s_in)++; ++length; \
|
---|
[3745] | 119 | } \
|
---|
| 120 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 121 | } \
|
---|
| 122 | *(Word) = length; \
|
---|
| 123 | }while(0)
|
---|
| 124 |
|
---|
| 125 | /*
|
---|
| 126 | #define PARSE_WORD(Word, s_in, end) \
|
---|
| 127 | do { \
|
---|
| 128 | register u_char *wptr = (Word)+1; \
|
---|
| 129 | register int length = 0; \
|
---|
| 130 | register int c = *(s_in); \
|
---|
| 131 | register int numeric = 0; \
|
---|
| 132 | \
|
---|
| 133 | while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
|
---|
| 134 | { \
|
---|
| 135 | if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
|
---|
| 136 | break; \
|
---|
| 137 | *wptr++ = c; \
|
---|
[13660] | 138 | ++length; \
|
---|
[3745] | 139 | c = *++(s_in); \
|
---|
| 140 | } \
|
---|
| 141 | *(Word) = length; \
|
---|
| 142 | }while(0)
|
---|
| 143 | */
|
---|
| 144 |
|
---|
| 145 |
|
---|
| 146 | /* =========================================================================
|
---|
| 147 | * Macro: PARSE_NON_WORD
|
---|
| 148 | * Description:
|
---|
| 149 | * Extract a non-word out for storing compressed text
|
---|
| 150 | * Input: as above
|
---|
| 151 | * Output: as above
|
---|
| 152 | * ========================================================================= */
|
---|
| 153 | #define PARSE_NON_WORD(Word, s_in, end) \
|
---|
| 154 | do { \
|
---|
| 155 | register int charlength = 0; \
|
---|
| 156 | register u_char *wptr = (Word)+1; \
|
---|
| 157 | register int length = 0; \
|
---|
| 158 | unsigned short c; \
|
---|
| 159 | \
|
---|
| 160 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 161 | \
|
---|
| 162 | while (length+charlength <= MAXWORDLEN && charlength > 0 && \
|
---|
| 163 | !is_unicode_letdig(c)) { \
|
---|
| 164 | while (charlength-- > 0) { \
|
---|
[13660] | 165 | *wptr++ = *(s_in)++; ++length; \
|
---|
[3745] | 166 | } \
|
---|
| 167 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 168 | } \
|
---|
| 169 | *(Word) = length; \
|
---|
| 170 | }while(0)
|
---|
| 171 |
|
---|
| 172 | /*
|
---|
| 173 | #define PARSE_NON_WORD(Word, s_in, end) \
|
---|
| 174 | do { \
|
---|
| 175 | register u_char *wptr = (Word)+1; \
|
---|
| 176 | register int length = 0; \
|
---|
| 177 | register int c = *(s_in); \
|
---|
| 178 | \
|
---|
| 179 | while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
|
---|
| 180 | { \
|
---|
| 181 | *wptr++ = c; \
|
---|
[13660] | 182 | ++length; \
|
---|
[3745] | 183 | c = *++(s_in); \
|
---|
| 184 | } \
|
---|
| 185 | *(Word) = length; \
|
---|
| 186 | }while(0)
|
---|
| 187 | */
|
---|
| 188 |
|
---|
| 189 |
|
---|
| 190 | /* =========================================================================
|
---|
| 191 | * Macro: PARSE_STEM_WORD
|
---|
| 192 | * Description:
|
---|
| 193 | * Extracts out Word.
|
---|
| 194 | * Input:
|
---|
| 195 | * s_in points to 1st letter in buffer to test
|
---|
| 196 | * end points to last letter in buffer
|
---|
| 197 | * Output:
|
---|
| 198 | * s_in is modified to move to next word
|
---|
| 199 | * Returns Word filled in with length in 1st byte.
|
---|
| 200 | * ========================================================================= */
|
---|
| 201 | #define PARSE_STEM_WORD(Word, s_in, end) \
|
---|
| 202 | do { \
|
---|
| 203 | register int charlength = 0; \
|
---|
| 204 | register u_char *wptr = (Word)+1; \
|
---|
| 205 | register int length = 0; \
|
---|
| 206 | register int numeric = 0; \
|
---|
| 207 | unsigned short c; \
|
---|
[7228] | 208 | register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
|
---|
[3745] | 209 | \
|
---|
| 210 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 211 | \
|
---|
| 212 | while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
|
---|
| 213 | (is_unicode_letter(c) || (is_unicode_digit(c) && \
|
---|
[7228] | 214 | ++numeric <= maxnumeric))) { \
|
---|
[3745] | 215 | while (charlength-- > 0) { \
|
---|
[13660] | 216 | *wptr++ = *(s_in)++; ++length; \
|
---|
[3745] | 217 | } \
|
---|
| 218 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 219 | } \
|
---|
| 220 | *(Word) = length; \
|
---|
| 221 | }while(0)
|
---|
| 222 | /*
|
---|
| 223 | #define PARSE_STEM_WORD(Word, s_in, end) \
|
---|
| 224 | do \
|
---|
| 225 | { \
|
---|
| 226 | register u_char *wptr = (Word)+1; \
|
---|
| 227 | register int length = 0; \
|
---|
| 228 | register int c = *(s_in); \
|
---|
| 229 | register int numeric = 0; \
|
---|
| 230 | \
|
---|
| 231 | while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
|
---|
| 232 | { \
|
---|
| 233 | if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
|
---|
| 234 | break; \
|
---|
| 235 | *wptr++ = c; \
|
---|
[13660] | 236 | ++length; \
|
---|
[3745] | 237 | c = *++(s_in); \
|
---|
| 238 | } \
|
---|
| 239 | *(Word) = length; \
|
---|
| 240 | }while(0)
|
---|
| 241 | */
|
---|
| 242 |
|
---|
| 243 |
|
---|
| 244 | /* =========================================================================
|
---|
| 245 | * Macro: PARSE_NON_STEM_WORD
|
---|
| 246 | * Description:
|
---|
| 247 | * Eat up non-word. Do not store non-word.
|
---|
| 248 | * It is not needed in index only in text !
|
---|
| 249 | *
|
---|
| 250 | * Input: as above but no Word needed
|
---|
| 251 | * Output: as above
|
---|
| 252 | * ========================================================================= */
|
---|
| 253 | #define PARSE_NON_STEM_WORD(s_in, end) \
|
---|
| 254 | do { \
|
---|
| 255 | register int charlength = 0; \
|
---|
| 256 | unsigned short c; \
|
---|
| 257 | \
|
---|
| 258 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 259 | \
|
---|
| 260 | while (charlength > 0 && !is_unicode_letdig(c)) { \
|
---|
| 261 | (s_in) += charlength; \
|
---|
| 262 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 263 | } \
|
---|
| 264 | }while(0)
|
---|
| 265 |
|
---|
| 266 | /*
|
---|
| 267 | #define PARSE_NON_STEM_WORD(s_in, end) \
|
---|
| 268 | do \
|
---|
| 269 | { \
|
---|
| 270 | while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
|
---|
| 271 | (s_in)++; \
|
---|
| 272 | }while(0)
|
---|
| 273 | */
|
---|
| 274 |
|
---|
| 275 |
|
---|
| 276 | /* =========================================================================
|
---|
| 277 | * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
|
---|
| 278 | * Description:
|
---|
| 279 | * Like PARSE_NON_STEM_WORD but also eats up SGML tags
|
---|
| 280 | * Input: as above
|
---|
| 281 | * Output: as above
|
---|
| 282 | * ========================================================================= */
|
---|
| 283 | #define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
|
---|
| 284 | do { \
|
---|
| 285 | register int charlength = 0; \
|
---|
| 286 | unsigned short c; \
|
---|
| 287 | \
|
---|
| 288 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 289 | \
|
---|
| 290 | while (charlength > 0 && !is_unicode_letdig(c)) { \
|
---|
| 291 | if (c == '<') { \
|
---|
| 292 | while (charlength > 0 && c != '>') { \
|
---|
| 293 | (s_in) += charlength; \
|
---|
| 294 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 295 | } \
|
---|
| 296 | } \
|
---|
| 297 | (s_in) += charlength; \
|
---|
| 298 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 299 | } \
|
---|
| 300 | }while(0)
|
---|
| 301 |
|
---|
| 302 | /*
|
---|
| 303 | #define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
|
---|
| 304 | do \
|
---|
| 305 | { \
|
---|
| 306 | register int c = *(s_in); \
|
---|
| 307 | \
|
---|
| 308 | while (!INAWORD(c) && (s_in)<=(end)) \
|
---|
| 309 | { \
|
---|
| 310 | if (c == '<') \
|
---|
| 311 | { \
|
---|
| 312 | while (c != '>' && (s_in)<=(end)) \
|
---|
| 313 | c = *++(s_in); \
|
---|
| 314 | } \
|
---|
| 315 | if ((s_in)<=(end)) \
|
---|
| 316 | c = *++(s_in); \
|
---|
| 317 | } \
|
---|
| 318 | }while(0)
|
---|
| 319 | */
|
---|
| 320 |
|
---|
| 321 |
|
---|
| 322 | /* =========================================================================
|
---|
| 323 | * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
|
---|
| 324 | * Description:
|
---|
| 325 | * Extracts out optional paramater for query term.
|
---|
| 326 | * Needed only in parsing the query line !
|
---|
| 327 | *
|
---|
| 328 | * Note: that this function has not been converted to use UTF-8
|
---|
| 329 | * as it should still work as it is (only uses ascii
|
---|
| 330 | * characters)
|
---|
| 331 | *
|
---|
| 332 | * Input: as above but no Word needed
|
---|
| 333 | * Output: as above
|
---|
| 334 | * ========================================================================= */
|
---|
| 335 | #define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
|
---|
| 336 | do \
|
---|
| 337 | { \
|
---|
| 338 | register u_char *wptr = (Param); \
|
---|
| 339 | register int length = 0; \
|
---|
| 340 | register int c = *(s_in); \
|
---|
| 341 | \
|
---|
| 342 | if (c == WEIGHTPARAM || c == STEMPARAM) \
|
---|
| 343 | { \
|
---|
| 344 | type = c; \
|
---|
| 345 | c = *++(s_in); \
|
---|
| 346 | while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
|
---|
| 347 | { \
|
---|
| 348 | *wptr++ = c; \
|
---|
[13660] | 349 | ++length; \
|
---|
[3745] | 350 | c = *++(s_in); \
|
---|
| 351 | } \
|
---|
| 352 | *wptr = '\0'; \
|
---|
| 353 | for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
|
---|
| 354 | ; \
|
---|
| 355 | } \
|
---|
| 356 | }while(0)
|
---|
| 357 |
|
---|
| 358 | /* =========================================================================
|
---|
| 359 | * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
|
---|
| 360 | * Description:
|
---|
| 361 | * Eat up non-word. Do not store non-word.
|
---|
| 362 | * If come across a match requirement store it in require_match
|
---|
| 363 | * It is not needed in index only in text !
|
---|
| 364 | *
|
---|
| 365 | * Input: as above
|
---|
| 366 | * Output: the requirement mode for the next term. -1=must not match,
|
---|
| 367 | * 0=optional match, 1=must match
|
---|
| 368 | * ========================================================================= */
|
---|
| 369 | #define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
|
---|
| 370 | do { \
|
---|
| 371 | register int charlength = 0; \
|
---|
| 372 | unsigned short c; \
|
---|
| 373 | (require_match) = 0; \
|
---|
| 374 | \
|
---|
| 375 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 376 | \
|
---|
| 377 | while (charlength > 0 && !is_unicode_letdig(c)) { \
|
---|
| 378 | if (c == MUSTMATCHPARAM) (require_match) = 1; \
|
---|
| 379 | (s_in) += charlength; \
|
---|
| 380 | charlength = parse_utf8_char((s_in),(end),&c); \
|
---|
| 381 | } \
|
---|
| 382 | }while(0)
|
---|
| 383 |
|
---|
| 384 | /*
|
---|
| 385 | #define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
|
---|
| 386 | do { \
|
---|
| 387 | (require_match) = 0; \
|
---|
| 388 | while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
|
---|
| 389 | if (*(s_in) == MUSTMATCHPARAM) { \
|
---|
| 390 | (require_match) = 1; \
|
---|
| 391 | } \
|
---|
| 392 | (s_in)++; \
|
---|
| 393 | } \
|
---|
| 394 | } while (0)
|
---|
| 395 | */
|
---|