source: trunk/gsdl/src/mgpp/text/words.h@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "unitool.h"
27#include "UCArray.h"
28
29
30/*
31 * This has been cleaned up by Tim Shimmin.
32 */
33
34/*
35 * ---NOTE---
36 *
37 * "WORD" refers to a word in the compressed text.
38 * "STEM" or "STEM_WORD" refers to a word for indexing on
39 *
40 */
41
42#define MAXWORDLEN 15
43 /* Maximum length in bytes of any word or non-word. Note that
44 variations to MAXWORDLEN may have dramatic effects on the rest
45 of the program, as the length and the prefix match are packed
46 together into a four bit nibble, and there is not check that
47 this is possible, i.e., leave MAXWORDLEN alone... */
48
49#define MAXSTEMLEN 255
50 /* Maximum length in bytes of any stem. Note that
51 variations to MAXSTEMLEN may have dramatic effects on the rest
52 of the program, , i.e., leave MAXSTEMLEN alone... */
53
54#define MAXNUMERIC 4
55 /* Maximum number of numeric characters permitted in a word.
56 This avoids long sequences of numbers creating just one
57 word occurrence for each number. At most 10,000 all numeric
58 words will be permitted. */
59
60/* [RPAP - Jan 97: Stem Index Change] */
61#define MAXPARAMLEN 20
62 /* Maximum number of bytes to read for a parameter value for a
63 term in a query. */
64#define WEIGHTPARAM '/'
65#define STEMPARAM '#'
66
67/* [RJM 07/97: Ranked Required Terms] */
68#define MUSTMATCHPARAM '+'
69
70
71#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
72 /* The definition of what characters are permitted in a word.
73 This macro is pessimistic, you cannot tell from a particular
74 byte above 0x80 whether it is a character or not. This function
75 is needed by various functions relating to huffman coding
76 where frequency counts need to be primed, it should not be
77 used in parsing the UTF-8 encoded input. */
78
79#ifdef __cplusplus
80extern "C"
81#endif
82int inaword (const u_char *here, const u_char *end);
83 /* Takes the place of the old INAWORD macro. It determines
84 whether a given place in a UTF-8 encoded Unicode string
85 is part of a word. */
86
87
88const unsigned char *ParseIndexWord (const unsigned char *textHere,
89 const unsigned char *textEnd,
90 UCArray &word);
91const unsigned char *ParseIndexMGWord (const unsigned char *textHere,
92 const unsigned char *textEnd,
93 unsigned char *mgWord);
94const unsigned char *ParseNonindexWord (const unsigned char *textHere,
95 const unsigned char *textEnd);
96
97
98
99/* =========================================================================
100 * Macro: PARSE_WORD
101 * Description:
102 * Extract a word out for compressing text
103 * Input:
104 * s_in = string start in buffer
105 * end = string end in buffer
106 * Output:
107 * Word = extracted word with length in 1st byte
108 * s_in = ptr to next character in buffer yet to be processed
109 * ========================================================================= */
110#define PARSE_WORD(Word, s_in, end) \
111 do { \
112 register int charlength = 0; \
113 register u_char *wptr = (Word)+1; \
114 register int length = 0; \
115 register int numeric = 0; \
116 unsigned short c; \
117 \
118 charlength = parse_utf8_char((s_in),(end),&c); \
119 \
120 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
121 (is_unicode_letter(c) || (is_unicode_digit(c) && \
122 ++numeric <= MAXNUMERIC))) { \
123 while (charlength-- > 0) { \
124 *wptr++ = *(s_in)++; length++; \
125 } \
126 charlength = parse_utf8_char((s_in),(end),&c); \
127 } \
128 *(Word) = length; \
129 }while(0)
130
131
132
133/* =========================================================================
134 * Macro: PARSE_NON_WORD
135 * Description:
136 * Extract a non-word out for storing compressed text
137 * Input: as above
138 * Output: as above
139 * ========================================================================= */
140#define PARSE_NON_WORD(Word, s_in, end) \
141 do { \
142 register int charlength = 0; \
143 register u_char *wptr = (Word)+1; \
144 register int length = 0; \
145 unsigned short c; \
146 \
147 charlength = parse_utf8_char((s_in),(end),&c); \
148 \
149 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
150 !is_unicode_letdig(c)) { \
151 while (charlength-- > 0) { \
152 *wptr++ = *(s_in)++; length++; \
153 } \
154 charlength = parse_utf8_char((s_in),(end),&c); \
155 } \
156 *(Word) = length; \
157 }while(0)
Note: See TracBrowser for help on using the repository browser.