source: trunk/gsdl3/src/packages/mg/src/text/words.h@ 7431

Last change on this file since 7431 was 7228, checked in by kjdon, 20 years ago

added a new -M option to mg_passes, allowing maxnumeric to be altered - made this change to keep gsdl3 mg inline with gsdl2 mg.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.3 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 7228 2004-04-25 23:01:18Z kjdon $
21 *
22 **************************************************************************/
23
24
25#include "sysfuncs.h"
26#include "unitool.h"
27
28/*
29 * This has been cleaned up by Tim Shimmin.
30 */
31
32/*
33 * ---NOTE---
34 *
35 * "WORD" refers to a word in the compressed text.
36 * "STEM" or "STEM_WORD" refers to a word for indexing on
37 *
38 */
39
40#define MAXWORDLEN 15
41 /* Maximum length in bytes of any word or non-word. Note that
42 variations to MAXWORDLEN may have dramatic effects on the rest
43 of the program, as the length and the prefix match are packed
44 together into a four bit nibble, and there is not check that
45 this is possible, i.e., leave MAXWORDLEN alone... */
46
47#define MAXSTEMLEN 255
48 /* Maximum length in bytes of any stem. Note that
49 variations to MAXSTEMLEN may have dramatic effects on the rest
50 of the program, , i.e., leave MAXSTEMLEN alone... */
51
52/*#define MAXNUMERIC 4*/
53
54 /* Maximum number of numeric characters permitted in a word.
55 This avoids long sequences of numbers creating just one
56 word occurrence for each number. At most 10,000 all numeric
57 words will be permitted. */
58
59/* [RPAP - Jan 97: Stem Index Change] */
60#define MAXPARAMLEN 20
61 /* Maximum number of bytes to read for a parameter value for a
62 term in a query. */
63#define WEIGHTPARAM '/'
64#define STEMPARAM '#'
65
66/* [RJM 07/97: Ranked Required Terms] */
67#define MUSTMATCHPARAM '+'
68
69
70#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
71 /* The definition of what characters are permitted in a word.
72 This macro is pessimistic, you cannot tell from a particular
73 byte above 0x80 whether it is a character or not. This function
74 is needed by various functions relating to huffman coding
75 where frequency counts need to be primed, it should not be
76 used in parsing the UTF-8 encoded input. */
77
78int inaword (const u_char *here, const u_char *end);
79 /* Takes the place of the old INAWORD macro. It determines
80 whether a given place in a UTF-8 encoded Unicode string
81 is part of a word. */
82
83
84/* =========================================================================
85 * Macro: PARSE_WORD
86 * Description:
87 * Extract a word out for compressing text
88 * Input:
89 * s_in = string start in buffer
90 * end = string end in buffer
91 * Output:
92 * Word = extracted word with length in 1st byte
93 * s_in = ptr to next character in buffer yet to be processed
94 * ========================================================================= */
95#define PARSE_WORD(Word, s_in, end) \
96 do { \
97 register int charlength = 0; \
98 register u_char *wptr = (Word)+1; \
99 register int length = 0; \
100 register int numeric = 0; \
101 unsigned short c; \
102 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
103 \
104 charlength = parse_utf8_char((s_in),(end),&c); \
105 \
106 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
107 (is_unicode_letter(c) || (is_unicode_digit(c) && \
108 ++numeric <= maxnumeric))) { \
109 while (charlength-- > 0) { \
110 *wptr++ = *(s_in)++; length++; \
111 } \
112 charlength = parse_utf8_char((s_in),(end),&c); \
113 } \
114 *(Word) = length; \
115 }while(0)
116
117 /*
118#define PARSE_WORD(Word, s_in, end) \
119 do { \
120 register u_char *wptr = (Word)+1; \
121 register int length = 0; \
122 register int c = *(s_in); \
123 register int numeric = 0; \
124 \
125 while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
126 { \
127 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
128 break; \
129 *wptr++ = c; \
130 length++; \
131 c = *++(s_in); \
132 } \
133 *(Word) = length; \
134 }while(0)
135 */
136
137
138/* =========================================================================
139 * Macro: PARSE_NON_WORD
140 * Description:
141 * Extract a non-word out for storing compressed text
142 * Input: as above
143 * Output: as above
144 * ========================================================================= */
145#define PARSE_NON_WORD(Word, s_in, end) \
146 do { \
147 register int charlength = 0; \
148 register u_char *wptr = (Word)+1; \
149 register int length = 0; \
150 unsigned short c; \
151 \
152 charlength = parse_utf8_char((s_in),(end),&c); \
153 \
154 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
155 !is_unicode_letdig(c)) { \
156 while (charlength-- > 0) { \
157 *wptr++ = *(s_in)++; length++; \
158 } \
159 charlength = parse_utf8_char((s_in),(end),&c); \
160 } \
161 *(Word) = length; \
162 }while(0)
163
164 /*
165#define PARSE_NON_WORD(Word, s_in, end) \
166 do { \
167 register u_char *wptr = (Word)+1; \
168 register int length = 0; \
169 register int c = *(s_in); \
170 \
171 while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
172 { \
173 *wptr++ = c; \
174 length++; \
175 c = *++(s_in); \
176 } \
177 *(Word) = length; \
178 }while(0)
179 */
180
181
182/* =========================================================================
183 * Macro: PARSE_STEM_WORD
184 * Description:
185 * Extracts out Word.
186 * Input:
187 * s_in points to 1st letter in buffer to test
188 * end points to last letter in buffer
189 * Output:
190 * s_in is modified to move to next word
191 * Returns Word filled in with length in 1st byte.
192 * ========================================================================= */
193#define PARSE_STEM_WORD(Word, s_in, end) \
194 do { \
195 register int charlength = 0; \
196 register u_char *wptr = (Word)+1; \
197 register int length = 0; \
198 register int numeric = 0; \
199 unsigned short c; \
200 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
201 \
202 charlength = parse_utf8_char((s_in),(end),&c); \
203 \
204 while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
205 (is_unicode_letter(c) || (is_unicode_digit(c) && \
206 ++numeric <= maxnumeric))) { \
207 while (charlength-- > 0) { \
208 *wptr++ = *(s_in)++; length++; \
209 } \
210 charlength = parse_utf8_char((s_in),(end),&c); \
211 } \
212 *(Word) = length; \
213 }while(0)
214 /*
215#define PARSE_STEM_WORD(Word, s_in, end) \
216 do \
217 { \
218 register u_char *wptr = (Word)+1; \
219 register int length = 0; \
220 register int c = *(s_in); \
221 register int numeric = 0; \
222 \
223 while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
224 { \
225 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
226 break; \
227 *wptr++ = c; \
228 length++; \
229 c = *++(s_in); \
230 } \
231 *(Word) = length; \
232 }while(0)
233 */
234
235
236/* =========================================================================
237 * Macro: PARSE_NON_STEM_WORD
238 * Description:
239 * Eat up non-word. Do not store non-word.
240 * It is not needed in index only in text !
241 *
242 * Input: as above but no Word needed
243 * Output: as above
244 * ========================================================================= */
245#define PARSE_NON_STEM_WORD(s_in, end) \
246 do { \
247 register int charlength = 0; \
248 unsigned short c; \
249 \
250 charlength = parse_utf8_char((s_in),(end),&c); \
251 \
252 while (charlength > 0 && !is_unicode_letdig(c)) { \
253 (s_in) += charlength; \
254 charlength = parse_utf8_char((s_in),(end),&c); \
255 } \
256 }while(0)
257
258 /*
259#define PARSE_NON_STEM_WORD(s_in, end) \
260 do \
261 { \
262 while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
263 (s_in)++; \
264 }while(0)
265 */
266
267
268/* =========================================================================
269 * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
270 * Description:
271 * Like PARSE_NON_STEM_WORD but also eats up SGML tags
272 * Input: as above
273 * Output: as above
274 * ========================================================================= */
275#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
276 do { \
277 register int charlength = 0; \
278 unsigned short c; \
279 \
280 charlength = parse_utf8_char((s_in),(end),&c); \
281 \
282 while (charlength > 0 && !is_unicode_letdig(c)) { \
283 if (c == '<') { \
284 while (charlength > 0 && c != '>') { \
285 (s_in) += charlength; \
286 charlength = parse_utf8_char((s_in),(end),&c); \
287 } \
288 } \
289 (s_in) += charlength; \
290 charlength = parse_utf8_char((s_in),(end),&c); \
291 } \
292 }while(0)
293
294 /*
295#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
296 do \
297 { \
298 register int c = *(s_in); \
299 \
300 while (!INAWORD(c) && (s_in)<=(end)) \
301 { \
302 if (c == '<') \
303 { \
304 while (c != '>' && (s_in)<=(end)) \
305 c = *++(s_in); \
306 } \
307 if ((s_in)<=(end)) \
308 c = *++(s_in); \
309 } \
310 }while(0)
311 */
312
313
314/* =========================================================================
315 * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
316 * Description:
317 * Extracts out optional paramater for query term.
318 * Needed only in parsing the query line !
319 *
320 * Note: that this function has not been converted to use UTF-8
321 * as it should still work as it is (only uses ascii
322 * characters)
323 *
324 * Input: as above but no Word needed
325 * Output: as above
326 * ========================================================================= */
327#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
328 do \
329 { \
330 register u_char *wptr = (Param); \
331 register int length = 0; \
332 register int c = *(s_in); \
333 \
334 if (c == WEIGHTPARAM || c == STEMPARAM) \
335 { \
336 type = c; \
337 c = *++(s_in); \
338 while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
339 { \
340 *wptr++ = c; \
341 length++; \
342 c = *++(s_in); \
343 } \
344 *wptr = '\0'; \
345 for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
346 ; \
347 } \
348 }while(0)
349
350/* =========================================================================
351 * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
352 * Description:
353 * Eat up non-word. Do not store non-word.
354 * If come across a match requirement store it in require_match
355 * It is not needed in index only in text !
356 *
357 * Input: as above
358 * Output: the requirement mode for the next term. -1=must not match,
359 * 0=optional match, 1=must match
360 * ========================================================================= */
361#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
362 do { \
363 register int charlength = 0; \
364 unsigned short c; \
365 (require_match) = 0; \
366 \
367 charlength = parse_utf8_char((s_in),(end),&c); \
368 \
369 while (charlength > 0 && !is_unicode_letdig(c)) { \
370 if (c == MUSTMATCHPARAM) (require_match) = 1; \
371 (s_in) += charlength; \
372 charlength = parse_utf8_char((s_in),(end),&c); \
373 } \
374 }while(0)
375
376 /*
377#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
378 do { \
379 (require_match) = 0; \
380 while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
381 if (*(s_in) == MUSTMATCHPARAM) { \
382 (require_match) = 1; \
383 } \
384 (s_in)++; \
385 } \
386 } while (0)
387 */
Note: See TracBrowser for help on using the repository browser.