source: trunk/gsdl/packages/mg/src/text/words.h@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "unitool.h"
27
28
29/*
30 * This has been cleaned up by Tim Shimmin.
31 */
32
33/*
34 * ---NOTE---
35 *
36 * "WORD" refers to a word in the compressed text.
37 * "STEM" or "STEM_WORD" refers to a word for indexing on
38 *
39 */
40
41#define MAXWORDLEN 15
42 /* Maximum length in bytes of any word or non-word. Note that
43 variations to MAXWORDLEN may have dramatic effects on the rest
44 of the program, as the length and the prefix match are packed
45 together into a four bit nibble, and there is not check that
46 this is possible, i.e., leave MAXWORDLEN alone... */
47
48#define MAXSTEMLEN 255
49 /* Maximum length in bytes of any stem. Note that
50 variations to MAXSTEMLEN may have dramatic effects on the rest
51 of the program, , i.e., leave MAXSTEMLEN alone... */
52
53#define MAXNUMERIC 4
54 /* Maximum number of numeric characters permitted in a word.
55 This avoids long sequences of numbers creating just one
56 word occurrence for each number. At most 10,000 all numeric
57 words will be permitted. */
58
59/* [RPAP - Jan 97: Stem Index Change] */
60#define MAXPARAMLEN 20
61 /* Maximum number of bytes to read for a parameter value for a
62 term in a query. */
63#define WEIGHTPARAM '/'
64#define STEMPARAM '#'
65
66/* [RJM 07/97: Ranked Required Terms] */
67#define MUSTMATCHPARAM '+'
68
69
70#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
71 /* The definition of what characters are permitted in a word.
72 This macro is pessimistic, you cannot tell from a particular
73 byte above 0x80 whether it is a character or not. This function
74 is needed by various functions relating to huffman coding
75 where frequency counts need to be primed, it should not be
76 used in parsing the UTF-8 encoded input. */
77
78int inaword (const u_char *here, const u_char *end);
79 /* Takes the place of the old INAWORD macro. It determines
80 whether a given place in a UTF-8 encoded Unicode string
81 is part of a word. */
82
83
84/* =========================================================================
85 * Macro: PARSE_WORD
86 * Description:
87 * Extract a word out for compressing text
88 * Input:
89 * s_in = string start in buffer
90 * end = string end in buffer
91 * Output:
92 * Word = extracted word with length in 1st byte
93 * s_in = ptr to next character in buffer yet to be processed
94 * ========================================================================= */
95#define PARSE_WORD(Word, s_in, end) \
96 do { \
97 register int charlength = 0; \
98 register u_char *wptr = (Word)+1; \
99 register int length = 0; \
100 register int numeric = 0; \
101 unsigned short c; \
102 \
103 charlength = parse_utf8_char((s_in),(end),&c); \
104 \
105 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
106 (is_unicode_letter(c) || (is_unicode_digit(c) && \
107 ++numeric <= MAXNUMERIC))) { \
108 while (charlength-- > 0) { \
109 *wptr++ = *(s_in)++; length++; \
110 } \
111 charlength = parse_utf8_char((s_in),(end),&c); \
112 } \
113 *(Word) = length; \
114 }while(0)
115
116 /*
117#define PARSE_WORD(Word, s_in, end) \
118 do { \
119 register u_char *wptr = (Word)+1; \
120 register int length = 0; \
121 register int c = *(s_in); \
122 register int numeric = 0; \
123 \
124 while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
125 { \
126 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
127 break; \
128 *wptr++ = c; \
129 length++; \
130 c = *++(s_in); \
131 } \
132 *(Word) = length; \
133 }while(0)
134 */
135
136
137/* =========================================================================
138 * Macro: PARSE_NON_WORD
139 * Description:
140 * Extract a non-word out for storing compressed text
141 * Input: as above
142 * Output: as above
143 * ========================================================================= */
144#define PARSE_NON_WORD(Word, s_in, end) \
145 do { \
146 register int charlength = 0; \
147 register u_char *wptr = (Word)+1; \
148 register int length = 0; \
149 unsigned short c; \
150 \
151 charlength = parse_utf8_char((s_in),(end),&c); \
152 \
153 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
154 !is_unicode_letdig(c)) { \
155 while (charlength-- > 0) { \
156 *wptr++ = *(s_in)++; length++; \
157 } \
158 charlength = parse_utf8_char((s_in),(end),&c); \
159 } \
160 *(Word) = length; \
161 }while(0)
162
163 /*
164#define PARSE_NON_WORD(Word, s_in, end) \
165 do { \
166 register u_char *wptr = (Word)+1; \
167 register int length = 0; \
168 register int c = *(s_in); \
169 \
170 while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
171 { \
172 *wptr++ = c; \
173 length++; \
174 c = *++(s_in); \
175 } \
176 *(Word) = length; \
177 }while(0)
178 */
179
180
181/* =========================================================================
182 * Macro: PARSE_STEM_WORD
183 * Description:
184 * Extracts out Word.
185 * Input:
186 * s_in points to 1st letter in buffer to test
187 * end points to last letter in buffer
188 * Output:
189 * s_in is modified to move to next word
190 * Returns Word filled in with length in 1st byte.
191 * ========================================================================= */
192#define PARSE_STEM_WORD(Word, s_in, end) \
193 do { \
194 register int charlength = 0; \
195 register u_char *wptr = (Word)+1; \
196 register int length = 0; \
197 register int numeric = 0; \
198 unsigned short c; \
199 \
200 charlength = parse_utf8_char((s_in),(end),&c); \
201 \
202 while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
203 (is_unicode_letter(c) || (is_unicode_digit(c) && \
204 ++numeric <= MAXNUMERIC))) { \
205 while (charlength-- > 0) { \
206 *wptr++ = *(s_in)++; length++; \
207 } \
208 charlength = parse_utf8_char((s_in),(end),&c); \
209 } \
210 *(Word) = length; \
211 }while(0)
212
213 /*
214#define PARSE_STEM_WORD(Word, s_in, end) \
215 do \
216 { \
217 register u_char *wptr = (Word)+1; \
218 register int length = 0; \
219 register int c = *(s_in); \
220 register int numeric = 0; \
221 \
222 while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
223 { \
224 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
225 break; \
226 *wptr++ = c; \
227 length++; \
228 c = *++(s_in); \
229 } \
230 *(Word) = length; \
231 }while(0)
232 */
233
234
235/* =========================================================================
236 * Macro: PARSE_NON_STEM_WORD
237 * Description:
238 * Eat up non-word. Do not store non-word.
239 * It is not needed in index only in text !
240 *
241 * Input: as above but no Word needed
242 * Output: as above
243 * ========================================================================= */
244#define PARSE_NON_STEM_WORD(s_in, end) \
245 do { \
246 register int charlength = 0; \
247 unsigned short c; \
248 \
249 charlength = parse_utf8_char((s_in),(end),&c); \
250 \
251 while (charlength > 0 && !is_unicode_letdig(c)) { \
252 (s_in) += charlength; \
253 charlength = parse_utf8_char((s_in),(end),&c); \
254 } \
255 }while(0)
256
257 /*
258#define PARSE_NON_STEM_WORD(s_in, end) \
259 do \
260 { \
261 while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
262 (s_in)++; \
263 }while(0)
264 */
265
266
267/* =========================================================================
268 * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
269 * Description:
270 * Like PARSE_NON_STEM_WORD but also eats up SGML tags
271 * Input: as above
272 * Output: as above
273 * ========================================================================= */
274#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
275 do { \
276 register int charlength = 0; \
277 unsigned short c; \
278 \
279 charlength = parse_utf8_char((s_in),(end),&c); \
280 \
281 while (charlength > 0 && !is_unicode_letdig(c)) { \
282 if (c == '<') { \
283 while (charlength > 0 && c != '>') { \
284 (s_in) += charlength; \
285 charlength = parse_utf8_char((s_in),(end),&c); \
286 } \
287 } \
288 (s_in) += charlength; \
289 charlength = parse_utf8_char((s_in),(end),&c); \
290 } \
291 }while(0)
292
293 /*
294#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
295 do \
296 { \
297 register int c = *(s_in); \
298 \
299 while (!INAWORD(c) && (s_in)<=(end)) \
300 { \
301 if (c == '<') \
302 { \
303 while (c != '>' && (s_in)<=(end)) \
304 c = *++(s_in); \
305 } \
306 if ((s_in)<=(end)) \
307 c = *++(s_in); \
308 } \
309 }while(0)
310 */
311
312
313/* =========================================================================
314 * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
315 * Description:
316 * Extracts out optional paramater for query term.
317 * Needed only in parsing the query line !
318 *
319 * Note: that this function has not been converted to use UTF-8
320 * as it should still work as it is (only uses ascii
321 * characters)
322 *
323 * Input: as above but no Word needed
324 * Output: as above
325 * ========================================================================= */
326#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
327 do \
328 { \
329 register u_char *wptr = (Param); \
330 register int length = 0; \
331 register int c = *(s_in); \
332 \
333 if (c == WEIGHTPARAM || c == STEMPARAM) \
334 { \
335 type = c; \
336 c = *++(s_in); \
337 while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
338 { \
339 *wptr++ = c; \
340 length++; \
341 c = *++(s_in); \
342 } \
343 *wptr = '\0'; \
344 for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
345 ; \
346 } \
347 }while(0)
348
349/* =========================================================================
350 * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
351 * Description:
352 * Eat up non-word. Do not store non-word.
353 * If come across a match requirement store it in require_match
354 * It is not needed in index only in text !
355 *
356 * Input: as above
357 * Output: the requirement mode for the next term. -1=must not match,
358 * 0=optional match, 1=must match
359 * ========================================================================= */
360#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
361 do { \
362 register int charlength = 0; \
363 unsigned short c; \
364 (require_match) = 0; \
365 \
366 charlength = parse_utf8_char((s_in),(end),&c); \
367 \
368 while (charlength > 0 && !is_unicode_letdig(c)) { \
369 if (c == MUSTMATCHPARAM) (require_match) = 1; \
370 (s_in) += charlength; \
371 charlength = parse_utf8_char((s_in),(end),&c); \
372 } \
373 }while(0)
374
375 /*
376#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
377 do { \
378 (require_match) = 0; \
379 while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
380 if (*(s_in) == MUSTMATCHPARAM) { \
381 (require_match) = 1; \
382 } \
383 (s_in)++; \
384 } \
385 } while (0)
386 */
387
Note: See TracBrowser for help on using the repository browser.