source: trunk/indexers/mg/src/text/words.h@ 8694

Last change on this file since 8694 was 8694, checked in by kjdon, 19 years ago

added some changes made by Emanuel Dejanu (Simple Words)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 8694 2004-11-29 03:15:13Z kjdon $
21 *
22 **************************************************************************/
23
24
25#include "sysfuncs.h"
26#include "unitool.h"
27
28/*
29 * This has been cleaned up by Tim Shimmin.
30 */
31
32/*
33 * ---NOTE---
34 *
35 * "WORD" refers to a word in the compressed text.
36 * "STEM" or "STEM_WORD" refers to a word for indexing on
37 *
38 */
39
40#define MAXWORDLEN 15
41 /* Maximum length in bytes of any word or non-word. Note that
42 variations to MAXWORDLEN may have dramatic effects on the rest
43 of the program, as the length and the prefix match are packed
44 together into a four bit nibble, and there is not check that
45 this is possible, i.e., leave MAXWORDLEN alone... */
46
47#define MAXSTEMLEN 255
48 /* Maximum length in bytes of any stem. Note that
49 variations to MAXSTEMLEN may have dramatic effects on the rest
50 of the program, , i.e., leave MAXSTEMLEN alone... */
51
52/*#define MAXNUMERIC 4*/
53
54 /* Maximum number of numeric characters permitted in a word.
55 This avoids long sequences of numbers creating just one
56 word occurrence for each number. At most 10,000 all numeric
57 words will be permitted. */
58
59/* [RPAP - Jan 97: Stem Index Change] */
60#define MAXPARAMLEN 20
61 /* Maximum number of bytes to read for a parameter value for a
62 term in a query. */
63#define WEIGHTPARAM '/'
64#define STEMPARAM '#'
65
66/* [RJM 07/97: Ranked Required Terms] */
67#define MUSTMATCHPARAM '+'
68
69
70#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
71 /* The definition of what characters are permitted in a word.
72 This macro is pessimistic, you cannot tell from a particular
73 byte above 0x80 whether it is a character or not. This function
74 is needed by various functions relating to huffman coding
75 where frequency counts need to be primed, it should not be
76 used in parsing the UTF-8 encoded input. */
77
78int inaword (const u_char *here, const u_char *end);
79 /* Takes the place of the old INAWORD macro. It determines
80 whether a given place in a UTF-8 encoded Unicode string
81 is part of a word. */
82
83int isaspace (const u_char *here, const u_char *end);
84 /* It determines whether a given place in a UTF-8 encoded
85 Unicode string is a unicode space. */
86
87u_char *skipspace(u_char *here, u_char *end);
88 /* Return a the UTF-8 encoded Unicode string with
89 begining unicode spaces skipped. */
90
91/* =========================================================================
92 * Macro: PARSE_WORD
93 * Description:
94 * Extract a word out for compressing text
95 * Input:
96 * s_in = string start in buffer
97 * end = string end in buffer
98 * Output:
99 * Word = extracted word with length in 1st byte
100 * s_in = ptr to next character in buffer yet to be processed
101 * ========================================================================= */
102#define PARSE_WORD(Word, s_in, end) \
103 do { \
104 register int charlength = 0; \
105 register u_char *wptr = (Word)+1; \
106 register int length = 0; \
107 register int numeric = 0; \
108 unsigned short c; \
109 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
110 \
111 charlength = parse_utf8_char((s_in),(end),&c); \
112 \
113 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
114 (is_unicode_letter(c) || (is_unicode_digit(c) && \
115 ++numeric <= maxnumeric))) { \
116 while (charlength-- > 0) { \
117 *wptr++ = *(s_in)++; length++; \
118 } \
119 charlength = parse_utf8_char((s_in),(end),&c); \
120 } \
121 *(Word) = length; \
122 }while(0)
123
124 /*
125#define PARSE_WORD(Word, s_in, end) \
126 do { \
127 register u_char *wptr = (Word)+1; \
128 register int length = 0; \
129 register int c = *(s_in); \
130 register int numeric = 0; \
131 \
132 while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
133 { \
134 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
135 break; \
136 *wptr++ = c; \
137 length++; \
138 c = *++(s_in); \
139 } \
140 *(Word) = length; \
141 }while(0)
142 */
143
144
145/* =========================================================================
146 * Macro: PARSE_NON_WORD
147 * Description:
148 * Extract a non-word out for storing compressed text
149 * Input: as above
150 * Output: as above
151 * ========================================================================= */
152#define PARSE_NON_WORD(Word, s_in, end) \
153 do { \
154 register int charlength = 0; \
155 register u_char *wptr = (Word)+1; \
156 register int length = 0; \
157 unsigned short c; \
158 \
159 charlength = parse_utf8_char((s_in),(end),&c); \
160 \
161 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
162 !is_unicode_letdig(c)) { \
163 while (charlength-- > 0) { \
164 *wptr++ = *(s_in)++; length++; \
165 } \
166 charlength = parse_utf8_char((s_in),(end),&c); \
167 } \
168 *(Word) = length; \
169 }while(0)
170
171 /*
172#define PARSE_NON_WORD(Word, s_in, end) \
173 do { \
174 register u_char *wptr = (Word)+1; \
175 register int length = 0; \
176 register int c = *(s_in); \
177 \
178 while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
179 { \
180 *wptr++ = c; \
181 length++; \
182 c = *++(s_in); \
183 } \
184 *(Word) = length; \
185 }while(0)
186 */
187
188
189/* =========================================================================
190 * Macro: PARSE_STEM_WORD
191 * Description:
192 * Extracts out Word.
193 * Input:
194 * s_in points to 1st letter in buffer to test
195 * end points to last letter in buffer
196 * Output:
197 * s_in is modified to move to next word
198 * Returns Word filled in with length in 1st byte.
199 * ========================================================================= */
200#define PARSE_STEM_WORD(Word, s_in, end) \
201 do { \
202 register int charlength = 0; \
203 register u_char *wptr = (Word)+1; \
204 register int length = 0; \
205 register int numeric = 0; \
206 unsigned short c; \
207 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
208 \
209 charlength = parse_utf8_char((s_in),(end),&c); \
210 \
211 while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
212 (is_unicode_letter(c) || (is_unicode_digit(c) && \
213 ++numeric <= maxnumeric))) { \
214 while (charlength-- > 0) { \
215 *wptr++ = *(s_in)++; length++; \
216 } \
217 charlength = parse_utf8_char((s_in),(end),&c); \
218 } \
219 *(Word) = length; \
220 }while(0)
221 /*
222#define PARSE_STEM_WORD(Word, s_in, end) \
223 do \
224 { \
225 register u_char *wptr = (Word)+1; \
226 register int length = 0; \
227 register int c = *(s_in); \
228 register int numeric = 0; \
229 \
230 while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
231 { \
232 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
233 break; \
234 *wptr++ = c; \
235 length++; \
236 c = *++(s_in); \
237 } \
238 *(Word) = length; \
239 }while(0)
240 */
241
242
243/* =========================================================================
244 * Macro: PARSE_NON_STEM_WORD
245 * Description:
246 * Eat up non-word. Do not store non-word.
247 * It is not needed in index only in text !
248 *
249 * Input: as above but no Word needed
250 * Output: as above
251 * ========================================================================= */
252#define PARSE_NON_STEM_WORD(s_in, end) \
253 do { \
254 register int charlength = 0; \
255 unsigned short c; \
256 \
257 charlength = parse_utf8_char((s_in),(end),&c); \
258 \
259 while (charlength > 0 && !is_unicode_letdig(c)) { \
260 (s_in) += charlength; \
261 charlength = parse_utf8_char((s_in),(end),&c); \
262 } \
263 }while(0)
264
265 /*
266#define PARSE_NON_STEM_WORD(s_in, end) \
267 do \
268 { \
269 while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
270 (s_in)++; \
271 }while(0)
272 */
273
274
275/* =========================================================================
276 * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
277 * Description:
278 * Like PARSE_NON_STEM_WORD but also eats up SGML tags
279 * Input: as above
280 * Output: as above
281 * ========================================================================= */
282#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
283 do { \
284 register int charlength = 0; \
285 unsigned short c; \
286 \
287 charlength = parse_utf8_char((s_in),(end),&c); \
288 \
289 while (charlength > 0 && !is_unicode_letdig(c)) { \
290 if (c == '<') { \
291 while (charlength > 0 && c != '>') { \
292 (s_in) += charlength; \
293 charlength = parse_utf8_char((s_in),(end),&c); \
294 } \
295 } \
296 (s_in) += charlength; \
297 charlength = parse_utf8_char((s_in),(end),&c); \
298 } \
299 }while(0)
300
301 /*
302#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
303 do \
304 { \
305 register int c = *(s_in); \
306 \
307 while (!INAWORD(c) && (s_in)<=(end)) \
308 { \
309 if (c == '<') \
310 { \
311 while (c != '>' && (s_in)<=(end)) \
312 c = *++(s_in); \
313 } \
314 if ((s_in)<=(end)) \
315 c = *++(s_in); \
316 } \
317 }while(0)
318 */
319
320
321/* =========================================================================
322 * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
323 * Description:
324 * Extracts out optional paramater for query term.
325 * Needed only in parsing the query line !
326 *
327 * Note: that this function has not been converted to use UTF-8
328 * as it should still work as it is (only uses ascii
329 * characters)
330 *
331 * Input: as above but no Word needed
332 * Output: as above
333 * ========================================================================= */
334#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
335 do \
336 { \
337 register u_char *wptr = (Param); \
338 register int length = 0; \
339 register int c = *(s_in); \
340 \
341 if (c == WEIGHTPARAM || c == STEMPARAM) \
342 { \
343 type = c; \
344 c = *++(s_in); \
345 while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
346 { \
347 *wptr++ = c; \
348 length++; \
349 c = *++(s_in); \
350 } \
351 *wptr = '\0'; \
352 for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
353 ; \
354 } \
355 }while(0)
356
357/* =========================================================================
358 * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
359 * Description:
360 * Eat up non-word. Do not store non-word.
361 * If come across a match requirement store it in require_match
362 * It is not needed in index only in text !
363 *
364 * Input: as above
365 * Output: the requirement mode for the next term. -1=must not match,
366 * 0=optional match, 1=must match
367 * ========================================================================= */
368#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
369 do { \
370 register int charlength = 0; \
371 unsigned short c; \
372 (require_match) = 0; \
373 \
374 charlength = parse_utf8_char((s_in),(end),&c); \
375 \
376 while (charlength > 0 && !is_unicode_letdig(c)) { \
377 if (c == MUSTMATCHPARAM) (require_match) = 1; \
378 (s_in) += charlength; \
379 charlength = parse_utf8_char((s_in),(end),&c); \
380 } \
381 }while(0)
382
383 /*
384#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
385 do { \
386 (require_match) = 0; \
387 while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
388 if (*(s_in) == MUSTMATCHPARAM) { \
389 (require_match) = 1; \
390 } \
391 (s_in)++; \
392 } \
393 } while (0)
394 */
Note: See TracBrowser for help on using the repository browser.