source: trunk/indexers/mg/src/text/words.h@ 13660

Last change on this file since 13660 was 13660, checked in by kjdon, 17 years ago

added some x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 13660 2007-01-16 23:13:03Z kjdon $
21 *
22 **************************************************************************/
23
24
25#include "sysfuncs.h"
26#include "unitool.h"
27
28/*
29 * This has been cleaned up by Tim Shimmin.
30 */
31
32/*
33 * ---NOTE---
34 *
35 * "WORD" refers to a word in the compressed text.
36 * "STEM" or "STEM_WORD" refers to a word for indexing on
37 *
38 */
39
40#define MAXWORDLEN 15
41 /* Maximum length in bytes of any word or non-word. Note that
42 variations to MAXWORDLEN may have dramatic effects on the rest
43 of the program, as the length and the prefix match are packed
44 together into a four bit nibble, and there is not check that
45 this is possible, i.e., leave MAXWORDLEN alone... */
46
47#define MAXSTEMLEN 255
48 /* Maximum length in bytes of any stem. Note that
49 variations to MAXSTEMLEN may have dramatic effects on the rest
50 of the program, , i.e., leave MAXSTEMLEN alone... */
51
52/*#define MAXNUMERIC 4*/
53
54 /* Maximum number of numeric characters permitted in a word.
55 This avoids long sequences of numbers creating just one
56 word occurrence for each number. At most 10,000 all numeric
57 words will be permitted. */
58
59/* [RPAP - Jan 97: Stem Index Change] */
60#define MAXPARAMLEN 20
61 /* Maximum number of bytes to read for a parameter value for a
62 term in a query. */
63#define WEIGHTPARAM '/'
64#define STEMPARAM '#'
65
66/* [RJM 07/97: Ranked Required Terms] */
67#define MUSTMATCHPARAM '+'
68
69
70#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
71 /* The definition of what characters are permitted in a word.
72 This macro is pessimistic, you cannot tell from a particular
73 byte above 0x80 whether it is a character or not. This function
74 is needed by various functions relating to huffman coding
75 where frequency counts need to be primed, it should not be
76 used in parsing the UTF-8 encoded input. */
77
78int inaword (const u_char *here, const u_char *end);
79 /* Takes the place of the old INAWORD macro. It determines
80 whether a given place in a UTF-8 encoded Unicode string
81 is part of a word. */
82
83int isaspace (const u_char *here, const u_char *end);
84 /* It determines whether a given place in a UTF-8 encoded
85 Unicode string is a unicode space. */
86
87u_char *skipspace(u_char *here, u_char *end);
88 /* Return a the UTF-8 encoded Unicode string with beginning
89 unicode spaces skipped. */
90
91
92/* =========================================================================
93 * Macro: PARSE_WORD
94 * Description:
95 * Extract a word out for compressing text
96 * Input:
97 * s_in = string start in buffer
98 * end = string end in buffer
99 * Output:
100 * Word = extracted word with length in 1st byte
101 * s_in = ptr to next character in buffer yet to be processed
102 * ========================================================================= */
103#define PARSE_WORD(Word, s_in, end) \
104 do { \
105 register int charlength = 0; \
106 register u_char *wptr = (Word)+1; \
107 register int length = 0; \
108 register int numeric = 0; \
109 unsigned short c; \
110 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
111 \
112 charlength = parse_utf8_char((s_in),(end),&c); \
113 \
114 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
115 (is_unicode_letter(c) || (is_unicode_digit(c) && \
116 ++numeric <= maxnumeric))) { \
117 while (charlength-- > 0) { \
118 *wptr++ = *(s_in)++; ++length; \
119 } \
120 charlength = parse_utf8_char((s_in),(end),&c); \
121 } \
122 *(Word) = length; \
123 }while(0)
124
125 /*
126#define PARSE_WORD(Word, s_in, end) \
127 do { \
128 register u_char *wptr = (Word)+1; \
129 register int length = 0; \
130 register int c = *(s_in); \
131 register int numeric = 0; \
132 \
133 while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
134 { \
135 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
136 break; \
137 *wptr++ = c; \
138 ++length; \
139 c = *++(s_in); \
140 } \
141 *(Word) = length; \
142 }while(0)
143 */
144
145
146/* =========================================================================
147 * Macro: PARSE_NON_WORD
148 * Description:
149 * Extract a non-word out for storing compressed text
150 * Input: as above
151 * Output: as above
152 * ========================================================================= */
153#define PARSE_NON_WORD(Word, s_in, end) \
154 do { \
155 register int charlength = 0; \
156 register u_char *wptr = (Word)+1; \
157 register int length = 0; \
158 unsigned short c; \
159 \
160 charlength = parse_utf8_char((s_in),(end),&c); \
161 \
162 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
163 !is_unicode_letdig(c)) { \
164 while (charlength-- > 0) { \
165 *wptr++ = *(s_in)++; ++length; \
166 } \
167 charlength = parse_utf8_char((s_in),(end),&c); \
168 } \
169 *(Word) = length; \
170 }while(0)
171
172 /*
173#define PARSE_NON_WORD(Word, s_in, end) \
174 do { \
175 register u_char *wptr = (Word)+1; \
176 register int length = 0; \
177 register int c = *(s_in); \
178 \
179 while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
180 { \
181 *wptr++ = c; \
182 ++length; \
183 c = *++(s_in); \
184 } \
185 *(Word) = length; \
186 }while(0)
187 */
188
189
190/* =========================================================================
191 * Macro: PARSE_STEM_WORD
192 * Description:
193 * Extracts out Word.
194 * Input:
195 * s_in points to 1st letter in buffer to test
196 * end points to last letter in buffer
197 * Output:
198 * s_in is modified to move to next word
199 * Returns Word filled in with length in 1st byte.
200 * ========================================================================= */
201#define PARSE_STEM_WORD(Word, s_in, end) \
202 do { \
203 register int charlength = 0; \
204 register u_char *wptr = (Word)+1; \
205 register int length = 0; \
206 register int numeric = 0; \
207 unsigned short c; \
208 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
209 \
210 charlength = parse_utf8_char((s_in),(end),&c); \
211 \
212 while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
213 (is_unicode_letter(c) || (is_unicode_digit(c) && \
214 ++numeric <= maxnumeric))) { \
215 while (charlength-- > 0) { \
216 *wptr++ = *(s_in)++; ++length; \
217 } \
218 charlength = parse_utf8_char((s_in),(end),&c); \
219 } \
220 *(Word) = length; \
221 }while(0)
222 /*
223#define PARSE_STEM_WORD(Word, s_in, end) \
224 do \
225 { \
226 register u_char *wptr = (Word)+1; \
227 register int length = 0; \
228 register int c = *(s_in); \
229 register int numeric = 0; \
230 \
231 while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
232 { \
233 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
234 break; \
235 *wptr++ = c; \
236 ++length; \
237 c = *++(s_in); \
238 } \
239 *(Word) = length; \
240 }while(0)
241 */
242
243
244/* =========================================================================
245 * Macro: PARSE_NON_STEM_WORD
246 * Description:
247 * Eat up non-word. Do not store non-word.
248 * It is not needed in index only in text !
249 *
250 * Input: as above but no Word needed
251 * Output: as above
252 * ========================================================================= */
253#define PARSE_NON_STEM_WORD(s_in, end) \
254 do { \
255 register int charlength = 0; \
256 unsigned short c; \
257 \
258 charlength = parse_utf8_char((s_in),(end),&c); \
259 \
260 while (charlength > 0 && !is_unicode_letdig(c)) { \
261 (s_in) += charlength; \
262 charlength = parse_utf8_char((s_in),(end),&c); \
263 } \
264 }while(0)
265
266 /*
267#define PARSE_NON_STEM_WORD(s_in, end) \
268 do \
269 { \
270 while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
271 (s_in)++; \
272 }while(0)
273 */
274
275
276/* =========================================================================
277 * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
278 * Description:
279 * Like PARSE_NON_STEM_WORD but also eats up SGML tags
280 * Input: as above
281 * Output: as above
282 * ========================================================================= */
283#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
284 do { \
285 register int charlength = 0; \
286 unsigned short c; \
287 \
288 charlength = parse_utf8_char((s_in),(end),&c); \
289 \
290 while (charlength > 0 && !is_unicode_letdig(c)) { \
291 if (c == '<') { \
292 while (charlength > 0 && c != '>') { \
293 (s_in) += charlength; \
294 charlength = parse_utf8_char((s_in),(end),&c); \
295 } \
296 } \
297 (s_in) += charlength; \
298 charlength = parse_utf8_char((s_in),(end),&c); \
299 } \
300 }while(0)
301
302 /*
303#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
304 do \
305 { \
306 register int c = *(s_in); \
307 \
308 while (!INAWORD(c) && (s_in)<=(end)) \
309 { \
310 if (c == '<') \
311 { \
312 while (c != '>' && (s_in)<=(end)) \
313 c = *++(s_in); \
314 } \
315 if ((s_in)<=(end)) \
316 c = *++(s_in); \
317 } \
318 }while(0)
319 */
320
321
322/* =========================================================================
323 * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
324 * Description:
325 * Extracts out optional paramater for query term.
326 * Needed only in parsing the query line !
327 *
328 * Note: that this function has not been converted to use UTF-8
329 * as it should still work as it is (only uses ascii
330 * characters)
331 *
332 * Input: as above but no Word needed
333 * Output: as above
334 * ========================================================================= */
335#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
336 do \
337 { \
338 register u_char *wptr = (Param); \
339 register int length = 0; \
340 register int c = *(s_in); \
341 \
342 if (c == WEIGHTPARAM || c == STEMPARAM) \
343 { \
344 type = c; \
345 c = *++(s_in); \
346 while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
347 { \
348 *wptr++ = c; \
349 ++length; \
350 c = *++(s_in); \
351 } \
352 *wptr = '\0'; \
353 for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
354 ; \
355 } \
356 }while(0)
357
358/* =========================================================================
359 * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
360 * Description:
361 * Eat up non-word. Do not store non-word.
362 * If come across a match requirement store it in require_match
363 * It is not needed in index only in text !
364 *
365 * Input: as above
366 * Output: the requirement mode for the next term. -1=must not match,
367 * 0=optional match, 1=must match
368 * ========================================================================= */
369#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
370 do { \
371 register int charlength = 0; \
372 unsigned short c; \
373 (require_match) = 0; \
374 \
375 charlength = parse_utf8_char((s_in),(end),&c); \
376 \
377 while (charlength > 0 && !is_unicode_letdig(c)) { \
378 if (c == MUSTMATCHPARAM) (require_match) = 1; \
379 (s_in) += charlength; \
380 charlength = parse_utf8_char((s_in),(end),&c); \
381 } \
382 }while(0)
383
384 /*
385#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
386 do { \
387 (require_match) = 0; \
388 while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
389 if (*(s_in) == MUSTMATCHPARAM) { \
390 (require_match) = 1; \
391 } \
392 (s_in)++; \
393 } \
394 } while (0)
395 */
Note: See TracBrowser for help on using the repository browser.