source: main/trunk/greenstone2/common-src/indexers/mg/src/text/words.h@ 29327

Last change on this file since 29327 was 29327, checked in by ak19, 10 years ago

Dr Bainbridge fixed mg segmentation fault on Mac Lion, Mountain Lion and Maverick. On these Macs, mg collections didn't work. At least one mg binary (mg_passes) segfaulted. The problem was that functions GetEnv and IntEnv were declared at a point where their return types were not known. The compiler gave out a warning that the return type was being defaulted to an int (4 bytes). As a result of this, running in the debugger showed that the return values from these functions had got truncated. Dr Bainbridge allowed the files with the functions to know the return types at the necessary time during compilation by ensuring the necessary header files (mglong.h and environment.h) were included at the right time and in the right places so that they were available to the c functions and files that needed to know about them.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.7 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 29327 2014-09-22 06:13:42Z ak19 $
21 *
22 **************************************************************************/
23
24
25#include "sysfuncs.h"
26#include "unitool.h"
27
28#include "environment.h"
29
30/*
31 * This has been cleaned up by Tim Shimmin.
32 */
33
34/*
35 * ---NOTE---
36 *
37 * "WORD" refers to a word in the compressed text.
38 * "STEM" or "STEM_WORD" refers to a word for indexing on
39 *
40 */
41
42#define MAXWORDLEN 15
43 /* Maximum length in bytes of any word or non-word. Note that
44 variations to MAXWORDLEN may have dramatic effects on the rest
45 of the program, as the length and the prefix match are packed
46 together into a four bit nibble, and there is not check that
47 this is possible, i.e., leave MAXWORDLEN alone... */
48
49#define MAXSTEMLEN 255
50 /* Maximum length in bytes of any stem. Note that
51 variations to MAXSTEMLEN may have dramatic effects on the rest
52 of the program, , i.e., leave MAXSTEMLEN alone... */
53
54/*#define MAXNUMERIC 4*/
55
56 /* Maximum number of numeric characters permitted in a word.
57 This avoids long sequences of numbers creating just one
58 word occurrence for each number. At most 10,000 all numeric
59 words will be permitted. */
60
61/* [RPAP - Jan 97: Stem Index Change] */
62#define MAXPARAMLEN 20
63 /* Maximum number of bytes to read for a parameter value for a
64 term in a query. */
65#define WEIGHTPARAM '/'
66#define STEMPARAM '#'
67
68/* [RJM 07/97: Ranked Required Terms] */
69#define MUSTMATCHPARAM '+'
70
71
72#define PESINAWORD(c) (isalnum(c) || ((c) >= 0x80 && (c) <= 0xff))
73 /* The definition of what characters are permitted in a word.
74 This macro is pessimistic, you cannot tell from a particular
75 byte above 0x80 whether it is a character or not. This function
76 is needed by various functions relating to huffman coding
77 where frequency counts need to be primed, it should not be
78 used in parsing the UTF-8 encoded input. */
79
80int inaword (const u_char *here, const u_char *end);
81 /* Takes the place of the old INAWORD macro. It determines
82 whether a given place in a UTF-8 encoded Unicode string
83 is part of a word. */
84
85int isaspace (const u_char *here, const u_char *end);
86 /* It determines whether a given place in a UTF-8 encoded
87 Unicode string is a unicode space. */
88
89u_char *skipspace(u_char *here, u_char *end);
90 /* Return a the UTF-8 encoded Unicode string with beginning
91 unicode spaces skipped. */
92
93
94/* =========================================================================
95 * Macro: PARSE_WORD
96 * Description:
97 * Extract a word out for compressing text
98 * Input:
99 * s_in = string start in buffer
100 * end = string end in buffer
101 * Output:
102 * Word = extracted word with length in 1st byte
103 * s_in = ptr to next character in buffer yet to be processed
104 * ========================================================================= */
105#define PARSE_WORD(Word, s_in, end) \
106 do { \
107 register int charlength = 0; \
108 register u_char *wptr = (Word)+1; \
109 register int length = 0; \
110 register int numeric = 0; \
111 unsigned short c; \
112 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
113 \
114 charlength = parse_utf8_char((s_in),(end),&c); \
115 \
116 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
117 (is_unicode_letter(c) || (is_unicode_digit(c) && \
118 ++numeric <= maxnumeric))) { \
119 while (charlength-- > 0) { \
120 *wptr++ = *(s_in)++; ++length; \
121 } \
122 charlength = parse_utf8_char((s_in),(end),&c); \
123 } \
124 *(Word) = length; \
125 }while(0)
126
127 /*
128#define PARSE_WORD(Word, s_in, end) \
129 do { \
130 register u_char *wptr = (Word)+1; \
131 register int length = 0; \
132 register int c = *(s_in); \
133 register int numeric = 0; \
134 \
135 while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
136 { \
137 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
138 break; \
139 *wptr++ = c; \
140 ++length; \
141 c = *++(s_in); \
142 } \
143 *(Word) = length; \
144 }while(0)
145 */
146
147
148/* =========================================================================
149 * Macro: PARSE_NON_WORD
150 * Description:
151 * Extract a non-word out for storing compressed text
152 * Input: as above
153 * Output: as above
154 * ========================================================================= */
155#define PARSE_NON_WORD(Word, s_in, end) \
156 do { \
157 register int charlength = 0; \
158 register u_char *wptr = (Word)+1; \
159 register int length = 0; \
160 unsigned short c; \
161 \
162 charlength = parse_utf8_char((s_in),(end),&c); \
163 \
164 while (length+charlength <= MAXWORDLEN && charlength > 0 && \
165 !is_unicode_letdig(c)) { \
166 while (charlength-- > 0) { \
167 *wptr++ = *(s_in)++; ++length; \
168 } \
169 charlength = parse_utf8_char((s_in),(end),&c); \
170 } \
171 *(Word) = length; \
172 }while(0)
173
174 /*
175#define PARSE_NON_WORD(Word, s_in, end) \
176 do { \
177 register u_char *wptr = (Word)+1; \
178 register int length = 0; \
179 register int c = *(s_in); \
180 \
181 while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
182 { \
183 *wptr++ = c; \
184 ++length; \
185 c = *++(s_in); \
186 } \
187 *(Word) = length; \
188 }while(0)
189 */
190
191
192/* =========================================================================
193 * Macro: PARSE_STEM_WORD
194 * Description:
195 * Extracts out Word.
196 * Input:
197 * s_in points to 1st letter in buffer to test
198 * end points to last letter in buffer
199 * Output:
200 * s_in is modified to move to next word
201 * Returns Word filled in with length in 1st byte.
202 * ========================================================================= */
203#define PARSE_STEM_WORD(Word, s_in, end) \
204 do { \
205 register int charlength = 0; \
206 register u_char *wptr = (Word)+1; \
207 register int length = 0; \
208 register int numeric = 0; \
209 unsigned short c; \
210 register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
211 \
212 charlength = parse_utf8_char((s_in),(end),&c); \
213 \
214 while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
215 (is_unicode_letter(c) || (is_unicode_digit(c) && \
216 ++numeric <= maxnumeric))) { \
217 while (charlength-- > 0) { \
218 *wptr++ = *(s_in)++; ++length; \
219 } \
220 charlength = parse_utf8_char((s_in),(end),&c); \
221 } \
222 *(Word) = length; \
223 }while(0)
224 /*
225#define PARSE_STEM_WORD(Word, s_in, end) \
226 do \
227 { \
228 register u_char *wptr = (Word)+1; \
229 register int length = 0; \
230 register int c = *(s_in); \
231 register int numeric = 0; \
232 \
233 while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
234 { \
235 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
236 break; \
237 *wptr++ = c; \
238 ++length; \
239 c = *++(s_in); \
240 } \
241 *(Word) = length; \
242 }while(0)
243 */
244
245
246/* =========================================================================
247 * Macro: PARSE_NON_STEM_WORD
248 * Description:
249 * Eat up non-word. Do not store non-word.
250 * It is not needed in index only in text !
251 *
252 * Input: as above but no Word needed
253 * Output: as above
254 * ========================================================================= */
255#define PARSE_NON_STEM_WORD(s_in, end) \
256 do { \
257 register int charlength = 0; \
258 unsigned short c; \
259 \
260 charlength = parse_utf8_char((s_in),(end),&c); \
261 \
262 while (charlength > 0 && !is_unicode_letdig(c)) { \
263 (s_in) += charlength; \
264 charlength = parse_utf8_char((s_in),(end),&c); \
265 } \
266 }while(0)
267
268 /*
269#define PARSE_NON_STEM_WORD(s_in, end) \
270 do \
271 { \
272 while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
273 (s_in)++; \
274 }while(0)
275 */
276
277
278/* =========================================================================
279 * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
280 * Description:
281 * Like PARSE_NON_STEM_WORD but also eats up SGML tags
282 * Input: as above
283 * Output: as above
284 * ========================================================================= */
285#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
286 do { \
287 register int charlength = 0; \
288 unsigned short c; \
289 \
290 charlength = parse_utf8_char((s_in),(end),&c); \
291 \
292 while (charlength > 0 && !is_unicode_letdig(c)) { \
293 if (c == '<') { \
294 while (charlength > 0 && c != '>') { \
295 (s_in) += charlength; \
296 charlength = parse_utf8_char((s_in),(end),&c); \
297 } \
298 } \
299 (s_in) += charlength; \
300 charlength = parse_utf8_char((s_in),(end),&c); \
301 } \
302 }while(0)
303
304 /*
305#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
306 do \
307 { \
308 register int c = *(s_in); \
309 \
310 while (!INAWORD(c) && (s_in)<=(end)) \
311 { \
312 if (c == '<') \
313 { \
314 while (c != '>' && (s_in)<=(end)) \
315 c = *++(s_in); \
316 } \
317 if ((s_in)<=(end)) \
318 c = *++(s_in); \
319 } \
320 }while(0)
321 */
322
323
324/* =========================================================================
325 * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
326 * Description:
327 * Extracts out optional paramater for query term.
328 * Needed only in parsing the query line !
329 *
330 * Note: that this function has not been converted to use UTF-8
331 * as it should still work as it is (only uses ascii
332 * characters)
333 *
334 * Input: as above but no Word needed
335 * Output: as above
336 * ========================================================================= */
337#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
338 do \
339 { \
340 register u_char *wptr = (Param); \
341 register int length = 0; \
342 register int c = *(s_in); \
343 \
344 if (c == WEIGHTPARAM || c == STEMPARAM) \
345 { \
346 type = c; \
347 c = *++(s_in); \
348 while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
349 { \
350 *wptr++ = c; \
351 ++length; \
352 c = *++(s_in); \
353 } \
354 *wptr = '\0'; \
355 for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
356 ; \
357 } \
358 }while(0)
359
360/* =========================================================================
361 * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
362 * Description:
363 * Eat up non-word. Do not store non-word.
364 * If come across a match requirement store it in require_match
365 * It is not needed in index only in text !
366 *
367 * Input: as above
368 * Output: the requirement mode for the next term. -1=must not match,
369 * 0=optional match, 1=must match
370 * ========================================================================= */
371#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
372 do { \
373 register int charlength = 0; \
374 unsigned short c; \
375 (require_match) = 0; \
376 \
377 charlength = parse_utf8_char((s_in),(end),&c); \
378 \
379 while (charlength > 0 && !is_unicode_letdig(c)) { \
380 if (c == MUSTMATCHPARAM) (require_match) = 1; \
381 (s_in) += charlength; \
382 charlength = parse_utf8_char((s_in),(end),&c); \
383 } \
384 }while(0)
385
386 /*
387#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
388 do { \
389 (require_match) = 0; \
390 while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
391 if (*(s_in) == MUSTMATCHPARAM) { \
392 (require_match) = 1; \
393 } \
394 (s_in)++; \
395 } \
396 } while (0)
397 */
Note: See TracBrowser for help on using the repository browser.