Context Navigation

words.h@ 29327

Last change on this file since 29327 was 29327, checked in by ak19, 10 years ago
Dr Bainbridge fixed mg segmentation fault on Mac Lion, Mountain Lion and Maverick. On these Macs, mg collections didn't work. At least one mg binary (mg_passes) segfaulted. The problem was that functions GetEnv and IntEnv were declared at a point where their return types were not known. The compiler gave out a warning that the return type was being defaulted to an int (4 bytes). As a result of this, running in the debugger showed that the return values from these functions had got truncated. Dr Bainbridge allowed the files with the functions to know the return types at the necessary time during compilation by ensuring the necessary header files (mglong.h and environment.h) were included at the right time and in the right places so that they were available to the c functions and files that needed to know about them.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.7 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: words.h 29327 2014-09-22 06:13:42Z ak19 $
21	*
22	**************************************************************************/
23
24
25	#include "sysfuncs.h"
26	#include "unitool.h"
27
28	#include "environment.h"
29
30	/*
31	* This has been cleaned up by Tim Shimmin.
32	*/
33
34	/*
35	* ---NOTE---
36	*
37	* "WORD" refers to a word in the compressed text.
38	* "STEM" or "STEM_WORD" refers to a word for indexing on
39	*
40	*/
41
42	#define MAXWORDLEN 15
43	/* Maximum length in bytes of any word or non-word. Note that
44	variations to MAXWORDLEN may have dramatic effects on the rest
45	of the program, as the length and the prefix match are packed
46	together into a four bit nibble, and there is not check that
47	this is possible, i.e., leave MAXWORDLEN alone... */
48
49	#define MAXSTEMLEN 255
50	/* Maximum length in bytes of any stem. Note that
51	variations to MAXSTEMLEN may have dramatic effects on the rest
52	of the program, , i.e., leave MAXSTEMLEN alone... */
53
54	/#define MAXNUMERIC 4/
55
56	/* Maximum number of numeric characters permitted in a word.
57	This avoids long sequences of numbers creating just one
58	word occurrence for each number. At most 10,000 all numeric
59	words will be permitted. */
60
61	/* [RPAP - Jan 97: Stem Index Change] */
62	#define MAXPARAMLEN 20
63	/* Maximum number of bytes to read for a parameter value for a
64	term in a query. */
65	#define WEIGHTPARAM '/'
66	#define STEMPARAM '#'
67
68	/* [RJM 07/97: Ranked Required Terms] */
69	#define MUSTMATCHPARAM '+'
70
71
72	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
73	/* The definition of what characters are permitted in a word.
74	This macro is pessimistic, you cannot tell from a particular
75	byte above 0x80 whether it is a character or not. This function
76	is needed by various functions relating to huffman coding
77	where frequency counts need to be primed, it should not be
78	used in parsing the UTF-8 encoded input. */
79
80	int inaword (const u_char here, const u_char end);
81	/* Takes the place of the old INAWORD macro. It determines
82	whether a given place in a UTF-8 encoded Unicode string
83	is part of a word. */
84
85	int isaspace (const u_char here, const u_char end);
86	/* It determines whether a given place in a UTF-8 encoded
87	Unicode string is a unicode space. */
88
89	u_char skipspace(u_char here, u_char *end);
90	/* Return a the UTF-8 encoded Unicode string with beginning
91	unicode spaces skipped. */
92
93
94	/* =========================================================================
95	* Macro: PARSE_WORD
96	* Description:
97	* Extract a word out for compressing text
98	* Input:
99	* s_in = string start in buffer
100	* end = string end in buffer
101	* Output:
102	* Word = extracted word with length in 1st byte
103	* s_in = ptr to next character in buffer yet to be processed
104	* ========================================================================= */
105	#define PARSE_WORD(Word, s_in, end) \
106	do { \
107	register int charlength = 0; \
108	register u_char *wptr = (Word)+1; \
109	register int length = 0; \
110	register int numeric = 0; \
111	unsigned short c; \
112	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
113	\
114	charlength = parse_utf8_char((s_in),(end),&c); \
115	\
116	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
117	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
118	++numeric <= maxnumeric))) { \
119	while (charlength-- > 0) { \
120	wptr++ = (s_in)++; ++length; \
121	} \
122	charlength = parse_utf8_char((s_in),(end),&c); \
123	} \
124	*(Word) = length; \
125	}while(0)
126
127	/*
128	#define PARSE_WORD(Word, s_in, end) \
129	do { \
130	register u_char *wptr = (Word)+1; \
131	register int length = 0; \
132	register int c = *(s_in); \
133	register int numeric = 0; \
134	\
135	while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
136	{ \
137	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
138	break; \
139	*wptr++ = c; \
140	++length; \
141	c = *++(s_in); \
142	} \
143	*(Word) = length; \
144	}while(0)
145	*/
146
147
148	/* =========================================================================
149	* Macro: PARSE_NON_WORD
150	* Description:
151	* Extract a non-word out for storing compressed text
152	* Input: as above
153	* Output: as above
154	* ========================================================================= */
155	#define PARSE_NON_WORD(Word, s_in, end) \
156	do { \
157	register int charlength = 0; \
158	register u_char *wptr = (Word)+1; \
159	register int length = 0; \
160	unsigned short c; \
161	\
162	charlength = parse_utf8_char((s_in),(end),&c); \
163	\
164	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
165	!is_unicode_letdig(c)) { \
166	while (charlength-- > 0) { \
167	wptr++ = (s_in)++; ++length; \
168	} \
169	charlength = parse_utf8_char((s_in),(end),&c); \
170	} \
171	*(Word) = length; \
172	}while(0)
173
174	/*
175	#define PARSE_NON_WORD(Word, s_in, end) \
176	do { \
177	register u_char *wptr = (Word)+1; \
178	register int length = 0; \
179	register int c = *(s_in); \
180	\
181	while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
182	{ \
183	*wptr++ = c; \
184	++length; \
185	c = *++(s_in); \
186	} \
187	*(Word) = length; \
188	}while(0)
189	*/
190
191
192	/* =========================================================================
193	* Macro: PARSE_STEM_WORD
194	* Description:
195	* Extracts out Word.
196	* Input:
197	* s_in points to 1st letter in buffer to test
198	* end points to last letter in buffer
199	* Output:
200	* s_in is modified to move to next word
201	* Returns Word filled in with length in 1st byte.
202	* ========================================================================= */
203	#define PARSE_STEM_WORD(Word, s_in, end) \
204	do { \
205	register int charlength = 0; \
206	register u_char *wptr = (Word)+1; \
207	register int length = 0; \
208	register int numeric = 0; \
209	unsigned short c; \
210	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
211	\
212	charlength = parse_utf8_char((s_in),(end),&c); \
213	\
214	while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
215	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
216	++numeric <= maxnumeric))) { \
217	while (charlength-- > 0) { \
218	wptr++ = (s_in)++; ++length; \
219	} \
220	charlength = parse_utf8_char((s_in),(end),&c); \
221	} \
222	*(Word) = length; \
223	}while(0)
224	/*
225	#define PARSE_STEM_WORD(Word, s_in, end) \
226	do \
227	{ \
228	register u_char *wptr = (Word)+1; \
229	register int length = 0; \
230	register int c = *(s_in); \
231	register int numeric = 0; \
232	\
233	while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
234	{ \
235	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
236	break; \
237	*wptr++ = c; \
238	++length; \
239	c = *++(s_in); \
240	} \
241	*(Word) = length; \
242	}while(0)
243	*/
244
245
246	/* =========================================================================
247	* Macro: PARSE_NON_STEM_WORD
248	* Description:
249	* Eat up non-word. Do not store non-word.
250	* It is not needed in index only in text !
251	*
252	* Input: as above but no Word needed
253	* Output: as above
254	* ========================================================================= */
255	#define PARSE_NON_STEM_WORD(s_in, end) \
256	do { \
257	register int charlength = 0; \
258	unsigned short c; \
259	\
260	charlength = parse_utf8_char((s_in),(end),&c); \
261	\
262	while (charlength > 0 && !is_unicode_letdig(c)) { \
263	(s_in) += charlength; \
264	charlength = parse_utf8_char((s_in),(end),&c); \
265	} \
266	}while(0)
267
268	/*
269	#define PARSE_NON_STEM_WORD(s_in, end) \
270	do \
271	{ \
272	while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
273	(s_in)++; \
274	}while(0)
275	*/
276
277
278	/* =========================================================================
279	* Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
280	* Description:
281	* Like PARSE_NON_STEM_WORD but also eats up SGML tags
282	* Input: as above
283	* Output: as above
284	* ========================================================================= */
285	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
286	do { \
287	register int charlength = 0; \
288	unsigned short c; \
289	\
290	charlength = parse_utf8_char((s_in),(end),&c); \
291	\
292	while (charlength > 0 && !is_unicode_letdig(c)) { \
293	if (c == '<') { \
294	while (charlength > 0 && c != '>') { \
295	(s_in) += charlength; \
296	charlength = parse_utf8_char((s_in),(end),&c); \
297	} \
298	} \
299	(s_in) += charlength; \
300	charlength = parse_utf8_char((s_in),(end),&c); \
301	} \
302	}while(0)
303
304	/*
305	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
306	do \
307	{ \
308	register int c = *(s_in); \
309	\
310	while (!INAWORD(c) && (s_in)<=(end)) \
311	{ \
312	if (c == '<') \
313	{ \
314	while (c != '>' && (s_in)<=(end)) \
315	c = *++(s_in); \
316	} \
317	if ((s_in)<=(end)) \
318	c = *++(s_in); \
319	} \
320	}while(0)
321	*/
322
323
324	/* =========================================================================
325	* Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
326	* Description:
327	* Extracts out optional paramater for query term.
328	* Needed only in parsing the query line !
329	*
330	* Note: that this function has not been converted to use UTF-8
331	* as it should still work as it is (only uses ascii
332	* characters)
333	*
334	* Input: as above but no Word needed
335	* Output: as above
336	* ========================================================================= */
337	#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
338	do \
339	{ \
340	register u_char *wptr = (Param); \
341	register int length = 0; \
342	register int c = *(s_in); \
343	\
344	if (c == WEIGHTPARAM \|\| c == STEMPARAM) \
345	{ \
346	type = c; \
347	c = *++(s_in); \
348	while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
349	{ \
350	*wptr++ = c; \
351	++length; \
352	c = *++(s_in); \
353	} \
354	*wptr = '\0'; \
355	for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
356	; \
357	} \
358	}while(0)
359
360	/* =========================================================================
361	* Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
362	* Description:
363	* Eat up non-word. Do not store non-word.
364	* If come across a match requirement store it in require_match
365	* It is not needed in index only in text !
366	*
367	* Input: as above
368	* Output: the requirement mode for the next term. -1=must not match,
369	* 0=optional match, 1=must match
370	* ========================================================================= */
371	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
372	do { \
373	register int charlength = 0; \
374	unsigned short c; \
375	(require_match) = 0; \
376	\
377	charlength = parse_utf8_char((s_in),(end),&c); \
378	\
379	while (charlength > 0 && !is_unicode_letdig(c)) { \
380	if (c == MUSTMATCHPARAM) (require_match) = 1; \
381	(s_in) += charlength; \
382	charlength = parse_utf8_char((s_in),(end),&c); \
383	} \
384	}while(0)
385
386	/*
387	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
388	do { \
389	(require_match) = 0; \
390	while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
391	if (*(s_in) == MUSTMATCHPARAM) { \
392	(require_match) = 1; \
393	} \
394	(s_in)++; \
395	} \
396	} while (0)
397	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mg/src/text/words.h@ 29327

Download in other formats: