Context Navigation

source: trunk/gsdl3/src/packages/mg/src/text/words.h@ 7431

Last change on this file since 7431 was 7228, checked in by kjdon, 20 years ago
added a new -M option to mg_passes, allowing maxnumeric to be altered - made this change to keep gsdl3 mg inline with gsdl2 mg.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.3 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: words.h 7228 2004-04-25 23:01:18Z kjdon $
21	*
22	**************************************************************************/
23
24
25	#include "sysfuncs.h"
26	#include "unitool.h"
27
28	/*
29	* This has been cleaned up by Tim Shimmin.
30	*/
31
32	/*
33	* ---NOTE---
34	*
35	* "WORD" refers to a word in the compressed text.
36	* "STEM" or "STEM_WORD" refers to a word for indexing on
37	*
38	*/
39
40	#define MAXWORDLEN 15
41	/* Maximum length in bytes of any word or non-word. Note that
42	variations to MAXWORDLEN may have dramatic effects on the rest
43	of the program, as the length and the prefix match are packed
44	together into a four bit nibble, and there is not check that
45	this is possible, i.e., leave MAXWORDLEN alone... */
46
47	#define MAXSTEMLEN 255
48	/* Maximum length in bytes of any stem. Note that
49	variations to MAXSTEMLEN may have dramatic effects on the rest
50	of the program, , i.e., leave MAXSTEMLEN alone... */
51
52	/#define MAXNUMERIC 4/
53
54	/* Maximum number of numeric characters permitted in a word.
55	This avoids long sequences of numbers creating just one
56	word occurrence for each number. At most 10,000 all numeric
57	words will be permitted. */
58
59	/* [RPAP - Jan 97: Stem Index Change] */
60	#define MAXPARAMLEN 20
61	/* Maximum number of bytes to read for a parameter value for a
62	term in a query. */
63	#define WEIGHTPARAM '/'
64	#define STEMPARAM '#'
65
66	/* [RJM 07/97: Ranked Required Terms] */
67	#define MUSTMATCHPARAM '+'
68
69
70	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
71	/* The definition of what characters are permitted in a word.
72	This macro is pessimistic, you cannot tell from a particular
73	byte above 0x80 whether it is a character or not. This function
74	is needed by various functions relating to huffman coding
75	where frequency counts need to be primed, it should not be
76	used in parsing the UTF-8 encoded input. */
77
78	int inaword (const u_char here, const u_char end);
79	/* Takes the place of the old INAWORD macro. It determines
80	whether a given place in a UTF-8 encoded Unicode string
81	is part of a word. */
82
83
84	/* =========================================================================
85	* Macro: PARSE_WORD
86	* Description:
87	* Extract a word out for compressing text
88	* Input:
89	* s_in = string start in buffer
90	* end = string end in buffer
91	* Output:
92	* Word = extracted word with length in 1st byte
93	* s_in = ptr to next character in buffer yet to be processed
94	* ========================================================================= */
95	#define PARSE_WORD(Word, s_in, end) \
96	do { \
97	register int charlength = 0; \
98	register u_char *wptr = (Word)+1; \
99	register int length = 0; \
100	register int numeric = 0; \
101	unsigned short c; \
102	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
103	\
104	charlength = parse_utf8_char((s_in),(end),&c); \
105	\
106	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
107	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
108	++numeric <= maxnumeric))) { \
109	while (charlength-- > 0) { \
110	wptr++ = (s_in)++; length++; \
111	} \
112	charlength = parse_utf8_char((s_in),(end),&c); \
113	} \
114	*(Word) = length; \
115	}while(0)
116
117	/*
118	#define PARSE_WORD(Word, s_in, end) \
119	do { \
120	register u_char *wptr = (Word)+1; \
121	register int length = 0; \
122	register int c = *(s_in); \
123	register int numeric = 0; \
124	\
125	while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
126	{ \
127	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
128	break; \
129	*wptr++ = c; \
130	length++; \
131	c = *++(s_in); \
132	} \
133	*(Word) = length; \
134	}while(0)
135	*/
136
137
138	/* =========================================================================
139	* Macro: PARSE_NON_WORD
140	* Description:
141	* Extract a non-word out for storing compressed text
142	* Input: as above
143	* Output: as above
144	* ========================================================================= */
145	#define PARSE_NON_WORD(Word, s_in, end) \
146	do { \
147	register int charlength = 0; \
148	register u_char *wptr = (Word)+1; \
149	register int length = 0; \
150	unsigned short c; \
151	\
152	charlength = parse_utf8_char((s_in),(end),&c); \
153	\
154	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
155	!is_unicode_letdig(c)) { \
156	while (charlength-- > 0) { \
157	wptr++ = (s_in)++; length++; \
158	} \
159	charlength = parse_utf8_char((s_in),(end),&c); \
160	} \
161	*(Word) = length; \
162	}while(0)
163
164	/*
165	#define PARSE_NON_WORD(Word, s_in, end) \
166	do { \
167	register u_char *wptr = (Word)+1; \
168	register int length = 0; \
169	register int c = *(s_in); \
170	\
171	while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
172	{ \
173	*wptr++ = c; \
174	length++; \
175	c = *++(s_in); \
176	} \
177	*(Word) = length; \
178	}while(0)
179	*/
180
181
182	/* =========================================================================
183	* Macro: PARSE_STEM_WORD
184	* Description:
185	* Extracts out Word.
186	* Input:
187	* s_in points to 1st letter in buffer to test
188	* end points to last letter in buffer
189	* Output:
190	* s_in is modified to move to next word
191	* Returns Word filled in with length in 1st byte.
192	* ========================================================================= */
193	#define PARSE_STEM_WORD(Word, s_in, end) \
194	do { \
195	register int charlength = 0; \
196	register u_char *wptr = (Word)+1; \
197	register int length = 0; \
198	register int numeric = 0; \
199	unsigned short c; \
200	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
201	\
202	charlength = parse_utf8_char((s_in),(end),&c); \
203	\
204	while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
205	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
206	++numeric <= maxnumeric))) { \
207	while (charlength-- > 0) { \
208	wptr++ = (s_in)++; length++; \
209	} \
210	charlength = parse_utf8_char((s_in),(end),&c); \
211	} \
212	*(Word) = length; \
213	}while(0)
214	/*
215	#define PARSE_STEM_WORD(Word, s_in, end) \
216	do \
217	{ \
218	register u_char *wptr = (Word)+1; \
219	register int length = 0; \
220	register int c = *(s_in); \
221	register int numeric = 0; \
222	\
223	while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
224	{ \
225	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
226	break; \
227	*wptr++ = c; \
228	length++; \
229	c = *++(s_in); \
230	} \
231	*(Word) = length; \
232	}while(0)
233	*/
234
235
236	/* =========================================================================
237	* Macro: PARSE_NON_STEM_WORD
238	* Description:
239	* Eat up non-word. Do not store non-word.
240	* It is not needed in index only in text !
241	*
242	* Input: as above but no Word needed
243	* Output: as above
244	* ========================================================================= */
245	#define PARSE_NON_STEM_WORD(s_in, end) \
246	do { \
247	register int charlength = 0; \
248	unsigned short c; \
249	\
250	charlength = parse_utf8_char((s_in),(end),&c); \
251	\
252	while (charlength > 0 && !is_unicode_letdig(c)) { \
253	(s_in) += charlength; \
254	charlength = parse_utf8_char((s_in),(end),&c); \
255	} \
256	}while(0)
257
258	/*
259	#define PARSE_NON_STEM_WORD(s_in, end) \
260	do \
261	{ \
262	while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
263	(s_in)++; \
264	}while(0)
265	*/
266
267
268	/* =========================================================================
269	* Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
270	* Description:
271	* Like PARSE_NON_STEM_WORD but also eats up SGML tags
272	* Input: as above
273	* Output: as above
274	* ========================================================================= */
275	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
276	do { \
277	register int charlength = 0; \
278	unsigned short c; \
279	\
280	charlength = parse_utf8_char((s_in),(end),&c); \
281	\
282	while (charlength > 0 && !is_unicode_letdig(c)) { \
283	if (c == '<') { \
284	while (charlength > 0 && c != '>') { \
285	(s_in) += charlength; \
286	charlength = parse_utf8_char((s_in),(end),&c); \
287	} \
288	} \
289	(s_in) += charlength; \
290	charlength = parse_utf8_char((s_in),(end),&c); \
291	} \
292	}while(0)
293
294	/*
295	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
296	do \
297	{ \
298	register int c = *(s_in); \
299	\
300	while (!INAWORD(c) && (s_in)<=(end)) \
301	{ \
302	if (c == '<') \
303	{ \
304	while (c != '>' && (s_in)<=(end)) \
305	c = *++(s_in); \
306	} \
307	if ((s_in)<=(end)) \
308	c = *++(s_in); \
309	} \
310	}while(0)
311	*/
312
313
314	/* =========================================================================
315	* Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
316	* Description:
317	* Extracts out optional paramater for query term.
318	* Needed only in parsing the query line !
319	*
320	* Note: that this function has not been converted to use UTF-8
321	* as it should still work as it is (only uses ascii
322	* characters)
323	*
324	* Input: as above but no Word needed
325	* Output: as above
326	* ========================================================================= */
327	#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
328	do \
329	{ \
330	register u_char *wptr = (Param); \
331	register int length = 0; \
332	register int c = *(s_in); \
333	\
334	if (c == WEIGHTPARAM \|\| c == STEMPARAM) \
335	{ \
336	type = c; \
337	c = *++(s_in); \
338	while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
339	{ \
340	*wptr++ = c; \
341	length++; \
342	c = *++(s_in); \
343	} \
344	*wptr = '\0'; \
345	for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
346	; \
347	} \
348	}while(0)
349
350	/* =========================================================================
351	* Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
352	* Description:
353	* Eat up non-word. Do not store non-word.
354	* If come across a match requirement store it in require_match
355	* It is not needed in index only in text !
356	*
357	* Input: as above
358	* Output: the requirement mode for the next term. -1=must not match,
359	* 0=optional match, 1=must match
360	* ========================================================================= */
361	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
362	do { \
363	register int charlength = 0; \
364	unsigned short c; \
365	(require_match) = 0; \
366	\
367	charlength = parse_utf8_char((s_in),(end),&c); \
368	\
369	while (charlength > 0 && !is_unicode_letdig(c)) { \
370	if (c == MUSTMATCHPARAM) (require_match) = 1; \
371	(s_in) += charlength; \
372	charlength = parse_utf8_char((s_in),(end),&c); \
373	} \
374	}while(0)
375
376	/*
377	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
378	do { \
379	(require_match) = 0; \
380	while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
381	if (*(s_in) == MUSTMATCHPARAM) { \
382	(require_match) = 1; \
383	} \
384	(s_in)++; \
385	} \
386	} while (0)
387	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: