Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/mg/src/text/words.h@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago
renamed mg-1.3d directory mg
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.2 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: words.h 439 1999-08-10 21:23:37Z sjboddie $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "unitool.h"
27
28
29	/*
30	* This has been cleaned up by Tim Shimmin.
31	*/
32
33	/*
34	* ---NOTE---
35	*
36	* "WORD" refers to a word in the compressed text.
37	* "STEM" or "STEM_WORD" refers to a word for indexing on
38	*
39	*/
40
41	#define MAXWORDLEN 15
42	/* Maximum length in bytes of any word or non-word. Note that
43	variations to MAXWORDLEN may have dramatic effects on the rest
44	of the program, as the length and the prefix match are packed
45	together into a four bit nibble, and there is not check that
46	this is possible, i.e., leave MAXWORDLEN alone... */
47
48	#define MAXSTEMLEN 255
49	/* Maximum length in bytes of any stem. Note that
50	variations to MAXSTEMLEN may have dramatic effects on the rest
51	of the program, , i.e., leave MAXSTEMLEN alone... */
52
53	#define MAXNUMERIC 4
54	/* Maximum number of numeric characters permitted in a word.
55	This avoids long sequences of numbers creating just one
56	word occurrence for each number. At most 10,000 all numeric
57	words will be permitted. */
58
59	/* [RPAP - Jan 97: Stem Index Change] */
60	#define MAXPARAMLEN 20
61	/* Maximum number of bytes to read for a parameter value for a
62	term in a query. */
63	#define WEIGHTPARAM '/'
64	#define STEMPARAM '#'
65
66	/* [RJM 07/97: Ranked Required Terms] */
67	#define MUSTMATCHPARAM '+'
68
69
70	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
71	/* The definition of what characters are permitted in a word.
72	This macro is pessimistic, you cannot tell from a particular
73	byte above 0x80 whether it is a character or not. This function
74	is needed by various functions relating to huffman coding
75	where frequency counts need to be primed, it should not be
76	used in parsing the UTF-8 encoded input. */
77
78	int inaword (const u_char here, const u_char end);
79	/* Takes the place of the old INAWORD macro. It determines
80	whether a given place in a UTF-8 encoded Unicode string
81	is part of a word. */
82
83
84	/* =========================================================================
85	* Macro: PARSE_WORD
86	* Description:
87	* Extract a word out for compressing text
88	* Input:
89	* s_in = string start in buffer
90	* end = string end in buffer
91	* Output:
92	* Word = extracted word with length in 1st byte
93	* s_in = ptr to next character in buffer yet to be processed
94	* ========================================================================= */
95	#define PARSE_WORD(Word, s_in, end) \
96	do { \
97	register int charlength = 0; \
98	register u_char *wptr = (Word)+1; \
99	register int length = 0; \
100	register int numeric = 0; \
101	unsigned short c; \
102	\
103	charlength = parse_utf8_char((s_in),(end),&c); \
104	\
105	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
106	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
107	++numeric <= MAXNUMERIC))) { \
108	while (charlength-- > 0) { \
109	wptr++ = (s_in)++; length++; \
110	} \
111	charlength = parse_utf8_char((s_in),(end),&c); \
112	} \
113	*(Word) = length; \
114	}while(0)
115
116	/*
117	#define PARSE_WORD(Word, s_in, end) \
118	do { \
119	register u_char *wptr = (Word)+1; \
120	register int length = 0; \
121	register int c = *(s_in); \
122	register int numeric = 0; \
123	\
124	while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
125	{ \
126	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
127	break; \
128	*wptr++ = c; \
129	length++; \
130	c = *++(s_in); \
131	} \
132	*(Word) = length; \
133	}while(0)
134	*/
135
136
137	/* =========================================================================
138	* Macro: PARSE_NON_WORD
139	* Description:
140	* Extract a non-word out for storing compressed text
141	* Input: as above
142	* Output: as above
143	* ========================================================================= */
144	#define PARSE_NON_WORD(Word, s_in, end) \
145	do { \
146	register int charlength = 0; \
147	register u_char *wptr = (Word)+1; \
148	register int length = 0; \
149	unsigned short c; \
150	\
151	charlength = parse_utf8_char((s_in),(end),&c); \
152	\
153	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
154	!is_unicode_letdig(c)) { \
155	while (charlength-- > 0) { \
156	wptr++ = (s_in)++; length++; \
157	} \
158	charlength = parse_utf8_char((s_in),(end),&c); \
159	} \
160	*(Word) = length; \
161	}while(0)
162
163	/*
164	#define PARSE_NON_WORD(Word, s_in, end) \
165	do { \
166	register u_char *wptr = (Word)+1; \
167	register int length = 0; \
168	register int c = *(s_in); \
169	\
170	while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
171	{ \
172	*wptr++ = c; \
173	length++; \
174	c = *++(s_in); \
175	} \
176	*(Word) = length; \
177	}while(0)
178	*/
179
180
181	/* =========================================================================
182	* Macro: PARSE_STEM_WORD
183	* Description:
184	* Extracts out Word.
185	* Input:
186	* s_in points to 1st letter in buffer to test
187	* end points to last letter in buffer
188	* Output:
189	* s_in is modified to move to next word
190	* Returns Word filled in with length in 1st byte.
191	* ========================================================================= */
192	#define PARSE_STEM_WORD(Word, s_in, end) \
193	do { \
194	register int charlength = 0; \
195	register u_char *wptr = (Word)+1; \
196	register int length = 0; \
197	register int numeric = 0; \
198	unsigned short c; \
199	\
200	charlength = parse_utf8_char((s_in),(end),&c); \
201	\
202	while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
203	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
204	++numeric <= MAXNUMERIC))) { \
205	while (charlength-- > 0) { \
206	wptr++ = (s_in)++; length++; \
207	} \
208	charlength = parse_utf8_char((s_in),(end),&c); \
209	} \
210	*(Word) = length; \
211	}while(0)
212
213	/*
214	#define PARSE_STEM_WORD(Word, s_in, end) \
215	do \
216	{ \
217	register u_char *wptr = (Word)+1; \
218	register int length = 0; \
219	register int c = *(s_in); \
220	register int numeric = 0; \
221	\
222	while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
223	{ \
224	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
225	break; \
226	*wptr++ = c; \
227	length++; \
228	c = *++(s_in); \
229	} \
230	*(Word) = length; \
231	}while(0)
232	*/
233
234
235	/* =========================================================================
236	* Macro: PARSE_NON_STEM_WORD
237	* Description:
238	* Eat up non-word. Do not store non-word.
239	* It is not needed in index only in text !
240	*
241	* Input: as above but no Word needed
242	* Output: as above
243	* ========================================================================= */
244	#define PARSE_NON_STEM_WORD(s_in, end) \
245	do { \
246	register int charlength = 0; \
247	unsigned short c; \
248	\
249	charlength = parse_utf8_char((s_in),(end),&c); \
250	\
251	while (charlength > 0 && !is_unicode_letdig(c)) { \
252	(s_in) += charlength; \
253	charlength = parse_utf8_char((s_in),(end),&c); \
254	} \
255	}while(0)
256
257	/*
258	#define PARSE_NON_STEM_WORD(s_in, end) \
259	do \
260	{ \
261	while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
262	(s_in)++; \
263	}while(0)
264	*/
265
266
267	/* =========================================================================
268	* Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
269	* Description:
270	* Like PARSE_NON_STEM_WORD but also eats up SGML tags
271	* Input: as above
272	* Output: as above
273	* ========================================================================= */
274	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
275	do { \
276	register int charlength = 0; \
277	unsigned short c; \
278	\
279	charlength = parse_utf8_char((s_in),(end),&c); \
280	\
281	while (charlength > 0 && !is_unicode_letdig(c)) { \
282	if (c == '<') { \
283	while (charlength > 0 && c != '>') { \
284	(s_in) += charlength; \
285	charlength = parse_utf8_char((s_in),(end),&c); \
286	} \
287	} \
288	(s_in) += charlength; \
289	charlength = parse_utf8_char((s_in),(end),&c); \
290	} \
291	}while(0)
292
293	/*
294	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
295	do \
296	{ \
297	register int c = *(s_in); \
298	\
299	while (!INAWORD(c) && (s_in)<=(end)) \
300	{ \
301	if (c == '<') \
302	{ \
303	while (c != '>' && (s_in)<=(end)) \
304	c = *++(s_in); \
305	} \
306	if ((s_in)<=(end)) \
307	c = *++(s_in); \
308	} \
309	}while(0)
310	*/
311
312
313	/* =========================================================================
314	* Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
315	* Description:
316	* Extracts out optional paramater for query term.
317	* Needed only in parsing the query line !
318	*
319	* Note: that this function has not been converted to use UTF-8
320	* as it should still work as it is (only uses ascii
321	* characters)
322	*
323	* Input: as above but no Word needed
324	* Output: as above
325	* ========================================================================= */
326	#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
327	do \
328	{ \
329	register u_char *wptr = (Param); \
330	register int length = 0; \
331	register int c = *(s_in); \
332	\
333	if (c == WEIGHTPARAM \|\| c == STEMPARAM) \
334	{ \
335	type = c; \
336	c = *++(s_in); \
337	while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
338	{ \
339	*wptr++ = c; \
340	length++; \
341	c = *++(s_in); \
342	} \
343	*wptr = '\0'; \
344	for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
345	; \
346	} \
347	}while(0)
348
349	/* =========================================================================
350	* Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
351	* Description:
352	* Eat up non-word. Do not store non-word.
353	* If come across a match requirement store it in require_match
354	* It is not needed in index only in text !
355	*
356	* Input: as above
357	* Output: the requirement mode for the next term. -1=must not match,
358	* 0=optional match, 1=must match
359	* ========================================================================= */
360	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
361	do { \
362	register int charlength = 0; \
363	unsigned short c; \
364	(require_match) = 0; \
365	\
366	charlength = parse_utf8_char((s_in),(end),&c); \
367	\
368	while (charlength > 0 && !is_unicode_letdig(c)) { \
369	if (c == MUSTMATCHPARAM) (require_match) = 1; \
370	(s_in) += charlength; \
371	charlength = parse_utf8_char((s_in),(end),&c); \
372	} \
373	}while(0)
374
375	/*
376	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
377	do { \
378	(require_match) = 0; \
379	while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
380	if (*(s_in) == MUSTMATCHPARAM) { \
381	(require_match) = 1; \
382	} \
383	(s_in)++; \
384	} \
385	} while (0)
386	*/
387

Note: See TracBrowser for help on using the repository browser.

Download in other formats: