Context Navigation

source: trunk/indexers/mg/src/text/words.h@ 8694

Last change on this file since 8694 was 8694, checked in by kjdon, 19 years ago
added some changes made by Emanuel Dejanu (Simple Words)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 18.6 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: words.h 8694 2004-11-29 03:15:13Z kjdon $
21	*
22	**************************************************************************/
23
24
25	#include "sysfuncs.h"
26	#include "unitool.h"
27
28	/*
29	* This has been cleaned up by Tim Shimmin.
30	*/
31
32	/*
33	* ---NOTE---
34	*
35	* "WORD" refers to a word in the compressed text.
36	* "STEM" or "STEM_WORD" refers to a word for indexing on
37	*
38	*/
39
40	#define MAXWORDLEN 15
41	/* Maximum length in bytes of any word or non-word. Note that
42	variations to MAXWORDLEN may have dramatic effects on the rest
43	of the program, as the length and the prefix match are packed
44	together into a four bit nibble, and there is not check that
45	this is possible, i.e., leave MAXWORDLEN alone... */
46
47	#define MAXSTEMLEN 255
48	/* Maximum length in bytes of any stem. Note that
49	variations to MAXSTEMLEN may have dramatic effects on the rest
50	of the program, , i.e., leave MAXSTEMLEN alone... */
51
52	/#define MAXNUMERIC 4/
53
54	/* Maximum number of numeric characters permitted in a word.
55	This avoids long sequences of numbers creating just one
56	word occurrence for each number. At most 10,000 all numeric
57	words will be permitted. */
58
59	/* [RPAP - Jan 97: Stem Index Change] */
60	#define MAXPARAMLEN 20
61	/* Maximum number of bytes to read for a parameter value for a
62	term in a query. */
63	#define WEIGHTPARAM '/'
64	#define STEMPARAM '#'
65
66	/* [RJM 07/97: Ranked Required Terms] */
67	#define MUSTMATCHPARAM '+'
68
69
70	#define PESINAWORD(c) (isalnum(c) \|\| ((c) >= 0x80 && (c) <= 0xff))
71	/* The definition of what characters are permitted in a word.
72	This macro is pessimistic, you cannot tell from a particular
73	byte above 0x80 whether it is a character or not. This function
74	is needed by various functions relating to huffman coding
75	where frequency counts need to be primed, it should not be
76	used in parsing the UTF-8 encoded input. */
77
78	int inaword (const u_char here, const u_char end);
79	/* Takes the place of the old INAWORD macro. It determines
80	whether a given place in a UTF-8 encoded Unicode string
81	is part of a word. */
82
83	int isaspace (const u_char here, const u_char end);
84	/* It determines whether a given place in a UTF-8 encoded
85	Unicode string is a unicode space. */
86
87	u_char skipspace(u_char here, u_char *end);
88	/* Return a the UTF-8 encoded Unicode string with
89	begining unicode spaces skipped. */
90
91	/* =========================================================================
92	* Macro: PARSE_WORD
93	* Description:
94	* Extract a word out for compressing text
95	* Input:
96	* s_in = string start in buffer
97	* end = string end in buffer
98	* Output:
99	* Word = extracted word with length in 1st byte
100	* s_in = ptr to next character in buffer yet to be processed
101	* ========================================================================= */
102	#define PARSE_WORD(Word, s_in, end) \
103	do { \
104	register int charlength = 0; \
105	register u_char *wptr = (Word)+1; \
106	register int length = 0; \
107	register int numeric = 0; \
108	unsigned short c; \
109	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
110	\
111	charlength = parse_utf8_char((s_in),(end),&c); \
112	\
113	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
114	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
115	++numeric <= maxnumeric))) { \
116	while (charlength-- > 0) { \
117	wptr++ = (s_in)++; length++; \
118	} \
119	charlength = parse_utf8_char((s_in),(end),&c); \
120	} \
121	*(Word) = length; \
122	}while(0)
123
124	/*
125	#define PARSE_WORD(Word, s_in, end) \
126	do { \
127	register u_char *wptr = (Word)+1; \
128	register int length = 0; \
129	register int c = *(s_in); \
130	register int numeric = 0; \
131	\
132	while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
133	{ \
134	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
135	break; \
136	*wptr++ = c; \
137	length++; \
138	c = *++(s_in); \
139	} \
140	*(Word) = length; \
141	}while(0)
142	*/
143
144
145	/* =========================================================================
146	* Macro: PARSE_NON_WORD
147	* Description:
148	* Extract a non-word out for storing compressed text
149	* Input: as above
150	* Output: as above
151	* ========================================================================= */
152	#define PARSE_NON_WORD(Word, s_in, end) \
153	do { \
154	register int charlength = 0; \
155	register u_char *wptr = (Word)+1; \
156	register int length = 0; \
157	unsigned short c; \
158	\
159	charlength = parse_utf8_char((s_in),(end),&c); \
160	\
161	while (length+charlength <= MAXWORDLEN && charlength > 0 && \
162	!is_unicode_letdig(c)) { \
163	while (charlength-- > 0) { \
164	wptr++ = (s_in)++; length++; \
165	} \
166	charlength = parse_utf8_char((s_in),(end),&c); \
167	} \
168	*(Word) = length; \
169	}while(0)
170
171	/*
172	#define PARSE_NON_WORD(Word, s_in, end) \
173	do { \
174	register u_char *wptr = (Word)+1; \
175	register int length = 0; \
176	register int c = *(s_in); \
177	\
178	while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
179	{ \
180	*wptr++ = c; \
181	length++; \
182	c = *++(s_in); \
183	} \
184	*(Word) = length; \
185	}while(0)
186	*/
187
188
189	/* =========================================================================
190	* Macro: PARSE_STEM_WORD
191	* Description:
192	* Extracts out Word.
193	* Input:
194	* s_in points to 1st letter in buffer to test
195	* end points to last letter in buffer
196	* Output:
197	* s_in is modified to move to next word
198	* Returns Word filled in with length in 1st byte.
199	* ========================================================================= */
200	#define PARSE_STEM_WORD(Word, s_in, end) \
201	do { \
202	register int charlength = 0; \
203	register u_char *wptr = (Word)+1; \
204	register int length = 0; \
205	register int numeric = 0; \
206	unsigned short c; \
207	register int maxnumeric = IntEnv (GetEnv ("maxnumeric"), 4); \
208	\
209	charlength = parse_utf8_char((s_in),(end),&c); \
210	\
211	while (length+charlength <= MAXSTEMLEN && charlength > 0 && \
212	(is_unicode_letter(c) \|\| (is_unicode_digit(c) && \
213	++numeric <= maxnumeric))) { \
214	while (charlength-- > 0) { \
215	wptr++ = (s_in)++; length++; \
216	} \
217	charlength = parse_utf8_char((s_in),(end),&c); \
218	} \
219	*(Word) = length; \
220	}while(0)
221	/*
222	#define PARSE_STEM_WORD(Word, s_in, end) \
223	do \
224	{ \
225	register u_char *wptr = (Word)+1; \
226	register int length = 0; \
227	register int c = *(s_in); \
228	register int numeric = 0; \
229	\
230	while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
231	{ \
232	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
233	break; \
234	*wptr++ = c; \
235	length++; \
236	c = *++(s_in); \
237	} \
238	*(Word) = length; \
239	}while(0)
240	*/
241
242
243	/* =========================================================================
244	* Macro: PARSE_NON_STEM_WORD
245	* Description:
246	* Eat up non-word. Do not store non-word.
247	* It is not needed in index only in text !
248	*
249	* Input: as above but no Word needed
250	* Output: as above
251	* ========================================================================= */
252	#define PARSE_NON_STEM_WORD(s_in, end) \
253	do { \
254	register int charlength = 0; \
255	unsigned short c; \
256	\
257	charlength = parse_utf8_char((s_in),(end),&c); \
258	\
259	while (charlength > 0 && !is_unicode_letdig(c)) { \
260	(s_in) += charlength; \
261	charlength = parse_utf8_char((s_in),(end),&c); \
262	} \
263	}while(0)
264
265	/*
266	#define PARSE_NON_STEM_WORD(s_in, end) \
267	do \
268	{ \
269	while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
270	(s_in)++; \
271	}while(0)
272	*/
273
274
275	/* =========================================================================
276	* Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
277	* Description:
278	* Like PARSE_NON_STEM_WORD but also eats up SGML tags
279	* Input: as above
280	* Output: as above
281	* ========================================================================= */
282	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
283	do { \
284	register int charlength = 0; \
285	unsigned short c; \
286	\
287	charlength = parse_utf8_char((s_in),(end),&c); \
288	\
289	while (charlength > 0 && !is_unicode_letdig(c)) { \
290	if (c == '<') { \
291	while (charlength > 0 && c != '>') { \
292	(s_in) += charlength; \
293	charlength = parse_utf8_char((s_in),(end),&c); \
294	} \
295	} \
296	(s_in) += charlength; \
297	charlength = parse_utf8_char((s_in),(end),&c); \
298	} \
299	}while(0)
300
301	/*
302	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
303	do \
304	{ \
305	register int c = *(s_in); \
306	\
307	while (!INAWORD(c) && (s_in)<=(end)) \
308	{ \
309	if (c == '<') \
310	{ \
311	while (c != '>' && (s_in)<=(end)) \
312	c = *++(s_in); \
313	} \
314	if ((s_in)<=(end)) \
315	c = *++(s_in); \
316	} \
317	}while(0)
318	*/
319
320
321	/* =========================================================================
322	* Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
323	* Description:
324	* Extracts out optional paramater for query term.
325	* Needed only in parsing the query line !
326	*
327	* Note: that this function has not been converted to use UTF-8
328	* as it should still work as it is (only uses ascii
329	* characters)
330	*
331	* Input: as above but no Word needed
332	* Output: as above
333	* ========================================================================= */
334	#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
335	do \
336	{ \
337	register u_char *wptr = (Param); \
338	register int length = 0; \
339	register int c = *(s_in); \
340	\
341	if (c == WEIGHTPARAM \|\| c == STEMPARAM) \
342	{ \
343	type = c; \
344	c = *++(s_in); \
345	while( length < MAXPARAMLEN && isdigit(c) && (s_in)<=(end)) \
346	{ \
347	*wptr++ = c; \
348	length++; \
349	c = *++(s_in); \
350	} \
351	*wptr = '\0'; \
352	for (; isdigit(c) && (s_in)<=(end); c = *++(s_in)) \
353	; \
354	} \
355	}while(0)
356
357	/* =========================================================================
358	* Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
359	* Description:
360	* Eat up non-word. Do not store non-word.
361	* If come across a match requirement store it in require_match
362	* It is not needed in index only in text !
363	*
364	* Input: as above
365	* Output: the requirement mode for the next term. -1=must not match,
366	* 0=optional match, 1=must match
367	* ========================================================================= */
368	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
369	do { \
370	register int charlength = 0; \
371	unsigned short c; \
372	(require_match) = 0; \
373	\
374	charlength = parse_utf8_char((s_in),(end),&c); \
375	\
376	while (charlength > 0 && !is_unicode_letdig(c)) { \
377	if (c == MUSTMATCHPARAM) (require_match) = 1; \
378	(s_in) += charlength; \
379	charlength = parse_utf8_char((s_in),(end),&c); \
380	} \
381	}while(0)
382
383	/*
384	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
385	do { \
386	(require_match) = 0; \
387	while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
388	if (*(s_in) == MUSTMATCHPARAM) { \
389	(require_match) = 1; \
390	} \
391	(s_in)++; \
392	} \
393	} while (0)
394	*/

Note: See TracBrowser for help on using the repository browser.

Download in other formats: