Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/packages/mg-1.3d/src/text/words.h@ 34

Last change on this file since 34 was 13, checked in by rjmcnab, 26 years ago
* empty log message *
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.3 KB

Line
1	/**************************************************************************
2	*
3	* words.h -- Macros for parsing out words from the source text
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: words.h 13 1998-11-17 09:36:00Z rjmcnab $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	/*
27	* This has been cleaned up by Tim Shimmin.
28	*/
29
30	/*
31	* ---NOTE---
32	*
33	* "WORD" refers to a word in the compressed text.
34	* "STEM" or "STEM_WORD" refers to a word for indexing on
35	*
36	*/
37
38	#define MAXWORDLEN 15
39	/* Maximum length in bytes of any word or non-word. Note that
40	variations to MAXWORDLEN may have dramatic effects on the rest
41	of the program, as the length and the prefix match are packed
42	together into a four bit nibble, and there is not check that
43	this is possible, i.e., leave MAXWORDLEN alone... */
44	#define MAXSTEMLEN 255
45	/* Maximum length in bytes of any stem. Note that
46	variations to MAXSTEMLEN may have dramatic effects on the rest
47	of the program, , i.e., leave MAXSTEMLEN alone... */
48	#define MAXNUMERIC 4
49	/* Maximum number of numeric characters permitted in a word.
50	This avoids long sequences of numbers creating just one
51	word occurrence for each number. At most 10,000 all numeric
52	words will be permitted. */
53
54	/* [RPAP - Jan 97: Stem Index Change] */
55	#define MAXPARAMLEN 20
56	/* Maximum number of bytes to read for a parameter value for a
57	term in a query. */
58	#define WEIGHTPARAM '/'
59	#define STEMPARAM '#'
60
61	/* [RJM 07/97: Ranked Required Terms] */
62	#define MUSTMATCHPARAM '+'
63
64	/* [RJM 03/98: Extended ascii] */
65	/* Note that this extension was based on some code by */
66	/* Nelson H.F. Beebe */
67	static u_char c__;
68	#define isextletter(c) (c__ = (c), \
69	((c__ >= 65 && c__ <= 90) \|\| \
70	(c__ >= 97 && c__ <= 122) \|\| \
71	(c__ >= 192 && c__ <= 214) \|\| \
72	(c__ >= 216 && c__ <= 246) \|\| \
73	(c__ >= 248 && c__ <= 255)))
74
75	#define INAWORD(c) (isalnum(c) \|\| isextletter(c))
76	/* The definition of what characters are permitted in a word
77	*/
78
79	#define INNUMBER(c) (isdigit(c)?1:0)
80
81	/* =========================================================================
82	* Macro: PARSE_WORD
83	* Description:
84	* Extract a word out for compressing text
85	* Input:
86	* s_in = string start in buffer
87	* end = string end in buffer
88	* Output:
89	* Word = extracted word with length in 1st byte
90	* s_in = ptr to next character in buffer yet to be processed
91	* ========================================================================= */
92	#define PARSE_WORD(Word, s_in, end) \
93	do { \
94	register u_char *wptr = (Word)+1; \
95	register int length = 0; \
96	register int c = *(s_in); \
97	register int numeric = 0; \
98	\
99	while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
100	{ \
101	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
102	break; \
103	*wptr++ = c; \
104	length++; \
105	c = *++(s_in); \
106	} \
107	*(Word) = length; \
108	}while(0)
109
110	/* =========================================================================
111	* Macro: PARSE_NON_WORD
112	* Description:
113	* Extract a non-word out for storing compressed text
114	* Input: as above
115	* Output: as above
116	* ========================================================================= */
117	#define PARSE_NON_WORD(Word, s_in, end) \
118	do { \
119	register u_char *wptr = (Word)+1; \
120	register int length = 0; \
121	register int c = *(s_in); \
122	\
123	while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
124	{ \
125	*wptr++ = c; \
126	length++; \
127	c = *++(s_in); \
128	} \
129	*(Word) = length; \
130	}while(0)
131
132	/* =========================================================================
133	* Macro: PARSE_STEM_WORD
134	* Description:
135	* Extracts out Word.
136	* Input:
137	* s_in points to 1st letter in buffer to test
138	* end points to last letter in buffer
139	* Output:
140	* s_in is modified to move to next word
141	* Returns Word filled in with length in 1st byte.
142	* ========================================================================= */
143	#define PARSE_STEM_WORD(Word, s_in, end) \
144	do \
145	{ \
146	register u_char *wptr = (Word)+1; \
147	register int length = 0; \
148	register int c = *(s_in); \
149	register int numeric = 0; \
150	\
151	while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
152	{ \
153	if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
154	break; \
155	*wptr++ = c; \
156	length++; \
157	c = *++(s_in); \
158	} \
159	*(Word) = length; \
160	}while(0)
161
162	/* =========================================================================
163	* Macro: PARSE_NON_STEM_WORD
164	* Description:
165	* Eat up non-word. Do not store non-word.
166	* It is not needed in index only in text !
167	*
168	* Input: as above but no Word needed
169	* Output: as above
170	* ========================================================================= */
171	#define PARSE_NON_STEM_WORD(s_in, end) \
172	do \
173	{ \
174	while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
175	(s_in)++; \
176	}while(0)
177
178	/* =========================================================================
179	* Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
180	* Description:
181	* Like PARSE_NON_STEM_WORD but also eats up SGML tags
182	* Input: as above
183	* Output: as above
184	* ========================================================================= */
185	#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
186	do \
187	{ \
188	register int c = *(s_in); \
189	\
190	while (!INAWORD(c) && (s_in)<=(end)) \
191	{ \
192	if (c == '<') \
193	{ \
194	while (c != '>' && (s_in)<=(end)) \
195	c = *++(s_in); \
196	} \
197	if ((s_in)<=(end)) \
198	c = *++(s_in); \
199	} \
200	}while(0)
201
202	/* =========================================================================
203	* Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
204	* Description:
205	* Extracts out optional paramater for query term.
206	* Needed only in parsing the query line !
207
208	*
209	* Input: as above but no Word needed
210	* Output: as above
211	* ========================================================================= */
212	#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
213	do \
214	{ \
215	register u_char *wptr = (Param); \
216	register int length = 0; \
217	register int c = *(s_in); \
218	\
219	if (c == WEIGHTPARAM \|\| c == STEMPARAM) \
220	{ \
221	type = c; \
222	c = *++(s_in); \
223	while( length < MAXPARAMLEN && INNUMBER(c) && (s_in)<=(end)) \
224	{ \
225	*wptr++ = c; \
226	length++; \
227	c = *++(s_in); \
228	} \
229	*wptr = '\0'; \
230	for (; INNUMBER(c) && (s_in)<=(end); c = *++(s_in)) \
231	; \
232	} \
233	}while(0)
234
235	/* =========================================================================
236	* Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
237	* Description:
238	* Eat up non-word. Do not store non-word.
239	* If come across a match requirement store it in require_match
240	* It is not needed in index only in text !
241	*
242	* Input: as above
243	* Output: the requirement mode for the next term. -1=must not match,
244	* 0=optional match, 1=must match
245	* ========================================================================= */
246	#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
247	do { \
248	(require_match) = 0; \
249	while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
250	if (*(s_in) == MUSTMATCHPARAM) { \
251	(require_match) = 1; \
252	} \
253	(s_in)++; \
254	} \
255	} while (0)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: