source: trunk/gsdl/packages/mg-1.3d/src/text/words.h@ 34

Last change on this file since 34 was 13, checked in by rjmcnab, 26 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.3 KB
Line 
1/**************************************************************************
2 *
3 * words.h -- Macros for parsing out words from the source text
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: words.h 13 1998-11-17 09:36:00Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26/*
27 * This has been cleaned up by Tim Shimmin.
28 */
29
30/*
31 * ---NOTE---
32 *
33 * "WORD" refers to a word in the compressed text.
34 * "STEM" or "STEM_WORD" refers to a word for indexing on
35 *
36 */
37
38#define MAXWORDLEN 15
39 /* Maximum length in bytes of any word or non-word. Note that
40 variations to MAXWORDLEN may have dramatic effects on the rest
41 of the program, as the length and the prefix match are packed
42 together into a four bit nibble, and there is not check that
43 this is possible, i.e., leave MAXWORDLEN alone... */
44#define MAXSTEMLEN 255
45 /* Maximum length in bytes of any stem. Note that
46 variations to MAXSTEMLEN may have dramatic effects on the rest
47 of the program, , i.e., leave MAXSTEMLEN alone... */
48#define MAXNUMERIC 4
49 /* Maximum number of numeric characters permitted in a word.
50 This avoids long sequences of numbers creating just one
51 word occurrence for each number. At most 10,000 all numeric
52 words will be permitted. */
53
54/* [RPAP - Jan 97: Stem Index Change] */
55#define MAXPARAMLEN 20
56 /* Maximum number of bytes to read for a parameter value for a
57 term in a query. */
58#define WEIGHTPARAM '/'
59#define STEMPARAM '#'
60
61/* [RJM 07/97: Ranked Required Terms] */
62#define MUSTMATCHPARAM '+'
63
64/* [RJM 03/98: Extended ascii] */
65/* Note that this extension was based on some code by */
66/* Nelson H.F. Beebe */
67static u_char c__;
68#define isextletter(c) (c__ = (c), \
69 ((c__ >= 65 && c__ <= 90) || \
70 (c__ >= 97 && c__ <= 122) || \
71 (c__ >= 192 && c__ <= 214) || \
72 (c__ >= 216 && c__ <= 246) || \
73 (c__ >= 248 && c__ <= 255)))
74
75#define INAWORD(c) (isalnum(c) || isextletter(c))
76 /* The definition of what characters are permitted in a word
77 */
78
79#define INNUMBER(c) (isdigit(c)?1:0)
80
81/* =========================================================================
82 * Macro: PARSE_WORD
83 * Description:
84 * Extract a word out for compressing text
85 * Input:
86 * s_in = string start in buffer
87 * end = string end in buffer
88 * Output:
89 * Word = extracted word with length in 1st byte
90 * s_in = ptr to next character in buffer yet to be processed
91 * ========================================================================= */
92#define PARSE_WORD(Word, s_in, end) \
93 do { \
94 register u_char *wptr = (Word)+1; \
95 register int length = 0; \
96 register int c = *(s_in); \
97 register int numeric = 0; \
98 \
99 while( length < MAXWORDLEN && INAWORD(c) && (s_in)<=(end)) \
100 { \
101 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
102 break; \
103 *wptr++ = c; \
104 length++; \
105 c = *++(s_in); \
106 } \
107 *(Word) = length; \
108 }while(0)
109
110/* =========================================================================
111 * Macro: PARSE_NON_WORD
112 * Description:
113 * Extract a non-word out for storing compressed text
114 * Input: as above
115 * Output: as above
116 * ========================================================================= */
117#define PARSE_NON_WORD(Word, s_in, end) \
118 do { \
119 register u_char *wptr = (Word)+1; \
120 register int length = 0; \
121 register int c = *(s_in); \
122 \
123 while( length < MAXWORDLEN && !INAWORD(c) && (s_in)<=(end) ) \
124 { \
125 *wptr++ = c; \
126 length++; \
127 c = *++(s_in); \
128 } \
129 *(Word) = length; \
130 }while(0)
131
132/* =========================================================================
133 * Macro: PARSE_STEM_WORD
134 * Description:
135 * Extracts out Word.
136 * Input:
137 * s_in points to 1st letter in buffer to test
138 * end points to last letter in buffer
139 * Output:
140 * s_in is modified to move to next word
141 * Returns Word filled in with length in 1st byte.
142 * ========================================================================= */
143#define PARSE_STEM_WORD(Word, s_in, end) \
144 do \
145 { \
146 register u_char *wptr = (Word)+1; \
147 register int length = 0; \
148 register int c = *(s_in); \
149 register int numeric = 0; \
150 \
151 while ( length < MAXSTEMLEN && INAWORD(c) && (s_in)<=(end)) \
152 { \
153 if ((numeric += INNUMBER(c)) > MAXNUMERIC) \
154 break; \
155 *wptr++ = c; \
156 length++; \
157 c = *++(s_in); \
158 } \
159 *(Word) = length; \
160 }while(0)
161
162/* =========================================================================
163 * Macro: PARSE_NON_STEM_WORD
164 * Description:
165 * Eat up non-word. Do not store non-word.
166 * It is not needed in index only in text !
167 *
168 * Input: as above but no Word needed
169 * Output: as above
170 * ========================================================================= */
171#define PARSE_NON_STEM_WORD(s_in, end) \
172 do \
173 { \
174 while (!INAWORD(*(s_in)) && (s_in)<=(end)) \
175 (s_in)++; \
176 }while(0)
177
178/* =========================================================================
179 * Macro: PARSE_NON_STEM_WORD_OR_SGML_TAG
180 * Description:
181 * Like PARSE_NON_STEM_WORD but also eats up SGML tags
182 * Input: as above
183 * Output: as above
184 * ========================================================================= */
185#define PARSE_NON_STEM_WORD_OR_SGML_TAG(s_in, end) \
186 do \
187 { \
188 register int c = *(s_in); \
189 \
190 while (!INAWORD(c) && (s_in)<=(end)) \
191 { \
192 if (c == '<') \
193 { \
194 while (c != '>' && (s_in)<=(end)) \
195 c = *++(s_in); \
196 } \
197 if ((s_in)<=(end)) \
198 c = *++(s_in); \
199 } \
200 }while(0)
201
202/* =========================================================================
203 * Macro: PARSE_OPT_TERM_PARAM [RPAP - Jan 97: Stem Index Change]
204 * Description:
205 * Extracts out optional paramater for query term.
206 * Needed only in parsing the query line !
207
208 *
209 * Input: as above but no Word needed
210 * Output: as above
211 * ========================================================================= */
212#define PARSE_OPT_TERM_PARAM(Param, type, s_in, end) \
213 do \
214 { \
215 register u_char *wptr = (Param); \
216 register int length = 0; \
217 register int c = *(s_in); \
218 \
219 if (c == WEIGHTPARAM || c == STEMPARAM) \
220 { \
221 type = c; \
222 c = *++(s_in); \
223 while( length < MAXPARAMLEN && INNUMBER(c) && (s_in)<=(end)) \
224 { \
225 *wptr++ = c; \
226 length++; \
227 c = *++(s_in); \
228 } \
229 *wptr = '\0'; \
230 for (; INNUMBER(c) && (s_in)<=(end); c = *++(s_in)) \
231 ; \
232 } \
233 }while(0)
234
235/* =========================================================================
236 * Macro: PARSE_RANKED_NON_STEM_WORD [RJM 07/97: Ranked Required Terms]
237 * Description:
238 * Eat up non-word. Do not store non-word.
239 * If come across a match requirement store it in require_match
240 * It is not needed in index only in text !
241 *
242 * Input: as above
243 * Output: the requirement mode for the next term. -1=must not match,
244 * 0=optional match, 1=must match
245 * ========================================================================= */
246#define PARSE_RANKED_NON_STEM_WORD(require_match, s_in, end) \
247 do { \
248 (require_match) = 0; \
249 while (!INAWORD(*(s_in)) && (s_in)<=(end)) { \
250 if (*(s_in) == MUSTMATCHPARAM) { \
251 (require_match) = 1; \
252 } \
253 (s_in)++; \
254 } \
255 } while (0)
Note: See TracBrowser for help on using the repository browser.