1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * bool_parser - boolean query parser
|
---|
4 | * Copyright (C) 1994 Neil Sharman & Tim Shimmin
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | *
|
---|
21 | **************************************************************************/
|
---|
22 |
|
---|
23 | /**************************************************************************/
|
---|
24 | %{
|
---|
25 |
|
---|
26 | #include "sysfuncs.h"
|
---|
27 |
|
---|
28 | #include "messages.h"
|
---|
29 |
|
---|
30 | #include "memlib.h"
|
---|
31 | #include "words.h"
|
---|
32 | #include "stemmer.h"
|
---|
33 | #include "term_lists.h"
|
---|
34 | #include "bool_tree.h"
|
---|
35 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
36 | #include "backend.h" /* for stemmed_dict def */
|
---|
37 | #include "stem_search.h"
|
---|
38 |
|
---|
39 | #include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
|
---|
40 |
|
---|
41 | /* --- routines --- */
|
---|
42 | static int query_lex();
|
---|
43 | static int yyerror(char *);
|
---|
44 | #define yylex() query_lex(&ch_buf, end_buf)
|
---|
45 |
|
---|
46 | /* --- module variables --- */
|
---|
47 | static char *ch_buf; /* ptr to the character query line buffer */
|
---|
48 | static char *end_buf; /* ptr to the last character of the line buffer */
|
---|
49 | static bool_tree_node *tree_base = NULL;
|
---|
50 | static TermList **term_list;
|
---|
51 | static int stemmer_num;
|
---|
52 | static int stem_method;
|
---|
53 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
54 | stemmed_dict *p__sd;
|
---|
55 | static int indexed;
|
---|
56 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
57 | static QueryTermList **query_term_list;
|
---|
58 | static int word_num;
|
---|
59 | static mg_u_long count;
|
---|
60 | static mg_u_long doc_count;
|
---|
61 | static mg_u_long invf_ptr;
|
---|
62 | static mg_u_long invf_len;
|
---|
63 | %}
|
---|
64 |
|
---|
65 |
|
---|
66 | %union {
|
---|
67 | char *text;
|
---|
68 | bool_tree_node *node;
|
---|
69 | }
|
---|
70 |
|
---|
71 | %token <text> TERM
|
---|
72 | %type <node> query term not and or
|
---|
73 |
|
---|
74 | %%
|
---|
75 |
|
---|
76 | query: or { tree_base = $1;}
|
---|
77 | ;
|
---|
78 |
|
---|
79 |
|
---|
80 | term: TERM { $$ = CreateBoolTermNode(term_list, $1, 1, word_num, count, doc_count, invf_ptr, invf_len, stemmer_num); }
|
---|
81 | | '(' or ')' { $$ = $2; }
|
---|
82 | | '*' { $$ = CreateBoolTreeNode(N_all, NULL, NULL); }
|
---|
83 | | '_' { $$ = CreateBoolTreeNode(N_none, NULL, NULL); }
|
---|
84 | ;
|
---|
85 |
|
---|
86 | not: term
|
---|
87 | | '!' not { $$ = CreateBoolTreeNode(N_not, $2, NULL); }
|
---|
88 | ;
|
---|
89 |
|
---|
90 | and: and '&' not { $$ = CreateBoolTreeNode(N_and, $1, $3); }
|
---|
91 | | and not { $$ = CreateBoolTreeNode(N_and, $1, $2); }
|
---|
92 | | not
|
---|
93 | ;
|
---|
94 |
|
---|
95 | or: or '|' and { $$ = CreateBoolTreeNode(N_or, $1, $3); }
|
---|
96 | | and
|
---|
97 | ;
|
---|
98 |
|
---|
99 | %%
|
---|
100 |
|
---|
101 | /* Bison on one mips machine defined "const" to be nothing but
|
---|
102 | then did not undef it */
|
---|
103 | #ifdef const
|
---|
104 | #undef const
|
---|
105 | #endif
|
---|
106 |
|
---|
107 | /**************************************************************************/
|
---|
108 |
|
---|
109 |
|
---|
110 | /* =========================================================================
|
---|
111 | * Function: query_lex
|
---|
112 | * Description:
|
---|
113 | * Hand written lexical analyser for the parser.
|
---|
114 | * Input:
|
---|
115 | * ptr = ptr to a ptr into character query-line buffer
|
---|
116 | * end = ptr to last char in buffer
|
---|
117 | * Output:
|
---|
118 | * yylval.text = the token's text
|
---|
119 | * Notes:
|
---|
120 | * does NOT produce WILD tokens at the moment
|
---|
121 | * ========================================================================= */
|
---|
122 |
|
---|
123 | /* [RPAP - Jan 97: Stem Index Change]
|
---|
124 | state mode:
|
---|
125 | 0 = Read next token
|
---|
126 | 1 = Output word
|
---|
127 | 2 = Output '|' or ')'
|
---|
128 | */
|
---|
129 | static int query_lex(char **ptr, const char *end)
|
---|
130 | {
|
---|
131 | char *buf_ptr = *ptr;
|
---|
132 | static int mode = 0;
|
---|
133 | static int termnum = 0;
|
---|
134 | static TermList *Terms = NULL;
|
---|
135 |
|
---|
136 | if (mode == 0)
|
---|
137 | {
|
---|
138 | /* jump over whitespace */
|
---|
139 | buf_ptr = skipspace(buf_ptr, end);
|
---|
140 |
|
---|
141 | if (inaword(buf_ptr, end))
|
---|
142 | {
|
---|
143 | static char word[MAXSTEMLEN + 1]; /* [RJM 07/98: Memory Leak] */
|
---|
144 | char *sWord = Xmalloc(MAXSTEMLEN + 1);
|
---|
145 | int stem_to_apply, method_using = -1;
|
---|
146 |
|
---|
147 | PARSE_STEM_WORD(word, buf_ptr, end);
|
---|
148 |
|
---|
149 | /* Extract any parameters */
|
---|
150 | stem_to_apply = stem_method;
|
---|
151 | while (buf_ptr <= end)
|
---|
152 | {
|
---|
153 | int stem_param, param_type;
|
---|
154 | char param[MAXPARAMLEN + 1];
|
---|
155 |
|
---|
156 | param_type = 0;
|
---|
157 | PARSE_OPT_TERM_PARAM (param, param_type, buf_ptr, end);
|
---|
158 | if (!param_type)
|
---|
159 | break;
|
---|
160 |
|
---|
161 | if (param_type == STEMPARAM)
|
---|
162 | {
|
---|
163 | stem_param = atoi (param);
|
---|
164 | if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
|
---|
165 | method_using = stem_to_apply = stem_param;
|
---|
166 | }
|
---|
167 | }
|
---|
168 |
|
---|
169 | bcopy ((char *) word, (char *) sWord, *word + 1);
|
---|
170 | stemmer (stem_to_apply, stemmer_num, sWord);
|
---|
171 |
|
---|
172 | if (stem_to_apply == 0 || !indexed || p__sd == NULL)
|
---|
173 | {
|
---|
174 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
175 | word_num = FindWord (p__sd, sWord, &count, &doc_count, &invf_ptr, &invf_len);
|
---|
176 | if (word_num == -1)
|
---|
177 | count = doc_count = invf_ptr = invf_len = 0;
|
---|
178 | AddQueryTerm (query_term_list, (u_char *) word, count, method_using);
|
---|
179 |
|
---|
180 | yylval.text = word;
|
---|
181 | *ptr = buf_ptr; /* fix up ptr */
|
---|
182 | Xfree (sWord);
|
---|
183 | return TERM;
|
---|
184 | }
|
---|
185 | else
|
---|
186 | {
|
---|
187 | *ptr = buf_ptr; /* fix up ptr */
|
---|
188 | termnum = 0;
|
---|
189 | ResetTermList (&Terms);
|
---|
190 | if (FindWords (p__sd, (u_char *) sWord, stem_to_apply, &Terms) > 0)
|
---|
191 | {
|
---|
192 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
193 | int i, freq = 0;
|
---|
194 | for (i = 0; i < Terms->num; i++)
|
---|
195 | freq += Terms->TE[i].WE.count;
|
---|
196 | AddQueryTerm (query_term_list, word, freq, method_using);
|
---|
197 |
|
---|
198 | Xfree (sWord);
|
---|
199 | mode = 1;
|
---|
200 | return '(';
|
---|
201 | }
|
---|
202 | else
|
---|
203 | {
|
---|
204 | /* Word does not exists - include in tree anyway */
|
---|
205 | Xfree (sWord);
|
---|
206 |
|
---|
207 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
208 | word_num = -1;
|
---|
209 | count = doc_count = invf_ptr = invf_len = 0;
|
---|
210 | AddQueryTerm (query_term_list, (u_char *) word, count, method_using);
|
---|
211 |
|
---|
212 | yylval.text = word;
|
---|
213 | return TERM;
|
---|
214 | }
|
---|
215 | }
|
---|
216 | }
|
---|
217 | else /* NON-WORD */
|
---|
218 | {
|
---|
219 | if (*buf_ptr == '\0')
|
---|
220 | {
|
---|
221 | /* return null-char if it is one */
|
---|
222 | *ptr = buf_ptr; /* fix up ptr */
|
---|
223 | return 0;
|
---|
224 | }
|
---|
225 | else
|
---|
226 | {
|
---|
227 | /* return 1st char, and delete from buffer */
|
---|
228 | char c = *buf_ptr++;
|
---|
229 | *ptr = buf_ptr; /* fix up ptr */
|
---|
230 | return c;
|
---|
231 | }
|
---|
232 | }
|
---|
233 | }
|
---|
234 | else if (mode == 1)
|
---|
235 | {
|
---|
236 | yylval.text = Terms->TE[termnum].Word;
|
---|
237 |
|
---|
238 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
239 | word_num = Terms->TE[termnum].WE.word_num;
|
---|
240 | count = Terms->TE[termnum].WE.count;
|
---|
241 | doc_count = Terms->TE[termnum].WE.doc_count;
|
---|
242 | invf_ptr = Terms->TE[termnum].WE.invf_ptr;
|
---|
243 | invf_len = Terms->TE[termnum].WE.invf_len;
|
---|
244 |
|
---|
245 | termnum++;
|
---|
246 | mode = 2;
|
---|
247 | return TERM;
|
---|
248 | }
|
---|
249 | else /* mode == 2 */
|
---|
250 | {
|
---|
251 | if (termnum >= Terms->num)
|
---|
252 | {
|
---|
253 | mode = 0;
|
---|
254 | return ')';
|
---|
255 | }
|
---|
256 | else
|
---|
257 | {
|
---|
258 | mode = 1;
|
---|
259 | return '|';
|
---|
260 | }
|
---|
261 | }
|
---|
262 | }/*query_lex*/
|
---|
263 |
|
---|
264 | /* =========================================================================
|
---|
265 | * Function: yyerror
|
---|
266 | * Description:
|
---|
267 | * Input:
|
---|
268 | * Output:
|
---|
269 | * ========================================================================= */
|
---|
270 | static int yyerror(char *s)
|
---|
271 | {
|
---|
272 | Message("%s", s);
|
---|
273 | return(1);
|
---|
274 | }
|
---|
275 |
|
---|
276 |
|
---|
277 | /* =========================================================================
|
---|
278 | * Function: ParseBool
|
---|
279 | * Description:
|
---|
280 | * Parse a boolean query string into a term-list and a boolean parse tree
|
---|
281 | * Input:
|
---|
282 | * query_line = query line string
|
---|
283 | * query_len = query line length
|
---|
284 | * the_stem_method = stem method id used for stemming
|
---|
285 | * Output:
|
---|
286 | * the_term_list = the list of terms
|
---|
287 | * res = parser result code
|
---|
288 | * ========================================================================= */
|
---|
289 |
|
---|
290 | bool_tree_node *
|
---|
291 | ParseBool(char *query_line, int query_len,
|
---|
292 | TermList **the_term_list, int the_stemmer_num, int the_stem_method, int *res,
|
---|
293 | stemmed_dict * the_sd, int is_indexed, /* [RPAP - Jan 97: Stem Index Change] */
|
---|
294 | QueryTermList **the_query_term_list) /* [RPAP - Feb 97: Term Frequency] */
|
---|
295 | {
|
---|
296 | /* global variables to be accessed by bison/yacc created parser */
|
---|
297 | term_list = the_term_list;
|
---|
298 | stemmer_num = the_stemmer_num;
|
---|
299 | stem_method = the_stem_method;
|
---|
300 | ch_buf = query_line;
|
---|
301 | end_buf = query_line + query_len;
|
---|
302 | p__sd = the_sd; /* [RPAP - Jan 97: Stem Index Change] */
|
---|
303 | indexed = is_indexed; /* [RPAP - Jan 97: Stem Index Change] */
|
---|
304 | query_term_list = the_query_term_list; /* [RPAP - Feb 97: Term Frequency] */
|
---|
305 |
|
---|
306 | FreeBoolTree(&(tree_base));
|
---|
307 |
|
---|
308 | ResetTermList(term_list);
|
---|
309 | ResetQueryTermList(query_term_list); /* [RPAP - Feb 97: Term Frequency] */
|
---|
310 |
|
---|
311 | *res = yyparse();
|
---|
312 |
|
---|
313 | return tree_base;
|
---|
314 | }
|
---|
315 |
|
---|
316 |
|
---|