[13] | 1 | /**************************************************************************
|
---|
| 2 | *
|
---|
| 3 | * bool_parser - boolean query parser
|
---|
| 4 | * Copyright (C) 1994 Neil Sharman & Tim Shimmin
|
---|
| 5 | *
|
---|
| 6 | * This program is free software; you can redistribute it and/or modify
|
---|
| 7 | * it under the terms of the GNU General Public License as published by
|
---|
| 8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 9 | * (at your option) any later version.
|
---|
| 10 | *
|
---|
| 11 | * This program is distributed in the hope that it will be useful,
|
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 14 | * GNU General Public License for more details.
|
---|
| 15 | *
|
---|
| 16 | * You should have received a copy of the GNU General Public License
|
---|
| 17 | * along with this program; if not, write to the Free Software
|
---|
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 19 | *
|
---|
| 20 | *
|
---|
| 21 | **************************************************************************/
|
---|
| 22 |
|
---|
| 23 | /**************************************************************************/
|
---|
| 24 | %{
|
---|
| 25 |
|
---|
| 26 | #include "sysfuncs.h"
|
---|
| 27 |
|
---|
| 28 | #include "messages.h"
|
---|
| 29 |
|
---|
| 30 | #include "memlib.h"
|
---|
| 31 | #include "words.h"
|
---|
| 32 | #include "stemmer.h"
|
---|
| 33 | #include "term_lists.h"
|
---|
| 34 | #include "bool_tree.h"
|
---|
| 35 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 36 | #include "backend.h" /* for stemmed_dict def */
|
---|
| 37 | #include "stem_search.h"
|
---|
| 38 |
|
---|
| 39 | #include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 40 |
|
---|
| 41 | /* --- routines --- */
|
---|
| 42 | static int query_lex();
|
---|
| 43 | static int yyerror(char *);
|
---|
| 44 | #define yylex() query_lex(&ch_buf, end_buf)
|
---|
| 45 |
|
---|
| 46 | /* --- module variables --- */
|
---|
| 47 | static char *ch_buf; /* ptr to the character query line buffer */
|
---|
| 48 | static char *end_buf; /* ptr to the last character of the line buffer */
|
---|
| 49 | static bool_tree_node *tree_base = NULL;
|
---|
| 50 | static TermList **term_list;
|
---|
| 51 | static int stem_method;
|
---|
| 52 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 53 | stemmed_dict *p__sd;
|
---|
| 54 | static int indexed;
|
---|
| 55 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 56 | static QueryTermList **query_term_list;
|
---|
| 57 | static int word_num;
|
---|
| 58 | static u_long count;
|
---|
| 59 | static u_long doc_count;
|
---|
| 60 | static u_long invf_ptr;
|
---|
| 61 | static u_long invf_len;
|
---|
| 62 | %}
|
---|
| 63 |
|
---|
| 64 |
|
---|
| 65 | %union {
|
---|
| 66 | char *text;
|
---|
| 67 | bool_tree_node *node;
|
---|
| 68 | }
|
---|
| 69 |
|
---|
| 70 | %token <text> TERM
|
---|
| 71 | %type <node> query term not and or
|
---|
| 72 |
|
---|
| 73 | %%
|
---|
| 74 |
|
---|
| 75 | query: or { tree_base = $1;}
|
---|
| 76 | ;
|
---|
| 77 |
|
---|
| 78 |
|
---|
| 79 | term: TERM { $$ = CreateBoolTermNode(term_list, $1, 1, word_num, count, doc_count, invf_ptr, invf_len); }
|
---|
| 80 | | '(' or ')' { $$ = $2; }
|
---|
| 81 | | '*' { $$ = CreateBoolTreeNode(N_all, NULL, NULL); }
|
---|
| 82 | | '_' { $$ = CreateBoolTreeNode(N_none, NULL, NULL); }
|
---|
| 83 | ;
|
---|
| 84 |
|
---|
| 85 | not: term
|
---|
| 86 | | '!' not { $$ = CreateBoolTreeNode(N_not, $2, NULL); }
|
---|
| 87 | ;
|
---|
| 88 |
|
---|
| 89 | and: and '&' not { $$ = CreateBoolTreeNode(N_and, $1, $3); }
|
---|
| 90 | | and not { $$ = CreateBoolTreeNode(N_and, $1, $2); }
|
---|
| 91 | | not
|
---|
| 92 | ;
|
---|
| 93 |
|
---|
| 94 | or: or '|' and { $$ = CreateBoolTreeNode(N_or, $1, $3); }
|
---|
| 95 | | and
|
---|
| 96 | ;
|
---|
| 97 |
|
---|
| 98 | %%
|
---|
| 99 |
|
---|
| 100 | /* Bison on one mips machine defined "const" to be nothing but
|
---|
| 101 | then did not undef it */
|
---|
| 102 | #ifdef const
|
---|
| 103 | #undef const
|
---|
| 104 | #endif
|
---|
| 105 |
|
---|
| 106 | /**************************************************************************/
|
---|
| 107 |
|
---|
| 108 |
|
---|
| 109 | /* =========================================================================
|
---|
| 110 | * Function: query_lex
|
---|
| 111 | * Description:
|
---|
| 112 | * Hand written lexical analyser for the parser.
|
---|
| 113 | * Input:
|
---|
| 114 | * ptr = ptr to a ptr into character query-line buffer
|
---|
| 115 | * end = ptr to last char in buffer
|
---|
| 116 | * Output:
|
---|
| 117 | * yylval.text = the token's text
|
---|
| 118 | * Notes:
|
---|
| 119 | * does NOT produce WILD tokens at the moment
|
---|
| 120 | * ========================================================================= */
|
---|
| 121 |
|
---|
| 122 | /* [RPAP - Jan 97: Stem Index Change]
|
---|
| 123 | state mode:
|
---|
| 124 | 0 = Read next token
|
---|
| 125 | 1 = Output word
|
---|
| 126 | 2 = Output '|' or ')'
|
---|
| 127 | */
|
---|
| 128 | static int query_lex(char **ptr, const char *end)
|
---|
| 129 | {
|
---|
| 130 | char *buf_ptr = *ptr;
|
---|
| 131 | static int mode = 0;
|
---|
| 132 | static int termnum = 0;
|
---|
| 133 | static TermList *Terms = NULL;
|
---|
| 134 |
|
---|
| 135 | if (mode == 0)
|
---|
| 136 | {
|
---|
| 137 | /* jump over whitespace */
|
---|
| 138 | while (isspace(*buf_ptr))
|
---|
| 139 | buf_ptr++;
|
---|
| 140 |
|
---|
| 141 | if (INAWORD(*buf_ptr))
|
---|
| 142 | {
|
---|
| 143 | static char word[MAXSTEMLEN + 1]; /* [RJM 07/98: Memory Leak] */
|
---|
| 144 | char *sWord = Xmalloc(MAXSTEMLEN + 1);
|
---|
| 145 | int stem_to_apply, method_using = -1;
|
---|
| 146 |
|
---|
| 147 | PARSE_STEM_WORD(word, buf_ptr, end);
|
---|
| 148 |
|
---|
| 149 | /* Extract any parameters */
|
---|
| 150 | stem_to_apply = stem_method;
|
---|
| 151 | while (buf_ptr <= end)
|
---|
| 152 | {
|
---|
| 153 | int stem_param, param_type;
|
---|
| 154 | char param[MAXPARAMLEN + 1];
|
---|
| 155 |
|
---|
| 156 | param_type = 0;
|
---|
| 157 | PARSE_OPT_TERM_PARAM (param, param_type, buf_ptr, end);
|
---|
| 158 | if (!param_type)
|
---|
| 159 | break;
|
---|
| 160 |
|
---|
| 161 | if (param_type == STEMPARAM)
|
---|
| 162 | {
|
---|
| 163 | stem_param = atoi (param);
|
---|
| 164 | if (errno != ERANGE && indexed && stem_param >= 0 && stem_param <= 3)
|
---|
| 165 | method_using = stem_to_apply = stem_param;
|
---|
| 166 | }
|
---|
| 167 | }
|
---|
| 168 |
|
---|
| 169 | bcopy ((char *) word, (char *) sWord, *word + 1);
|
---|
| 170 | stemmer (stem_to_apply, sWord);
|
---|
| 171 |
|
---|
| 172 | if (stem_to_apply == 0 || !indexed || p__sd == NULL)
|
---|
| 173 | {
|
---|
| 174 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 175 | word_num = FindWord (p__sd, sWord, &count, &doc_count, &invf_ptr, &invf_len);
|
---|
| 176 | if (word_num == -1)
|
---|
| 177 | count = doc_count = invf_ptr = invf_len = 0;
|
---|
| 178 | AddQueryTerm (query_term_list, (u_char *) word, count, method_using);
|
---|
| 179 |
|
---|
| 180 | yylval.text = word;
|
---|
| 181 | *ptr = buf_ptr; /* fix up ptr */
|
---|
| 182 | Xfree (sWord);
|
---|
| 183 | return TERM;
|
---|
| 184 | }
|
---|
| 185 | else
|
---|
| 186 | {
|
---|
| 187 | *ptr = buf_ptr; /* fix up ptr */
|
---|
| 188 | termnum = 0;
|
---|
| 189 | ResetTermList (&Terms);
|
---|
| 190 | if (FindWords (p__sd, (u_char *) sWord, stem_to_apply, &Terms) > 0)
|
---|
| 191 | {
|
---|
| 192 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 193 | int i, freq = 0;
|
---|
| 194 | for (i = 0; i < Terms->num; i++)
|
---|
| 195 | freq += Terms->TE[i].WE.count;
|
---|
| 196 | AddQueryTerm (query_term_list, word, freq, method_using);
|
---|
| 197 |
|
---|
| 198 | Xfree (sWord);
|
---|
| 199 | mode = 1;
|
---|
| 200 | return '(';
|
---|
| 201 | }
|
---|
| 202 | else
|
---|
| 203 | {
|
---|
| 204 | /* Word does not exists - include in tree anyway */
|
---|
| 205 | Xfree (sWord);
|
---|
| 206 |
|
---|
| 207 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 208 | word_num = -1;
|
---|
| 209 | count = doc_count = invf_ptr = invf_len = 0;
|
---|
| 210 | AddQueryTerm (query_term_list, (u_char *) word, count, method_using);
|
---|
| 211 |
|
---|
| 212 | yylval.text = word;
|
---|
| 213 | return TERM;
|
---|
| 214 | }
|
---|
| 215 | }
|
---|
| 216 | }
|
---|
| 217 | else /* NON-WORD */
|
---|
| 218 | {
|
---|
| 219 | if (*buf_ptr == '\0')
|
---|
| 220 | {
|
---|
| 221 | /* return null-char if it is one */
|
---|
| 222 | *ptr = buf_ptr; /* fix up ptr */
|
---|
| 223 | return 0;
|
---|
| 224 | }
|
---|
| 225 | else
|
---|
| 226 | {
|
---|
| 227 | /* return 1st char, and delete from buffer */
|
---|
| 228 | char c = *buf_ptr++;
|
---|
| 229 | *ptr = buf_ptr; /* fix up ptr */
|
---|
| 230 | return c;
|
---|
| 231 | }
|
---|
| 232 | }
|
---|
| 233 | }
|
---|
| 234 | else if (mode == 1)
|
---|
| 235 | {
|
---|
| 236 | yylval.text = Terms->TE[termnum].Word;
|
---|
| 237 |
|
---|
| 238 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 239 | word_num = Terms->TE[termnum].WE.word_num;
|
---|
| 240 | count = Terms->TE[termnum].WE.count;
|
---|
| 241 | doc_count = Terms->TE[termnum].WE.doc_count;
|
---|
| 242 | invf_ptr = Terms->TE[termnum].WE.invf_ptr;
|
---|
| 243 | invf_len = Terms->TE[termnum].WE.invf_len;
|
---|
| 244 |
|
---|
| 245 | termnum++;
|
---|
| 246 | mode = 2;
|
---|
| 247 | return TERM;
|
---|
| 248 | }
|
---|
| 249 | else /* mode == 2 */
|
---|
| 250 | {
|
---|
| 251 | if (termnum >= Terms->num)
|
---|
| 252 | {
|
---|
| 253 | mode = 0;
|
---|
| 254 | return ')';
|
---|
| 255 | }
|
---|
| 256 | else
|
---|
| 257 | {
|
---|
| 258 | mode = 1;
|
---|
| 259 | return '|';
|
---|
| 260 | }
|
---|
| 261 | }
|
---|
| 262 | }/*query_lex*/
|
---|
| 263 |
|
---|
| 264 | /* =========================================================================
|
---|
| 265 | * Function: yyerror
|
---|
| 266 | * Description:
|
---|
| 267 | * Input:
|
---|
| 268 | * Output:
|
---|
| 269 | * ========================================================================= */
|
---|
| 270 | static int yyerror(char *s)
|
---|
| 271 | {
|
---|
| 272 | Message("%s", s);
|
---|
| 273 | return(1);
|
---|
| 274 | }
|
---|
| 275 |
|
---|
| 276 |
|
---|
| 277 | /* =========================================================================
|
---|
| 278 | * Function: ParseBool
|
---|
| 279 | * Description:
|
---|
| 280 | * Parse a boolean query string into a term-list and a boolean parse tree
|
---|
| 281 | * Input:
|
---|
| 282 | * query_line = query line string
|
---|
| 283 | * query_len = query line length
|
---|
| 284 | * the_stem_method = stem method id used for stemming
|
---|
| 285 | * Output:
|
---|
| 286 | * the_term_list = the list of terms
|
---|
| 287 | * res = parser result code
|
---|
| 288 | * ========================================================================= */
|
---|
| 289 |
|
---|
| 290 | bool_tree_node *
|
---|
| 291 | ParseBool(char *query_line, int query_len,
|
---|
| 292 | TermList **the_term_list, int the_stem_method, int *res,
|
---|
| 293 | stemmed_dict * the_sd, int is_indexed, /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 294 | QueryTermList **the_query_term_list) /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 295 | {
|
---|
| 296 | /* global variables to be accessed by bison/yacc created parser */
|
---|
| 297 | term_list = the_term_list;
|
---|
| 298 | stem_method = the_stem_method;
|
---|
| 299 | ch_buf = query_line;
|
---|
| 300 | end_buf = query_line + query_len;
|
---|
| 301 | p__sd = the_sd; /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 302 | indexed = is_indexed; /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 303 | query_term_list = the_query_term_list; /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 304 |
|
---|
| 305 | FreeBoolTree(&(tree_base));
|
---|
| 306 |
|
---|
| 307 | ResetTermList(term_list);
|
---|
| 308 | ResetQueryTermList(query_term_list); /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 309 |
|
---|
| 310 | *res = yyparse();
|
---|
| 311 |
|
---|
| 312 | return tree_base;
|
---|
| 313 | }
|
---|
| 314 |
|
---|
| 315 |
|
---|