source: main/tags/2.80/indexers/mg/src/text/bool_query.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.7 KB
Line 
1/**************************************************************************
2 *
3 * bool_query.c -- Boolean query evaluation
4 * Copyright (C) 1994 Neil Sharman & Tim Shimmin
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 *
21 **************************************************************************/
22
23#include "sysfuncs.h"
24
25#include "memlib.h"
26#include "messages.h"
27#include "filestats.h"
28#include "timing.h"
29#include "sptree.h"
30
31
32#include "mg.h"
33#include "text.h"
34#include "invf.h"
35#include "lists.h"
36#include "term_lists.h"
37#include "backend.h"
38#include "words.h"
39#include "stemmer.h"
40#include "stem_search.h"
41#include "invf_get.h"
42#include "text_get.h"
43#include "read_line.h"
44#include "bool_tree.h"
45#include "bool_parser.h"
46#include "bool_optimiser.h"
47#include "environment.h"
48
49
50/**********************
51 [RPAP - Feb 97: Term Frequency]
52
53 Shifted to bool_parser.y
54
55 *********************/
56
57/* =========================================================================
58 * Function: ReadInTermInfo
59 * Description:
60 * Input:
61 * Output:
62 * ========================================================================= */
63/**************************
64static void
65ReadInTermInfo (query_data * qd)
66{
67 int i;
68 TermList *tl = qd->TL;
69
70 for (i = 0; i < tl->num; i++)
71 {
72 TermEntry *te = &(tl->TE[i]);
73 WordEntry *we = &(te->WE);
74
75 we->word_num = FindWord (qd->sd, te->Word,
76 &we->count, &we->doc_count,
77 &we->invf_ptr, &we->invf_len);
78 if (we->word_num == -1)
79 we->count = we->doc_count = 0;
80 }
81}
82
83***********************/
84
85
86
87/* =========================================================================
88 * Function: FindNoneTerms
89 * Description:
90 * Set any terms which are not in dictionary to N_none (i.e. false)
91 * Assumes a prior call to ReadInTermInfo
92 * Input:
93 * Output:
94 * ========================================================================= */
95
96static void
97FindNoneTerms (query_data * qd, bool_tree_node * tree)
98{
99 if (BOOL_HAS_CHILD (tree))
100 {
101 bool_tree_node *child = BOOL_CHILD (tree);
102 while (child)
103 {
104 FindNoneTerms (qd, child);
105 child = BOOL_SIBLING (child);
106 }
107 }
108 else if (BOOL_TAG (tree) == N_term)
109 {
110 int doc_count = qd->TL->TE[BOOL_TERM (tree)].WE.doc_count;
111 if (!doc_count)
112 BOOL_TAG (tree) = N_none;
113 }
114
115} /*FindNoneTerms */
116
117/* =========================================================================
118 * Function: ReduceList
119 * Description:
120 * Go thru doc-set and get rid of all docs which are unmarked
121 * The motivation for this routine is in the case of CNF queries.
122 * Input:
123 * Output:
124 * ========================================================================= */
125
126void
127ReduceList (DocList * cset)
128{
129 DocEntry *src;
130 DocEntry *dest;
131
132 src = dest = cset->DE;
133 for (src = dest = cset->DE; src < cset->DE + cset->num; src++)
134 {
135 if (src->or_included)
136 {
137 src->or_included = 0;
138 *dest++ = *src;
139 }
140 }
141 cset->num = dest - cset->DE;
142
143} /*ReduceList */
144
145/* =========================================================================
146 * Function: BooleanGet
147 * Description:
148 * Input:
149 * Output:
150 * ========================================================================= */
151static DocList *
152BooleanGet (query_data * qd, bool_tree_node * n,
153 BooleanQueryInfo * bqi)
154{
155 if (n)
156 {
157 DocList *res = NULL;
158 switch (BOOL_TAG (n))
159 {
160 case N_all:
161 Message ("Unexpected \"all\" node in the parse tree.");
162 res = MakeDocList (0);
163 break;
164 case N_not:
165 Message ("Unexpected \"not\" node in the parse tree.");
166 res = MakeDocList (0);
167 break;
168 case N_none:
169 {
170 res = MakeDocList (0);
171 break;
172 }
173 case N_diff:
174 {
175 res = BooleanGet (qd, BOOL_CHILD (n), bqi);
176 n = BOOL_SIBLING (BOOL_CHILD (n));
177 while (n)
178 {
179 bool_tree_node *next = BOOL_SIBLING (n);
180 if (BOOL_TAG (n) == N_term)
181 {
182 WordEntry *we = GetNthWE (qd->TL, BOOL_TERM (n));
183 res = GetDocsOp (qd, we, op_diff_terms, res);
184 }
185 else
186 res = DiffLists (res, BooleanGet (qd, n, bqi));
187 n = next;
188 }
189 break;
190 }
191 case N_and:
192 {
193 res = BooleanGet (qd, BOOL_CHILD (n), bqi); /* initial fill of c-set */
194 n = BOOL_SIBLING (BOOL_CHILD (n));
195 while (n)
196 {
197 bool_tree_node *next = BOOL_SIBLING (n);
198 switch (BOOL_TAG (n))
199 {
200 case N_term:
201 {
202 WordEntry *we = GetNthWE (qd->TL, BOOL_TERM (n));
203 res = GetDocsOp (qd, we, op_and_terms, res);
204 }
205 break;
206 case N_or_terms:
207 {
208 bool_tree_node *child = BOOL_CHILD (n);
209 /* note: all children are terms - predetermined */
210 while (child)
211 {
212 WordEntry *we = GetNthWE (qd->TL, BOOL_TERM (child));
213 res = GetDocsOp (qd, we, op_and_or_terms, res);
214 child = BOOL_SIBLING (child);
215 }
216 ReduceList (res);
217 }
218 break;
219 default:
220 res = IntersectLists (res, BooleanGet (qd, n, bqi));
221 } /*switch */
222 n = next;
223 }
224 break;
225 }
226 case N_or:
227 case N_or_terms:
228 {
229 res = BooleanGet (qd, BOOL_CHILD (n), bqi);
230 n = BOOL_SIBLING (BOOL_CHILD (n));
231 while (n)
232 {
233 bool_tree_node *next = BOOL_SIBLING (n);
234 res = MergeLists (res, BooleanGet (qd, n, bqi));
235 n = next;
236 }
237 break;
238 }
239 case N_term:
240 {
241 res = GetDocsOp (qd, GetNthWE (qd->TL, BOOL_TERM (n)), op_term, NULL);
242 break;
243 }
244 }
245 return (res ? res : MakeDocList (0));
246 }
247 else
248 return MakeDocList (0);
249}
250
251/* =========================================================================
252 * Function: AdjustParaDocs
253 * Description:
254 * Input:
255 * Output:
256 * ========================================================================= */
257static int
258DE_comp (void *a, void *b)
259{
260 return ((DocEntry *) a)->SeekPos - ((DocEntry *) b)->SeekPos;
261}
262
263static void
264AdjustParaDocs (query_data * qd, DocList * Docs)
265{
266 DocEntry *DE = Docs->DE;
267 int s, d;
268 Splay_Tree *SP;
269
270
271 /* Make the doc-list contain only references to unique
272 * real documents. So can get rid of cases where there
273 * are multiple paragraphs for a real document.
274 */
275
276 /* Note: [TS: 29/Sep/94] */
277 /* reads in seek-pos for docnums so can compare docs */
278 /* seek-pos used as real doc identifier */
279 /* whereas doc-num entries refer to paragraphs -- I think */
280 for (s = 0; s < Docs->num; s++)
281 FetchDocStart (qd, DE[s].DocNum, &DE[s].SeekPos, &DE[s].Len);
282
283 Message ("Original Number = %d\n", Docs->num);
284 SP = SP_createset (DE_comp);
285 d = 0;
286 for (s = 0; s < Docs->num; s++)
287 if (!SP_member (&DE[s], SP))
288 {
289 DE[d] = DE[s];
290 SP_insert (&DE[d], SP);
291 d++;
292 }
293 SP_freeset (SP);
294 Docs->num = d;
295 Message ("Final Number = %d\n", Docs->num);
296}
297
298/* =========================================================================
299 * Function: BooleanQuery
300 * Description:
301 * The main control function for the boolean query.
302 * Input:
303 * Output:
304 * qd->DL
305 * ========================================================================= */
306
307void
308BooleanQuery (query_data * qd, char *Query, BooleanQueryInfo * bqi,
309 int stem_method)
310{
311 bool_tree_node *tree;
312 DocList *Docs;
313 int res = 0;
314
315 tree = ParseBool (Query, MAXLINEBUFFERLEN, &(qd->TL),
316 qd->sd->sdh.stemmer_num, stem_method, &res,
317 qd->sd, qd->sd->sdh.indexed, /* [RPAP - Jan 97: Stem Index Change] */
318 &(qd->QTL)); /* [RPAP - Feb 97: Term Frequency] */
319
320 /* [TS:Aug/94] no longer tries to continue after parse failure */
321 if (res != 0) /* failed the parse */
322 {
323 qd->DL = NULL; /* have empty doc-list */
324 return;
325 }
326
327 /* [RPAP - Feb 97: Term Frequency] */
328 /* ReadInTermInfo (qd); */
329
330 FindNoneTerms (qd, tree);
331
332 OptimiseBoolTree (tree, qd->TL, atoi (GetEnv ("optimise_type")));
333
334 qd->hops_taken = qd->num_of_ptrs = 0;
335 qd->num_of_terms = 0;
336
337 Docs = BooleanGet (qd, tree, bqi);
338
339/* [RPAP - Feb 97: NZDL Specifics] */
340#if !defined(PARABOOLEANMATCHES) && !defined(NZDL)
341 if (qd->id->ifh.InvfLevel == 3)
342 {
343 AdjustParaDocs (qd, Docs);
344 }
345#endif
346
347 if (bqi->MaxDocsToRetrieve != -1 && Docs->num > bqi->MaxDocsToRetrieve)
348 Docs->num = bqi->MaxDocsToRetrieve;
349
350 FreeQueryDocs (qd);
351
352 qd->DL = Docs;
353}
Note: See TracBrowser for help on using the repository browser.