1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * backend.c -- Underlying routines for mgquery
|
---|
4 | * Copyright (C) 1994 Neil Sharman
|
---|
5 | *
|
---|
6 | * This program is free software; you can redistribute it and/or modify
|
---|
7 | * it under the terms of the GNU General Public License as published by
|
---|
8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
9 | * (at your option) any later version.
|
---|
10 | *
|
---|
11 | * This program is distributed in the hope that it will be useful,
|
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
14 | * GNU General Public License for more details.
|
---|
15 | *
|
---|
16 | * You should have received a copy of the GNU General Public License
|
---|
17 | * along with this program; if not, write to the Free Software
|
---|
18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
19 | *
|
---|
20 | * $Id: backend.cpp 711 1999-10-17 23:43:31Z cs025 $
|
---|
21 | *
|
---|
22 | **************************************************************************/
|
---|
23 |
|
---|
24 | #include "sysfuncs.h"
|
---|
25 |
|
---|
26 | #include "memlib.h"
|
---|
27 | #include "messages.h"
|
---|
28 | #include "timing.h"
|
---|
29 | #include "filestats.h"
|
---|
30 | #include "sptree.h"
|
---|
31 | #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
|
---|
32 |
|
---|
33 |
|
---|
34 | #include "mg_files.h"
|
---|
35 | #include "mg.h"
|
---|
36 | #include "invf.h"
|
---|
37 | #include "text.h"
|
---|
38 | #include "lists.h"
|
---|
39 | #include "backend.h"
|
---|
40 | #include "stem_search.h"
|
---|
41 | #include "StemBlock.h"
|
---|
42 | #include "invf_get.h"
|
---|
43 | #include "text_get.h"
|
---|
44 | #include "weights.h"
|
---|
45 | #include "locallib.h"
|
---|
46 | #include "mg_errors.h"
|
---|
47 | #include "DocEntry.h"
|
---|
48 |
|
---|
49 | static File *
|
---|
50 | OpenFile (char *base, char *suffix, unsigned long magic, int *ok)
|
---|
51 | {
|
---|
52 | char FileName[512];
|
---|
53 | File *F;
|
---|
54 | sprintf (FileName, "%s%s", base, suffix);
|
---|
55 | if (!(F = Fopen (FileName, "rb", 0))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
56 | {
|
---|
57 | mg_errno = MG_NOFILE;
|
---|
58 | MgErrorData (FileName);
|
---|
59 | if (ok)
|
---|
60 | *ok = 0;
|
---|
61 | return (NULL);
|
---|
62 | }
|
---|
63 | if (magic)
|
---|
64 | {
|
---|
65 | unsigned long m;
|
---|
66 | if (fread ((char *) &m, sizeof (m), 1, F->f) == 0)
|
---|
67 | {
|
---|
68 | mg_errno = MG_READERR;
|
---|
69 | MgErrorData (FileName);
|
---|
70 | if (ok)
|
---|
71 | *ok = 0;
|
---|
72 | Fclose (F);
|
---|
73 | return (NULL);
|
---|
74 | }
|
---|
75 | NTOHUL(m); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
76 | if (m != magic)
|
---|
77 | {
|
---|
78 | mg_errno = MG_BADMAGIC;
|
---|
79 | MgErrorData (FileName);
|
---|
80 | if (ok)
|
---|
81 | *ok = 0;
|
---|
82 | Fclose (F);
|
---|
83 | return (NULL);
|
---|
84 | }
|
---|
85 | }
|
---|
86 | return (F);
|
---|
87 | }
|
---|
88 |
|
---|
89 | /* --
|
---|
90 | -- GRB: Open text files only; based on open_all_files in MG1.2.1;
|
---|
91 | -- should only be called after open_index_files
|
---|
92 | --
|
---|
93 | */
|
---|
94 | static int
|
---|
95 | open_text_files (query_data * qd)
|
---|
96 | {
|
---|
97 | int ok = 1;
|
---|
98 |
|
---|
99 | qd->File_text = OpenFile (qd->textpathname, TEXT_SUFFIX, /* [RJM 06/97: text filename] */
|
---|
100 | MAGIC_TEXT, &ok);
|
---|
101 | qd->File_fast_comp_dict = OpenFile (qd->textpathname, /* [RJM 06/97: text filename] */
|
---|
102 | TEXT_DICT_FAST_SUFFIX, MAGIC_FAST_DICT, NULL);
|
---|
103 | if (!qd->File_fast_comp_dict)
|
---|
104 | {
|
---|
105 | qd->File_comp_dict = OpenFile (qd->textpathname, /* [RJM 06/97: text filename] */
|
---|
106 | TEXT_DICT_SUFFIX, MAGIC_DICT, &ok);
|
---|
107 | qd->File_aux_dict = OpenFile (qd->textpathname, /* [RJM 06/97: text filename] */
|
---|
108 | TEXT_DICT_AUX_SUFFIX, MAGIC_AUX_DICT, NULL);
|
---|
109 | }
|
---|
110 | else
|
---|
111 | qd->File_comp_dict = qd->File_aux_dict = NULL;
|
---|
112 |
|
---|
113 | /* This will fail if a level 1 inverted file was created because there
|
---|
114 | will be no document weights */
|
---|
115 | qd->File_text_idx_wgt = OpenFile (qd->pathname, TEXT_IDX_WGT_SUFFIX,
|
---|
116 | MAGIC_TEXI_WGT, NULL);
|
---|
117 | if (qd->File_text_idx_wgt == NULL && qd->File_weight_approx == NULL)
|
---|
118 | qd->File_text_idx = OpenFile (qd->textpathname, /* [RJM 06/97: text filename] */
|
---|
119 | TEXT_IDX_SUFFIX, MAGIC_TEXI, NULL);
|
---|
120 | else
|
---|
121 | qd->File_text_idx = NULL;
|
---|
122 |
|
---|
123 | /**
|
---|
124 | * GRB: Additional check; if we didn't get idx_wgt and we are on
|
---|
125 | * a level 2 or later inversion fileset, then error.
|
---|
126 | * As the id structure may not have been initialised, only
|
---|
127 | * error if it has been; we'll assume that if it isn't
|
---|
128 | * initialised yet then something bespoke is happening and
|
---|
129 | * the programmer should deal with it;
|
---|
130 | * open_all_files() calls this fn after open_index_files,
|
---|
131 | * in which case id is NULL if when open_all_files is called
|
---|
132 | * inside __InitQuerySystem (ie. when MG is just run as normal).
|
---|
133 | * Programmers doing their own thing should prepare their
|
---|
134 | * code to cope with the 2-stage initialisation of level 2
|
---|
135 | * or higher indexes. The idx_wgt index is only used in
|
---|
136 | * retrieving text; it has no effect on indexed searches.
|
---|
137 | */
|
---|
138 | if (qd->id != NULL &&
|
---|
139 | ((qd->allfiles == 1 && qd->File_text_idx_wgt == NULL) ||
|
---|
140 | qd->File_weight_approx == NULL) &&
|
---|
141 | qd->id->ifh.InvfLevel >= 2)
|
---|
142 | { ok = 0;
|
---|
143 | }
|
---|
144 |
|
---|
145 |
|
---|
146 | if (!ok)
|
---|
147 | { if (qd->File_text)
|
---|
148 | Fclose (qd->File_text);
|
---|
149 | if (qd->File_fast_comp_dict)
|
---|
150 | Fclose (qd->File_fast_comp_dict);
|
---|
151 | if (qd->File_aux_dict)
|
---|
152 | Fclose (qd->File_aux_dict);
|
---|
153 | if (qd->File_comp_dict)
|
---|
154 | Fclose (qd->File_comp_dict);
|
---|
155 |
|
---|
156 | if (qd->File_text_idx_wgt)
|
---|
157 | Fclose (qd->File_text_idx_wgt);
|
---|
158 | if (qd->File_text_idx)
|
---|
159 | Fclose (qd->File_text_idx);
|
---|
160 | return (-1);
|
---|
161 | }
|
---|
162 | return (0);
|
---|
163 |
|
---|
164 | }
|
---|
165 |
|
---|
166 | /* --
|
---|
167 | -- GRB: Open index files only; based on open_all_files in MG1.2.1
|
---|
168 | --
|
---|
169 | */
|
---|
170 | static int open_index_files(query_data *qd)
|
---|
171 | {
|
---|
172 | int ok = 1;
|
---|
173 |
|
---|
174 | qd->File_text = NULL;
|
---|
175 | qd->File_fast_comp_dict = NULL;
|
---|
176 | qd->File_comp_dict = NULL;
|
---|
177 | qd->File_aux_dict = NULL;
|
---|
178 |
|
---|
179 | qd->File_stem = OpenFile (qd->pathname, INVF_DICT_BLOCKED_SUFFIX,
|
---|
180 | MAGIC_STEM, &ok);
|
---|
181 |
|
---|
182 | /* [RPAP - Jan 97: Stem Index Change]
|
---|
183 | These will fail if collection not built with stem indexes */
|
---|
184 | qd->File_stem1 = OpenFile (qd->pathname, INVF_DICT_BLOCKED_1_SUFFIX,
|
---|
185 | MAGIC_STEM_1, NULL);
|
---|
186 | qd->File_stem2 = OpenFile (qd->pathname, INVF_DICT_BLOCKED_2_SUFFIX,
|
---|
187 | MAGIC_STEM_2, NULL);
|
---|
188 | qd->File_stem3 = OpenFile (qd->pathname, INVF_DICT_BLOCKED_3_SUFFIX,
|
---|
189 | MAGIC_STEM_3, NULL);
|
---|
190 |
|
---|
191 | qd->File_invf = OpenFile (qd->pathname, INVF_SUFFIX,
|
---|
192 | MAGIC_INVF, &ok);
|
---|
193 |
|
---|
194 | /* Ths will fail if a level 1 inverted file was created because there
|
---|
195 | will be no document weights */
|
---|
196 | qd->File_weight_approx = OpenFile (qd->pathname, APPROX_WEIGHTS_SUFFIX,
|
---|
197 | MAGIC_WGHT_APPROX, NULL);
|
---|
198 |
|
---|
199 | qd->File_text_idx_wgt = NULL;
|
---|
200 | qd->File_text_idx = NULL;
|
---|
201 |
|
---|
202 | if (!ok)
|
---|
203 | { Fclose (qd->File_stem);
|
---|
204 |
|
---|
205 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
206 | if (qd->File_stem1)
|
---|
207 | Fclose (qd->File_stem1);
|
---|
208 | if (qd->File_stem2)
|
---|
209 | Fclose (qd->File_stem2);
|
---|
210 | if (qd->File_stem3)
|
---|
211 | Fclose (qd->File_stem3);
|
---|
212 |
|
---|
213 | Fclose (qd->File_invf);
|
---|
214 | if (qd->File_weight_approx)
|
---|
215 | Fclose (qd->File_weight_approx);
|
---|
216 | return (-1);
|
---|
217 | }
|
---|
218 | return (0);
|
---|
219 |
|
---|
220 | }
|
---|
221 |
|
---|
222 | static int
|
---|
223 | open_all_files (query_data * qd)
|
---|
224 | {
|
---|
225 | int ok = 1;
|
---|
226 |
|
---|
227 | if (open_index_files(qd) != 0)
|
---|
228 | {
|
---|
229 | return -1;
|
---|
230 | }
|
---|
231 | if (open_text_files(qd) != 0)
|
---|
232 | {
|
---|
233 | return -1;
|
---|
234 | }
|
---|
235 | return 0;
|
---|
236 | }
|
---|
237 |
|
---|
238 | static void
|
---|
239 | close_all_files (query_data * qd)
|
---|
240 | { if (qd->File_text)
|
---|
241 | Fclose (qd->File_text);
|
---|
242 | if (qd->File_fast_comp_dict)
|
---|
243 | Fclose (qd->File_fast_comp_dict);
|
---|
244 | if (qd->File_aux_dict)
|
---|
245 | Fclose (qd->File_aux_dict);
|
---|
246 | if (qd->File_comp_dict)
|
---|
247 | Fclose (qd->File_comp_dict);
|
---|
248 | Fclose (qd->File_stem);
|
---|
249 |
|
---|
250 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
251 | if (qd->File_stem1)
|
---|
252 | Fclose (qd->File_stem1);
|
---|
253 | if (qd->File_stem2)
|
---|
254 | Fclose (qd->File_stem2);
|
---|
255 | if (qd->File_stem3)
|
---|
256 | Fclose (qd->File_stem3);
|
---|
257 |
|
---|
258 | Fclose (qd->File_invf);
|
---|
259 | if (qd->File_text_idx_wgt)
|
---|
260 | Fclose (qd->File_text_idx_wgt);
|
---|
261 | if (qd->File_weight_approx)
|
---|
262 | Fclose (qd->File_weight_approx);
|
---|
263 | if (qd->File_text_idx)
|
---|
264 | Fclose (qd->File_text_idx);
|
---|
265 | }
|
---|
266 |
|
---|
267 | /* If textname is equal to null then name will be used instead */
|
---|
268 | /* [RJM 06/97: text filename] */
|
---|
269 |
|
---|
270 | /**
|
---|
271 | * The following define and the variable abortvar in the following function are designed
|
---|
272 | * to provide an exception-like recovery system for when an error occurs.
|
---|
273 | */
|
---|
274 | #define abort(x) abortvar=x;goto Recovery;
|
---|
275 |
|
---|
276 | query_data *
|
---|
277 | __InitQuerySystem (char *dir, char *name, char *textname, InitQueryTimes * iqt, int allfiles)
|
---|
278 | {
|
---|
279 | query_data *qd;
|
---|
280 | char *s;
|
---|
281 | int abortvar;
|
---|
282 |
|
---|
283 | if (textname == NULL) textname = name; /* [RJM 06/97: text filename] */
|
---|
284 |
|
---|
285 | if (!(qd = new query_data))
|
---|
286 | {
|
---|
287 | abort(0);
|
---|
288 | }
|
---|
289 |
|
---|
290 | bzero ((char *) qd, sizeof (*qd));
|
---|
291 |
|
---|
292 | /* -- GRB: 13/09/99: note if all files were requested */
|
---|
293 | qd->allfiles = allfiles;
|
---|
294 |
|
---|
295 | qd->mem_in_use = qd->max_mem_in_use = 0;
|
---|
296 |
|
---|
297 | qd->doc_pos = qd->buf_in_use = 0;
|
---|
298 | qd->TextBufferLen = 0;
|
---|
299 | qd->DL = NULL;
|
---|
300 |
|
---|
301 | /* [RPAP - Feb 97: Term Frequency] */
|
---|
302 | qd->TL = NULL;
|
---|
303 | qd->QTL = NULL;
|
---|
304 |
|
---|
305 | qd->TextBuffer = NULL;
|
---|
306 |
|
---|
307 | qd->tot_hops_taken = 0;
|
---|
308 | qd->tot_num_of_ptrs = 0;
|
---|
309 | qd->tot_num_of_accum = 0;
|
---|
310 | qd->tot_num_of_terms = 0;
|
---|
311 | qd->tot_num_of_ans = 0;
|
---|
312 | qd->tot_text_idx_lookups = 0;
|
---|
313 |
|
---|
314 | qd->hops_taken = 0;
|
---|
315 | qd->num_of_ptrs = 0;
|
---|
316 | qd->num_of_accum = 0;
|
---|
317 | qd->num_of_terms = 0;
|
---|
318 | qd->num_of_ans = 0;
|
---|
319 | qd->text_idx_lookups = 0;
|
---|
320 |
|
---|
321 | qd->pathname = NULL; /* RJM 06/97: text filename] */
|
---|
322 | qd->textpathname = NULL; /* RJM 06/97: text filename] */
|
---|
323 |
|
---|
324 | s = strrchr (dir, '/');
|
---|
325 | if (s && *(s + 1) == '\0')
|
---|
326 | {
|
---|
327 | /* [RJM 06/97: text filename] */
|
---|
328 | if (!(qd->pathname = new char[strlen (dir) + strlen (name) + 1]) ||
|
---|
329 | !(qd->textpathname = new char[strlen (dir) + strlen (textname) + 1]))
|
---|
330 | {
|
---|
331 | abort(1);
|
---|
332 | }
|
---|
333 | sprintf (qd->pathname, "%s%s", dir, name);
|
---|
334 | sprintf (qd->textpathname, "%s%s", dir, textname); /* [RJM 06/97: text filename] */
|
---|
335 | }
|
---|
336 |
|
---|
337 | else
|
---|
338 | {
|
---|
339 | /* [RJM 06/97: text filename] */
|
---|
340 | if (!(qd->pathname = new char[strlen (dir) + strlen (name) + 2]) ||
|
---|
341 | !(qd->textpathname = new char[strlen (dir) + strlen (textname) + 2]))
|
---|
342 | {
|
---|
343 | abort(1);
|
---|
344 | }
|
---|
345 | /* [RPAP - Feb 97: WIN32 Port] */
|
---|
346 | #ifdef __WIN32__
|
---|
347 | if (dir == NULL || dir[0] == '\0') {
|
---|
348 | sprintf (qd->pathname, "%s", name);
|
---|
349 | sprintf (qd->textpathname, "%s", textname); /* [RJM 06/97: text filename] */
|
---|
350 | } else {
|
---|
351 | sprintf (qd->pathname, "%s%s", dir, name);
|
---|
352 | sprintf (qd->textpathname, "%s%s", dir, textname); /* [RJM 06/97: text filename] */
|
---|
353 | }
|
---|
354 | #else
|
---|
355 | sprintf (qd->pathname, "%s/%s", dir, name);
|
---|
356 | sprintf (qd->textpathname, "%s/%s", dir, textname); /* [RJM 06/97: text filename] */
|
---|
357 | #endif
|
---|
358 | }
|
---|
359 |
|
---|
360 | if (((allfiles == 1) && (open_all_files (qd) == -1)) ||
|
---|
361 | ((allfiles == 0) && (open_index_files (qd) == -1)))
|
---|
362 | {
|
---|
363 | abort(2);
|
---|
364 | return (NULL);
|
---|
365 | }
|
---|
366 |
|
---|
367 | if (iqt)
|
---|
368 | GetTime (&iqt->Start);
|
---|
369 |
|
---|
370 | /* Initialise the stemmed dictionary system */
|
---|
371 | if (!(qd->sd = ReadStemDictBlk (qd->File_stem, _MGErr)))
|
---|
372 | {
|
---|
373 | abort(3);
|
---|
374 | }
|
---|
375 |
|
---|
376 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
377 | if ((qd->sd->sdh.indexed & 7) && qd->File_stem1 && qd->File_stem2 && qd->File_stem3)
|
---|
378 | {
|
---|
379 | if (!(qd->sd->stem1 = ReadStemIdxBlk (qd->File_stem1)))
|
---|
380 | {
|
---|
381 | abort(4);
|
---|
382 | }
|
---|
383 | if (!(qd->sd->stem2 = ReadStemIdxBlk (qd->File_stem2)))
|
---|
384 | {
|
---|
385 | abort(4);
|
---|
386 | }
|
---|
387 | if (!(qd->sd->stem3 = ReadStemIdxBlk (qd->File_stem3)))
|
---|
388 | {
|
---|
389 | abort(4);
|
---|
390 | }
|
---|
391 | }
|
---|
392 | else if (qd->sd->sdh.indexed != 0)
|
---|
393 | {
|
---|
394 | abort(4);
|
---|
395 | }
|
---|
396 | else
|
---|
397 | {
|
---|
398 | if (qd->File_stem1)
|
---|
399 | Fclose (qd->File_stem1);
|
---|
400 | if (qd->File_stem2)
|
---|
401 | Fclose (qd->File_stem2);
|
---|
402 | if (qd->File_stem3)
|
---|
403 | Fclose (qd->File_stem3);
|
---|
404 | qd->File_stem1 = NULL;
|
---|
405 | qd->File_stem2 = NULL;
|
---|
406 | qd->File_stem3 = NULL;
|
---|
407 | qd->sd->stem1 = NULL;
|
---|
408 | qd->sd->stem2 = NULL;
|
---|
409 | qd->sd->stem3 = NULL;
|
---|
410 | }
|
---|
411 |
|
---|
412 | if (iqt)
|
---|
413 | GetTime (&iqt->StemDict);
|
---|
414 | if (qd->File_weight_approx)
|
---|
415 | {
|
---|
416 | if (!(qd->awd = LoadDocWeights (qd->File_weight_approx,
|
---|
417 | qd->sd->sdh.num_of_docs)))
|
---|
418 | {
|
---|
419 | abort(4);
|
---|
420 | }
|
---|
421 | }
|
---|
422 | else
|
---|
423 | qd->awd = NULL;
|
---|
424 |
|
---|
425 |
|
---|
426 | if (iqt)
|
---|
427 | GetTime (&iqt->ApproxWeights);
|
---|
428 |
|
---|
429 | if ((allfiles == 1) &&
|
---|
430 | !(qd->cd = LoadCompDict (qd->File_comp_dict, qd->File_aux_dict,
|
---|
431 | qd->File_fast_comp_dict)))
|
---|
432 | {
|
---|
433 | abort(5);
|
---|
434 | }
|
---|
435 | else if (allfiles == 0)
|
---|
436 | {
|
---|
437 | qd->cd = NULL;
|
---|
438 | }
|
---|
439 |
|
---|
440 | if (iqt)
|
---|
441 | GetTime (&iqt->CompDict);
|
---|
442 |
|
---|
443 | if (!(qd->id = InitInvfFile (qd->File_invf, qd->sd)))
|
---|
444 | {
|
---|
445 | abort(6);
|
---|
446 | }
|
---|
447 | if (((allfiles == 1 && qd->File_text_idx_wgt == NULL) ||
|
---|
448 | qd->File_weight_approx == NULL) &&
|
---|
449 | qd->id->ifh.InvfLevel >= 2)
|
---|
450 | {
|
---|
451 | abort(7);
|
---|
452 | }
|
---|
453 | if (iqt)
|
---|
454 | GetTime (&iqt->Invf);
|
---|
455 |
|
---|
456 | if ((allfiles == 1) &&
|
---|
457 | !(qd->td = LoadTextData (qd->File_text, qd->File_text_idx_wgt,
|
---|
458 | qd->File_text_idx)))
|
---|
459 | {
|
---|
460 | abort(8);
|
---|
461 | }
|
---|
462 | else if (allfiles == 0)
|
---|
463 | {
|
---|
464 | qd->td = NULL;
|
---|
465 | }
|
---|
466 |
|
---|
467 | /* [RPAP - Feb 97: NZDL Additions] */
|
---|
468 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
469 |
|
---|
470 | /*
|
---|
471 |
|
---|
472 | This code is based on the TREC_MODE code below to read the .paragraph
|
---|
473 | file to determine what document numbers correspond to what paragraphs.
|
---|
474 | This code is more space efficient, reading in the .paragraph file
|
---|
475 | into memory as an accumulate docnum array. Eg. the .paragraph may contain
|
---|
476 |
|
---|
477 | [5 3 6 4 7 9 4]
|
---|
478 |
|
---|
479 | indicating the first document has 5 paragraphs, the next 3, etc.
|
---|
480 | This will be stored in memory as
|
---|
481 |
|
---|
482 | [0 5 8 14 18 25 34 38]
|
---|
483 |
|
---|
484 | so a binary search can be performed. The first 0 is for convenience;
|
---|
485 | it prevents testing boundary conditions.
|
---|
486 |
|
---|
487 |
|
---|
488 | The TREC_MODE code does this differently; it stores the array
|
---|
489 |
|
---|
490 | [1 1 1 1 1 2 2 2 3 3 3 3 3 3 ....]
|
---|
491 |
|
---|
492 | allowing directy paragraph to docnum conversion, at the expense
|
---|
493 | of memory.
|
---|
494 |
|
---|
495 | */
|
---|
496 | qd->paragraph = NULL;
|
---|
497 |
|
---|
498 | if ((allfiles == 1) && (qd->id->ifh.InvfLevel == 3))
|
---|
499 | {
|
---|
500 | unsigned long magic;
|
---|
501 | FILE *paragraph;
|
---|
502 | int i;
|
---|
503 | char paraFile[512];
|
---|
504 |
|
---|
505 | sprintf(paraFile, "%s%s", qd->pathname, INVF_PARAGRAPH_SUFFIX);
|
---|
506 | paragraph = fopen(paraFile, "rb");
|
---|
507 | if (!paragraph)
|
---|
508 | FatalError(1, "Unable to open 'paraFile'.", paraFile);
|
---|
509 |
|
---|
510 | fread((void *)&magic, sizeof(magic), 1, paragraph);
|
---|
511 | qd->paragraph = new int[qd->td->cth.num_of_docs+1];
|
---|
512 | qd->paragraph[0] = 0;
|
---|
513 | for (i = 1; i <= qd->td->cth.num_of_docs; i++)
|
---|
514 | {
|
---|
515 | int count;
|
---|
516 |
|
---|
517 | if (fread((void *)&count, sizeof(count), 1, paragraph) != 1)
|
---|
518 | FatalError(1, "Unexpected EOF while reading '%s'.", paraFile);
|
---|
519 | NTOHSI(count); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
520 | qd->paragraph[i] = qd->paragraph[i-1]+count;
|
---|
521 | }
|
---|
522 |
|
---|
523 | fclose (paragraph); /* [RJM 07/98: Memory Leak] */
|
---|
524 | }
|
---|
525 |
|
---|
526 |
|
---|
527 | #endif
|
---|
528 |
|
---|
529 | #ifdef TREC_MODE
|
---|
530 | {
|
---|
531 | extern char *trec_ids;
|
---|
532 | extern long *trec_paras;
|
---|
533 | int size;
|
---|
534 | char FileName[512];
|
---|
535 | FILE *f;
|
---|
536 | if (!strstr (qd->pathname, "trec"))
|
---|
537 | goto error;
|
---|
538 | sprintf (FileName, "%s%s", qd->pathname, ".DOCIDS");
|
---|
539 | if (!(f = fopen (FileName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
540 | {
|
---|
541 | Message ("Unable to open \"%s\"", FileName);
|
---|
542 | goto error;
|
---|
543 | }
|
---|
544 | fseek (f, 0, 2);
|
---|
545 | size = ftell (f);
|
---|
546 | fseek (f, 0, 0);
|
---|
547 | trec_ids = new char[size];
|
---|
548 | if (!trec_ids)
|
---|
549 | {
|
---|
550 | fclose (f);
|
---|
551 | goto error;
|
---|
552 | }
|
---|
553 | fread (trec_ids, 1, size, f);
|
---|
554 | fclose (f);
|
---|
555 | if ((allfiles == 1) && (qd->id->ifh.InvfLevel == 3))
|
---|
556 | {
|
---|
557 | int i, d;
|
---|
558 | unsigned long magic;
|
---|
559 | trec_paras = new long[qd->sd->sdh.num_of_docs];
|
---|
560 | if (!trec_paras)
|
---|
561 | {
|
---|
562 | delete trec_ids;
|
---|
563 | trec_ids = NULL;
|
---|
564 | goto error;
|
---|
565 | }
|
---|
566 | sprintf (FileName, "%s%s", qd->pathname, INVF_PARAGRAPH_SUFFIX);
|
---|
567 | if (!(f = fopen (FileName, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
|
---|
568 | {
|
---|
569 | Message ("Unable to open \"%s\"", FileName);
|
---|
570 | goto error;
|
---|
571 | }
|
---|
572 | if (fread ((char *) &magic, sizeof (magic), 1, f) != 1 ||
|
---|
573 | NTOHUL(magic) != MAGIC_PARAGRAPH) /* [RPAP - Jan 97: Endian Ordering] */
|
---|
574 | {
|
---|
575 | fclose (f);
|
---|
576 | Message ("Bad magic number in \"%s\"", FileName);
|
---|
577 | goto error;
|
---|
578 | }
|
---|
579 |
|
---|
580 | for (d = i = 0; i < qd->td->cth.num_of_docs; i++)
|
---|
581 | {
|
---|
582 | int count;
|
---|
583 | if (fread ((char *) &count, sizeof (count), 1, f) != 1)
|
---|
584 | {
|
---|
585 | fclose (f);
|
---|
586 | goto error;
|
---|
587 | }
|
---|
588 | NTOHSI(count); /* [RPAP - Jan 97: Endian Ordering] */
|
---|
589 | while (count--)
|
---|
590 | trec_paras[d++] = i;
|
---|
591 | }
|
---|
592 | fclose (f);
|
---|
593 | }
|
---|
594 | goto ok;
|
---|
595 | error:
|
---|
596 | if (trec_ids)
|
---|
597 | delete trec_ids;
|
---|
598 | if (trec_paras)
|
---|
599 | delete (trec_paras);
|
---|
600 | trec_ids = NULL;
|
---|
601 | trec_paras = NULL;
|
---|
602 | ok:
|
---|
603 | ;
|
---|
604 | }
|
---|
605 | #endif
|
---|
606 |
|
---|
607 | if (iqt)
|
---|
608 | GetTime (&iqt->Text);
|
---|
609 |
|
---|
610 | return (qd);
|
---|
611 |
|
---|
612 | Recovery:
|
---|
613 | switch (abortvar)
|
---|
614 | {
|
---|
615 | case 8: // NB: 8 indicates a failure after the loading of the inverted file
|
---|
616 | case 7: // NB: 7 indicates a failure during the loading of the inverted file
|
---|
617 | FreeInvfData (qd->id);
|
---|
618 | case 6:
|
---|
619 | if (qd->cd)
|
---|
620 | FreeCompDict (qd->cd);
|
---|
621 | case 5:
|
---|
622 | if (qd->awd)
|
---|
623 | {
|
---|
624 | FreeWeights (qd->awd);
|
---|
625 | }
|
---|
626 | case 4:
|
---|
627 | FreeStemDict (qd->sd);
|
---|
628 | case 3:
|
---|
629 | close_all_files (qd);
|
---|
630 | case 2:
|
---|
631 | delete (qd->textpathname); /* [RJM 06/97: text filename] */
|
---|
632 | case 1:
|
---|
633 | if (qd->pathname)
|
---|
634 | delete (qd->pathname); /* [RJM 06/97: text filename] */
|
---|
635 | delete qd;
|
---|
636 | break;
|
---|
637 | }
|
---|
638 |
|
---|
639 | /* Set mg error status for particular failures; NB not every failure results in an
|
---|
640 | * mg error value being set.
|
---|
641 | */
|
---|
642 | if (abortvar < 2)
|
---|
643 | mg_errno = MG_NOMEM;
|
---|
644 | else if (abortvar == 7)
|
---|
645 | mg_errno = MG_INVERSION;
|
---|
646 | return (NULL);
|
---|
647 | }
|
---|
648 |
|
---|
649 |
|
---|
650 |
|
---|
651 | query_data *
|
---|
652 | InitQuerySystem (char *dir, char *name, char *textname, InitQueryTimes * iqt)
|
---|
653 | { return __InitQuerySystem(dir, name, textname, iqt, 1);
|
---|
654 | }
|
---|
655 |
|
---|
656 | // This function initialises the query system without loading the text dictionary etc.
|
---|
657 | query_data *
|
---|
658 | InitQuerySystemNT (char *dir, char *name, char *textname, InitQueryTimes * iqt)
|
---|
659 | { return __InitQuerySystem(dir, name, textname, iqt, 0);
|
---|
660 | }
|
---|
661 |
|
---|
662 | /*
|
---|
663 | * Change the amount of memory currently in use
|
---|
664 | *
|
---|
665 | */
|
---|
666 | void
|
---|
667 | ChangeMemInUse (query_data * qd, long delta)
|
---|
668 | {
|
---|
669 | qd->mem_in_use += delta;
|
---|
670 | if (qd->mem_in_use > qd->max_mem_in_use)
|
---|
671 | qd->max_mem_in_use = qd->mem_in_use;
|
---|
672 | }
|
---|
673 |
|
---|
674 |
|
---|
675 | void
|
---|
676 | FinishQuerySystem (query_data * qd)
|
---|
677 | {
|
---|
678 | /* [RJM 07/98: Memory Leak] */
|
---|
679 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
680 | if (qd->paragraph != NULL) {
|
---|
681 | delete (qd->paragraph);
|
---|
682 | qd->paragraph = NULL;
|
---|
683 | }
|
---|
684 | #endif
|
---|
685 |
|
---|
686 | if (qd->td != NULL)
|
---|
687 | {
|
---|
688 | FreeTextData (qd->td);
|
---|
689 | }
|
---|
690 | FreeInvfData (qd->id);
|
---|
691 | if (qd->cd)
|
---|
692 | FreeCompDict (qd->cd);
|
---|
693 |
|
---|
694 | if (qd->awd)
|
---|
695 | FreeWeights (qd->awd);
|
---|
696 | FreeStemDict (qd->sd);
|
---|
697 | close_all_files (qd);
|
---|
698 | delete (qd->textpathname); /* [RJM 06/97: text filename] */
|
---|
699 | delete (qd->pathname);
|
---|
700 | QueryData_FreeQueryDocs (qd);
|
---|
701 | if (qd->TL != NULL) TermList_destroy(&qd->TL); /* [RJM 07/98: Memory Leak] */
|
---|
702 | if (qd->QTL != NULL) QueryTermList_free(&qd->QTL); /* [RJM 07/98: Memory Leak] */
|
---|
703 | delete qd;
|
---|
704 |
|
---|
705 | /* other global stuff hanging around */
|
---|
706 | MgErrorDeinit ();
|
---|
707 | }
|
---|
708 |
|
---|
709 |
|
---|
710 | void
|
---|
711 | ResetFileStats (query_data * qd)
|
---|
712 | {
|
---|
713 | if (qd->File_text)
|
---|
714 | ZeroFileStats (qd->File_text);
|
---|
715 | if (qd->File_comp_dict)
|
---|
716 | ZeroFileStats (qd->File_comp_dict);
|
---|
717 | if (qd->File_fast_comp_dict)
|
---|
718 | ZeroFileStats (qd->File_fast_comp_dict);
|
---|
719 | ZeroFileStats (qd->File_stem);
|
---|
720 |
|
---|
721 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
722 | if (qd->File_stem1)
|
---|
723 | ZeroFileStats (qd->File_stem1);
|
---|
724 | if (qd->File_stem2)
|
---|
725 | ZeroFileStats (qd->File_stem2);
|
---|
726 | if (qd->File_stem3)
|
---|
727 | ZeroFileStats (qd->File_stem3);
|
---|
728 |
|
---|
729 | ZeroFileStats (qd->File_invf);
|
---|
730 | if (qd->File_text_idx_wgt)
|
---|
731 | ZeroFileStats (qd->File_text_idx_wgt);
|
---|
732 | if (qd->File_weight_approx)
|
---|
733 | ZeroFileStats (qd->File_weight_approx);
|
---|
734 | if (qd->File_text_idx)
|
---|
735 | ZeroFileStats (qd->File_text_idx);
|
---|
736 | }
|
---|
737 |
|
---|
738 |
|
---|
739 | void
|
---|
740 | TransFileStats (query_data * qd)
|
---|
741 | {
|
---|
742 | qd->File_text->Current = qd->File_text->Cumulative;
|
---|
743 | if (qd->File_comp_dict)
|
---|
744 | qd->File_comp_dict->Current = qd->File_comp_dict->Cumulative;
|
---|
745 | if (qd->File_fast_comp_dict)
|
---|
746 | qd->File_fast_comp_dict->Current = qd->File_fast_comp_dict->Cumulative;
|
---|
747 | qd->File_stem->Current = qd->File_stem->Cumulative;
|
---|
748 |
|
---|
749 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
750 | if (qd->File_stem1)
|
---|
751 | qd->File_stem1->Current = qd->File_stem1->Cumulative;
|
---|
752 | if (qd->File_stem2)
|
---|
753 | qd->File_stem2->Current = qd->File_stem2->Cumulative;
|
---|
754 | if (qd->File_stem3)
|
---|
755 | qd->File_stem3->Current = qd->File_stem3->Cumulative;
|
---|
756 |
|
---|
757 | qd->File_invf->Current = qd->File_invf->Cumulative;
|
---|
758 | if (qd->File_text_idx_wgt)
|
---|
759 | qd->File_text_idx_wgt->Current = qd->File_text_idx_wgt->Cumulative;
|
---|
760 | if (qd->File_weight_approx)
|
---|
761 | qd->File_weight_approx->Current = qd->File_weight_approx->Cumulative;
|
---|
762 | if (qd->File_text_idx)
|
---|
763 | qd->File_text_idx->Current = qd->File_text_idx->Cumulative;
|
---|
764 | }
|
---|
765 |
|
---|
766 |
|
---|
767 | /**
|
---|
768 | * Block: Query Document handling;
|
---|
769 | * Provides basic loading/unloading facilities for individual
|
---|
770 | * documents, usually for presentation purposes (searches
|
---|
771 | * are performed on inverted files
|
---|
772 | */
|
---|
773 |
|
---|
774 | void
|
---|
775 | QueryData_FreeTextBuffer (query_data * qd)
|
---|
776 | {
|
---|
777 | if (qd->TextBuffer)
|
---|
778 | {
|
---|
779 | delete qd->TextBuffer;
|
---|
780 | ChangeMemInUse (qd, -qd->TextBufferLen);
|
---|
781 | }
|
---|
782 | qd->TextBuffer = NULL;
|
---|
783 | qd->TextBufferLen = 0;
|
---|
784 | }
|
---|
785 |
|
---|
786 | void
|
---|
787 | QueryData_FreeQueryDocs (query_data * qd)
|
---|
788 | {
|
---|
789 | qd->doc_pos = 0;
|
---|
790 | qd->buf_in_use = 0;
|
---|
791 | if (qd->DL)
|
---|
792 | {
|
---|
793 | //int i;
|
---|
794 | DocList_FreeTextBuffers(qd->DL, qd);
|
---|
795 | /*
|
---|
796 | for (i = 0; i < qd->DL->size(); i++)
|
---|
797 | DocEntry_FreeTextBuffer(&qd->DL->DE[i], qd);
|
---|
798 |
|
---|
799 | if (qd->DL->DE[i].CompTextBuffer)
|
---|
800 | {
|
---|
801 | delete qd->DL->DE[i].CompTextBuffer;
|
---|
802 | qd->DL->DE[i].CompTextBuffer = NULL;
|
---|
803 | ChangeMemInUse (qd, -qd->DL->DE[i].Len);
|
---|
804 | }
|
---|
805 | */
|
---|
806 | DocList_destroy(qd->DL);
|
---|
807 | }
|
---|
808 | qd->DL = NULL;
|
---|
809 | QueryData_FreeTextBuffer (qd);
|
---|
810 | }
|
---|
811 |
|
---|
812 | int
|
---|
813 | LoadCompressedText (query_data * qd, int max_mem)
|
---|
814 | {
|
---|
815 | DocEntry *DE;
|
---|
816 |
|
---|
817 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
818 | return -1;
|
---|
819 |
|
---|
820 | /**
|
---|
821 | * GRB: added test for td (text data) per addition of loading
|
---|
822 | * indexes only; td may now be NULL; this returns a
|
---|
823 | * special value to indicated that the text was not
|
---|
824 | * loaded; mgquery etc now test on that value
|
---|
825 | * 17/09/99
|
---|
826 | */
|
---|
827 | if (qd->td == NULL)
|
---|
828 | return -2;
|
---|
829 |
|
---|
830 | DE = qd->DL->member(qd->doc_pos);
|
---|
831 |
|
---|
832 | if (!DocEntry_TextLoaded(DE))
|
---|
833 | {
|
---|
834 | //int i;
|
---|
835 | //DocEntry *de;
|
---|
836 |
|
---|
837 | DocList_FreeTextBuffers(qd->DL, qd);
|
---|
838 | /*
|
---|
839 | for (i = 0, de = qd->DL->DE; i < qd->DL->size(); i++, de++)
|
---|
840 | DocEntry_FreeTextBuffer(de);
|
---|
841 | */
|
---|
842 |
|
---|
843 | if (LoadBuffers (qd, qd->DL->member(qd->doc_pos), max_mem,
|
---|
844 | qd->DL->size() - qd->doc_pos) == -1)
|
---|
845 | return -1;
|
---|
846 | }
|
---|
847 | return 0;
|
---|
848 | }
|
---|
849 |
|
---|
850 |
|
---|
851 | int
|
---|
852 | GetDocNum (query_data * qd)
|
---|
853 | {
|
---|
854 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
855 | return -1;
|
---|
856 | return qd->DL->member(qd->doc_pos)->docNum();
|
---|
857 | }
|
---|
858 |
|
---|
859 | DocEntry *
|
---|
860 | GetDocChain (query_data * qd)
|
---|
861 | {
|
---|
862 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
863 | return NULL;
|
---|
864 | return qd->DL->member(qd->doc_pos);
|
---|
865 | }
|
---|
866 |
|
---|
867 | float
|
---|
868 | GetDocWeight (query_data * qd)
|
---|
869 | {
|
---|
870 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
871 | return -1;
|
---|
872 | return qd->DL->member(qd->doc_pos)->docWeight();
|
---|
873 | }
|
---|
874 |
|
---|
875 | long
|
---|
876 | GetDocCompLength (query_data * qd)
|
---|
877 | {
|
---|
878 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
879 | return -1;
|
---|
880 | return qd->DL->DE[qd->doc_pos].Len;
|
---|
881 | }
|
---|
882 |
|
---|
883 |
|
---|
884 | u_char *
|
---|
885 | GetDocText (query_data * qd, unsigned long *len)
|
---|
886 | {
|
---|
887 | DocEntry *DE;
|
---|
888 | int ULen;
|
---|
889 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
890 | return NULL;
|
---|
891 |
|
---|
892 | DE = qd->DL->member(qd->doc_pos);
|
---|
893 |
|
---|
894 | if (!DocEntry_TextLoaded(DE))
|
---|
895 | {
|
---|
896 | fprintf (stderr, "The compressed text buffer is NULL\n");
|
---|
897 | mg_errno = MG_NOMEM;
|
---|
898 | return (NULL);
|
---|
899 | }
|
---|
900 |
|
---|
901 | QueryData_FreeTextBuffer (qd);
|
---|
902 |
|
---|
903 | qd->TextBufferLen = (int) (qd->td->cth.ratio * 1.01 *
|
---|
904 | DocEntry_length(DE)) + 100;
|
---|
905 | if (!(qd->TextBuffer = new unsigned char[qd->TextBufferLen]))
|
---|
906 | {
|
---|
907 | fprintf (stderr, "No memory for TextBuffer\n");
|
---|
908 | mg_errno = MG_NOMEM;
|
---|
909 | return (NULL);
|
---|
910 | }
|
---|
911 |
|
---|
912 | DecodeText (qd->cd, (u_char *) DocEntry_TextBuffer(DE), DE->Len,
|
---|
913 | (u_char *) (qd->TextBuffer), &ULen);
|
---|
914 | qd->TextBuffer[ULen] = '\0';
|
---|
915 |
|
---|
916 | if (ULen >= qd->TextBufferLen)
|
---|
917 | {
|
---|
918 | fprintf (stderr, "%d >= %d\n", ULen, qd->TextBufferLen);
|
---|
919 | mg_errno = MG_BUFTOOSMALL;
|
---|
920 | return (NULL);
|
---|
921 | }
|
---|
922 |
|
---|
923 | if (len)
|
---|
924 | *len = ULen;
|
---|
925 |
|
---|
926 | return qd->TextBuffer;
|
---|
927 | }
|
---|
928 |
|
---|
929 | int
|
---|
930 | NextDoc (query_data * qd)
|
---|
931 | {
|
---|
932 | if (qd->DL == NULL || qd->doc_pos >= qd->DL->size())
|
---|
933 | return 0;
|
---|
934 | qd->doc_pos++;
|
---|
935 | return qd->doc_pos < qd->DL->size();
|
---|
936 | }
|
---|
937 |
|
---|
938 |
|
---|