1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * mgq.c -- cut-dowm version of mgquery
|
---|
4 | * Copyright (C) 1999 The New Zealand Digital Library Project
|
---|
5 | *
|
---|
6 | * A component of the Greenstone digital library software
|
---|
7 | * from the New Zealand Digital Library Project at the
|
---|
8 | * University of Waikato, New Zealand.
|
---|
9 | *
|
---|
10 | * This program is free software; you can redistribute it and/or modify
|
---|
11 | * it under the terms of the GNU General Public License as published by
|
---|
12 | * the Free Software Foundation; either version 2 of the License, or
|
---|
13 | * (at your option) any later version.
|
---|
14 | *
|
---|
15 | * This program is distributed in the hope that it will be useful,
|
---|
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
18 | * GNU General Public License for more details.
|
---|
19 | *
|
---|
20 | * You should have received a copy of the GNU General Public License
|
---|
21 | * along with this program; if not, write to the Free Software
|
---|
22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
23 | *
|
---|
24 | * $Id: mgq.c 561 1999-09-09 04:12:22Z sjboddie $
|
---|
25 | *
|
---|
26 | *********************************************************************/
|
---|
27 |
|
---|
28 | /*
|
---|
29 | $Log$
|
---|
30 | Revision 1.8 1999/09/09 04:12:22 sjboddie
|
---|
31 | added GPL header
|
---|
32 |
|
---|
33 | Revision 1.7 1999/07/01 09:29:18 rjmcnab
|
---|
34 | Changes for better reporting of number documents which match a query. Changes
|
---|
35 | should still work as before with older versions of mg.
|
---|
36 |
|
---|
37 | Revision 1.6 1999/07/01 03:52:05 rjmcnab
|
---|
38 | Added a function to get the equivalent terms of a query term. I also
|
---|
39 | fixed a small bug that was causing massive slowdown :-^
|
---|
40 |
|
---|
41 | Revision 1.5 1999/06/30 04:04:11 rjmcnab
|
---|
42 | made stemming functions available from mgsearch and made the stems
|
---|
43 | for the query terms available in queryinfo
|
---|
44 |
|
---|
45 | Revision 1.4 1999/06/28 08:56:29 rjmcnab
|
---|
46 | A bit of hacking to remove the restriction that the index to get
|
---|
47 | a document must be a level 2 index. Now both level 2 and level 3
|
---|
48 | indexes can be used to get the text of a document.
|
---|
49 |
|
---|
50 | Revision 1.3 1999/01/19 01:38:16 rjmcnab
|
---|
51 |
|
---|
52 | Made the source more portable.
|
---|
53 |
|
---|
54 | Revision 1.2 1999/01/12 01:51:02 rjmcnab
|
---|
55 |
|
---|
56 | Standard header.
|
---|
57 |
|
---|
58 | Revision 1.1 1999/01/08 09:02:22 rjmcnab
|
---|
59 |
|
---|
60 | Moved from src/library.
|
---|
61 |
|
---|
62 | */
|
---|
63 |
|
---|
64 |
|
---|
65 | #include "mgq.h"
|
---|
66 |
|
---|
67 |
|
---|
68 | #include <stdio.h>
|
---|
69 | #include <string.h>
|
---|
70 | /* #include <io.h> */
|
---|
71 | #include <fcntl.h>
|
---|
72 |
|
---|
73 | #ifdef __cplusplus
|
---|
74 | extern "C" {
|
---|
75 | #endif
|
---|
76 |
|
---|
77 | #include "sysfuncs.h"
|
---|
78 |
|
---|
79 | #include "messages.h"
|
---|
80 | #include "memlib.h"
|
---|
81 |
|
---|
82 | #include "invf.h"
|
---|
83 | #include "text.h"
|
---|
84 | #include "lists.h"
|
---|
85 | #include "backend.h"
|
---|
86 | #include "environment.h"
|
---|
87 | #include "globals.h"
|
---|
88 | #include "mg_errors.h"
|
---|
89 | #include "commands.h"
|
---|
90 | #include "text_get.h"
|
---|
91 | #include "term_lists.h"
|
---|
92 | #include "local_strings.h"
|
---|
93 |
|
---|
94 | #include "words.h"
|
---|
95 | #include "stemmer.h"
|
---|
96 | #include "stem_search.h"
|
---|
97 |
|
---|
98 | #ifdef __cplusplus
|
---|
99 | }
|
---|
100 | #endif
|
---|
101 |
|
---|
102 |
|
---|
103 | #include "mgq.h"
|
---|
104 |
|
---|
105 | /* get a reasonable database cache size */
|
---|
106 | #ifndef MAXNUMDATABASEINFO
|
---|
107 | # ifdef GSDLSERVER
|
---|
108 | # define MAXNUMDATABASEINFO 10
|
---|
109 | # else
|
---|
110 | # define MAXNUMDATABASEINFO 2
|
---|
111 | # endif
|
---|
112 | #endif
|
---|
113 |
|
---|
114 | #define MAXCOLLECTIONLEN 16
|
---|
115 | #define MAXMGDIRLEN 256
|
---|
116 | #define MAXGENSUFFIXLEN 256
|
---|
117 | #define MAXTEXTSUFFIXLEN 256
|
---|
118 |
|
---|
119 | typedef struct DatabaseInfo {
|
---|
120 | int accessnum; /* -1 = invalid record */
|
---|
121 | char collection[MAXCOLLECTIONLEN];
|
---|
122 | char mgdir[MAXMGDIRLEN];
|
---|
123 | char gensuffix[MAXGENSUFFIXLEN];
|
---|
124 | char textsuffix[MAXTEXTSUFFIXLEN];
|
---|
125 | query_data *qd;
|
---|
126 | } DatabaseInfo;
|
---|
127 |
|
---|
128 |
|
---|
129 | /* globals needed by some vague part of mg... */
|
---|
130 | FILE *OutFile = NULL, *InFile = NULL;
|
---|
131 | int OutPipe = 0, InPipe = 0;
|
---|
132 | int Quitting = 0;
|
---|
133 |
|
---|
134 | /* globals needed to handle loading of databases */
|
---|
135 | static int cur_cachenum = -1;
|
---|
136 |
|
---|
137 | /* globals needed by the database cache */
|
---|
138 | static DatabaseInfo dbcache[MAXNUMDATABASEINFO];
|
---|
139 | static int cache_nextaccessnum = 0;
|
---|
140 | static int cache_numloaded = 0;
|
---|
141 |
|
---|
142 |
|
---|
143 |
|
---|
144 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
145 | static int GetDocNumFromParaNum(query_data *qd, int paranum) {
|
---|
146 | int Documents = qd->td->cth.num_of_docs;
|
---|
147 | int *Paragraph = qd->paragraph;
|
---|
148 | int low = 1, high = Documents;
|
---|
149 | int mid = (low+high)/2;
|
---|
150 |
|
---|
151 | while ((mid = (low+high)/2) >=1 && mid <= Documents)
|
---|
152 | {
|
---|
153 | if (paranum > Paragraph[mid])
|
---|
154 | low = mid+1;
|
---|
155 | else if (paranum <= Paragraph[mid-1])
|
---|
156 | high = mid-1;
|
---|
157 | else
|
---|
158 | return mid;
|
---|
159 | }
|
---|
160 | FatalError(1, "Bad paragraph number.\n");
|
---|
161 | return 0;
|
---|
162 | }
|
---|
163 |
|
---|
164 | static int GetParaNumFromDocNum(query_data *qd, int docnum) {
|
---|
165 | int Documents = qd->td->cth.num_of_docs;
|
---|
166 | int *Paragraph = qd->paragraph;
|
---|
167 |
|
---|
168 | if (docnum > 0 && docnum <= Documents)
|
---|
169 | return Paragraph[docnum-1]+1;
|
---|
170 | return 0;
|
---|
171 | }
|
---|
172 | #endif
|
---|
173 |
|
---|
174 |
|
---|
175 | /*****************************************************************************/
|
---|
176 |
|
---|
177 | static void MGQError(char *emsg)
|
---|
178 | {
|
---|
179 | fprintf(stderr,"Fatal error: %s\n", emsg);
|
---|
180 | exit(1);
|
---|
181 | }
|
---|
182 |
|
---|
183 | static int ProcessDocs (query_data * qd, int skip, int howmany,
|
---|
184 | enum result_kinds kind,
|
---|
185 | int (*sender)(char *,int,int,float,void *), void *ptr) {
|
---|
186 | int max_buf = 0, output_failure = 0;
|
---|
187 | int DocCount = 0;
|
---|
188 | int need_text = (kind == result_docs);
|
---|
189 |
|
---|
190 | /* skip the requested number of documents */
|
---|
191 | while (skip > 0) {
|
---|
192 | if (!NextDoc(qd)) return 0;
|
---|
193 | skip--;
|
---|
194 | }
|
---|
195 |
|
---|
196 | /* find out the maximum size for the text buffer */
|
---|
197 | if (need_text) max_buf = atoi (GetDefEnv ("buffer", "1048576"));
|
---|
198 |
|
---|
199 | /* process each document */
|
---|
200 | do {
|
---|
201 | u_char *UDoc = NULL;
|
---|
202 | unsigned long ULen=0;
|
---|
203 |
|
---|
204 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
205 | /* adjust the document number for paragraph level result_docs */
|
---|
206 | /* this is a bit of a hack ... */
|
---|
207 | if (kind==result_docs && qd->id->ifh.InvfLevel == 3 &&
|
---|
208 | qd->DL != NULL && (int)qd->doc_pos < (int)qd->DL->num)
|
---|
209 | qd->DL->DE[qd->doc_pos].DocNum = GetParaNumFromDocNum(qd, qd->DL->DE[qd->doc_pos].DocNum);
|
---|
210 | #endif
|
---|
211 |
|
---|
212 | if (need_text) {
|
---|
213 | /* load the compressed text */
|
---|
214 | if (LoadCompressedText (qd, max_buf))
|
---|
215 | MGQError("Unable to load compressed text(memory?).");
|
---|
216 |
|
---|
217 | /* uncompress the loaded text */
|
---|
218 | UDoc = GetDocText (qd, &ULen);
|
---|
219 | if (UDoc == NULL) MGQError("UDoc is unexpectedly NULL");
|
---|
220 | }
|
---|
221 |
|
---|
222 | if (UDoc != NULL || kind == result_docnums) {
|
---|
223 | int docnum = GetDocNum(qd);
|
---|
224 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
225 | if (qd->id->ifh.InvfLevel == 3) docnum = GetDocNumFromParaNum(qd, docnum);
|
---|
226 | #endif
|
---|
227 | switch (kind) {
|
---|
228 | case result_docnums:
|
---|
229 | if (sender != NULL)
|
---|
230 | output_failure = (*sender)("",0,docnum,GetDocWeight(qd),ptr);
|
---|
231 | break;
|
---|
232 | case result_docs:
|
---|
233 | if (sender != NULL)
|
---|
234 | output_failure = (*sender)((char *)UDoc,ULen,docnum,GetDocWeight(qd),ptr);
|
---|
235 | break;
|
---|
236 | default:
|
---|
237 | break;
|
---|
238 | }
|
---|
239 | }
|
---|
240 | DocCount++;
|
---|
241 |
|
---|
242 | } while (NextDoc (qd) && output_failure == 0 && --howmany > 0);
|
---|
243 |
|
---|
244 | /* if (need_text) FreeTextBuffer (qd);*/
|
---|
245 |
|
---|
246 | return (DocCount);
|
---|
247 | }
|
---|
248 |
|
---|
249 |
|
---|
250 | static void send_query_term_freqs(QueryTermList *qtl,
|
---|
251 | int (*sender)(char *,int,int,float,void *), void *ptr)
|
---|
252 | {
|
---|
253 | int i = 0;
|
---|
254 | for (i = 0; i < qtl->num; i++)
|
---|
255 | if (sender != NULL) {
|
---|
256 | /* word = word2str(qtl->QTE[i].Term);
|
---|
257 | (* sender)(word, strlen(word), qtl->QTE[i].Count, (float)0.0, ptr); */
|
---|
258 | (* sender)((char *)(qtl->QTE[i].Term+1), qtl->QTE[i].Term[0],
|
---|
259 | qtl->QTE[i].Count, (float)0.0, ptr);
|
---|
260 | }
|
---|
261 | }
|
---|
262 |
|
---|
263 |
|
---|
264 | static void send_terms (TermList * qtl,
|
---|
265 | int (*sender)(char *,int,int,float,void *), void *ptr)
|
---|
266 | {
|
---|
267 | int i = 0;
|
---|
268 | if (sender == NULL) return;
|
---|
269 | for (i = 0; i < qtl->num; i++)
|
---|
270 | {
|
---|
271 | /* word = word2str(qtl->TE[i].Word);
|
---|
272 | (* sender)(word, strlen(word), qtl->TE[i].Count, (float)0.0, ptr);*/
|
---|
273 | (* sender)((char *)(qtl->TE[i].Word+1), qtl->TE[i].Word[0],
|
---|
274 | qtl->TE[i].Count, (float)0.0, ptr);
|
---|
275 | }
|
---|
276 | }
|
---|
277 |
|
---|
278 |
|
---|
279 | /* MoreDocs () */
|
---|
280 | /* Displays all documents in list DocList. */
|
---|
281 | /* Documents are fetched, then decompressed and displayed according to the */
|
---|
282 | /* format implied in FormString(). */
|
---|
283 |
|
---|
284 | static void
|
---|
285 | MoreDocs (query_data * qd, enum result_kinds kind,
|
---|
286 | int skip, int howmany,
|
---|
287 | int (*sender)(char *,int,int,float,void *), void *ptr)
|
---|
288 | {
|
---|
289 | qd->num_of_ans = qd->DL->num;
|
---|
290 | switch (kind) {
|
---|
291 | case result_docs:
|
---|
292 | case result_docnums:
|
---|
293 | if (qd->num_of_ans > 0)
|
---|
294 | ProcessDocs (qd, skip, howmany, kind, sender, ptr);
|
---|
295 | break;
|
---|
296 | case result_termfreqs:
|
---|
297 | send_query_term_freqs(qd->QTL, sender, ptr);
|
---|
298 | break;
|
---|
299 | case result_terms:
|
---|
300 | send_terms(qd->TL, sender, ptr);
|
---|
301 | break;
|
---|
302 | }
|
---|
303 | }
|
---|
304 |
|
---|
305 |
|
---|
306 |
|
---|
307 |
|
---|
308 |
|
---|
309 |
|
---|
310 | /******************************************
|
---|
311 | * functions to handle the database cache *
|
---|
312 | ******************************************/
|
---|
313 |
|
---|
314 | /* init_dbcache should be called at the start of each */
|
---|
315 | /* function which deals with the database cache */
|
---|
316 | static void init_dbcache (void) {
|
---|
317 | static int dbcacheinited = 0;
|
---|
318 | int i = 0;
|
---|
319 |
|
---|
320 | if (dbcacheinited) return;
|
---|
321 |
|
---|
322 | cache_numloaded = 0;
|
---|
323 |
|
---|
324 | for (i=0; i<MAXNUMDATABASEINFO; i++) {
|
---|
325 | dbcache[i].accessnum = -1;
|
---|
326 | dbcache[i].collection[0] = '\0';
|
---|
327 | dbcache[i].mgdir[0] = '\0';
|
---|
328 | dbcache[i].gensuffix[0] = '\0';
|
---|
329 | dbcache[i].textsuffix[0] = '\0';
|
---|
330 | dbcache[i].qd = NULL;
|
---|
331 | }
|
---|
332 |
|
---|
333 | dbcacheinited = 1;
|
---|
334 | }
|
---|
335 |
|
---|
336 | /* returns the next cache access number and increments it */
|
---|
337 | static int get_next_accessnum (void) {
|
---|
338 | return cache_nextaccessnum++;
|
---|
339 | }
|
---|
340 |
|
---|
341 | /* get_free_dbcache returns the cache number which */
|
---|
342 | /* was used the longest time ago */
|
---|
343 | /* init_dbcache should be called before this function */
|
---|
344 | static int get_free_dbcache (void) {
|
---|
345 | int i = 0;
|
---|
346 | int minaccessnum = cache_nextaccessnum; /* the current max */
|
---|
347 | int minpos = 0;
|
---|
348 |
|
---|
349 | for (i=0; i<MAXNUMDATABASEINFO; i++) {
|
---|
350 | if (dbcache[i].accessnum < minaccessnum) {
|
---|
351 | minaccessnum = dbcache[i].accessnum;
|
---|
352 | minpos = i;
|
---|
353 | }
|
---|
354 | }
|
---|
355 |
|
---|
356 | return minpos;
|
---|
357 | }
|
---|
358 |
|
---|
359 | /* search_collect will search for an index which */
|
---|
360 | /* belongs to a certain collection It returns -1 if none could be found. */
|
---|
361 | /* init_dbcache should be called before this function */
|
---|
362 | static int search_collect (char *collection) {
|
---|
363 | int i = 0;
|
---|
364 |
|
---|
365 | for (i=0; i<MAXNUMDATABASEINFO; i++) {
|
---|
366 | if ((dbcache[i].accessnum >= 0) &&
|
---|
367 | (dbcache[i].qd != NULL) &&
|
---|
368 | (strcmp (collection, dbcache[i].collection) == 0)
|
---|
369 | /* && (dbcache[i].qd->id->ifh.InvfLevel == 2)*/
|
---|
370 | ) {
|
---|
371 | dbcache[i].accessnum = get_next_accessnum ();
|
---|
372 | return i;
|
---|
373 | }
|
---|
374 | }
|
---|
375 |
|
---|
376 | return -1;
|
---|
377 | }
|
---|
378 |
|
---|
379 | /* search_gensuffix will search for an index which */
|
---|
380 | /* has a certain gensuffix. It returns -1 if none could be found. */
|
---|
381 | /* init_dbcache should be called before this function */
|
---|
382 | static int search_gensuffix (char *gensuffix) {
|
---|
383 | int i = 0;
|
---|
384 |
|
---|
385 | for (i=0; i<MAXNUMDATABASEINFO; i++) {
|
---|
386 | if ((dbcache[i].accessnum >= 0) &&
|
---|
387 | (dbcache[i].qd != NULL) &&
|
---|
388 | (strcmp (gensuffix, dbcache[i].gensuffix) == 0)) {
|
---|
389 | dbcache[i].accessnum = get_next_accessnum ();
|
---|
390 | return i;
|
---|
391 | }
|
---|
392 | }
|
---|
393 |
|
---|
394 | return -1;
|
---|
395 | }
|
---|
396 |
|
---|
397 | /* unload_database will unload a certain entry within */
|
---|
398 | /* the database cache, clearing it for furture use. */
|
---|
399 | static void unload_database (int i) {
|
---|
400 | /* check to see if it contains anything */
|
---|
401 | if (dbcache[i].accessnum < 0 || dbcache[i].qd == NULL)
|
---|
402 | return;
|
---|
403 |
|
---|
404 | /* unload all the query information */
|
---|
405 | FinishQuerySystem(dbcache[i].qd);
|
---|
406 |
|
---|
407 | /* reset all the db info */
|
---|
408 | dbcache[i].accessnum = -1;
|
---|
409 | dbcache[i].collection[0] = '\0';
|
---|
410 | dbcache[i].mgdir[0] = '\0';
|
---|
411 | dbcache[i].gensuffix[0] = '\0';
|
---|
412 | dbcache[i].textsuffix[0] = '\0';
|
---|
413 | dbcache[i].qd = NULL;
|
---|
414 |
|
---|
415 | cache_numloaded--;
|
---|
416 | if (cache_numloaded < 0) cache_numloaded = 0;
|
---|
417 | }
|
---|
418 |
|
---|
419 | /* cache_database will store the information about */
|
---|
420 | /* a database in the database cache. */
|
---|
421 | static void cache_database (int i, char *collection, char *mgdir, char *gensuffix,
|
---|
422 | char *textsuffix, query_data *qd) {
|
---|
423 | /* make sure this entry has been unloaded first */
|
---|
424 | if (dbcache[i].accessnum >= 0 && dbcache[i].qd != NULL)
|
---|
425 | unload_database (i);
|
---|
426 |
|
---|
427 | /* store the db info */
|
---|
428 | dbcache[i].accessnum = get_next_accessnum ();
|
---|
429 | strcpy (dbcache[i].collection, collection);
|
---|
430 | strcpy (dbcache[i].mgdir, mgdir);
|
---|
431 | strcpy (dbcache[i].gensuffix, gensuffix);
|
---|
432 | strcpy (dbcache[i].textsuffix, textsuffix);
|
---|
433 | dbcache[i].qd = qd;
|
---|
434 |
|
---|
435 | cache_numloaded++;
|
---|
436 | }
|
---|
437 |
|
---|
438 | static void make_current (int i) {
|
---|
439 | /* see if it is the current index */
|
---|
440 | if (i == cur_cachenum) return;
|
---|
441 |
|
---|
442 | /* unload the old index */
|
---|
443 | if (cur_cachenum >= 0) UninitEnv ();
|
---|
444 | cur_cachenum = -1;
|
---|
445 |
|
---|
446 | /* make sure the new one is ok */
|
---|
447 | if (i < 0 || dbcache[i].accessnum < 0 || dbcache[i].qd == NULL)
|
---|
448 | return;
|
---|
449 |
|
---|
450 | /* load the new one */
|
---|
451 |
|
---|
452 | /* Initialise the environment with default values */
|
---|
453 | InitEnv ();
|
---|
454 |
|
---|
455 | SetEnv("mgdir",dbcache[i].mgdir,NULL);
|
---|
456 | SetEnv("mgname",dbcache[i].gensuffix,NULL);
|
---|
457 | SetEnv("textname",dbcache[i].textsuffix,NULL);
|
---|
458 |
|
---|
459 | PushEnv ();
|
---|
460 |
|
---|
461 | cur_cachenum = i;
|
---|
462 | }
|
---|
463 |
|
---|
464 |
|
---|
465 |
|
---|
466 | /********************
|
---|
467 | * public functions *
|
---|
468 | ********************/
|
---|
469 |
|
---|
470 | int mgq_ask(char *line)
|
---|
471 | {
|
---|
472 | query_data *qd = (query_data *)NULL;
|
---|
473 | char QueryType = QUERY_BOOLEAN;
|
---|
474 | char OutputType = QUERY_DOCNUMS;
|
---|
475 | char *LinePtr = (char *)NULL;
|
---|
476 |
|
---|
477 | if (cur_cachenum == -1) return 0;
|
---|
478 | qd = dbcache[cur_cachenum].qd;
|
---|
479 | if (qd == NULL) return 0;
|
---|
480 |
|
---|
481 | ResetFileStats (qd);
|
---|
482 | qd->max_mem_in_use = qd->mem_in_use = 0;
|
---|
483 | qd->tot_hops_taken += qd->hops_taken;
|
---|
484 | qd->tot_num_of_ptrs += qd->num_of_ptrs;
|
---|
485 | qd->tot_num_of_accum += qd->num_of_accum;
|
---|
486 | qd->tot_num_of_terms += qd->num_of_terms;
|
---|
487 | qd->tot_num_of_ans += qd->num_of_ans;
|
---|
488 | qd->tot_text_idx_lookups += qd->text_idx_lookups;
|
---|
489 | qd->hops_taken = qd->num_of_ptrs = 0;
|
---|
490 | qd->num_of_accum = qd->num_of_ans = qd->num_of_terms = 0;
|
---|
491 | qd->text_idx_lookups = 0;
|
---|
492 |
|
---|
493 | LinePtr = ProcessCommands (line, qd);
|
---|
494 | if (CommandsErrorStr) {
|
---|
495 | fprintf (stderr, "%s\n", CommandsErrorStr);
|
---|
496 | return 0;
|
---|
497 | }
|
---|
498 | if (*LinePtr == '\0') return 1;
|
---|
499 |
|
---|
500 | FreeQueryDocs (qd);
|
---|
501 |
|
---|
502 | QueryType = get_query_type ();
|
---|
503 | OutputType = get_output_type ();
|
---|
504 | /* No point in hiliting words on a docnum query */
|
---|
505 | if (OutputType == OUTPUT_HILITE && QueryType == QUERY_DOCNUMS)
|
---|
506 | OutputType = OUTPUT_TEXT;
|
---|
507 |
|
---|
508 | switch (QueryType)
|
---|
509 | {
|
---|
510 | case QUERY_BOOLEAN:
|
---|
511 | {
|
---|
512 | char *maxdocs = (char *)NULL;
|
---|
513 | BooleanQueryInfo bqi;
|
---|
514 | maxdocs = GetDefEnv ("maxdocs", "all");
|
---|
515 | bqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
|
---|
516 | if (qd->sd->sdh.indexed)
|
---|
517 | BooleanQuery (qd, line, &bqi, (BooleanEnv (GetEnv ("casefold"), 0) |
|
---|
518 | (BooleanEnv (GetEnv ("stem"), 0) << 1)));
|
---|
519 | else
|
---|
520 | BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method);
|
---|
521 | /* if (qd->sd->sdh.indexed) BooleanQuery (qd, line, &bqi, 3);
|
---|
522 | else BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method); */
|
---|
523 | break;
|
---|
524 | }
|
---|
525 | case QUERY_APPROX:
|
---|
526 | case QUERY_RANKED:
|
---|
527 | {
|
---|
528 | char *maxdocs = (char *)NULL;
|
---|
529 | char *maxterms = (char *)NULL;
|
---|
530 | char *maxaccum = (char *)NULL;
|
---|
531 | RankedQueryInfo rqi;
|
---|
532 | maxdocs = GetDefEnv ("maxdocs", "all");
|
---|
533 | maxterms = GetDefEnv ("max_terms", "all");
|
---|
534 | maxaccum = GetDefEnv ("max_accumulators", "all");
|
---|
535 | rqi.Sort = BooleanEnv (GetEnv ("sorted_terms"), 0);
|
---|
536 | rqi.QueryFreqs = BooleanEnv (GetEnv ("qfreq"), 1);
|
---|
537 | rqi.Exact = QueryType == QUERY_RANKED;
|
---|
538 | rqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
|
---|
539 | rqi.MaxTerms = strcmp (maxterms, "all") ? atoi (maxterms) : -1;
|
---|
540 | rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;
|
---|
541 | if (qd->id->ifh.InvfLevel == 3 && GetEnv ("maxparas"))
|
---|
542 | rqi.MaxParasToRetrieve = atoi (GetEnv ("maxparas"));
|
---|
543 | rqi.AccumMethod = toupper (*GetDefEnv ("accumulator_method", "A"));
|
---|
544 | rqi.MaxAccums = strcmp (maxaccum, "all") ? atoi (maxaccum) : -1;
|
---|
545 | rqi.HashTblSize = IntEnv (GetEnv ("hash_tbl_size"), 1000);
|
---|
546 | rqi.StopAtMaxAccum = BooleanEnv (GetEnv ("stop_at_max_accum"), 0);
|
---|
547 | rqi.skip_dump = GetEnv ("skip_dump");
|
---|
548 | RankedQuery (qd, line, &rqi);
|
---|
549 | break;
|
---|
550 | }
|
---|
551 | case QUERY_DOCNUMS:
|
---|
552 | {
|
---|
553 | DocnumsQuery (qd, line);
|
---|
554 | break;
|
---|
555 | }
|
---|
556 | }
|
---|
557 |
|
---|
558 | return 1;
|
---|
559 | }
|
---|
560 |
|
---|
561 | int mgq_numdocs(void)
|
---|
562 | {
|
---|
563 | query_data *qd = NULL;
|
---|
564 | if (cur_cachenum == -1) return 0;
|
---|
565 | qd = dbcache[cur_cachenum].qd;
|
---|
566 | if (qd == NULL) return 0;
|
---|
567 |
|
---|
568 | if (qd->DL) return qd->DL->num;
|
---|
569 | else return 0;
|
---|
570 | }
|
---|
571 |
|
---|
572 | int mgq_numterms(void)
|
---|
573 | {
|
---|
574 | query_data *qd = NULL;
|
---|
575 | if (cur_cachenum == -1) return 0;
|
---|
576 | qd = dbcache[cur_cachenum].qd;
|
---|
577 | if (qd == NULL) return 0;
|
---|
578 |
|
---|
579 | if (qd->QTL) return qd->QTL->num;
|
---|
580 | else return 0;
|
---|
581 | }
|
---|
582 |
|
---|
583 | int mgq_results(enum result_kinds kind,int skip,int howmany, int (*sender)(char *, int, int, float, void *), void *ptr)
|
---|
584 | {
|
---|
585 | query_data *qd = NULL;
|
---|
586 | if (cur_cachenum == -1) return 0;
|
---|
587 | qd = dbcache[cur_cachenum].qd;
|
---|
588 | if (qd == NULL) return 0;
|
---|
589 |
|
---|
590 | if (qd->DL) {
|
---|
591 | qd->doc_pos = 0;
|
---|
592 | MoreDocs(qd, kind, skip, howmany, sender, ptr);
|
---|
593 | }
|
---|
594 | return 0;
|
---|
595 | }
|
---|
596 |
|
---|
597 | /* get all the terms that match wordstem using the current stemmer and */
|
---|
598 | /* stemming method. The callback is the same style used by mgq_results */
|
---|
599 | int mgq_equivterms (unsigned char *wordstem, int (*sender)(char *, int, int, float, void *),
|
---|
600 | void *ptr) {
|
---|
601 | int stem_method = 0;
|
---|
602 | query_data *qd = NULL;
|
---|
603 | TermList *equivterms = NULL; /* used for equivalent terms */
|
---|
604 |
|
---|
605 | if (cur_cachenum == -1) return 0;
|
---|
606 | qd = dbcache[cur_cachenum].qd;
|
---|
607 | if (qd == NULL || wordstem == NULL || sender == NULL) return 0;
|
---|
608 |
|
---|
609 | if (qd->sd->sdh.indexed) {
|
---|
610 | stem_method = BooleanEnv(GetEnv("casefold"),0) | (BooleanEnv(GetEnv("stem"),0) << 1);
|
---|
611 | } else {
|
---|
612 | stem_method = qd->sd->sdh.stem_method;
|
---|
613 | }
|
---|
614 |
|
---|
615 | /* make the term list */
|
---|
616 | equivterms = MakeTermList (0);
|
---|
617 |
|
---|
618 | /* expand out this word */
|
---|
619 | if (FindWords (qd->sd, wordstem, stem_method, &equivterms) > 0) {
|
---|
620 | int i;
|
---|
621 | for (i=0; i < equivterms->num; i++) {
|
---|
622 | (* sender)((char *)(equivterms->TE[i].Word+1), equivterms->TE[i].Word[0],
|
---|
623 | equivterms->TE[i].Count, (float)0.0, ptr);
|
---|
624 | }
|
---|
625 | }
|
---|
626 |
|
---|
627 | /* free the term list */
|
---|
628 | if (equivterms != NULL) FreeTermList (&equivterms);
|
---|
629 |
|
---|
630 | return 0;
|
---|
631 | }
|
---|
632 |
|
---|
633 | /* gets the total number of documents retrieved. If this is not available */
|
---|
634 | /* it will set total_retrieved to 0 (even when it obviously isn't) */
|
---|
635 | int mgq_docsretrieved (int *total_retrieved, int *is_approx) {
|
---|
636 | query_data *qd = NULL;
|
---|
637 |
|
---|
638 | if (cur_cachenum == -1) return 0;
|
---|
639 | qd = dbcache[cur_cachenum].qd;
|
---|
640 | if (qd == NULL || total_retrieved == NULL || is_approx == NULL) return 0;
|
---|
641 |
|
---|
642 | /* set default values */
|
---|
643 | *total_retrieved = 0;
|
---|
644 | *is_approx = 0;
|
---|
645 |
|
---|
646 | if (qd->DL == NULL) return 0;
|
---|
647 |
|
---|
648 | *total_retrieved = qd->DL->total_retrieved;
|
---|
649 | *is_approx = qd->DL->is_approx;
|
---|
650 |
|
---|
651 | return 0;
|
---|
652 | }
|
---|
653 |
|
---|
654 |
|
---|
655 | /* use mgq_getmaxstemlen to determine the length of the word stems to pass */
|
---|
656 | /* to mgq_stemword */
|
---|
657 | int mgq_getmaxstemlen () {
|
---|
658 | return MAXSTEMLEN;
|
---|
659 | }
|
---|
660 |
|
---|
661 | /* note: the stemming method and the stemmer come from the last query */
|
---|
662 | /* "word" should be at least maxstemlen+1 long and it is a string that */
|
---|
663 | /* starts with the string length */
|
---|
664 | void mgq_stemword (unsigned char *word) {
|
---|
665 | int stem_method = 0;
|
---|
666 | query_data *qd = NULL;
|
---|
667 |
|
---|
668 | if (cur_cachenum == -1) return;
|
---|
669 | qd = dbcache[cur_cachenum].qd;
|
---|
670 | if (qd == NULL || word == NULL) return;
|
---|
671 |
|
---|
672 | if (qd->sd->sdh.indexed) {
|
---|
673 | stem_method = BooleanEnv(GetEnv("casefold"),0) | (BooleanEnv(GetEnv("stem"),0) << 1);
|
---|
674 | } else {
|
---|
675 | stem_method = qd->sd->sdh.stem_method;
|
---|
676 | }
|
---|
677 |
|
---|
678 | stemmer (stem_method, qd->sd->sdh.stemmer_num, word);
|
---|
679 | }
|
---|
680 |
|
---|
681 |
|
---|
682 |
|
---|
683 | int is_dbcache_full (void) {
|
---|
684 | init_dbcache ();
|
---|
685 | if (cache_numloaded >= MAXNUMDATABASEINFO) return 1;
|
---|
686 | return 0;
|
---|
687 | }
|
---|
688 |
|
---|
689 | int load_database (char *collection, char *mgdir,
|
---|
690 | char *gensuffix, char *textsuffix) {
|
---|
691 | int i = 0;
|
---|
692 | query_data *qd = NULL;
|
---|
693 | /* FILE *deb = NULL; */
|
---|
694 | init_dbcache ();
|
---|
695 |
|
---|
696 | /* print out some debug information */
|
---|
697 | /* deb = fopen ("/home/rjmcnab/gsdl/etc/deb.txt", "a");
|
---|
698 | fprintf (deb, "\ncache_nextaccessnum: %i\n", cache_nextaccessnum);
|
---|
699 | fprintf (deb, "cache_numloaded: %i\n", cache_numloaded);
|
---|
700 | fprintf (deb, "cur_cachenum: %i\n", cur_cachenum);
|
---|
701 | fprintf (deb, "MAXNUMDATABASEINFO: %i\n\n", MAXNUMDATABASEINFO);
|
---|
702 | for (i=0; i<MAXNUMDATABASEINFO; i++) {
|
---|
703 | fprintf (deb, "Entry %i\n", i);
|
---|
704 | fprintf (deb, " accessnum: %i\n", dbcache[i].accessnum);
|
---|
705 | fprintf (deb, " collection: %s\n", dbcache[i].collection);
|
---|
706 | fprintf (deb, " mgdir: %s\n", dbcache[i].mgdir);
|
---|
707 | fprintf (deb, " gensuffix: %s\n", dbcache[i].gensuffix);
|
---|
708 | fprintf (deb, " textsuffix: %s\n", dbcache[i].textsuffix);
|
---|
709 | fprintf (deb, " qd: %x\n", (int)(dbcache[i].qd));
|
---|
710 | }
|
---|
711 | fclose (deb); */
|
---|
712 |
|
---|
713 | /* search for the index */
|
---|
714 | i = search_gensuffix (gensuffix);
|
---|
715 | if (i >= 0) {
|
---|
716 | make_current (i);
|
---|
717 | return 1;
|
---|
718 | }
|
---|
719 |
|
---|
720 | /* if there was a current database then the */
|
---|
721 | /* environment needs uninitialising */
|
---|
722 | make_current (-1);
|
---|
723 |
|
---|
724 | /* get a free cache number */
|
---|
725 | i = get_free_dbcache ();
|
---|
726 | unload_database (i);
|
---|
727 |
|
---|
728 | /* load the index */
|
---|
729 | qd = InitQuerySystem (mgdir, gensuffix, textsuffix, NULL);
|
---|
730 | if (qd == NULL) return 0;
|
---|
731 |
|
---|
732 | /* cache this index */
|
---|
733 | cache_database (i, collection, mgdir, gensuffix, textsuffix, qd);
|
---|
734 |
|
---|
735 | /* make this index current */
|
---|
736 | make_current (i);
|
---|
737 |
|
---|
738 | return 1;
|
---|
739 | }
|
---|
740 |
|
---|
741 | /* load_text_database tries to make an index of the */
|
---|
742 | /* specified collection current */
|
---|
743 | int load_text_database (char *collection) {
|
---|
744 | int i = 0;
|
---|
745 | init_dbcache ();
|
---|
746 |
|
---|
747 | /* search for the index */
|
---|
748 | i = search_collect (collection);
|
---|
749 |
|
---|
750 | /* return if none were found */
|
---|
751 | if (i < 0) return 0;
|
---|
752 |
|
---|
753 | /* make this index current */
|
---|
754 | make_current (i);
|
---|
755 | return 1;
|
---|
756 | }
|
---|
757 |
|
---|
758 | void close_all_databases (void) {
|
---|
759 | int i = 0;
|
---|
760 | init_dbcache ();
|
---|
761 |
|
---|
762 | /* unload all active databases */
|
---|
763 | for (i=0; i<MAXNUMDATABASEINFO; i++) {
|
---|
764 | unload_database (i);
|
---|
765 | }
|
---|
766 |
|
---|
767 | /* if there was a current database then the */
|
---|
768 | /* environment needs uninitialising */
|
---|
769 | make_current (-1);
|
---|
770 | }
|
---|
771 |
|
---|
772 |
|
---|
773 |
|
---|