source: trunk/gsdl/src/colservr/mgq.c@ 308

Last change on this file since 308 was 308, checked in by rjmcnab, 25 years ago

A bit of hacking to remove the restriction that the index to get
a document must be a level 2 index. Now both level 2 and level 3
indexes can be used to get the text of a document.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.5 KB
Line 
1/**********************************************************************
2 *
3 * mgq.c -- cut-down version of mgquery
4 * Copyright (C) 1999 The New Zealand Digital Library Project
5 *
6 * PUT COPYRIGHT NOTICE HERE
7 *
8 * $Id: mgq.c 308 1999-06-28 08:56:29Z rjmcnab $
9 *
10 *********************************************************************/
11
12/*
13 $Log$
14 Revision 1.4 1999/06/28 08:56:29 rjmcnab
15 A bit of hacking to remove the restriction that the index to get
16 a document must be a level 2 index. Now both level 2 and level 3
17 indexes can be used to get the text of a document.
18
19 Revision 1.3 1999/01/19 01:38:16 rjmcnab
20
21 Made the source more portable.
22
23 Revision 1.2 1999/01/12 01:51:02 rjmcnab
24
25 Standard header.
26
27 Revision 1.1 1999/01/08 09:02:22 rjmcnab
28
29 Moved from src/library.
30
31 */
32
33
34#include "mgq.h"
35
36
37#include <stdio.h>
38#include <string.h>
39/* #include <io.h> */
40#include <fcntl.h>
41
42#ifdef __cplusplus
43extern "C" {
44#endif
45
46#include "sysfuncs.h"
47
48#include "messages.h"
49#include "memlib.h"
50
51#include "invf.h"
52#include "text.h"
53#include "lists.h"
54#include "backend.h"
55#include "environment.h"
56#include "globals.h"
57#include "mg_errors.h"
58#include "commands.h"
59#include "text_get.h"
60#include "term_lists.h"
61#include "local_strings.h"
62
63#ifdef __cplusplus
64}
65#endif
66
67
68#include "mgq.h"
69
70/* get a reasonable database cache size */
71#ifndef MAXNUMDATABASEINFO
72# ifdef GSDLSERVER
73# define MAXNUMDATABASEINFO 10
74# else
75# define MAXNUMDATABASEINFO 2
76# endif
77#endif
78
79#define MAXCOLLECTIONLEN 16
80#define MAXMGDIRLEN 256
81#define MAXGENSUFFIXLEN 256
82#define MAXTEXTSUFFIXLEN 256
83
84typedef struct DatabaseInfo {
85 int accessnum; /* -1 = invalid record */
86 char collection[MAXCOLLECTIONLEN];
87 char mgdir[MAXMGDIRLEN];
88 char gensuffix[MAXGENSUFFIXLEN];
89 char textsuffix[MAXTEXTSUFFIXLEN];
90 query_data *qd;
91} DatabaseInfo;
92
93
94/* globals needed by some vague part of mg... */
95FILE *OutFile = NULL, *InFile = NULL;
96int OutPipe = 0, InPipe = 0;
97int Quitting = 0;
98
99/* globals needed to handle loading of databases */
100static int cur_cachenum = -1;
101
102/* globals needed by the database cache */
103static DatabaseInfo dbcache[MAXNUMDATABASEINFO];
104static int cache_nextaccessnum = 0;
105static int cache_numloaded = 0;
106
107
108
109#if defined(PARADOCNUM) || defined(NZDL)
110static int GetDocNumFromParaNum(query_data *qd, int paranum) {
111 int Documents = qd->td->cth.num_of_docs;
112 int *Paragraph = qd->paragraph;
113 int low = 1, high = Documents;
114 int mid = (low+high)/2;
115
116 while ((mid = (low+high)/2) >=1 && mid <= Documents)
117 {
118 if (paranum > Paragraph[mid])
119 low = mid+1;
120 else if (paranum <= Paragraph[mid-1])
121 high = mid-1;
122 else
123 return mid;
124 }
125 FatalError(1, "Bad paragraph number.\n");
126 return 0;
127}
128
129static int GetParaNumFromDocNum(query_data *qd, int docnum) {
130 int Documents = qd->td->cth.num_of_docs;
131 int *Paragraph = qd->paragraph;
132
133 if (docnum > 0 && docnum <= Documents)
134 return Paragraph[docnum-1]+1;
135 return 0;
136}
137#endif
138
139
140/*****************************************************************************/
141
142static void MGQError(char *emsg)
143{
144 fprintf(stderr,"Fatal error: %s\n", emsg);
145 exit(1);
146}
147
148static int ProcessDocs (query_data * qd, int skip, int howmany,
149 enum result_kinds kind,
150 int (*sender)(char *,int,int,float,void *), void *ptr) {
151 int max_buf = 0, output_failure = 0;
152 int DocCount = 0;
153 int need_text = (kind == result_docs);
154
155 /* skip the requested number of documents */
156 while (skip > 0) {
157 if (!NextDoc(qd)) return 0;
158 skip--;
159 }
160
161 /* find out the maximum size for the text buffer */
162 if (need_text) max_buf = atoi (GetDefEnv ("buffer", "1048576"));
163
164 /* process each document */
165 do {
166 u_char *UDoc = NULL;
167 unsigned long ULen=0;
168
169#if defined(PARADOCNUM) || defined(NZDL)
170 /* adjust the document number for paragraph level result_docs */
171 /* this is a bit of a hack ... */
172 if (kind==result_docs && qd->id->ifh.InvfLevel == 3 &&
173 qd->DL != NULL && (int)qd->doc_pos < (int)qd->DL->num)
174 qd->DL->DE[qd->doc_pos].DocNum = GetParaNumFromDocNum(qd, qd->DL->DE[qd->doc_pos].DocNum);
175#endif
176
177 if (need_text) {
178 /* load the compressed text */
179 if (LoadCompressedText (qd, max_buf))
180 MGQError("Unable to load compressed text(memory?).");
181
182 /* uncompress the loaded text */
183 UDoc = GetDocText (qd, &ULen);
184 if (UDoc == NULL) MGQError("UDoc is unexpectedly NULL");
185 }
186
187 if (UDoc != NULL || kind == result_docnums) {
188 int docnum = GetDocNum(qd);
189#if defined(PARADOCNUM) || defined(NZDL)
190 if (qd->id->ifh.InvfLevel == 3) docnum = GetDocNumFromParaNum(qd, docnum);
191#endif
192 switch (kind) {
193 case result_docnums:
194 if (sender != NULL)
195 output_failure = (*sender)("",0,docnum,GetDocWeight(qd),ptr);
196 break;
197 case result_docs:
198 if (sender != NULL)
199 output_failure = (*sender)((char *)UDoc,ULen,docnum,GetDocWeight(qd),ptr);
200 break;
201 default:
202 break;
203 }
204 }
205 DocCount++;
206
207 } while (NextDoc (qd) && output_failure == 0 && --howmany > 0);
208
209 if (need_text) FreeTextBuffer (qd);
210
211 return (DocCount);
212}
213
214
215static void send_query_term_freqs(QueryTermList *qtl,
216 int (*sender)(char *,int,int,float,void *), void *ptr)
217{
218 int i = 0;
219 for (i = 0; i < qtl->num; i++)
220 if (sender != NULL) {
221 /* word = word2str(qtl->QTE[i].Term);
222 (* sender)(word, strlen(word), qtl->QTE[i].Count, (float)0.0, ptr); */
223 (* sender)((char *)(qtl->QTE[i].Term+1), qtl->QTE[i].Term[0],
224 qtl->QTE[i].Count, (float)0.0, ptr);
225 }
226}
227
228
229static void send_terms (TermList * qtl,
230 int (*sender)(char *,int,int,float,void *), void *ptr)
231{
232 int i = 0;
233 if (sender == NULL) return;
234 for (i = 0; i < qtl->num; i++)
235 {
236 /* word = word2str(qtl->TE[i].Word);
237 (* sender)(word, strlen(word), qtl->TE[i].Count, (float)0.0, ptr);*/
238 (* sender)((char *)(qtl->TE[i].Word+1), qtl->TE[i].Word[0],
239 qtl->TE[i].Count, (float)0.0, ptr);
240 }
241}
242
243
244/* MoreDocs () */
245/* Displays all documents in list DocList. */
246/* Documents are fetched, then decompressed and displayed according to the */
247/* format implied in FormString(). */
248
249static void
250MoreDocs (query_data * qd, enum result_kinds kind,
251 int skip, int howmany,
252 int (*sender)(char *,int,int,float,void *), void *ptr)
253{
254 qd->num_of_ans = qd->DL->num;
255 switch (kind) {
256 case result_docs:
257 case result_docnums:
258 if (qd->num_of_ans > 0)
259 ProcessDocs (qd, skip, howmany, kind, sender, ptr);
260 break;
261 case result_termfreqs:
262 send_query_term_freqs(qd->QTL, sender, ptr);
263 break;
264 case result_terms:
265 send_terms(qd->TL, sender, ptr);
266 break;
267 }
268}
269
270
271
272
273
274
275/******************************************
276 * functions to handle the database cache *
277 ******************************************/
278
279/* init_dbcache should be called at the start of each */
280/* function which deals with the database cache */
281static void init_dbcache (void) {
282 static int dbcacheinited = 0;
283 int i = 0;
284
285 if (dbcacheinited) return;
286
287 cache_numloaded = 0;
288
289 for (i=0; i<MAXNUMDATABASEINFO; i++) {
290 dbcache[i].accessnum = -1;
291 dbcache[i].collection[0] = '\0';
292 dbcache[i].mgdir[0] = '\0';
293 dbcache[i].gensuffix[0] = '\0';
294 dbcache[i].textsuffix[0] = '\0';
295 dbcache[i].qd = NULL;
296 }
297
298 dbcacheinited = 1;
299}
300
301/* returns the next cache access number and increments it */
302static int get_next_accessnum (void) {
303 return cache_nextaccessnum++;
304}
305
306/* get_free_dbcache returns the cache number which */
307/* was used the longest time ago */
308/* init_dbcache should be called before this function */
309static int get_free_dbcache (void) {
310 int i = 0;
311 int minaccessnum = cache_nextaccessnum; /* the current max */
312 int minpos = 0;
313
314 for (i=0; i<MAXNUMDATABASEINFO; i++) {
315 if (dbcache[i].accessnum < minaccessnum) {
316 minaccessnum = dbcache[i].accessnum;
317 minpos = i;
318 }
319 }
320
321 return minpos;
322}
323
324/* search_collect will search for an index which */
325/* belongs to a certain collection It returns -1 if none could be found. */
326/* init_dbcache should be called before this function */
327static int search_collect (char *collection) {
328 int i = 0;
329
330 for (i=0; i<MAXNUMDATABASEINFO; i++) {
331 if ((dbcache[i].accessnum >= 0) &&
332 (dbcache[i].qd != NULL) &&
333 (strcmp (collection, dbcache[i].collection) == 0)
334 /* && (dbcache[i].qd->id->ifh.InvfLevel == 2)*/
335 ) {
336 dbcache[i].accessnum = get_next_accessnum ();
337 return i;
338 }
339 }
340
341 return -1;
342}
343
344/* search_gensuffix will search for an index which */
345/* has a certain gensuffix. It returns -1 if none could be found. */
346/* init_dbcache should be called before this function */
347static int search_gensuffix (char *gensuffix) {
348 int i = 0;
349
350 for (i=0; i<MAXNUMDATABASEINFO; i++) {
351 if ((dbcache[i].accessnum >= 0) &&
352 (dbcache[i].qd != NULL) &&
353 (strcmp (gensuffix, dbcache[i].gensuffix) == 0)) {
354 dbcache[i].accessnum = get_next_accessnum ();
355 return i;
356 }
357 }
358
359 return -1;
360}
361
362/* unload_database will unload a certain entry within */
363/* the database cache, clearing it for furture use. */
364static void unload_database (int i) {
365 /* check to see if it contains anything */
366 if (dbcache[i].accessnum < 0 || dbcache[i].qd == NULL)
367 return;
368
369 /* unload all the query information */
370 FinishQuerySystem(dbcache[i].qd);
371
372 /* reset all the db info */
373 dbcache[i].accessnum = -1;
374 dbcache[i].collection[0] = '\0';
375 dbcache[i].mgdir[0] = '\0';
376 dbcache[i].gensuffix[0] = '\0';
377 dbcache[i].textsuffix[0] = '\0';
378 dbcache[i].qd = NULL;
379
380 cache_numloaded--;
381 if (cache_numloaded < 0) cache_numloaded = 0;
382}
383
384/* cache_database will store the information about */
385/* a database in the database cache. */
386static void cache_database (int i, char *collection, char *mgdir, char *gensuffix,
387 char *textsuffix, query_data *qd) {
388 /* make sure this entry has been unloaded first */
389 if (dbcache[i].accessnum >= 0 && dbcache[i].qd != NULL)
390 unload_database (i);
391
392 /* store the db info */
393 dbcache[i].accessnum = get_next_accessnum ();
394 strcpy (dbcache[i].collection, collection);
395 strcpy (dbcache[i].mgdir, mgdir);
396 strcpy (dbcache[i].gensuffix, gensuffix);
397 strcpy (dbcache[i].textsuffix, textsuffix);
398 dbcache[i].qd = qd;
399
400 cache_numloaded++;
401}
402
403static void make_current (int i) {
404 /* see if it is the current index */
405 if (i == cur_cachenum) return;
406
407 /* unload the old index */
408 if (cur_cachenum >= 0) UninitEnv ();
409 cur_cachenum = -1;
410
411 /* make sure the new one is ok */
412 if (i < 0 || dbcache[i].accessnum < 0 || dbcache[i].qd == NULL)
413 return;
414
415 /* load the new one */
416
417 /* Initialise the environment with default values */
418 InitEnv ();
419
420 SetEnv("mgdir",dbcache[i].mgdir,NULL);
421 SetEnv("mgname",dbcache[i].gensuffix,NULL);
422 SetEnv("textname",dbcache[i].textsuffix,NULL);
423
424 PushEnv ();
425
426 cur_cachenum = i;
427}
428
429
430
431/********************
432 * public functions *
433 ********************/
434
435int mgq_ask(char *line)
436{
437 query_data *qd = (query_data *)NULL;
438 char QueryType = QUERY_BOOLEAN;
439 char OutputType = QUERY_DOCNUMS;
440 char *LinePtr = (char *)NULL;
441
442 if (cur_cachenum == -1) return 0;
443 qd = dbcache[cur_cachenum].qd;
444 if (qd == NULL) return 0;
445
446 ResetFileStats (qd);
447 qd->max_mem_in_use = qd->mem_in_use = 0;
448 qd->tot_hops_taken += qd->hops_taken;
449 qd->tot_num_of_ptrs += qd->num_of_ptrs;
450 qd->tot_num_of_accum += qd->num_of_accum;
451 qd->tot_num_of_terms += qd->num_of_terms;
452 qd->tot_num_of_ans += qd->num_of_ans;
453 qd->tot_text_idx_lookups += qd->text_idx_lookups;
454 qd->hops_taken = qd->num_of_ptrs = 0;
455 qd->num_of_accum = qd->num_of_ans = qd->num_of_terms = 0;
456 qd->text_idx_lookups = 0;
457
458 LinePtr = ProcessCommands (line, qd);
459 if (CommandsErrorStr) {
460 fprintf (stderr, "%s\n", CommandsErrorStr);
461 return 0;
462 }
463 if (*LinePtr == '\0') return 1;
464
465 FreeQueryDocs (qd);
466
467 QueryType = get_query_type ();
468 OutputType = get_output_type ();
469 /* No point in hiliting words on a docnum query */
470 if (OutputType == OUTPUT_HILITE && QueryType == QUERY_DOCNUMS)
471 OutputType = OUTPUT_TEXT;
472
473 switch (QueryType)
474 {
475 case QUERY_BOOLEAN:
476 {
477 char *maxdocs = (char *)NULL;
478 BooleanQueryInfo bqi;
479 maxdocs = GetDefEnv ("maxdocs", "all");
480 bqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
481 if (qd->sd->sdh.indexed)
482 BooleanQuery (qd, line, &bqi, (BooleanEnv (GetEnv ("casefold"), 0) |
483 (BooleanEnv (GetEnv ("stem"), 0) << 1)));
484 else
485 BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method);
486 /* if (qd->sd->sdh.indexed) BooleanQuery (qd, line, &bqi, 3);
487 else BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method); */
488 break;
489 }
490 case QUERY_APPROX:
491 case QUERY_RANKED:
492 {
493 char *maxdocs = (char *)NULL;
494 char *maxterms = (char *)NULL;
495 char *maxaccum = (char *)NULL;
496 RankedQueryInfo rqi;
497 maxdocs = GetDefEnv ("maxdocs", "all");
498 maxterms = GetDefEnv ("max_terms", "all");
499 maxaccum = GetDefEnv ("max_accumulators", "all");
500 rqi.Sort = BooleanEnv (GetEnv ("sorted_terms"), 0);
501 rqi.QueryFreqs = BooleanEnv (GetEnv ("qfreq"), 1);
502 rqi.Exact = QueryType == QUERY_RANKED;
503 rqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1;
504 rqi.MaxTerms = strcmp (maxterms, "all") ? atoi (maxterms) : -1;
505 rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve;
506 if (qd->id->ifh.InvfLevel == 3 && GetEnv ("maxparas"))
507 rqi.MaxParasToRetrieve = atoi (GetEnv ("maxparas"));
508 rqi.AccumMethod = toupper (*GetDefEnv ("accumulator_method", "A"));
509 rqi.MaxAccums = strcmp (maxaccum, "all") ? atoi (maxaccum) : -1;
510 rqi.HashTblSize = IntEnv (GetEnv ("hash_tbl_size"), 1000);
511 rqi.StopAtMaxAccum = BooleanEnv (GetEnv ("stop_at_max_accum"), 0);
512 rqi.skip_dump = GetEnv ("skip_dump");
513 RankedQuery (qd, line, &rqi);
514 break;
515 }
516 case QUERY_DOCNUMS:
517 {
518 DocnumsQuery (qd, line);
519 break;
520 }
521 }
522
523 return 1;
524}
525
526int mgq_numdocs(void)
527{
528 query_data *qd = NULL;
529 if (cur_cachenum == -1) return 0;
530 qd = dbcache[cur_cachenum].qd;
531 if (qd == NULL) return 0;
532
533 if (qd->DL) return qd->DL->num;
534 else return 0;
535}
536
537int mgq_numterms(void)
538{
539 query_data *qd = NULL;
540 if (cur_cachenum == -1) return 0;
541 qd = dbcache[cur_cachenum].qd;
542 if (qd == NULL) return 0;
543
544 if (qd->QTL) return qd->QTL->num;
545 else return 0;
546}
547
548int mgq_results(enum result_kinds kind,int skip,int howmany, int (*sender)(char *, int, int, float, void *), void *ptr)
549{
550 query_data *qd = NULL;
551 if (cur_cachenum == -1) return 0;
552 qd = dbcache[cur_cachenum].qd;
553 if (qd == NULL) return 0;
554
555 if (qd->DL) {
556 qd->doc_pos = 0;
557 MoreDocs(qd, kind, skip, howmany, sender, ptr);
558 }
559 return 0;
560}
561
562
563int is_dbcache_full (void) {
564 init_dbcache ();
565 if (cache_numloaded >= MAXNUMDATABASEINFO) return 1;
566 return 0;
567}
568
569int load_database (char *collection, char *mgdir,
570 char *gensuffix, char *textsuffix) {
571 int i = 0;
572 query_data *qd = NULL;
573 /* FILE *deb = NULL; */
574 init_dbcache ();
575
576 /* print out some debug information */
577/* deb = fopen ("/home/rjmcnab/gsdl/etc/deb.txt", "a");
578 fprintf (deb, "\ncache_nextaccessnum: %i\n", cache_nextaccessnum);
579 fprintf (deb, "cache_numloaded: %i\n", cache_numloaded);
580 fprintf (deb, "cur_cachenum: %i\n", cur_cachenum);
581 fprintf (deb, "MAXNUMDATABASEINFO: %i\n\n", MAXNUMDATABASEINFO);
582 for (i=0; i<MAXNUMDATABASEINFO; i++) {
583 fprintf (deb, "Entry %i\n", i);
584 fprintf (deb, " accessnum: %i\n", dbcache[i].accessnum);
585 fprintf (deb, " collection: %s\n", dbcache[i].collection);
586 fprintf (deb, " mgdir: %s\n", dbcache[i].mgdir);
587 fprintf (deb, " gensuffix: %s\n", dbcache[i].gensuffix);
588 fprintf (deb, " textsuffix: %s\n", dbcache[i].textsuffix);
589 fprintf (deb, " qd: %x\n", (int)(dbcache[i].qd));
590 }
591 fclose (deb); */
592
593 /* search for the index */
594 i = search_gensuffix (gensuffix);
595 if (i > 0) {
596 make_current (i);
597 return 1;
598 }
599
600 /* if there was a current database then the */
601 /* environment needs uninitialising */
602 make_current (-1);
603
604 /* get a free cache number */
605 i = get_free_dbcache ();
606 unload_database (i);
607
608 /* load the index */
609 qd = InitQuerySystem (mgdir, gensuffix, textsuffix, NULL);
610 if (qd == NULL) return 0;
611
612 /* cache this index */
613 cache_database (i, collection, mgdir, gensuffix, textsuffix, qd);
614
615 /* make this index current */
616 make_current (i);
617
618 return 1;
619}
620
621/* load_text_database tries to make an index of the */
622/* specified collection current */
623int load_text_database (char *collection) {
624 int i = 0;
625 init_dbcache ();
626
627 /* search for the index */
628 i = search_collect (collection);
629
630 /* return if none were found */
631 if (i < 0) return 0;
632
633 /* make this index current */
634 make_current (i);
635 return 1;
636}
637
638void close_all_databases (void) {
639 int i = 0;
640 init_dbcache ();
641
642 /* unload all active databases */
643 for (i=0; i<MAXNUMDATABASEINFO; i++) {
644 unload_database (i);
645 }
646
647 /* if there was a current database then the */
648 /* environment needs uninitialising */
649 make_current (-1);
650}
651
652
653
Note: See TracBrowser for help on using the repository browser.