/********************************************************************** * * mgq.c -- cut-dowm version of mgquery * Copyright (C) 1999 The New Zealand Digital Library Project * * A component of the Greenstone digital library software * from the New Zealand Digital Library Project at the * University of Waikato, New Zealand. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *********************************************************************/ #include "mgq.h" #include #include /* #include */ #include #ifdef __cplusplus extern "C" { #endif #include "sysfuncs.h" #include "messages.h" #include "memlib.h" #include "invf.h" #include "text.h" #include "lists.h" #include "backend.h" #include "environment.h" #include "globals.h" #include "mg_errors.h" #include "commands.h" #include "text_get.h" #include "term_lists.h" #include "local_strings.h" #include "words.h" #include "stemmer.h" #include "stem_search.h" #ifdef __cplusplus } #endif #include "mgq.h" /* get a reasonable database cache size */ #ifndef MAXNUMDATABASEINFO # ifdef GSDLSERVER # define MAXNUMDATABASEINFO 10 # else # define MAXNUMDATABASEINFO 2 # endif #endif #define MAXCOLLECTIONLEN 16 #define MAXMGDIRLEN 256 #define MAXGENSUFFIXLEN 256 #define MAXTEXTSUFFIXLEN 256 typedef struct DatabaseInfo { int accessnum; /* -1 = invalid record */ char collection[MAXCOLLECTIONLEN]; char mgdir[MAXMGDIRLEN]; char gensuffix[MAXGENSUFFIXLEN]; char textsuffix[MAXTEXTSUFFIXLEN]; query_data *qd; } DatabaseInfo; /* globals needed by some vague part of mg... */ FILE *OutFile = NULL, *InFile = NULL; int OutPipe = 0, InPipe = 0; int Quitting = 0; /* globals needed to handle loading of databases */ static int cur_cachenum = -1; /* globals needed by the database cache */ static DatabaseInfo dbcache[MAXNUMDATABASEINFO]; static int cache_nextaccessnum = 0; static int cache_numloaded = 0; #if defined(PARADOCNUM) || defined(NZDL) static int GetDocNumFromParaNum(query_data *qd, int paranum) { int Documents = qd->td->cth.num_of_docs; int *Paragraph = qd->paragraph; int low = 1, high = Documents; int mid = (low+high)/2; while ((mid = (low+high)/2) >=1 && mid <= Documents) { if (paranum > Paragraph[mid]) low = mid+1; else if (paranum <= Paragraph[mid-1]) high = mid-1; else return mid; } FatalError(1, "Bad paragraph number.\n"); return 0; } static int GetParaNumFromDocNum(query_data *qd, int docnum) { int Documents = qd->td->cth.num_of_docs; int *Paragraph = qd->paragraph; if (docnum > 0 && docnum <= Documents) return Paragraph[docnum-1]+1; return 0; } #endif /*****************************************************************************/ static void MGQError(char *emsg) { fprintf(stderr,"Fatal error: %s\n", emsg); exit(1); } static int ProcessDocs (query_data * qd, int skip, int howmany, enum result_kinds kind, int (*sender)(char *,int,int,float,void *), void *ptr) { int max_buf = 0, output_failure = 0; int DocCount = 0; int need_text = (kind == result_docs); /* skip the requested number of documents */ while (skip > 0) { if (!NextDoc(qd)) return 0; skip--; } /* find out the maximum size for the text buffer */ if (need_text) max_buf = atoi (GetDefEnv ("buffer", "1048576")); /* process each document */ do { u_char *UDoc = NULL; unsigned long ULen=0; #if defined(PARADOCNUM) || defined(NZDL) /* adjust the document number for paragraph level result_docs */ /* this is a bit of a hack ... */ if (kind==result_docs && qd->id->ifh.InvfLevel == 3 && qd->DL != NULL && (int)qd->doc_pos < (int)qd->DL->num) qd->DL->DE[qd->doc_pos].DocNum = GetParaNumFromDocNum(qd, qd->DL->DE[qd->doc_pos].DocNum); #endif if (need_text) { /* load the compressed text */ if (LoadCompressedText (qd, max_buf)) MGQError("Unable to load compressed text(memory?)."); /* uncompress the loaded text */ UDoc = GetDocText (qd, &ULen); if (UDoc == NULL) MGQError("UDoc is unexpectedly NULL"); } if (UDoc != NULL || kind == result_docnums) { int docnum = GetDocNum(qd); #if defined(PARADOCNUM) || defined(NZDL) if (qd->id->ifh.InvfLevel == 3) docnum = GetDocNumFromParaNum(qd, docnum); #endif switch (kind) { case result_docnums: if (sender != NULL) output_failure = (*sender)("",0,docnum,GetDocWeight(qd),ptr); break; case result_docs: if (sender != NULL) output_failure = (*sender)((char *)UDoc,ULen,docnum,GetDocWeight(qd),ptr); break; default: break; } } DocCount++; } while (NextDoc (qd) && output_failure == 0 && --howmany > 0); /* if (need_text) FreeTextBuffer (qd);*/ return (DocCount); } static void send_query_term_freqs(QueryTermList *qtl, int (*sender)(char *,int,int,float,void *), void *ptr) { int i = 0; for (i = 0; i < qtl->num; i++) if (sender != NULL) { /* word = word2str(qtl->QTE[i].Term); (* sender)(word, strlen(word), qtl->QTE[i].Count, (float)0.0, ptr); */ (* sender)((char *)(qtl->QTE[i].Term+1), qtl->QTE[i].Term[0], qtl->QTE[i].Count, (float)0.0, ptr); } } static void send_terms (TermList * qtl, int (*sender)(char *,int,int,float,void *), void *ptr) { int i = 0; if (sender == NULL) return; for (i = 0; i < qtl->num; i++) { /* word = word2str(qtl->TE[i].Word); (* sender)(word, strlen(word), qtl->TE[i].Count, (float)0.0, ptr);*/ (* sender)((char *)(qtl->TE[i].Word+1), qtl->TE[i].Word[0], qtl->TE[i].Count, (float)0.0, ptr); } } /* MoreDocs () */ /* Displays all documents in list DocList. */ /* Documents are fetched, then decompressed and displayed according to the */ /* format implied in FormString(). */ static void MoreDocs (query_data * qd, enum result_kinds kind, int skip, int howmany, int (*sender)(char *,int,int,float,void *), void *ptr) { qd->num_of_ans = qd->DL->num; switch (kind) { case result_docs: case result_docnums: if (qd->num_of_ans > 0) ProcessDocs (qd, skip, howmany, kind, sender, ptr); break; case result_termfreqs: send_query_term_freqs(qd->QTL, sender, ptr); break; case result_terms: send_terms(qd->TL, sender, ptr); break; } } /****************************************** * functions to handle the database cache * ******************************************/ /* init_dbcache should be called at the start of each */ /* function which deals with the database cache */ static void init_dbcache (void) { static int dbcacheinited = 0; int i = 0; if (dbcacheinited) return; cache_numloaded = 0; for (i=0; i= 0) && (dbcache[i].qd != NULL) && (strcmp (collection, dbcache[i].collection) == 0) /* && (dbcache[i].qd->id->ifh.InvfLevel == 2)*/ ) { dbcache[i].accessnum = get_next_accessnum (); return i; } } return -1; } /* search_gensuffix will search for an index which */ /* has a certain gensuffix. It returns -1 if none could be found. */ /* init_dbcache should be called before this function */ static int search_gensuffix (char *gensuffix) { int i = 0; for (i=0; i= 0) && (dbcache[i].qd != NULL) && (strcmp (gensuffix, dbcache[i].gensuffix) == 0)) { dbcache[i].accessnum = get_next_accessnum (); return i; } } return -1; } /* unload_database will unload a certain entry within */ /* the database cache, clearing it for furture use. */ static void unload_database (int i) { /* check to see if it contains anything */ if (dbcache[i].accessnum < 0 || dbcache[i].qd == NULL) return; /* unload all the query information */ FinishQuerySystem(dbcache[i].qd); /* reset all the db info */ dbcache[i].accessnum = -1; dbcache[i].collection[0] = '\0'; dbcache[i].mgdir[0] = '\0'; dbcache[i].gensuffix[0] = '\0'; dbcache[i].textsuffix[0] = '\0'; dbcache[i].qd = NULL; cache_numloaded--; if (cache_numloaded < 0) cache_numloaded = 0; } /* cache_database will store the information about */ /* a database in the database cache. */ static void cache_database (int i, char *collection, char *mgdir, char *gensuffix, char *textsuffix, query_data *qd) { /* make sure this entry has been unloaded first */ if (dbcache[i].accessnum >= 0 && dbcache[i].qd != NULL) unload_database (i); /* store the db info */ dbcache[i].accessnum = get_next_accessnum (); strcpy (dbcache[i].collection, collection); strcpy (dbcache[i].mgdir, mgdir); strcpy (dbcache[i].gensuffix, gensuffix); strcpy (dbcache[i].textsuffix, textsuffix); dbcache[i].qd = qd; cache_numloaded++; } static void make_current (int i) { /* see if it is the current index */ if (i == cur_cachenum) return; /* unload the old index */ if (cur_cachenum >= 0) UninitEnv (); cur_cachenum = -1; /* make sure the new one is ok */ if (i < 0 || dbcache[i].accessnum < 0 || dbcache[i].qd == NULL) return; /* load the new one */ /* Initialise the environment with default values */ InitEnv (); SetEnv("mgdir",dbcache[i].mgdir,NULL); SetEnv("mgname",dbcache[i].gensuffix,NULL); SetEnv("textname",dbcache[i].textsuffix,NULL); PushEnv (); cur_cachenum = i; } /******************** * public functions * ********************/ int mgq_ask(char *line) { query_data *qd = (query_data *)NULL; char QueryType = QUERY_BOOLEAN; char OutputType = QUERY_DOCNUMS; char *LinePtr = (char *)NULL; if (cur_cachenum == -1) return 0; qd = dbcache[cur_cachenum].qd; if (qd == NULL) return 0; ResetFileStats (qd); qd->max_mem_in_use = qd->mem_in_use = 0; qd->tot_hops_taken += qd->hops_taken; qd->tot_num_of_ptrs += qd->num_of_ptrs; qd->tot_num_of_accum += qd->num_of_accum; qd->tot_num_of_terms += qd->num_of_terms; qd->tot_num_of_ans += qd->num_of_ans; qd->tot_text_idx_lookups += qd->text_idx_lookups; qd->hops_taken = qd->num_of_ptrs = 0; qd->num_of_accum = qd->num_of_ans = qd->num_of_terms = 0; qd->text_idx_lookups = 0; LinePtr = ProcessCommands (line, qd); if (CommandsErrorStr) { fprintf (stderr, "%s\n", CommandsErrorStr); return 0; } if (*LinePtr == '\0') return 1; FreeQueryDocs (qd); QueryType = get_query_type (); OutputType = get_output_type (); /* No point in hiliting words on a docnum query */ if (OutputType == OUTPUT_HILITE && QueryType == QUERY_DOCNUMS) OutputType = OUTPUT_TEXT; switch (QueryType) { case QUERY_BOOLEAN: { char *maxdocs = (char *)NULL; BooleanQueryInfo bqi; maxdocs = GetDefEnv ("maxdocs", "all"); bqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1; if (qd->sd->sdh.indexed) BooleanQuery (qd, line, &bqi, (BooleanEnv (GetEnv ("casefold"), 0) | (BooleanEnv (GetEnv ("stem"), 0) << 1))); else BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method); /* if (qd->sd->sdh.indexed) BooleanQuery (qd, line, &bqi, 3); else BooleanQuery (qd, line, &bqi, qd->sd->sdh.stem_method); */ break; } case QUERY_APPROX: case QUERY_RANKED: { char *maxdocs = (char *)NULL; char *maxterms = (char *)NULL; char *maxaccum = (char *)NULL; RankedQueryInfo rqi; maxdocs = GetDefEnv ("maxdocs", "all"); maxterms = GetDefEnv ("max_terms", "all"); maxaccum = GetDefEnv ("max_accumulators", "all"); rqi.Sort = BooleanEnv (GetEnv ("sorted_terms"), 0); rqi.QueryFreqs = BooleanEnv (GetEnv ("qfreq"), 1); rqi.Exact = QueryType == QUERY_RANKED; rqi.MaxDocsToRetrieve = strcmp (maxdocs, "all") ? atoi (maxdocs) : -1; rqi.MaxTerms = strcmp (maxterms, "all") ? atoi (maxterms) : -1; rqi.MaxParasToRetrieve = rqi.MaxDocsToRetrieve; if (qd->id->ifh.InvfLevel == 3 && GetEnv ("maxparas")) rqi.MaxParasToRetrieve = atoi (GetEnv ("maxparas")); rqi.AccumMethod = toupper (*GetDefEnv ("accumulator_method", "A")); rqi.MaxAccums = strcmp (maxaccum, "all") ? atoi (maxaccum) : -1; rqi.HashTblSize = IntEnv (GetEnv ("hash_tbl_size"), 1000); rqi.StopAtMaxAccum = BooleanEnv (GetEnv ("stop_at_max_accum"), 0); rqi.skip_dump = GetEnv ("skip_dump"); RankedQuery (qd, line, &rqi); break; } case QUERY_DOCNUMS: { DocnumsQuery (qd, line); break; } } return 1; } int mgq_numdocs(void) { query_data *qd = NULL; if (cur_cachenum == -1) return 0; qd = dbcache[cur_cachenum].qd; if (qd == NULL) return 0; if (qd->DL) return qd->DL->num; else return 0; } int mgq_numterms(void) { query_data *qd = NULL; if (cur_cachenum == -1) return 0; qd = dbcache[cur_cachenum].qd; if (qd == NULL) return 0; if (qd->QTL) return qd->QTL->num; else return 0; } int mgq_results(enum result_kinds kind,int skip,int howmany, int (*sender)(char *, int, int, float, void *), void *ptr) { query_data *qd = NULL; if (cur_cachenum == -1) return 0; qd = dbcache[cur_cachenum].qd; if (qd == NULL) return 0; if (qd->DL) { qd->doc_pos = 0; MoreDocs(qd, kind, skip, howmany, sender, ptr); } return 0; } /* get all the terms that match wordstem using the current stemmer and */ /* stemming method. The callback is the same style used by mgq_results */ int mgq_equivterms (unsigned char *wordstem, int (*sender)(char *, int, int, float, void *), void *ptr) { int stem_method = 0; query_data *qd = NULL; TermList *equivterms = NULL; /* used for equivalent terms */ if (cur_cachenum == -1) return 0; qd = dbcache[cur_cachenum].qd; if (qd == NULL || wordstem == NULL || sender == NULL) return 0; if (qd->sd->sdh.indexed) { stem_method = BooleanEnv(GetEnv("casefold"),0) | (BooleanEnv(GetEnv("stem"),0) << 1); } else { stem_method = qd->sd->sdh.stem_method; } /* make the term list */ equivterms = MakeTermList (0); /* expand out this word */ if (FindWords (qd->sd, wordstem, stem_method, &equivterms) > 0) { int i; for (i=0; i < equivterms->num; i++) { (* sender)((char *)(equivterms->TE[i].Word+1), equivterms->TE[i].Word[0], equivterms->TE[i].Count, (float)0.0, ptr); } } /* free the term list */ if (equivterms != NULL) FreeTermList (&equivterms); return 0; } /* gets the total number of documents retrieved. If this is not available */ /* it will set total_retrieved to 0 (even when it obviously isn't) */ int mgq_docsretrieved (int *total_retrieved, int *is_approx) { query_data *qd = NULL; if (cur_cachenum == -1) return 0; qd = dbcache[cur_cachenum].qd; if (qd == NULL || total_retrieved == NULL || is_approx == NULL) return 0; /* set default values */ *total_retrieved = 0; *is_approx = 0; if (qd->DL == NULL) return 0; *total_retrieved = qd->DL->total_retrieved; *is_approx = qd->DL->is_approx; return 0; } /* use mgq_getmaxstemlen to determine the length of the word stems to pass */ /* to mgq_stemword */ int mgq_getmaxstemlen () { return MAXSTEMLEN; } /* note: the stemming method and the stemmer come from the last query */ /* "word" should be at least maxstemlen+1 long and it is a string that */ /* starts with the string length */ void mgq_stemword (unsigned char *word) { int stem_method = 0; query_data *qd = NULL; if (cur_cachenum == -1) return; qd = dbcache[cur_cachenum].qd; if (qd == NULL || word == NULL) return; if (qd->sd->sdh.indexed) { stem_method = BooleanEnv(GetEnv("casefold"),0) | (BooleanEnv(GetEnv("stem"),0) << 1); } else { stem_method = qd->sd->sdh.stem_method; } stemmer (stem_method, qd->sd->sdh.stemmer_num, word); } int is_dbcache_full (void) { init_dbcache (); if (cache_numloaded >= MAXNUMDATABASEINFO) return 1; return 0; } int load_database (char *collection, char *mgdir, char *gensuffix, char *textsuffix) { int i = 0; query_data *qd = NULL; /* FILE *deb = NULL; */ init_dbcache (); /* print out some debug information */ /* deb = fopen ("/home/rjmcnab/gsdl/etc/deb.txt", "a"); fprintf (deb, "\ncache_nextaccessnum: %i\n", cache_nextaccessnum); fprintf (deb, "cache_numloaded: %i\n", cache_numloaded); fprintf (deb, "cur_cachenum: %i\n", cur_cachenum); fprintf (deb, "MAXNUMDATABASEINFO: %i\n\n", MAXNUMDATABASEINFO); for (i=0; i= 0) { make_current (i); return 1; } /* if there was a current database then the */ /* environment needs uninitialising */ make_current (-1); /* get a free cache number */ i = get_free_dbcache (); unload_database (i); /* load the index */ qd = InitQuerySystem (mgdir, gensuffix, textsuffix, NULL); if (qd == NULL) return 0; /* cache this index */ cache_database (i, collection, mgdir, gensuffix, textsuffix, qd); /* make this index current */ make_current (i); return 1; } /* load_text_database tries to make an index of the */ /* specified collection current */ int load_text_database (char *collection) { int i = 0; init_dbcache (); /* search for the index */ i = search_collect (collection); /* return if none were found */ if (i < 0) return 0; /* make this index current */ make_current (i); return 1; } void close_all_databases (void) { int i = 0; init_dbcache (); /* unload all active databases */ for (i=0; i