[3745] | 1 | /**************************************************************************
|
---|
| 2 | *
|
---|
| 3 | * backend.h -- Underlying routines and datastructures for mgquery
|
---|
| 4 | * Copyright (C) 1994 Neil Sharman
|
---|
| 5 | *
|
---|
| 6 | * This program is free software; you can redistribute it and/or modify
|
---|
| 7 | * it under the terms of the GNU General Public License as published by
|
---|
| 8 | * the Free Software Foundation; either version 2 of the License, or
|
---|
| 9 | * (at your option) any later version.
|
---|
| 10 | *
|
---|
| 11 | * This program is distributed in the hope that it will be useful,
|
---|
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 14 | * GNU General Public License for more details.
|
---|
| 15 | *
|
---|
| 16 | * You should have received a copy of the GNU General Public License
|
---|
| 17 | * along with this program; if not, write to the Free Software
|
---|
| 18 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 19 | *
|
---|
| 20 | * $Id: backend.h 23508 2010-12-17 01:04:10Z sjm84 $
|
---|
| 21 | *
|
---|
| 22 | **************************************************************************/
|
---|
| 23 |
|
---|
| 24 |
|
---|
| 25 | #ifndef BACKEND_H
|
---|
| 26 | #define BACKEND_H
|
---|
| 27 |
|
---|
| 28 | #include "sysfuncs.h"
|
---|
| 29 |
|
---|
| 30 | #include "timing.h"
|
---|
| 31 | #include "lists.h"
|
---|
| 32 | #include "term_lists.h"
|
---|
| 33 | #include "query_term_list.h" /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 34 | #include "mg.h"
|
---|
| 35 | #include "invf.h"
|
---|
| 36 | #include "text.h"
|
---|
| 37 |
|
---|
| 38 |
|
---|
| 39 | typedef struct invf_data
|
---|
| 40 | {
|
---|
| 41 | File *InvfFile;
|
---|
[23508] | 42 | mg_u_long N;
|
---|
| 43 | mg_u_long Nstatic; /* N parameter for decoding inverted file entries */
|
---|
[3745] | 44 | struct invf_file_header ifh;
|
---|
| 45 | }
|
---|
| 46 | invf_data;
|
---|
| 47 |
|
---|
| 48 | typedef struct text_data
|
---|
| 49 | {
|
---|
| 50 | File *TextFile;
|
---|
| 51 | File *TextIdxFile;
|
---|
| 52 | File *TextIdxWgtFile;
|
---|
[23508] | 53 | mg_s_long current_pos;
|
---|
[3745] | 54 | struct
|
---|
| 55 | {
|
---|
[23508] | 56 | mg_u_long Start;
|
---|
[3745] | 57 | float Weight;
|
---|
| 58 | }
|
---|
| 59 | *idx_data;
|
---|
| 60 | compressed_text_header cth;
|
---|
| 61 | }
|
---|
| 62 | text_data;
|
---|
| 63 |
|
---|
| 64 |
|
---|
| 65 | typedef struct auxiliary_dict
|
---|
| 66 | {
|
---|
| 67 | aux_frags_header afh[2];
|
---|
| 68 | u_char *word_data[2];
|
---|
| 69 | u_char **words[2];
|
---|
| 70 | int blk_start[2][33], blk_end[2][33]; /* blk_start and blk_end are required
|
---|
| 71 | for the hybrid methods */
|
---|
| 72 | }
|
---|
| 73 | auxiliary_dict;
|
---|
| 74 |
|
---|
| 75 |
|
---|
| 76 | typedef struct compression_dict
|
---|
| 77 | {
|
---|
| 78 | compression_dict_header cdh;
|
---|
| 79 | comp_frags_header *cfh[2];
|
---|
[23508] | 80 | mg_u_long MemForCompDict;
|
---|
[3745] | 81 | u_char ***values[2];
|
---|
| 82 | u_char *escape[2];
|
---|
| 83 | huff_data *chars_huff[2];
|
---|
[23508] | 84 | mg_u_long **chars_vals[2];
|
---|
[3745] | 85 | huff_data *lens_huff[2];
|
---|
[23508] | 86 | mg_u_long **lens_vals[2];
|
---|
[3745] | 87 | auxiliary_dict *ad;
|
---|
| 88 | int fast_loaded;
|
---|
| 89 | }
|
---|
| 90 | compression_dict;
|
---|
| 91 |
|
---|
| 92 |
|
---|
| 93 | typedef struct stemmed_idx /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 94 | {
|
---|
| 95 | File *stem_idx_file;
|
---|
| 96 | struct stem_idx_header sih;
|
---|
| 97 | u_char **index;
|
---|
[23508] | 98 | mg_u_long *pos;
|
---|
[3745] | 99 | int active;
|
---|
| 100 | u_char *buffer;
|
---|
[23508] | 101 | mg_u_long MemForStemIdx;
|
---|
[3745] | 102 | }
|
---|
| 103 | stemmed_idx;
|
---|
| 104 |
|
---|
| 105 |
|
---|
| 106 | typedef struct stemmed_dict
|
---|
| 107 | {
|
---|
| 108 | File *stem_file;
|
---|
| 109 | struct stem_dict_header sdh;
|
---|
| 110 | u_char **index;
|
---|
[23508] | 111 | mg_u_long *pos;
|
---|
[3745] | 112 | int active;
|
---|
| 113 | u_char *buffer;
|
---|
[23508] | 114 | mg_u_long MemForStemDict;
|
---|
[3745] | 115 |
|
---|
| 116 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 117 | stemmed_idx *stem1;
|
---|
| 118 | stemmed_idx *stem2;
|
---|
| 119 | stemmed_idx *stem3;
|
---|
| 120 | }
|
---|
| 121 | stemmed_dict;
|
---|
| 122 |
|
---|
| 123 |
|
---|
| 124 | typedef struct approx_weights_data
|
---|
| 125 | {
|
---|
| 126 | double L;
|
---|
| 127 | double B;
|
---|
[23508] | 128 | mg_u_long *DocWeights;
|
---|
[3745] | 129 | char bits;
|
---|
| 130 | float *table;
|
---|
[23508] | 131 | mg_u_long mask;
|
---|
| 132 | mg_u_long MemForWeights;
|
---|
| 133 | mg_u_long num_of_docs;
|
---|
[3745] | 134 | }
|
---|
| 135 | approx_weights_data;
|
---|
| 136 |
|
---|
| 137 |
|
---|
| 138 | typedef struct RankedQueryInfo
|
---|
| 139 | {
|
---|
| 140 | int QueryFreqs;
|
---|
| 141 | int Exact; /* use exact weights for ranking or not */
|
---|
[23508] | 142 | mg_s_long MaxDocsToRetrieve; /* may be -1 for all */
|
---|
| 143 | mg_s_long MaxParasToRetrieve;
|
---|
[3745] | 144 | int Sort;
|
---|
| 145 | char AccumMethod; /* 'A' = array, 'S' = splay tree, 'H' = hash_table */
|
---|
[23508] | 146 | mg_s_long MaxAccums; /* may be -1 for all */
|
---|
| 147 | mg_s_long MaxTerms; /* may be -1 for all */
|
---|
[3745] | 148 | int StopAtMaxAccum; /* Stop at maximum accumulator or not */
|
---|
[23508] | 149 | mg_s_long HashTblSize;
|
---|
[3745] | 150 | char *skip_dump;
|
---|
| 151 | }
|
---|
| 152 | RankedQueryInfo;
|
---|
| 153 |
|
---|
| 154 |
|
---|
| 155 |
|
---|
| 156 | typedef struct BooleanQueryInfo
|
---|
| 157 | {
|
---|
[23508] | 158 | mg_s_long MaxDocsToRetrieve;
|
---|
[3745] | 159 | }
|
---|
| 160 | BooleanQueryInfo;
|
---|
| 161 |
|
---|
| 162 |
|
---|
| 163 | /* [TS:24/Aug/94] - maximum number of characters in term string */
|
---|
| 164 | #define MAXTERMSTRLEN 1023
|
---|
| 165 |
|
---|
| 166 | typedef struct query_data
|
---|
| 167 | {
|
---|
| 168 | stemmed_dict *sd;
|
---|
| 169 | compression_dict *cd;
|
---|
| 170 | approx_weights_data *awd;
|
---|
| 171 | invf_data *id;
|
---|
| 172 | text_data *td;
|
---|
| 173 | #if defined(PARADOCNUM) || defined(NZDL)
|
---|
| 174 | int *paragraph;
|
---|
| 175 | #endif
|
---|
| 176 | char *pathname;
|
---|
| 177 | char *textpathname; /* [RJM 06/97: text filename] */
|
---|
| 178 | File *File_text;
|
---|
| 179 | File *File_comp_dict;
|
---|
| 180 | File *File_aux_dict;
|
---|
| 181 | File *File_fast_comp_dict;
|
---|
| 182 | File *File_text_idx_wgt;
|
---|
| 183 | File *File_text_idx;
|
---|
| 184 | File *File_stem;
|
---|
| 185 |
|
---|
| 186 | /* [RPAP - Jan 97: Stem Index Change] */
|
---|
| 187 | File *File_stem1;
|
---|
| 188 | File *File_stem2;
|
---|
| 189 | File *File_stem3;
|
---|
| 190 |
|
---|
| 191 | File *File_invf;
|
---|
| 192 | File *File_weight_approx;
|
---|
[23508] | 193 | mg_u_long mem_in_use, max_mem_in_use;
|
---|
| 194 | mg_u_long num_of_ptrs, tot_num_of_ptrs;
|
---|
| 195 | mg_u_long num_of_terms, tot_num_of_terms;
|
---|
| 196 | mg_u_long num_of_accum, tot_num_of_accum;
|
---|
| 197 | mg_u_long num_of_ans, tot_num_of_ans;
|
---|
| 198 | mg_u_long hops_taken, tot_hops_taken;
|
---|
| 199 | mg_u_long text_idx_lookups, tot_text_idx_lookups;
|
---|
| 200 | mg_u_long max_buffers;
|
---|
[3745] | 201 | unsigned doc_pos;
|
---|
| 202 | unsigned buf_in_use;
|
---|
| 203 | DocList *DL;
|
---|
| 204 | TermList *TL; /* [TS:Oct/94] - so term list for query can easily be accessed */
|
---|
| 205 | u_char *TextBuffer;
|
---|
| 206 | int TextBufferLen;
|
---|
| 207 | QueryTermList *QTL; /* [RPAP - Feb 97: Term Frequency] */
|
---|
| 208 | }
|
---|
| 209 | query_data;
|
---|
| 210 |
|
---|
| 211 |
|
---|
| 212 |
|
---|
| 213 | typedef struct InitQueryTimes
|
---|
| 214 | {
|
---|
| 215 | ProgTime Start;
|
---|
| 216 | ProgTime StemDict;
|
---|
| 217 | ProgTime ApproxWeights;
|
---|
| 218 | ProgTime CompDict;
|
---|
| 219 | ProgTime Invf;
|
---|
| 220 | ProgTime Text;
|
---|
| 221 | }
|
---|
| 222 | InitQueryTimes;
|
---|
| 223 |
|
---|
| 224 |
|
---|
| 225 | /* [RJM 06/97: text filename] */
|
---|
| 226 | query_data *InitQuerySystem (char *dir, char *name, char *textname, InitQueryTimes * iqt);
|
---|
| 227 |
|
---|
[23508] | 228 | void ChangeMemInUse (query_data * qd, mg_s_long delta);
|
---|
[3745] | 229 |
|
---|
| 230 | void FinishQuerySystem (query_data * qd);
|
---|
| 231 |
|
---|
| 232 | void ResetFileStats (query_data * qd);
|
---|
| 233 |
|
---|
| 234 | void TransFileStats (query_data * qd);
|
---|
| 235 |
|
---|
| 236 | void RankedQuery (query_data * qd, char *Query, RankedQueryInfo * rqi);
|
---|
| 237 |
|
---|
| 238 | void BooleanQuery (query_data * qd, char *Query, BooleanQueryInfo * bqi,
|
---|
| 239 | int stem_method);
|
---|
| 240 |
|
---|
| 241 | void DocnumsQuery (query_data * qd, char *QueryLine);
|
---|
| 242 |
|
---|
| 243 | void FreeTextBuffer (query_data * qd);
|
---|
| 244 |
|
---|
| 245 | void FreeQueryDocs (query_data * qd);
|
---|
| 246 |
|
---|
| 247 | int LoadCompressedText (query_data * qd, int max_mem);
|
---|
| 248 |
|
---|
| 249 | int GetDocNum (query_data * qd);
|
---|
| 250 |
|
---|
| 251 | float GetDocWeight (query_data * qd);
|
---|
| 252 |
|
---|
[23508] | 253 | mg_s_long GetDocCompLength (query_data * qd);
|
---|
[3745] | 254 |
|
---|
[23508] | 255 | u_char *GetDocText (query_data * qd, mg_u_long *len);
|
---|
[3745] | 256 |
|
---|
| 257 | DocEntry *GetDocChain (query_data * qd);
|
---|
| 258 |
|
---|
| 259 | int NextDoc (query_data * qd);
|
---|
| 260 |
|
---|
| 261 | #endif
|
---|