source: trunk/mgpp/text/invf.h@ 8242

Last change on this file since 8242 was 8242, checked in by kjdon, 20 years ago

added in Partial matching for query terms. Cna type comp* as a query, and it will find all words that begin with comp. This search can be case sensitive or insensitive. Changes made to invf.h/cpp, UCArray.h/cpp, Terms.cpp, GSDLQueryLex.h/cpp, GSDLQueryParser.cpp

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1/**************************************************************************
2 *
3 * invf.h -- Data structures for inverted files
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: invf.h 8242 2004-10-08 00:03:36Z kjdon $
21 *
22 **************************************************************************/
23
24
25#ifndef H_INVF
26#define H_INVF
27
28#include <stdio.h>
29
30#include "UCArray.h"
31
32// NOTE: This does not include the magic number
33// header info for .invf.dict file
34struct invf_dict_header {
35 unsigned long lookback;
36 unsigned long word_dict_start;
37 unsigned long word_dict_size;
38 unsigned long tag_dict_start;
39 unsigned long tag_dict_size;
40 unsigned long num_docs;
41 unsigned long num_frags;
42 unsigned long num_words;
43 unsigned long total_bytes;
44 unsigned long index_string_bytes;
45 unsigned long num_levels;
46
47 invf_dict_header ();
48 virtual ~invf_dict_header ();
49 virtual void Clear();
50
51 virtual bool Read (FILE *f);
52 virtual bool Write (FILE *f) const;
53};
54
55
56struct dict_el {
57 UCArray el; // word or tag
58 unsigned long frag_occur;
59 unsigned long freq;
60
61 virtual void Clear ();
62 dict_el () { Clear (); }
63 virtual ~dict_el () { }
64
65 // Read assumes that the last word is in el
66 bool Read (FILE *f);
67 bool Write (FILE *f, const UCArray *lastEl) const;
68};
69
70
71struct word_dict_el : public dict_el {
72 unsigned long *levelFreqs;
73
74 void Clear ();
75 word_dict_el () { levelFreqs = NULL; Clear (); }
76 ~word_dict_el ();
77
78 void SetNumLevels (unsigned long numLevels);
79
80 // SetNumLevels should be called before either
81 // reading or writing using Read and Write
82
83 // Read assumes that the last word is in el
84 bool Read (FILE *f, unsigned long numLevels);
85 bool Write (FILE *f, const UCArray *lastEl,
86 unsigned long numLevels) const;
87};
88
89
90// wblk = word block
91// tblk = tag block
92// this version of the blocked dictionary uses a fixed number
93// of entries per block, not a fixed block size
94// info for .invf.dict.blocked file
95// blocked dict has a heap of blocks, some for words, some for tags
96// and an index into each set of blocks. The index has pointers to
97// the first entry in each block. Can do a binary search on the index
98// to find out which block an elemnet is in
99struct block_dict_header : public invf_dict_header {
100 // note: word_dict_start and tag_dict_start are undefined
101 // for blocked dictionaries
102
103 unsigned long entries_per_wblk; // word blocks
104 unsigned long num_wblks;
105 unsigned long max_wblk_size;
106 unsigned long wblk_start;
107 unsigned long wblk_idx_start;
108
109 unsigned long entries_per_tblk; // tag blocks
110 unsigned long num_tblks;
111 unsigned long max_tblk_size;
112 unsigned long tblk_start;
113 unsigned long tblk_idx_start;
114
115 block_dict_header ();
116 void Clear ();
117
118 bool Read (FILE *f);
119 bool Write (FILE *f) const;
120};
121
122
123struct block_dict_el {
124 UCArray el; // word or tag
125 unsigned long frag_occur; // # entries in invf file - if have a
126 // word level index, this is the same as freq, otherwise, its the number
127 // of fragments containing this word
128 unsigned long freq; // # of times this word occurs
129 unsigned long invf_ptr; // pointer into inverted file
130
131 virtual void Clear ();
132 block_dict_el () { Clear (); }
133 virtual ~block_dict_el () { }
134
135 // Read assumes that the last word is in el
136 // set lastEl = NULL when no lookback is wanted (eg
137 // for the start of a block
138 bool Read (FILE *f);
139 bool Write (FILE *f, const UCArray *lastEl) const;
140};
141
142struct word_block_dict_el : public block_dict_el {
143 unsigned long *levelFreqs; // freq of the word at each level
144
145 void Clear ();
146 word_block_dict_el () { levelFreqs = NULL; Clear (); }
147 ~word_block_dict_el ();
148
149 void SetNumLevels (unsigned long numLevels);
150
151 // SetNumLevels should be called before either
152 // reading or writing using Read and Write
153
154 // Read assumes that the last word is in el
155 bool Read (FILE *f, unsigned long numLevels);
156 bool Write (FILE *f, const UCArray *lastEl,
157 unsigned long numLevels) const;
158};
159
160typedef vector<word_block_dict_el> word_block_dict_el_array;
161
162struct block_idx_info {
163 UCArray el;
164 unsigned long block_ptr;
165
166 block_idx_info ();
167 void Clear ();
168
169 bool Read (FILE *f);
170 bool Write (FILE *f) const;
171};
172
173// used for an index into the word and tag blocks
174typedef vector<block_idx_info> block_idx;
175
176bool ReadBlockIdx (FILE *f, block_idx &blockIdx);
177bool WriteBlockIdx (FILE *f, const block_idx &blockIdx);
178
179
180
181struct stem_idx_header {
182 unsigned long lookback;
183 unsigned long dict_size;
184
185 unsigned long entries_per_block;
186 unsigned long num_blocks;
187 unsigned long max_block_size;
188 unsigned long blocks_start;
189 unsigned long block_idx_start;
190
191 unsigned long stemmer_num;
192 unsigned long stem_method;
193
194 stem_idx_header ();
195 void Clear ();
196
197 bool Read (FILE *f);
198 bool Write (FILE *f) const;
199};
200
201struct stem_block_dict_el {
202 UCArray el; // word or tag
203 vector<unsigned long> equivWords;
204
205 stem_block_dict_el ();
206 void Clear ();
207
208 // Read assumes that the last word is in el
209 // set lastEl = NULL when no lookback is wanted (eg
210 // for the start of a block
211 bool Read (FILE *f);
212 bool Write (FILE *f, const UCArray *lastEl) const;
213};
214
215
216
217#define SKIP_MODE_NO_SKIPS 0
218
219// invf file - has a list of frags for each word, but the word is not
220// stored in the invf file - the dictionaries store the words, along
221// with num entries, and a pointer into invf file
222struct invf_file_header {
223 unsigned long no_of_words;
224 unsigned long no_of_tags;
225 unsigned long skip_mode;
226 unsigned long word_level_index; // 1 if word level index
227 unsigned long params[16];
228
229 invf_file_header ();
230 void Clear ();
231
232 bool Read (FILE *f);
233 bool Write (FILE *f) const;
234};
235
236
237
238
239
240
241// the search functions returns true if a block that could
242// satisfy the request is found. these functions assume that
243// the block index is sorted by DictCompare (or DictLTUCArray)
244bool SearchElNum (const block_idx &bIdx,
245 unsigned long entriesPerBlock,
246 unsigned long elNum,
247 unsigned long &blockIdxNum,
248 unsigned long &blockStartElNum);
249bool SearchEl (const block_idx &bIdx,
250 unsigned long entriesPerBlock,
251 const UCArray &el,
252 unsigned long &blockIdxNum,
253 unsigned long &blockStartElNum);
254
255
256// The next six functions use SearchElNum and SearchEl
257// for a particular type of dictionary (Block, WordBlock, or StemBlock)
258// and then look up the entry
259bool SearchBlockDictElNum (FILE *dictFile,
260 const block_idx &bIdx,
261 unsigned long entriesPerBlock,
262 unsigned long dictSize,
263 unsigned long elNum,
264 block_dict_el &dictEl);
265bool SearchBlockDictEl (FILE *dictFile,
266 const block_idx &bIdx,
267 unsigned long entriesPerBlock,
268 unsigned long dictSize,
269 const UCArray &el,
270 block_dict_el &dictEl,
271 unsigned long &elNum);
272
273// assumes the numLevels has been set for dictEl
274bool SearchWordBlockDictElNum (FILE *dictFile,
275 const block_idx &bIdx,
276 unsigned long entriesPerBlock,
277 unsigned long dictSize,
278 unsigned long numLevels,
279 unsigned long elNum,
280 word_block_dict_el &dictEl);
281bool SearchWordBlockDictEl (FILE *dictFile,
282 const block_idx &bIdx,
283 unsigned long entriesPerBlock,
284 unsigned long dictSize,
285 unsigned long numLevels,
286 const UCArray &el,
287 word_block_dict_el &dictEl,
288 unsigned long &elNum);
289
290bool SearchStemBlockDictElNum (FILE *dictFile,
291 const block_idx &bIdx,
292 unsigned long entriesPerBlock,
293 unsigned long dictSize,
294 unsigned long elNum,
295 stem_block_dict_el &dictEl);
296bool SearchStemBlockDictEl (FILE *dictFile,
297 const block_idx &bIdx,
298 unsigned long entriesPerBlock,
299 unsigned long dictSize,
300 const UCArray &el,
301 stem_block_dict_el &dictEl,
302 unsigned long &elNum);
303
304//-----------------------------------------------------
305// New functions for partial matching
306// returns a list of word numbers from the main block index whose
307// words start with el.
308bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
309 const block_idx &bIdx,
310 unsigned long entriesPerBlock,
311 unsigned long dictSize,
312 unsigned long numLevels,
313 const UCArray &el,
314 word_block_dict_el &dictEl,
315 vector<unsigned long> &elNumList,
316 bool casefold);
317//----------------------------------------------------------
318
319// new functions for full text browse
320
321bool NearestSearchWordBlockDictEl (FILE *dictFile,
322 const block_idx &bIdx,
323 unsigned long entriesPerBlock,
324 unsigned long dictSize,
325 unsigned long numLevels,
326 const UCArray &el,
327 word_block_dict_el &dictEl,
328 unsigned long &elNum);
329
330// returns a list of word_block_dict_el, with no levelfreqs
331bool SearchWordBlockDictElNumRange (FILE *dictFile,
332 const block_idx &bIdx,
333 unsigned long entriesPerBlock,
334 unsigned long dictSize,
335 unsigned long numLevels,
336 unsigned long elNum,
337 unsigned long numWords,
338 word_block_dict_el_array &terms);
339
340// just returns a list of terms
341bool SearchWordBlockDictElNumRange (FILE *dictFile,
342 const block_idx &bIdx,
343 unsigned long entriesPerBlock,
344 unsigned long dictSize,
345 unsigned long numLevels,
346 unsigned long elNum,
347 unsigned long numWords,
348 UCArrayVector &terms);
349
350
351
352#endif
Note: See TracBrowser for help on using the repository browser.