source: trunk/mgpp/text/invf.h@ 13652

Last change on this file since 13652 was 13652, checked in by kjdon, 17 years ago

removed id line from comment

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.7 KB
Line 
1/**************************************************************************
2 *
3 * invf.h -- Data structures for inverted files
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 *
21 **************************************************************************/
22
23
24#ifndef H_INVF
25#define H_INVF
26
27#include <stdio.h>
28
29#include "UCArray.h"
30
31// NOTE: This does not include the magic number
32// header info for .invf.dict file
33struct invf_dict_header {
34 unsigned long lookback;
35 unsigned long word_dict_start;
36 unsigned long word_dict_size;
37 unsigned long tag_dict_start;
38 unsigned long tag_dict_size;
39 unsigned long num_docs;
40 unsigned long num_frags;
41 unsigned long num_words;
42 unsigned long total_bytes;
43 unsigned long index_string_bytes;
44 unsigned long num_levels;
45
46 invf_dict_header ();
47 virtual ~invf_dict_header ();
48 virtual void Clear();
49
50 virtual bool Read (FILE *f);
51 virtual bool Write (FILE *f) const;
52};
53
54
55struct dict_el {
56 UCArray el; // word or tag
57 unsigned long frag_occur;
58 unsigned long freq;
59
60 virtual void Clear ();
61 dict_el () { Clear (); }
62 virtual ~dict_el () { }
63
64 // Read assumes that the last word is in el
65 bool Read (FILE *f);
66 bool Write (FILE *f, const UCArray *lastEl) const;
67};
68
69
70struct word_dict_el : public dict_el {
71 unsigned long *levelFreqs;
72
73 void Clear ();
74 word_dict_el () { levelFreqs = NULL; Clear (); }
75 ~word_dict_el ();
76
77 void SetNumLevels (unsigned long numLevels);
78
79 // SetNumLevels should be called before either
80 // reading or writing using Read and Write
81
82 // Read assumes that the last word is in el
83 bool Read (FILE *f, unsigned long numLevels);
84 bool Write (FILE *f, const UCArray *lastEl,
85 unsigned long numLevels) const;
86};
87
88
89// wblk = word block
90// tblk = tag block
91// this version of the blocked dictionary uses a fixed number
92// of entries per block, not a fixed block size
93// info for .invf.dict.blocked file
94// blocked dict has a heap of blocks, some for words, some for tags
95// and an index into each set of blocks. The index has pointers to
96// the first entry in each block. Can do a binary search on the index
97// to find out which block an elemnet is in
98struct block_dict_header : public invf_dict_header {
99 // note: word_dict_start and tag_dict_start are undefined
100 // for blocked dictionaries
101
102 unsigned long entries_per_wblk; // word blocks
103 unsigned long num_wblks;
104 unsigned long max_wblk_size;
105 unsigned long wblk_start;
106 unsigned long wblk_idx_start;
107
108 unsigned long entries_per_tblk; // tag blocks
109 unsigned long num_tblks;
110 unsigned long max_tblk_size;
111 unsigned long tblk_start;
112 unsigned long tblk_idx_start;
113
114 block_dict_header ();
115 void Clear ();
116
117 bool Read (FILE *f);
118 bool Write (FILE *f) const;
119};
120
121
122struct block_dict_el {
123 UCArray el; // word or tag
124 unsigned long frag_occur; // # entries in invf file - if have a
125 // word level index, this is the same as freq, otherwise, its the number
126 // of fragments containing this word
127 unsigned long freq; // # of times this word occurs
128 unsigned long invf_ptr; // pointer into inverted file
129
130 virtual void Clear ();
131 block_dict_el () { Clear (); }
132 virtual ~block_dict_el () { }
133
134 // Read assumes that the last word is in el
135 // set lastEl = NULL when no lookback is wanted (eg
136 // for the start of a block
137 bool Read (FILE *f);
138 bool Write (FILE *f, const UCArray *lastEl) const;
139};
140
141struct word_block_dict_el : public block_dict_el {
142 unsigned long *levelFreqs; // freq of the word at each level
143
144 void Clear ();
145 word_block_dict_el () { levelFreqs = NULL; Clear (); }
146 ~word_block_dict_el ();
147
148 void SetNumLevels (unsigned long numLevels);
149
150 // SetNumLevels should be called before either
151 // reading or writing using Read and Write
152
153 // Read assumes that the last word is in el
154 bool Read (FILE *f, unsigned long numLevels);
155 bool Write (FILE *f, const UCArray *lastEl,
156 unsigned long numLevels) const;
157};
158
159typedef vector<word_block_dict_el> word_block_dict_el_array;
160
161struct block_idx_info {
162 UCArray el;
163 unsigned long block_ptr;
164
165 block_idx_info ();
166 void Clear ();
167
168 bool Read (FILE *f);
169 bool Write (FILE *f) const;
170};
171
172// used for an index into the word and tag blocks
173typedef vector<block_idx_info> block_idx;
174
175bool ReadBlockIdx (FILE *f, block_idx &blockIdx);
176bool WriteBlockIdx (FILE *f, const block_idx &blockIdx);
177
178
179
180struct stem_idx_header {
181 unsigned long lookback;
182 unsigned long dict_size;
183
184 unsigned long entries_per_block;
185 unsigned long num_blocks;
186 unsigned long max_block_size;
187 unsigned long blocks_start;
188 unsigned long block_idx_start;
189
190 unsigned long stemmer_num;
191 unsigned long stem_method;
192
193 stem_idx_header ();
194 void Clear ();
195
196 bool Read (FILE *f);
197 bool Write (FILE *f) const;
198};
199
200struct stem_block_dict_el {
201 UCArray el; // word or tag
202 vector<unsigned long> equivWords;
203
204 stem_block_dict_el ();
205 void Clear ();
206
207 // Read assumes that the last word is in el
208 // set lastEl = NULL when no lookback is wanted (eg
209 // for the start of a block
210 bool Read (FILE *f);
211 bool Write (FILE *f, const UCArray *lastEl) const;
212};
213
214
215
216#define SKIP_MODE_NO_SKIPS 0
217
218// invf file - has a list of frags for each word, but the word is not
219// stored in the invf file - the dictionaries store the words, along
220// with num entries, and a pointer into invf file
221struct invf_file_header {
222 unsigned long no_of_words;
223 unsigned long no_of_tags;
224 unsigned long skip_mode;
225 unsigned long word_level_index; // 1 if word level index
226 unsigned long params[16];
227
228 invf_file_header ();
229 void Clear ();
230
231 bool Read (FILE *f);
232 bool Write (FILE *f) const;
233};
234
235
236
237
238
239
240// the search functions returns true if a block that could
241// satisfy the request is found. these functions assume that
242// the block index is sorted by DictCompare (or DictLTUCArray)
243bool SearchElNum (const block_idx &bIdx,
244 unsigned long entriesPerBlock,
245 unsigned long elNum,
246 unsigned long &blockIdxNum,
247 unsigned long &blockStartElNum);
248bool SearchEl (const block_idx &bIdx,
249 unsigned long entriesPerBlock,
250 const UCArray &el,
251 unsigned long &blockIdxNum,
252 unsigned long &blockStartElNum);
253
254
255// The next six functions use SearchElNum and SearchEl
256// for a particular type of dictionary (Block, WordBlock, or StemBlock)
257// and then look up the entry
258bool SearchBlockDictElNum (FILE *dictFile,
259 const block_idx &bIdx,
260 unsigned long entriesPerBlock,
261 unsigned long dictSize,
262 unsigned long elNum,
263 block_dict_el &dictEl);
264bool SearchBlockDictEl (FILE *dictFile,
265 const block_idx &bIdx,
266 unsigned long entriesPerBlock,
267 unsigned long dictSize,
268 const UCArray &el,
269 block_dict_el &dictEl,
270 unsigned long &elNum);
271
272// assumes the numLevels has been set for dictEl
273bool SearchWordBlockDictElNum (FILE *dictFile,
274 const block_idx &bIdx,
275 unsigned long entriesPerBlock,
276 unsigned long dictSize,
277 unsigned long numLevels,
278 unsigned long elNum,
279 word_block_dict_el &dictEl);
280bool SearchWordBlockDictEl (FILE *dictFile,
281 const block_idx &bIdx,
282 unsigned long entriesPerBlock,
283 unsigned long dictSize,
284 unsigned long numLevels,
285 const UCArray &el,
286 word_block_dict_el &dictEl,
287 unsigned long &elNum);
288
289bool SearchStemBlockDictElNum (FILE *dictFile,
290 const block_idx &bIdx,
291 unsigned long entriesPerBlock,
292 unsigned long dictSize,
293 unsigned long elNum,
294 stem_block_dict_el &dictEl);
295bool SearchStemBlockDictEl (FILE *dictFile,
296 const block_idx &bIdx,
297 unsigned long entriesPerBlock,
298 unsigned long dictSize,
299 const UCArray &el,
300 stem_block_dict_el &dictEl,
301 unsigned long &elNum);
302
303//-----------------------------------------------------
304// New functions for partial matching
305// returns a list of word numbers from the main block index whose
306// words start with el.
307bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
308 const block_idx &bIdx,
309 unsigned long entriesPerBlock,
310 unsigned long dictSize,
311 unsigned long numLevels,
312 const UCArray &el,
313 word_block_dict_el &dictEl,
314 vector<unsigned long> &elNumList,
315 bool casefold);
316//----------------------------------------------------------
317
318// new functions for full text browse
319
320bool NearestSearchWordBlockDictEl (FILE *dictFile,
321 const block_idx &bIdx,
322 unsigned long entriesPerBlock,
323 unsigned long dictSize,
324 unsigned long numLevels,
325 const UCArray &el,
326 word_block_dict_el &dictEl,
327 unsigned long &elNum);
328
329// returns a list of word_block_dict_el, with no levelfreqs
330bool SearchWordBlockDictElNumRange (FILE *dictFile,
331 const block_idx &bIdx,
332 unsigned long entriesPerBlock,
333 unsigned long dictSize,
334 unsigned long numLevels,
335 unsigned long elNum,
336 unsigned long numWords,
337 word_block_dict_el_array &terms);
338
339// just returns a list of terms
340bool SearchWordBlockDictElNumRange (FILE *dictFile,
341 const block_idx &bIdx,
342 unsigned long entriesPerBlock,
343 unsigned long dictSize,
344 unsigned long numLevels,
345 unsigned long elNum,
346 unsigned long numWords,
347 UCArrayVector &terms);
348
349
350
351#endif
Note: See TracBrowser for help on using the repository browser.