source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mgpp/text/invf.h@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.4 KB
Line 
1/**************************************************************************
2 *
3 * invf.h -- Data structures for inverted files
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 *
21 **************************************************************************/
22
23
24#ifndef H_INVF
25#define H_INVF
26
27#include "mglong.h"
28#include <stdio.h>
29
30#include "UCArray.h"
31
32// NOTE: This does not include the magic number
33// header info for .invf.dict file
34struct invf_dict_header {
35 mg_u_long lookback;
36 mg_u_long word_dict_start;
37 mg_u_long word_dict_size;
38 mg_u_long tag_dict_start;
39 mg_u_long tag_dict_size;
40 mg_u_long num_docs;
41 mg_u_long num_frags;
42 mg_u_long num_words;
43 mg_u_long total_bytes;
44 mg_u_long index_string_bytes;
45 mg_u_long num_levels;
46
47 invf_dict_header ();
48 virtual ~invf_dict_header ();
49 virtual void Clear();
50
51 virtual bool Read (FILE *f);
52 virtual bool Write (FILE *f) const;
53};
54
55
56struct dict_el {
57 UCArray el; // word or tag
58 mg_u_long frag_occur;
59 mg_u_long freq;
60
61 virtual void Clear ();
62 dict_el () { Clear (); }
63 virtual ~dict_el () { }
64
65 // Read assumes that the last word is in el
66 bool Read (FILE *f);
67 bool Write (FILE *f, const UCArray *lastEl) const;
68};
69
70
71struct word_dict_el : public dict_el {
72 mg_u_long *levelFreqs;
73
74 void Clear ();
75 word_dict_el () { levelFreqs = NULL; Clear (); }
76 ~word_dict_el ();
77
78 void SetNumLevels (mg_u_long numLevels);
79
80 // SetNumLevels should be called before either
81 // reading or writing using Read and Write
82
83 // Read assumes that the last word is in el
84 bool Read (FILE *f, mg_u_long numLevels);
85 bool Write (FILE *f, const UCArray *lastEl,
86 mg_u_long numLevels) const;
87};
88
89
90// wblk = word block
91// tblk = tag block
92// this version of the blocked dictionary uses a fixed number
93// of entries per block, not a fixed block size
94// info for .invf.dict.blocked file
95// blocked dict has a heap of blocks, some for words, some for tags
96// and an index into each set of blocks. The index has pointers to
97// the first entry in each block. Can do a binary search on the index
98// to find out which block an elemnet is in
99struct block_dict_header : public invf_dict_header {
100 // note: word_dict_start and tag_dict_start are undefined
101 // for blocked dictionaries
102
103 mg_u_long entries_per_wblk; // word blocks
104 mg_u_long num_wblks;
105 mg_u_long max_wblk_size;
106 mg_u_long wblk_start;
107 mg_u_long wblk_idx_start;
108
109 mg_u_long entries_per_tblk; // tag blocks
110 mg_u_long num_tblks;
111 mg_u_long max_tblk_size;
112 mg_u_long tblk_start;
113 mg_u_long tblk_idx_start;
114
115 block_dict_header ();
116 void Clear ();
117
118 bool Read (FILE *f);
119 bool Write (FILE *f) const;
120};
121
122
123struct block_dict_el {
124 UCArray el; // word or tag
125 mg_u_long frag_occur; // # entries in invf file - if have a
126 // word level index, this is the same as freq, otherwise, its the number
127 // of fragments containing this word
128 mg_u_long freq; // # of times this word occurs
129 mg_u_long invf_ptr; // pointer into inverted file
130
131 virtual void Clear ();
132 block_dict_el () { Clear (); }
133 virtual ~block_dict_el () { }
134
135 // Read assumes that the last word is in el
136 // set lastEl = NULL when no lookback is wanted (eg
137 // for the start of a block
138 bool Read (FILE *f);
139 bool Write (FILE *f, const UCArray *lastEl) const;
140};
141
142struct word_block_dict_el : public block_dict_el {
143 mg_u_long *levelFreqs; // freq of the word at each level
144
145 void Clear ();
146 word_block_dict_el () { levelFreqs = NULL; Clear (); }
147 ~word_block_dict_el ();
148
149 void SetNumLevels (mg_u_long numLevels);
150
151 // SetNumLevels should be called before either
152 // reading or writing using Read and Write
153
154 // Read assumes that the last word is in el
155 bool Read (FILE *f, mg_u_long numLevels);
156 bool Write (FILE *f, const UCArray *lastEl,
157 mg_u_long numLevels) const;
158};
159
160typedef vector<word_block_dict_el> word_block_dict_el_array;
161
162struct block_idx_info {
163 UCArray el;
164 mg_u_long block_ptr;
165
166 block_idx_info ();
167 void Clear ();
168
169 bool Read (FILE *f);
170 bool Write (FILE *f) const;
171};
172
173// used for an index into the word and tag blocks
174typedef vector<block_idx_info> block_idx;
175
176bool ReadBlockIdx (FILE *f, block_idx &blockIdx);
177bool WriteBlockIdx (FILE *f, const block_idx &blockIdx);
178
179
180
181struct stem_idx_header {
182 mg_u_long lookback;
183 mg_u_long dict_size;
184
185 mg_u_long entries_per_block;
186 mg_u_long num_blocks;
187 mg_u_long max_block_size;
188 mg_u_long blocks_start;
189 mg_u_long block_idx_start;
190
191 mg_u_long stemmer_num;
192 mg_u_long stem_method;
193
194 stem_idx_header ();
195 void Clear ();
196
197 bool Read (FILE *f);
198 bool Write (FILE *f) const;
199};
200
201struct stem_block_dict_el {
202 UCArray el; // word or tag
203 vector<mg_u_long> equivWords;
204
205 stem_block_dict_el ();
206 void Clear ();
207
208 // Read assumes that the last word is in el
209 // set lastEl = NULL when no lookback is wanted (eg
210 // for the start of a block
211 bool Read (FILE *f);
212 bool Write (FILE *f, const UCArray *lastEl) const;
213};
214
215
216
217#define SKIP_MODE_NO_SKIPS 0
218
219// invf file - has a list of frags for each word, but the word is not
220// stored in the invf file - the dictionaries store the words, along
221// with num entries, and a pointer into invf file
222struct invf_file_header {
223 mg_u_long no_of_words;
224 mg_u_long no_of_tags;
225 mg_u_long skip_mode;
226 mg_u_long word_level_index; // 1 if word level index
227 mg_u_long params[16];
228
229 invf_file_header ();
230 void Clear ();
231
232 bool Read (FILE *f);
233 bool Write (FILE *f) const;
234};
235
236
237
238
239
240
241// the search functions returns true if a block that could
242// satisfy the request is found. these functions assume that
243// the block index is sorted by DictCompare (or DictLTUCArray)
244bool SearchElNum (const block_idx &bIdx,
245 mg_u_long entriesPerBlock,
246 mg_u_long elNum,
247 mg_u_long &blockIdxNum,
248 mg_u_long &blockStartElNum);
249bool SearchEl (const block_idx &bIdx,
250 mg_u_long entriesPerBlock,
251 const UCArray &el,
252 mg_u_long &blockIdxNum,
253 mg_u_long &blockStartElNum);
254
255
256// The next six functions use SearchElNum and SearchEl
257// for a particular type of dictionary (Block, WordBlock, or StemBlock)
258// and then look up the entry
259bool SearchBlockDictElNum (FILE *dictFile,
260 const block_idx &bIdx,
261 mg_u_long entriesPerBlock,
262 mg_u_long dictSize,
263 mg_u_long elNum,
264 block_dict_el &dictEl);
265bool SearchBlockDictEl (FILE *dictFile,
266 const block_idx &bIdx,
267 mg_u_long entriesPerBlock,
268 mg_u_long dictSize,
269 const UCArray &el,
270 block_dict_el &dictEl,
271 mg_u_long &elNum);
272
273// assumes the numLevels has been set for dictEl
274bool SearchWordBlockDictElNum (FILE *dictFile,
275 const block_idx &bIdx,
276 mg_u_long entriesPerBlock,
277 mg_u_long dictSize,
278 mg_u_long numLevels,
279 mg_u_long elNum,
280 word_block_dict_el &dictEl);
281bool SearchWordBlockDictEl (FILE *dictFile,
282 const block_idx &bIdx,
283 mg_u_long entriesPerBlock,
284 mg_u_long dictSize,
285 mg_u_long numLevels,
286 const UCArray &el,
287 word_block_dict_el &dictEl,
288 mg_u_long &elNum);
289
290bool SearchStemBlockDictElNum (FILE *dictFile,
291 const block_idx &bIdx,
292 mg_u_long entriesPerBlock,
293 mg_u_long dictSize,
294 mg_u_long elNum,
295 stem_block_dict_el &dictEl);
296bool SearchStemBlockDictEl (FILE *dictFile,
297 const block_idx &bIdx,
298 mg_u_long entriesPerBlock,
299 mg_u_long dictSize,
300 const UCArray &el,
301 stem_block_dict_el &dictEl,
302 mg_u_long &elNum);
303
304//-----------------------------------------------------
305// New functions for partial matching
306// returns a list of word numbers from the main block index whose
307// words start with el.
308bool PartialMatchSearchWordBlockDictEl (FILE *dictFile,
309 const block_idx &bIdx,
310 mg_u_long entriesPerBlock,
311 mg_u_long dictSize,
312 mg_u_long numLevels,
313 const UCArray &el,
314 word_block_dict_el &dictEl,
315 vector<mg_u_long> &elNumList,
316 bool casefold);
317//----------------------------------------------------------
318
319// new functions for full text browse
320
321bool NearestSearchWordBlockDictEl (FILE *dictFile,
322 const block_idx &bIdx,
323 mg_u_long entriesPerBlock,
324 mg_u_long dictSize,
325 mg_u_long numLevels,
326 const UCArray &el,
327 word_block_dict_el &dictEl,
328 mg_u_long &elNum);
329
330// returns a list of word_block_dict_el, with no levelfreqs
331bool SearchWordBlockDictElNumRange (FILE *dictFile,
332 const block_idx &bIdx,
333 mg_u_long entriesPerBlock,
334 mg_u_long dictSize,
335 mg_u_long numLevels,
336 mg_u_long elNum,
337 mg_u_long numWords,
338 word_block_dict_el_array &terms);
339
340// just returns a list of terms
341bool SearchWordBlockDictElNumRange (FILE *dictFile,
342 const block_idx &bIdx,
343 mg_u_long entriesPerBlock,
344 mg_u_long dictSize,
345 mg_u_long numLevels,
346 mg_u_long elNum,
347 mg_u_long numWords,
348 UCArrayVector &terms);
349
350
351
352#endif
Note: See TracBrowser for help on using the repository browser.