root/branches/New_Config_Format-branch/gsdl/src/mgpp/text/invf.h @ 1279

Revision 1279, 8.2 KB (checked in by sjboddie, 20 years ago)

merged changes to trunk into New_Config_Format branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1/**************************************************************************
2 *
3 * invf.h -- Data structures for inverted files
4 * Copyright (C) 1999  Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id$
21 *
22 **************************************************************************/
23
24
25#ifndef H_INVF
26#define H_INVF
27
28#include <stdio.h>
29
30#include "UCArray.h"
31
32// NOTE: This does not include the magic number
33// header info for .invf.dict file
34struct invf_dict_header {
35  unsigned long lookback;
36  unsigned long word_dict_start;
37  unsigned long word_dict_size;
38  unsigned long tag_dict_start;
39  unsigned long tag_dict_size;
40  unsigned long num_docs;
41  unsigned long num_frags;
42  unsigned long num_words;
43  unsigned long total_bytes;
44  unsigned long index_string_bytes;
45  unsigned long num_levels;
46
47  invf_dict_header ();
48  virtual ~invf_dict_header ();
49  virtual void Clear();
50
51  virtual bool Read (FILE *f);
52  virtual bool Write (FILE *f) const;
53};
54
55
56struct dict_el {
57  UCArray el; // word or tag
58  unsigned long frag_occur;
59  unsigned long freq;
60
61  virtual void Clear ();
62  dict_el () { Clear (); }
63  virtual ~dict_el () { }
64
65  // Read assumes that the last word is in el
66  bool Read (FILE *f);
67  bool Write (FILE *f, const UCArray *lastEl) const;
68};
69
70
71struct word_dict_el : public dict_el {
72  unsigned long *levelFreqs;
73
74  void Clear ();
75  word_dict_el () { levelFreqs = NULL; Clear (); }
76  ~word_dict_el ();
77
78  void SetNumLevels (unsigned long numLevels);
79
80  // SetNumLevels should be called before either
81  // reading or writing using Read and Write
82 
83  // Read assumes that the last word is in el
84  bool Read (FILE *f, unsigned long numLevels);
85  bool Write (FILE *f, const UCArray *lastEl,
86          unsigned long numLevels) const;
87};
88
89
90// wblk = word block
91// tblk = tag block
92// this version of the blocked dictionary uses a fixed number
93// of entries per block, not a fixed block size
94// info for .invf.dict.blocked file
95// blocked dict has a heap of blocks, some for words, some for tags
96// and an index into each set of blocks. The index has pointers to
97// the first entry in each block. Can do a binary search on the index
98// to find out which block an elemnet is in
99struct block_dict_header : public invf_dict_header {
100  // note: word_dict_start and tag_dict_start are undefined
101  // for blocked dictionaries
102
103  unsigned long entries_per_wblk; // word blocks
104  unsigned long num_wblks;
105  unsigned long max_wblk_size;
106  unsigned long wblk_start;
107  unsigned long wblk_idx_start;
108 
109  unsigned long entries_per_tblk; // tag blocks
110  unsigned long num_tblks;
111  unsigned long max_tblk_size;
112  unsigned long tblk_start;
113  unsigned long tblk_idx_start;
114 
115  block_dict_header ();
116  void Clear ();
117 
118  bool Read (FILE *f);
119  bool Write (FILE *f) const;
120};
121
122
123struct block_dict_el {
124  UCArray el; // word or tag
125  unsigned long frag_occur; // # entries in invf file - if have a
126  // word level index, this is the same as freq, otherwise, its the number
127  // of fragments containing this word
128  unsigned long freq; // # of times this word occurs
129  unsigned long invf_ptr; // pointer into inverted file
130
131  virtual void Clear ();
132  block_dict_el () { Clear (); }
133  virtual ~block_dict_el () { }
134
135  // Read assumes that the last word is in el
136  // set lastEl = NULL when no lookback is wanted (eg
137  // for the start of a block
138  bool Read (FILE *f);
139  bool Write (FILE *f, const UCArray *lastEl) const;
140};
141
142struct word_block_dict_el : public block_dict_el {
143  unsigned long *levelFreqs; // freq of the word at each level
144
145  void Clear ();
146  word_block_dict_el () { levelFreqs = NULL; Clear (); }
147  ~word_block_dict_el ();
148
149  void SetNumLevels (unsigned long numLevels);
150
151  // SetNumLevels should be called before either
152  // reading or writing using Read and Write
153 
154  // Read assumes that the last word is in el
155  bool Read (FILE *f, unsigned long numLevels);
156  bool Write (FILE *f, const UCArray *lastEl,
157          unsigned long numLevels) const;
158};
159
160
161
162struct block_idx_info {
163  UCArray el;
164  unsigned long block_ptr;
165
166  block_idx_info ();
167  void Clear ();
168
169  bool Read (FILE *f);
170  bool Write (FILE *f) const;
171};
172
173// used for an index into the word and tag blocks
174typedef vector<block_idx_info> block_idx;
175
176bool ReadBlockIdx (FILE *f, block_idx &blockIdx);
177bool WriteBlockIdx (FILE *f, const block_idx &blockIdx);
178
179
180
181struct stem_idx_header {
182  unsigned long lookback;
183  unsigned long dict_size;
184
185  unsigned long entries_per_block;
186  unsigned long num_blocks;
187  unsigned long max_block_size;
188  unsigned long blocks_start;
189  unsigned long block_idx_start;
190 
191  unsigned long stemmer_num;
192  unsigned long stem_method;
193
194  stem_idx_header ();
195  void Clear ();
196
197  bool Read (FILE *f);
198  bool Write (FILE *f) const;
199};
200
201struct stem_block_dict_el {
202  UCArray el; // word or tag
203  vector<unsigned long> equivWords;
204 
205  stem_block_dict_el ();
206  void Clear ();
207
208  // Read assumes that the last word is in el
209  // set lastEl = NULL when no lookback is wanted (eg
210  // for the start of a block
211  bool Read (FILE *f);
212  bool Write (FILE *f, const UCArray *lastEl) const;
213};
214
215
216
217#define SKIP_MODE_NO_SKIPS 0
218
219// invf file - has a list of frags for each word, but the word is not
220//  stored in the invf file - the dictionaries store the words, along
221// with num entries, and a pointer into invf file
222struct invf_file_header {
223  unsigned long no_of_words;
224  unsigned long no_of_tags;
225  unsigned long skip_mode;
226  unsigned long word_level_index; // 1 if word level index
227  unsigned long params[16];
228
229  invf_file_header ();
230  void Clear ();
231 
232  bool Read (FILE *f);
233  bool Write (FILE *f) const;
234};
235
236
237
238
239
240
241// the search functions returns true if a block that could
242// satisfy the request is found. these functions assume that
243// the block index is sorted by DictCompare (or DictLTUCArray)
244bool SearchElNum (const block_idx &bIdx,
245          unsigned long entriesPerBlock,
246          unsigned long elNum,
247          unsigned long &blockIdxNum,
248          unsigned long &blockStartElNum);
249bool SearchEl (const block_idx &bIdx,
250           unsigned long entriesPerBlock,
251           const UCArray &el,
252           unsigned long &blockIdxNum,
253           unsigned long &blockStartElNum);
254
255
256// The next six functions use SearchElNum and SearchEl
257// for a particular type of dictionary (Block, WordBlock, or StemBlock)
258// and then look up the entry
259bool SearchBlockDictElNum (FILE *dictFile,
260               const block_idx &bIdx,
261               unsigned long entriesPerBlock,
262               unsigned long dictSize,
263               unsigned long elNum,
264               block_dict_el &dictEl);
265bool SearchBlockDictEl (FILE *dictFile,
266            const block_idx &bIdx,
267            unsigned long entriesPerBlock,
268            unsigned long dictSize,
269            const UCArray &el,
270            block_dict_el &dictEl,
271            unsigned long &elNum);
272
273// assumes the numLevels has been set for dictEl
274bool SearchWordBlockDictElNum (FILE *dictFile,
275                   const block_idx &bIdx,
276                   unsigned long entriesPerBlock,
277                   unsigned long dictSize,
278                   unsigned long numLevels,
279                   unsigned long elNum,
280                   word_block_dict_el &dictEl);
281bool SearchWordBlockDictEl (FILE *dictFile,
282                const block_idx &bIdx,
283                unsigned long entriesPerBlock,
284                unsigned long dictSize,
285                unsigned long numLevels,
286                const UCArray &el,
287                word_block_dict_el &dictEl,
288                unsigned long &elNum);
289
290bool SearchStemBlockDictElNum (FILE *dictFile,
291                   const block_idx &bIdx,
292                   unsigned long entriesPerBlock,
293                   unsigned long dictSize,
294                   unsigned long elNum,
295                   stem_block_dict_el &dictEl);
296bool SearchStemBlockDictEl (FILE *dictFile,
297                const block_idx &bIdx,
298                unsigned long entriesPerBlock,
299                unsigned long dictSize,
300                const UCArray &el,
301                stem_block_dict_el &dictEl,
302                unsigned long &elNum);
303
304
305
306
307#endif
Note: See TracBrowser for help on using the browser.