source: trunk/gsdl/src/mgpp/text/invf.h@ 879

Last change on this file since 879 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.3 KB
Line 
1/**************************************************************************
2 *
3 * invf.h -- Data structures for inverted files
4 * Copyright (C) 1999 Rodger McNab
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: invf.h 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24
25#ifndef H_INVF
26#define H_INVF
27
28#include <stdio.h>
29
30#include "UCArray.h"
31
32// NOTE: This does not include the magic number
33struct invf_dict_header {
34 unsigned long lookback;
35 unsigned long word_dict_start;
36 unsigned long word_dict_size;
37 unsigned long tag_dict_start;
38 unsigned long tag_dict_size;
39 unsigned long num_docs;
40 unsigned long num_frags;
41 unsigned long num_words;
42 unsigned long total_bytes;
43 unsigned long index_string_bytes;
44 unsigned long num_levels;
45
46 invf_dict_header ();
47 virtual ~invf_dict_header ();
48 virtual void Clear();
49
50 virtual bool Read (FILE *f);
51 virtual bool Write (FILE *f) const;
52};
53
54
55struct dict_el {
56 UCArray el; // word or tag
57 unsigned long frag_occur;
58 unsigned long freq;
59
60 virtual void Clear ();
61 dict_el () { Clear (); }
62 virtual ~dict_el () { }
63
64 // Read assumes that the last word is in el
65 bool Read (FILE *f);
66 bool Write (FILE *f, const UCArray *lastEl) const;
67};
68
69
70struct word_dict_el : public dict_el {
71 unsigned long *levelFreqs;
72
73 void Clear ();
74 word_dict_el () { levelFreqs = NULL; Clear (); }
75 ~word_dict_el ();
76
77 void SetNumLevels (unsigned long numLevels);
78
79 // SetNumLevels should be called before either
80 // reading or writing using Read and Write
81
82 // Read assumes that the last word is in el
83 bool Read (FILE *f, unsigned long numLevels);
84 bool Write (FILE *f, const UCArray *lastEl,
85 unsigned long numLevels) const;
86};
87
88
89// wblk = word block
90// tblk = tag block
91// this version of the blocked dictionary uses a fixed number
92// of entries per block, not a fixed block size
93struct block_dict_header : public invf_dict_header {
94 // note: word_dict_start and tag_dict_start are undefined
95 // for blocked dictionaries
96
97 unsigned long entries_per_wblk;
98 unsigned long num_wblks;
99 unsigned long max_wblk_size;
100 unsigned long wblk_start;
101 unsigned long wblk_idx_start;
102
103 unsigned long entries_per_tblk;
104 unsigned long num_tblks;
105 unsigned long max_tblk_size;
106 unsigned long tblk_start;
107 unsigned long tblk_idx_start;
108
109 block_dict_header ();
110 void Clear ();
111
112 bool Read (FILE *f);
113 bool Write (FILE *f) const;
114};
115
116
117struct block_dict_el {
118 UCArray el; // word or tag
119 unsigned long frag_occur; // # entries in invf file
120 unsigned long freq;
121 unsigned long invf_ptr;
122
123 virtual void Clear ();
124 block_dict_el () { Clear (); }
125 virtual ~block_dict_el () { }
126
127 // Read assumes that the last word is in el
128 // set lastEl = NULL when no lookback is wanted (eg
129 // for the start of a block
130 bool Read (FILE *f);
131 bool Write (FILE *f, const UCArray *lastEl) const;
132};
133
134struct word_block_dict_el : public block_dict_el {
135 unsigned long *levelFreqs;
136
137 void Clear ();
138 word_block_dict_el () { levelFreqs = NULL; Clear (); }
139 ~word_block_dict_el ();
140
141 void SetNumLevels (unsigned long numLevels);
142
143 // SetNumLevels should be called before either
144 // reading or writing using Read and Write
145
146 // Read assumes that the last word is in el
147 bool Read (FILE *f, unsigned long numLevels);
148 bool Write (FILE *f, const UCArray *lastEl,
149 unsigned long numLevels) const;
150};
151
152
153
154struct block_idx_info {
155 UCArray el;
156 unsigned long block_ptr;
157
158 block_idx_info ();
159 void Clear ();
160
161 bool Read (FILE *f);
162 bool Write (FILE *f) const;
163};
164
165// used for an index into the word and tag blocks
166typedef vector<block_idx_info> block_idx;
167
168bool ReadBlockIdx (FILE *f, block_idx &blockIdx);
169bool WriteBlockIdx (FILE *f, const block_idx &blockIdx);
170
171
172
173struct stem_idx_header {
174 unsigned long lookback;
175 unsigned long dict_size;
176
177 unsigned long entries_per_block;
178 unsigned long num_blocks;
179 unsigned long max_block_size;
180 unsigned long blocks_start;
181 unsigned long block_idx_start;
182
183 unsigned long stemmer_num;
184 unsigned long stem_method;
185
186 stem_idx_header ();
187 void Clear ();
188
189 bool Read (FILE *f);
190 bool Write (FILE *f) const;
191};
192
193struct stem_block_dict_el {
194 UCArray el; // word or tag
195 vector<unsigned long> equivWords;
196
197 stem_block_dict_el ();
198 void Clear ();
199
200 // Read assumes that the last word is in el
201 // set lastEl = NULL when no lookback is wanted (eg
202 // for the start of a block
203 bool Read (FILE *f);
204 bool Write (FILE *f, const UCArray *lastEl) const;
205};
206
207
208
209#define SKIP_MODE_NO_SKIPS 0
210
211struct invf_file_header {
212 unsigned long no_of_words;
213 unsigned long no_of_tags;
214 unsigned long skip_mode;
215 unsigned long word_level_index; // 1 if word level index
216 unsigned long params[16];
217
218 invf_file_header ();
219 void Clear ();
220
221 bool Read (FILE *f);
222 bool Write (FILE *f) const;
223};
224
225
226
227
228
229
230// the search functions returns true if a block that could
231// satisfy the request is found. these functions assume that
232// the block index is sorted by DictCompare (or DictLTUCArray)
233bool SearchElNum (const block_idx &bIdx,
234 unsigned long entriesPerBlock,
235 unsigned long elNum,
236 unsigned long &blockIdxNum,
237 unsigned long &blockStartElNum);
238bool SearchEl (const block_idx &bIdx,
239 unsigned long entriesPerBlock,
240 const UCArray &el,
241 unsigned long &blockIdxNum,
242 unsigned long &blockStartElNum);
243
244bool SearchBlockDictElNum (FILE *dictFile,
245 const block_idx &bIdx,
246 unsigned long entriesPerBlock,
247 unsigned long dictSize,
248 unsigned long elNum,
249 block_dict_el &dictEl);
250bool SearchBlockDictEl (FILE *dictFile,
251 const block_idx &bIdx,
252 unsigned long entriesPerBlock,
253 unsigned long dictSize,
254 const UCArray &el,
255 block_dict_el &dictEl,
256 unsigned long &elNum);
257
258// assumes the numLevels has been set for dictEl
259bool SearchWordBlockDictElNum (FILE *dictFile,
260 const block_idx &bIdx,
261 unsigned long entriesPerBlock,
262 unsigned long dictSize,
263 unsigned long numLevels,
264 unsigned long elNum,
265 word_block_dict_el &dictEl);
266bool SearchWordBlockDictEl (FILE *dictFile,
267 const block_idx &bIdx,
268 unsigned long entriesPerBlock,
269 unsigned long dictSize,
270 unsigned long numLevels,
271 const UCArray &el,
272 word_block_dict_el &dictEl,
273 unsigned long &elNum);
274
275bool SearchStemBlockDictElNum (FILE *dictFile,
276 const block_idx &bIdx,
277 unsigned long entriesPerBlock,
278 unsigned long dictSize,
279 unsigned long elNum,
280 stem_block_dict_el &dictEl);
281bool SearchStemBlockDictEl (FILE *dictFile,
282 const block_idx &bIdx,
283 unsigned long entriesPerBlock,
284 unsigned long dictSize,
285 const UCArray &el,
286 stem_block_dict_el &dictEl,
287 unsigned long &elNum);
288
289
290
291
292#endif
Note: See TracBrowser for help on using the repository browser.