source: trunk/gsdl/src/mgpp/text/mg_invf_dump.cpp@ 1898

Last change on this file since 1898 was 1898, checked in by kjm18, 23 years ago

added more command line options, and better help message

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.2 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.cpp -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dump.cpp 1898 2001-02-02 01:12:29Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "bitio_m_stdio.h"
28#include "bitio_gen.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31#include "mg_files.h"
32#include "locallib.h"
33#include "words.h"
34#include "invf.h"
35#include "WordData.h"
36
37/*
38 $Log$
39 Revision 1.2 2001/02/02 01:12:29 kjm18
40 added more command line options, and better help message
41
42 Revision 1.1 2000/01/14 02:26:17 sjboddie
43 Rodgers new C++ mg
44
45 Revision 1.1 1999/10/11 02:57:55 cs025
46 Base install of MG-PP
47
48 Revision 1.1 1999/08/10 21:18:09 sjboddie
49 renamed mg-1.3d directory mg
50
51 Revision 1.2 1998/11/25 07:55:46 rjmcnab
52
53 Modified mg to that you can specify the stemmer you want
54 to use via a command line option. You specify it to
55 mg_passes during the build process. The number of the
56 stemmer that you used is stored within the inverted
57 dictionary header and the stemmed dictionary header so
58 the correct stemmer is used in later stages of building
59 and querying.
60
61 Revision 1.1 1998/11/17 09:35:05 rjmcnab
62 *** empty log message ***
63
64 * Revision 1.3 1994/11/29 00:32:01 tes
65 * Committing the new merged files and changes.
66 *
67 * Revision 1.2 1994/09/20 04:41:50 tes
68 * For version 1.1
69 *
70 */
71
72
73static void PrintInvfWord (FILE *invfFile,
74 invf_dict_header &idh,
75 invf_file_header &ifh,
76 word_dict_el &wordEl,
77 unsigned long wordStart,
78 bool printFrags) {
79 cout << wordEl.frag_occur << " \"" << wordEl.el << "\"\n";
80
81 if (printFrags) {
82 // seek to the appropriate place in the inverted file
83 fseek (invfFile, wordStart, SEEK_SET);
84
85 stdio_bitio_buffer buffer(invfFile);
86
87 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
88 unsigned long fragNum = 0;
89 unsigned long i;
90 for (i=0; i<wordEl.frag_occur; i++) {
91 unsigned long delta = buffer.bblock_decode (B, NULL);
92 fragNum += delta;
93 cout << " " << fragNum;
94
95 if (!ifh.word_level_index ) {
96 unsigned long count = buffer.gamma_decode (NULL);
97 cout << "(" << count << ")";
98 } else {
99 cout << "(1)";
100 }
101 }
102
103 cout << "\n";
104
105 buffer.done();
106 }
107}
108
109static void PrintInvfTag (FILE *invfFile,
110 invf_dict_header &idh,
111 invf_file_header &/*ifh*/,
112 dict_el &tagEl,
113 unsigned long tagStart,
114 bool printFrags) {
115 cout << tagEl.frag_occur << " \"<" << tagEl.el << ">\"\n";
116
117 if (printFrags) {
118 // seek to the appropriate place in the inverted file
119 fseek (invfFile, tagStart, SEEK_SET);
120
121 stdio_bitio_buffer buffer(invfFile);
122
123 unsigned long pTag = tagEl.frag_occur*2;
124 unsigned long B = BIO_Bblock_Init (idh.num_frags+pTag, pTag);
125 unsigned long fragNum = 0;
126 unsigned long i;
127 for (i=0; i<tagEl.frag_occur; i++) {
128 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
129 fragNum += delta;
130 cout << " " << fragNum;
131 cout << "-";
132 delta = buffer.bblock_decode (B, NULL)-1;
133 fragNum += delta;
134 cout << fragNum;
135 }
136
137 cout << "\n";
138
139 buffer.done();
140 }
141}
142
143static void PrintHeaderInfo (invf_dict_header &idh,
144 invf_file_header &ifh) {
145 cerr << "Lookback: " << idh.lookback << "\n";
146 cerr << "Word Dict Size: " << idh.word_dict_size << "\n";
147 cerr << "Tag Dict Size: " << idh.tag_dict_size << "\n";
148 cerr << "Num Documents: " << idh.num_docs << "\n";
149 cerr << "Num Fragments: " << idh.num_frags << "\n";
150 cerr << "Num Words: " << idh.num_words << "\n";
151
152 cerr << "Skip Mode: " << ifh.skip_mode << "\n";
153 cerr << "Word Level Index: " << ifh.word_level_index << "\n";
154
155 cerr << "\n";
156}
157
158
159static void process_files (char *filename,
160 bool printHeader,
161 bool printWords,
162 bool printTags,
163 bool printFrags) {
164 // open the dictionary
165 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
166 MAGIC_STEM_BUILD, MG_ABORT);
167 invf_dict_header idh;
168 idh.Read (dictFile);
169
170 // open the inverted file
171 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
172 MAGIC_INVF, MG_ABORT);
173
174 invf_file_header ifh;
175 ifh.Read (invfFile);
176
177 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
178 FatalError (1, "The invf file contains skips. Unable to dump.");
179
180 // print out header information
181 if (printHeader) {
182 PrintHeaderInfo (idh, ifh);
183 }
184
185 // open the inverted index
186 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
187 MAGIC_INVI, MG_ABORT);
188
189 // go to the start of the word dictionary
190 fseek (dictFile, idh.word_dict_start, SEEK_SET);
191
192 // process all the words
193 if (printWords) {
194 unsigned long wordNum;
195 unsigned long wordStart;
196 word_dict_el wordEl;
197 wordEl.SetNumLevels (idh.num_levels);
198 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
199 wordEl.Read (dictFile, idh.num_levels);
200 ReadUL (invfIdxFile, wordStart);
201 PrintInvfWord (invfFile, idh, ifh, wordEl, wordStart, printFrags);
202 }
203 }
204
205 // process all the tags
206 if (printTags) {
207 unsigned long tagNum;
208 unsigned long tagStart;
209 dict_el tagEl;
210 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
211 tagEl.Read (dictFile);
212 ReadUL (invfIdxFile, tagStart);
213 PrintInvfTag (invfFile, idh, ifh, tagEl, tagStart, printFrags);
214 }
215 }
216 // close the open files
217 fclose (invfIdxFile);
218 fclose (invfFile);
219 fclose (dictFile);
220}
221
222
223int main (int argc, char **argv) {
224 char *dir_name, *filename = "";
225 int ch;
226 msg_prefix = argv[0];
227 dir_name = getenv ("MGDATA");
228 opterr = 0;
229
230 bool printHeader = false;
231 bool printWords = false;
232 bool printTags = false;
233 bool printFrags = false;
234
235 msg_prefix = argv[0];
236 while ((ch = getopt (argc, argv, "hrwtnf:d:")) != -1) {
237 switch (ch) {
238 case 'f': // input file
239 filename = optarg;
240 break;
241 case 'd':
242 set_basepath(optarg);
243 break;
244 case 'r':
245 printHeader = true;
246 break;
247 case 'w':
248 printWords = true;
249 break;
250 case 'n':
251 printFrags = true;
252 break;
253 case 't':
254 printTags = true;
255 break;
256 case 'h':
257 case '?':
258 fprintf (stderr, "usage: %s [-h] [-r] [-w] [-t] [-n] [-f input_file]"
259 "[-d data directory]\n(-rwnt:print header, words, tags, fragnums)\n",
260 argv[0]);
261 exit (1);
262 }
263 }
264
265 process_files (filename, printHeader, printWords, printTags, printFrags);
266
267 return 0;
268}
Note: See TracBrowser for help on using the repository browser.