source: trunk/gsdl/src/mgpp/text/mg_invf_dump.cpp@ 2442

Last change on this file since 2442 was 2442, checked in by jrm21, 23 years ago

portability changes, use getopt from unistd.h (all POSIX systems)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.4 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.cpp -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dump.cpp 2442 2001-05-17 04:38:16Z jrm21 $
21 *
22 **************************************************************************/
23#define _XOPEN_SOURCE 1
24#define _XOPEN_SOURCE_EXTENDED 1
25#include <unistd.h>
26
27#include "sysfuncs.h"
28
29#include "messages.h"
30#include "bitio_m_stdio.h"
31#include "bitio_gen.h"
32#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
33
34#include "mg_files.h"
35#include "locallib.h"
36#include "words.h"
37#include "invf.h"
38#include "WordData.h"
39
40/*
41 $Log$
42 Revision 1.3 2001/05/17 04:38:16 jrm21
43 portability changes, use getopt from unistd.h (all POSIX systems)
44
45 Revision 1.2 2001/02/02 01:12:29 kjm18
46 added more command line options, and better help message
47
48 Revision 1.1 2000/01/14 02:26:17 sjboddie
49 Rodgers new C++ mg
50
51 Revision 1.1 1999/10/11 02:57:55 cs025
52 Base install of MG-PP
53
54 Revision 1.1 1999/08/10 21:18:09 sjboddie
55 renamed mg-1.3d directory mg
56
57 Revision 1.2 1998/11/25 07:55:46 rjmcnab
58
59 Modified mg to that you can specify the stemmer you want
60 to use via a command line option. You specify it to
61 mg_passes during the build process. The number of the
62 stemmer that you used is stored within the inverted
63 dictionary header and the stemmed dictionary header so
64 the correct stemmer is used in later stages of building
65 and querying.
66
67 Revision 1.1 1998/11/17 09:35:05 rjmcnab
68 *** empty log message ***
69
70 * Revision 1.3 1994/11/29 00:32:01 tes
71 * Committing the new merged files and changes.
72 *
73 * Revision 1.2 1994/09/20 04:41:50 tes
74 * For version 1.1
75 *
76 */
77
78
79static void PrintInvfWord (FILE *invfFile,
80 invf_dict_header &idh,
81 invf_file_header &ifh,
82 word_dict_el &wordEl,
83 unsigned long wordStart,
84 bool printFrags) {
85 cout << wordEl.frag_occur << " \"" << wordEl.el << "\"\n";
86
87 if (printFrags) {
88 // seek to the appropriate place in the inverted file
89 fseek (invfFile, wordStart, SEEK_SET);
90
91 stdio_bitio_buffer buffer(invfFile);
92
93 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
94 unsigned long fragNum = 0;
95 unsigned long i;
96 for (i=0; i<wordEl.frag_occur; i++) {
97 unsigned long delta = buffer.bblock_decode (B, NULL);
98 fragNum += delta;
99 cout << " " << fragNum;
100
101 if (!ifh.word_level_index ) {
102 unsigned long count = buffer.gamma_decode (NULL);
103 cout << "(" << count << ")";
104 } else {
105 cout << "(1)";
106 }
107 }
108
109 cout << "\n";
110
111 buffer.done();
112 }
113}
114
115static void PrintInvfTag (FILE *invfFile,
116 invf_dict_header &idh,
117 invf_file_header &/*ifh*/,
118 dict_el &tagEl,
119 unsigned long tagStart,
120 bool printFrags) {
121 cout << tagEl.frag_occur << " \"<" << tagEl.el << ">\"\n";
122
123 if (printFrags) {
124 // seek to the appropriate place in the inverted file
125 fseek (invfFile, tagStart, SEEK_SET);
126
127 stdio_bitio_buffer buffer(invfFile);
128
129 unsigned long pTag = tagEl.frag_occur*2;
130 unsigned long B = BIO_Bblock_Init (idh.num_frags+pTag, pTag);
131 unsigned long fragNum = 0;
132 unsigned long i;
133 for (i=0; i<tagEl.frag_occur; i++) {
134 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
135 fragNum += delta;
136 cout << " " << fragNum;
137 cout << "-";
138 delta = buffer.bblock_decode (B, NULL)-1;
139 fragNum += delta;
140 cout << fragNum;
141 }
142
143 cout << "\n";
144
145 buffer.done();
146 }
147}
148
149static void PrintHeaderInfo (invf_dict_header &idh,
150 invf_file_header &ifh) {
151 cerr << "Lookback: " << idh.lookback << "\n";
152 cerr << "Word Dict Size: " << idh.word_dict_size << "\n";
153 cerr << "Tag Dict Size: " << idh.tag_dict_size << "\n";
154 cerr << "Num Documents: " << idh.num_docs << "\n";
155 cerr << "Num Fragments: " << idh.num_frags << "\n";
156 cerr << "Num Words: " << idh.num_words << "\n";
157
158 cerr << "Skip Mode: " << ifh.skip_mode << "\n";
159 cerr << "Word Level Index: " << ifh.word_level_index << "\n";
160
161 cerr << "\n";
162}
163
164
165static void process_files (char *filename,
166 bool printHeader,
167 bool printWords,
168 bool printTags,
169 bool printFrags) {
170 // open the dictionary
171 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
172 MAGIC_STEM_BUILD, MG_ABORT);
173 invf_dict_header idh;
174 idh.Read (dictFile);
175
176 // open the inverted file
177 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
178 MAGIC_INVF, MG_ABORT);
179
180 invf_file_header ifh;
181 ifh.Read (invfFile);
182
183 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
184 FatalError (1, "The invf file contains skips. Unable to dump.");
185
186 // print out header information
187 if (printHeader) {
188 PrintHeaderInfo (idh, ifh);
189 }
190
191 // open the inverted index
192 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
193 MAGIC_INVI, MG_ABORT);
194
195 // go to the start of the word dictionary
196 fseek (dictFile, idh.word_dict_start, SEEK_SET);
197
198 // process all the words
199 if (printWords) {
200 unsigned long wordNum;
201 unsigned long wordStart;
202 word_dict_el wordEl;
203 wordEl.SetNumLevels (idh.num_levels);
204 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
205 wordEl.Read (dictFile, idh.num_levels);
206 ReadUL (invfIdxFile, wordStart);
207 PrintInvfWord (invfFile, idh, ifh, wordEl, wordStart, printFrags);
208 }
209 }
210
211 // process all the tags
212 if (printTags) {
213 unsigned long tagNum;
214 unsigned long tagStart;
215 dict_el tagEl;
216 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
217 tagEl.Read (dictFile);
218 ReadUL (invfIdxFile, tagStart);
219 PrintInvfTag (invfFile, idh, ifh, tagEl, tagStart, printFrags);
220 }
221 }
222 // close the open files
223 fclose (invfIdxFile);
224 fclose (invfFile);
225 fclose (dictFile);
226}
227
228
229int main (int argc, char **argv) {
230 char *dir_name, *filename = "";
231 int ch;
232 msg_prefix = argv[0];
233 dir_name = getenv ("MGDATA");
234 opterr = 0;
235
236 bool printHeader = false;
237 bool printWords = false;
238 bool printTags = false;
239 bool printFrags = false;
240
241 msg_prefix = argv[0];
242 while ((ch = getopt (argc, argv, "hrwtnf:d:")) != -1) {
243 switch (ch) {
244 case 'f': // input file
245 filename = optarg;
246 break;
247 case 'd':
248 set_basepath(optarg);
249 break;
250 case 'r':
251 printHeader = true;
252 break;
253 case 'w':
254 printWords = true;
255 break;
256 case 'n':
257 printFrags = true;
258 break;
259 case 't':
260 printTags = true;
261 break;
262 case 'h':
263 case '?':
264 fprintf (stderr, "usage: %s [-h] [-r] [-w] [-t] [-n] [-f input_file]"
265 "[-d data directory]\n(-rwnt:print header, words, tags, fragnums)\n",
266 argv[0]);
267 exit (1);
268 }
269 }
270
271 process_files (filename, printHeader, printWords, printTags, printFrags);
272
273 return 0;
274}
Note: See TracBrowser for help on using the repository browser.