source: trunk/gsdl/src/mgpp/text/mg_invf_dump.cpp@ 2468

Last change on this file since 2468 was 2468, checked in by sjboddie, 23 years ago

Fiddled about with mgpp to get it compiling on Windows under VC++ 6.0. I
still can't get it to compile under VC++ 4.2 because of some weird
behaviour in STLport.

Also tidied up a little and removed some of the old log information
that was scattered about in some of the files.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.cpp -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25// need this to avoid bizarre compiler problems under VC++ 6.0
26#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
27# include <iostream>
28#endif
29
30#if defined (__WIN32__)
31# include "getopt.h"
32#else
33# include <unistd.h>
34#endif
35
36#include "sysfuncs.h"
37#include "messages.h"
38#include "bitio_m_stdio.h"
39#include "bitio_gen.h"
40#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
41
42#include "mg_files.h"
43#include "locallib.h"
44#include "words.h"
45#include "invf.h"
46#include "WordData.h"
47
48static void PrintInvfWord (FILE *invfFile,
49 invf_dict_header &idh,
50 invf_file_header &ifh,
51 word_dict_el &wordEl,
52 unsigned long wordStart,
53 bool printFrags) {
54 cout << wordEl.frag_occur << " \"" << wordEl.el << "\"\n";
55
56 if (printFrags) {
57 // seek to the appropriate place in the inverted file
58 fseek (invfFile, wordStart, SEEK_SET);
59
60 stdio_bitio_buffer buffer(invfFile);
61
62 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
63 unsigned long fragNum = 0;
64 unsigned long i;
65 for (i=0; i<wordEl.frag_occur; i++) {
66 unsigned long delta = buffer.bblock_decode (B, NULL);
67 fragNum += delta;
68 cout << " " << fragNum;
69
70 if (!ifh.word_level_index ) {
71 unsigned long count = buffer.gamma_decode (NULL);
72 cout << "(" << count << ")";
73 } else {
74 cout << "(1)";
75 }
76 }
77
78 cout << "\n";
79
80 buffer.done();
81 }
82}
83
84static void PrintInvfTag (FILE *invfFile,
85 invf_dict_header &idh,
86 invf_file_header &/*ifh*/,
87 dict_el &tagEl,
88 unsigned long tagStart,
89 bool printFrags) {
90 cout << tagEl.frag_occur << " \"<" << tagEl.el << ">\"\n";
91
92 if (printFrags) {
93 // seek to the appropriate place in the inverted file
94 fseek (invfFile, tagStart, SEEK_SET);
95
96 stdio_bitio_buffer buffer(invfFile);
97
98 unsigned long pTag = tagEl.frag_occur*2;
99 unsigned long B = BIO_Bblock_Init (idh.num_frags+pTag, pTag);
100 unsigned long fragNum = 0;
101 unsigned long i;
102 for (i=0; i<tagEl.frag_occur; i++) {
103 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
104 fragNum += delta;
105 cout << " " << fragNum;
106 cout << "-";
107 delta = buffer.bblock_decode (B, NULL)-1;
108 fragNum += delta;
109 cout << fragNum;
110 }
111
112 cout << "\n";
113
114 buffer.done();
115 }
116}
117
118static void PrintHeaderInfo (invf_dict_header &idh,
119 invf_file_header &ifh) {
120 cerr << "Lookback: " << idh.lookback << "\n";
121 cerr << "Word Dict Size: " << idh.word_dict_size << "\n";
122 cerr << "Tag Dict Size: " << idh.tag_dict_size << "\n";
123 cerr << "Num Documents: " << idh.num_docs << "\n";
124 cerr << "Num Fragments: " << idh.num_frags << "\n";
125 cerr << "Num Words: " << idh.num_words << "\n";
126
127 cerr << "Skip Mode: " << ifh.skip_mode << "\n";
128 cerr << "Word Level Index: " << ifh.word_level_index << "\n";
129
130 cerr << "\n";
131}
132
133
134static void process_files (char *filename,
135 bool printHeader,
136 bool printWords,
137 bool printTags,
138 bool printFrags) {
139 // open the dictionary
140 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
141 MAGIC_STEM_BUILD, MG_ABORT);
142 invf_dict_header idh;
143 idh.Read (dictFile);
144
145 // open the inverted file
146 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
147 MAGIC_INVF, MG_ABORT);
148
149 invf_file_header ifh;
150 ifh.Read (invfFile);
151
152 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
153 FatalError (1, "The invf file contains skips. Unable to dump.");
154
155 // print out header information
156 if (printHeader) {
157 PrintHeaderInfo (idh, ifh);
158 }
159
160 // open the inverted index
161 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
162 MAGIC_INVI, MG_ABORT);
163
164 // go to the start of the word dictionary
165 fseek (dictFile, idh.word_dict_start, SEEK_SET);
166
167 // process all the words
168 if (printWords) {
169 unsigned long wordNum;
170 unsigned long wordStart;
171 word_dict_el wordEl;
172 wordEl.SetNumLevels (idh.num_levels);
173 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
174 wordEl.Read (dictFile, idh.num_levels);
175 ReadUL (invfIdxFile, wordStart);
176 PrintInvfWord (invfFile, idh, ifh, wordEl, wordStart, printFrags);
177 }
178 }
179
180 // process all the tags
181 if (printTags) {
182 unsigned long tagNum;
183 unsigned long tagStart;
184 dict_el tagEl;
185 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
186 tagEl.Read (dictFile);
187 ReadUL (invfIdxFile, tagStart);
188 PrintInvfTag (invfFile, idh, ifh, tagEl, tagStart, printFrags);
189 }
190 }
191 // close the open files
192 fclose (invfIdxFile);
193 fclose (invfFile);
194 fclose (dictFile);
195}
196
197
198int main (int argc, char **argv) {
199 char *dir_name, *filename = "";
200 int ch;
201 msg_prefix = argv[0];
202 dir_name = getenv ("MGDATA");
203 opterr = 0;
204
205 bool printHeader = false;
206 bool printWords = false;
207 bool printTags = false;
208 bool printFrags = false;
209
210 msg_prefix = argv[0];
211 while ((ch = getopt (argc, argv, "hrwtnf:d:")) != -1) {
212 switch (ch) {
213 case 'f': // input file
214 filename = optarg;
215 break;
216 case 'd':
217 set_basepath(optarg);
218 break;
219 case 'r':
220 printHeader = true;
221 break;
222 case 'w':
223 printWords = true;
224 break;
225 case 'n':
226 printFrags = true;
227 break;
228 case 't':
229 printTags = true;
230 break;
231 case 'h':
232 case '?':
233 fprintf (stderr, "usage: %s [-h] [-r] [-w] [-t] [-n] [-f input_file]"
234 "[-d data directory]\n(-rwnt:print header, words, tags, fragnums)\n",
235 argv[0]);
236 exit (1);
237 }
238 }
239
240 process_files (filename, printHeader, printWords, printTags, printFrags);
241
242 return 0;
243}
Note: See TracBrowser for help on using the repository browser.