source: main/tags/2.80/indexers/mgpp/text/mgpp_invf_dump.cpp@ 24540

Last change on this file since 24540 was 9613, checked in by kjdon, 19 years ago

added in x++ -> ++x changes submitted by Emanuel Dejanu

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1/**************************************************************************
2 *
3 * mgpp_invf_dump.cpp -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 **************************************************************************/
21
22#define _XOPEN_SOURCE 1
23#define _XOPEN_SOURCE_EXTENDED 1
24
25// need this to avoid bizarre compiler problems under VC++ 6.0
26#if defined (__WIN32__) && !defined (GSDL_USE_IOS_H)
27# include <iostream>
28#endif
29
30/* getopt is in posix.2, so cygwin should have it in unistd, but doesn't */
31#if defined (__WIN32__) || defined (__CYGWIN__)
32# include "getopt_old.h"
33#else
34# include <unistd.h>
35#endif
36
37#include "sysfuncs.h"
38#include "messages.h"
39#include "bitio_m_stdio.h"
40#include "bitio_gen.h"
41#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
42
43#include "mg_files.h"
44#include "locallib.h"
45#include "words.h"
46#include "invf.h"
47#include "WordData.h"
48
49static void PrintInvfWord (FILE *invfFile,
50 invf_dict_header &idh,
51 invf_file_header &ifh,
52 word_dict_el &wordEl,
53 unsigned long wordStart,
54 bool printFrags) {
55 cout << wordEl.frag_occur << " \"" << wordEl.el << "\"\n";
56
57 if (printFrags) {
58 // seek to the appropriate place in the inverted file
59 fseek (invfFile, wordStart, SEEK_SET);
60
61 stdio_bitio_buffer buffer(invfFile);
62
63 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
64 unsigned long fragNum = 0;
65 unsigned long i;
66 for (i=0; i<wordEl.frag_occur; ++i) {
67 unsigned long delta = buffer.bblock_decode (B, NULL);
68 fragNum += delta;
69 cout << " " << fragNum;
70
71 if (!ifh.word_level_index ) {
72 unsigned long count = buffer.gamma_decode (NULL);
73 cout << "(" << count << ")";
74 } else {
75 cout << "(1)";
76 }
77 }
78
79 cout << "\n";
80
81 buffer.done();
82 }
83}
84
85static void PrintInvfTag (FILE *invfFile,
86 invf_dict_header &idh,
87 invf_file_header &/*ifh*/,
88 dict_el &tagEl,
89 unsigned long tagStart,
90 bool printFrags) {
91 cout << tagEl.frag_occur << " \"<" << tagEl.el << ">\"\n";
92
93 if (printFrags) {
94 // seek to the appropriate place in the inverted file
95 fseek (invfFile, tagStart, SEEK_SET);
96
97 stdio_bitio_buffer buffer(invfFile);
98
99 unsigned long pTag = tagEl.frag_occur*2;
100 unsigned long B = BIO_Bblock_Init (idh.num_frags+pTag, pTag);
101 unsigned long fragNum = 0;
102 unsigned long i;
103 for (i=0; i<tagEl.frag_occur; ++i) {
104 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
105 fragNum += delta;
106 cout << " " << fragNum;
107 cout << "-";
108 delta = buffer.bblock_decode (B, NULL)-1;
109 fragNum += delta;
110 cout << fragNum;
111 }
112
113 cout << "\n";
114
115 buffer.done();
116 }
117}
118
119static void PrintHeaderInfo (invf_dict_header &idh,
120 invf_file_header &ifh) {
121 cerr << "Lookback: " << idh.lookback << "\n";
122 cerr << "Word Dict Size: " << idh.word_dict_size << "\n";
123 cerr << "Tag Dict Size: " << idh.tag_dict_size << "\n";
124 cerr << "Num Documents: " << idh.num_docs << "\n";
125 cerr << "Num Fragments: " << idh.num_frags << "\n";
126 cerr << "Num Words: " << idh.num_words << "\n";
127
128 cerr << "Skip Mode: " << ifh.skip_mode << "\n";
129 cerr << "Word Level Index: " << ifh.word_level_index << "\n";
130
131 cerr << "\n";
132}
133
134
135static void process_files (char *filename,
136 bool printHeader,
137 bool printWords,
138 bool printTags,
139 bool printFrags) {
140 // open the dictionary
141 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
142 MAGIC_STEM_BUILD, MG_ABORT);
143 invf_dict_header idh;
144 idh.Read (dictFile);
145
146 // open the inverted file
147 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
148 MAGIC_INVF, MG_ABORT);
149
150 invf_file_header ifh;
151 ifh.Read (invfFile);
152
153 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
154 FatalError (1, "The invf file contains skips. Unable to dump.");
155
156 // print out header information
157 if (printHeader) {
158 PrintHeaderInfo (idh, ifh);
159 }
160
161 // open the inverted index
162 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
163 MAGIC_INVI, MG_ABORT);
164
165 // go to the start of the word dictionary
166 fseek (dictFile, idh.word_dict_start, SEEK_SET);
167
168 // process all the words
169 if (printWords) {
170 unsigned long wordNum;
171 unsigned long wordStart;
172 word_dict_el wordEl;
173 wordEl.SetNumLevels (idh.num_levels);
174 for (wordNum=0; wordNum<idh.word_dict_size; ++wordNum) {
175 wordEl.Read (dictFile, idh.num_levels);
176 ReadUL (invfIdxFile, wordStart);
177 PrintInvfWord (invfFile, idh, ifh, wordEl, wordStart, printFrags);
178 }
179 }
180
181 // process all the tags
182 if (printTags) {
183 unsigned long tagNum;
184 unsigned long tagStart;
185 dict_el tagEl;
186 for (tagNum=0; tagNum<idh.tag_dict_size; ++tagNum) {
187 tagEl.Read (dictFile);
188 ReadUL (invfIdxFile, tagStart);
189 PrintInvfTag (invfFile, idh, ifh, tagEl, tagStart, printFrags);
190 }
191 }
192 // close the open files
193 fclose (invfIdxFile);
194 fclose (invfFile);
195 fclose (dictFile);
196}
197
198
199int main (int argc, char **argv) {
200 char *dir_name, *filename = "";
201 int ch;
202 msg_prefix = argv[0];
203 dir_name = getenv ("MGDATA");
204 opterr = 0;
205
206 bool printHeader = false;
207 bool printWords = false;
208 bool printTags = false;
209 bool printFrags = false;
210
211 msg_prefix = argv[0];
212 while ((ch = getopt (argc, argv, "hrwtnf:d:")) != -1) {
213 switch (ch) {
214 case 'f': // input file
215 filename = optarg;
216 break;
217 case 'd':
218 set_basepath(optarg);
219 break;
220 case 'r':
221 printHeader = true;
222 break;
223 case 'w':
224 printWords = true;
225 break;
226 case 'n':
227 printFrags = true;
228 break;
229 case 't':
230 printTags = true;
231 break;
232 case 'h':
233 case '?':
234 fprintf (stderr, "usage: %s [-h] [-r] [-w] [-t] [-n] [-f input_file]"
235 "[-d data directory]\n(-rwnt:print header, words, tags, fragnums)\n",
236 argv[0]);
237 exit (1);
238 }
239 }
240
241 process_files (filename, printHeader, printWords, printTags, printFrags);
242
243 return 0;
244}
Note: See TracBrowser for help on using the repository browser.