source: trunk/gsdl/src/mgpp/text/mg_invf_dump.cpp@ 856

Last change on this file since 856 was 856, checked in by sjboddie, 24 years ago

Rodgers new C++ mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.3 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.cpp -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dump.cpp 856 2000-01-14 02:26:25Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "bitio_m_stdio.h"
28#include "bitio_gen.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31#include "mg_files.h"
32#include "locallib.h"
33#include "words.h"
34#include "invf.h"
35#include "WordData.h"
36
37/*
38 $Log$
39 Revision 1.1 2000/01/14 02:26:17 sjboddie
40 Rodgers new C++ mg
41
42 Revision 1.1 1999/10/11 02:57:55 cs025
43 Base install of MG-PP
44
45 Revision 1.1 1999/08/10 21:18:09 sjboddie
46 renamed mg-1.3d directory mg
47
48 Revision 1.2 1998/11/25 07:55:46 rjmcnab
49
50 Modified mg to that you can specify the stemmer you want
51 to use via a command line option. You specify it to
52 mg_passes during the build process. The number of the
53 stemmer that you used is stored within the inverted
54 dictionary header and the stemmed dictionary header so
55 the correct stemmer is used in later stages of building
56 and querying.
57
58 Revision 1.1 1998/11/17 09:35:05 rjmcnab
59 *** empty log message ***
60
61 * Revision 1.3 1994/11/29 00:32:01 tes
62 * Committing the new merged files and changes.
63 *
64 * Revision 1.2 1994/09/20 04:41:50 tes
65 * For version 1.1
66 *
67 */
68
69
70static void PrintInvfWord (FILE *invfFile,
71 invf_dict_header &idh,
72 invf_file_header &ifh,
73 word_dict_el &wordEl,
74 unsigned long wordStart) {
75 cout << wordEl.frag_occur << " \"" << wordEl.el << "\"\n";
76
77 // seek to the appropriate place in the inverted file
78 fseek (invfFile, wordStart, SEEK_SET);
79
80 stdio_bitio_buffer buffer(invfFile);
81
82 unsigned long B = BIO_Bblock_Init (idh.num_frags, wordEl.frag_occur);
83 unsigned long fragNum = 0;
84 unsigned long i;
85 for (i=0; i<wordEl.frag_occur; i++) {
86 unsigned long delta = buffer.bblock_decode (B, NULL);
87 fragNum += delta;
88 cout << " " << fragNum;
89
90 if (!ifh.word_level_index ) {
91 unsigned long count = buffer.gamma_decode (NULL);
92 cout << "(" << count << ")";
93 } else {
94 cout << "(1)";
95 }
96 }
97
98 cout << "\n";
99
100 buffer.done();
101}
102
103static void PrintInvfTag (FILE *invfFile,
104 invf_dict_header &idh,
105 invf_file_header &/*ifh*/,
106 dict_el &tagEl,
107 unsigned long tagStart) {
108 cout << tagEl.frag_occur << " \"<" << tagEl.el << ">\"\n";
109
110 // seek to the appropriate place in the inverted file
111 fseek (invfFile, tagStart, SEEK_SET);
112
113 stdio_bitio_buffer buffer(invfFile);
114
115 unsigned long pTag = tagEl.frag_occur*2;
116 unsigned long B = BIO_Bblock_Init (idh.num_frags+pTag, pTag);
117 unsigned long fragNum = 0;
118 unsigned long i;
119 for (i=0; i<tagEl.frag_occur; i++) {
120 unsigned long delta = buffer.bblock_decode (B, NULL)-1;
121 fragNum += delta;
122 cout << " " << fragNum;
123 cout << "-";
124 delta = buffer.bblock_decode (B, NULL)-1;
125 fragNum += delta;
126 cout << fragNum;
127 }
128
129 cout << "\n";
130
131 buffer.done();
132}
133
134static void PrintHeaderInfo (invf_dict_header &idh,
135 invf_file_header &ifh) {
136 cerr << "Lookback: " << idh.lookback << "\n";
137 cerr << "Word Dict Size: " << idh.word_dict_size << "\n";
138 cerr << "Tag Dict Size: " << idh.tag_dict_size << "\n";
139 cerr << "Num Documents: " << idh.num_docs << "\n";
140 cerr << "Num Fragments: " << idh.num_frags << "\n";
141 cerr << "Num Words: " << idh.num_words << "\n";
142
143 cerr << "Skip Mode: " << ifh.skip_mode << "\n";
144 cerr << "Word Level Index: " << ifh.word_level_index << "\n";
145
146 cerr << "\n";
147}
148
149
150static void process_files (char *filename) {
151 // open the dictionary
152 FILE *dictFile = open_file (filename, INVF_DICT_SUFFIX, "rb",
153 MAGIC_STEM_BUILD, MG_ABORT);
154 invf_dict_header idh;
155 idh.Read (dictFile);
156
157 // open the inverted file
158 FILE *invfFile = open_file (filename, INVF_SUFFIX, "rb",
159 MAGIC_INVF, MG_ABORT);
160
161 invf_file_header ifh;
162 ifh.Read (invfFile);
163
164 if (ifh.skip_mode != SKIP_MODE_NO_SKIPS)
165 FatalError (1, "The invf file contains skips. Unable to dump.");
166
167 // print out header information
168 PrintHeaderInfo (idh, ifh);
169
170 // open the inverted index
171 FILE *invfIdxFile = open_file (filename, INVF_IDX_SUFFIX, "rb",
172 MAGIC_INVI, MG_ABORT);
173
174 // go to the start of the word dictionary
175 fseek (dictFile, idh.word_dict_start, SEEK_SET);
176
177 // process all the words
178 unsigned long wordNum;
179 unsigned long wordStart;
180 word_dict_el wordEl;
181 wordEl.SetNumLevels (idh.num_levels);
182 for (wordNum=0; wordNum<idh.word_dict_size; wordNum++) {
183 wordEl.Read (dictFile, idh.num_levels);
184 ReadUL (invfIdxFile, wordStart);
185 PrintInvfWord (invfFile, idh, ifh, wordEl, wordStart);
186 }
187
188 // process all the tags
189 unsigned long tagNum;
190 unsigned long tagStart;
191 dict_el tagEl;
192 for (tagNum=0; tagNum<idh.tag_dict_size; tagNum++) {
193 tagEl.Read (dictFile);
194 ReadUL (invfIdxFile, tagStart);
195 PrintInvfTag (invfFile, idh, ifh, tagEl, tagStart);
196 }
197
198 // close the open files
199 fclose (invfIdxFile);
200 fclose (invfFile);
201 fclose (dictFile);
202}
203
204
205int main (int argc, char **argv) {
206 char *dir_name, *filename = "";
207 int ch;
208 msg_prefix = argv[0];
209 dir_name = getenv ("MGDATA");
210 opterr = 0;
211
212 msg_prefix = argv[0];
213 while ((ch = getopt (argc, argv, "hbwtf:d:")) != -1) {
214 switch (ch) {
215 case 'f': // input file
216 filename = optarg;
217 break;
218 case 'd':
219 set_basepath(optarg);
220 break;
221 case 'h':
222 case '?':
223 fprintf (stderr, "usage: %s [-h] [-b] [-w] [-t] [-f input_file]"
224 "[-d data directory]\n", argv[0]);
225 exit (1);
226 }
227 }
228
229 process_files (filename);
230
231 return 0;
232}
Note: See TracBrowser for help on using the repository browser.