source: trunk/gsdl/packages/mg/src/text/mg_invf_dump.c@ 2526

Last change on this file since 2526 was 2526, checked in by kjm18, 23 years ago

changed the help message

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.8 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.c -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dump.c 2526 2001-06-12 01:41:39Z kjm18 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "timing.h"
28#include "bitio_m.h"
29#include "bitio_m_stdio.h"
30#include "bitio_gen.h"
31#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33#include "mg_files.h"
34#include "locallib.h"
35#include "words.h"
36#include "invf.h"
37
38extern unsigned long S_btg;
39
40/*
41 $Log$
42 Revision 1.2 2001/06/12 01:41:39 kjm18
43 changed the help message
44
45 Revision 1.1 1999/08/10 21:18:09 sjboddie
46 renamed mg-1.3d directory mg
47
48 Revision 1.2 1998/11/25 07:55:46 rjmcnab
49
50 Modified mg to that you can specify the stemmer you want
51 to use via a command line option. You specify it to
52 mg_passes during the build process. The number of the
53 stemmer that you used is stored within the inverted
54 dictionary header and the stemmed dictionary header so
55 the correct stemmer is used in later stages of building
56 and querying.
57
58 Revision 1.1 1998/11/17 09:35:05 rjmcnab
59 *** empty log message ***
60
61 * Revision 1.3 1994/11/29 00:32:01 tes
62 * Committing the new merged files and changes.
63 *
64 * Revision 1.2 1994/09/20 04:41:50 tes
65 * For version 1.1
66 *
67 */
68
69static char *RCSID = "$Id: mg_invf_dump.c 2526 2001-06-12 01:41:39Z kjm18 $";
70
71
72
73static void process_files (char *filename);
74
75int binary = 0;
76int word_counts = 0;
77int term_dump = 0;
78
79
80int main (int argc, char **argv)
81{
82 ProgTime start;
83 char *dir_name, *file_name = "";
84 int ch;
85 msg_prefix = argv[0];
86 dir_name = getenv ("MGDATA");
87 opterr = 0;
88 msg_prefix = argv[0];
89 while ((ch = getopt (argc, argv, "hbwtf:d:")) != -1)
90 switch (ch)
91 {
92 case 'f': /* input file */
93 file_name = optarg;
94 break;
95 case 'd':
96 set_basepath(optarg);
97 break;
98 case 'b':
99 binary = 1;
100 break;
101 case 'w':
102 word_counts = 1;
103 break;
104 case 't':
105 term_dump = 1;
106 break;
107 case 'h':
108 case '?':
109 fprintf (stderr, "usage: %s [-h] [-b] [-w] [-t] [-f input_file]"
110 "[-d data directory]\n"
111 "(b - binary mode, w - wordcount, t - term dump)\n", argv[0]);
112 exit (1);
113 }
114 GetTime (&start);
115 process_files (file_name);
116 Message ("%s\n", ElapsedTime (&start, NULL));
117 return 0;
118}
119
120static void
121process_files (char *name)
122{
123 unsigned long N, Nstatic, k;
124 FILE *dict;
125 FILE *invf;
126 struct invf_dict_header idh;
127 struct invf_file_header ifh;
128 int i; /* [RPAP - Jan 97: Endian Ordering] */
129
130 dict = open_file (name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
131
132 fread ((char *) &idh, sizeof (idh), 1, dict);
133
134 /* [RPAP - Jan 97: Endian Ordering] */
135 NTOHUL(idh.lookback);
136 NTOHUL(idh.dict_size);
137 NTOHUL(idh.total_bytes);
138 NTOHUL(idh.index_string_bytes);
139 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
140 NTOHUL(idh.num_of_docs);
141 NTOHUL(idh.static_num_of_docs);
142 NTOHUL(idh.num_of_words);
143 NTOHUL(idh.stemmer_num);
144 NTOHUL(idh.stem_method);
145
146 if (!(invf = open_file (name, INVF_SUFFIX ".ORG", "rb", MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
147 invf = open_file (name, INVF_SUFFIX, "rb", MAGIC_INVF, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
148
149 fread ((char *) &ifh, sizeof (ifh), 1, invf);
150
151 /* [RPAP - Jan 97: Endian Ordering] */
152 NTOHUL(ifh.no_of_words);
153 NTOHUL(ifh.no_of_ptrs);
154 NTOHUL(ifh.skip_mode);
155 for (i = 0; i < 16; i++)
156 NTOHUL(ifh.params[i]);
157 NTOHUL(ifh.InvfLevel);
158
159 if (ifh.skip_mode != 0)
160 FatalError (1, "The invf file contains skips. Unable to dump.");
161
162 DECODE_START (invf)
163 N = idh.num_of_docs;
164 Nstatic = idh.static_num_of_docs;
165 if (binary)
166 {
167 fwrite ((char *) &N, sizeof (N), 1, stdout);
168 fwrite ((char *) &ifh.no_of_words, sizeof (ifh.no_of_words), 1, stdout);
169 }
170 else
171 printf ("%ld %ld\n", N, ifh.no_of_words);
172 for (k = 0; k < ifh.no_of_words; k++)
173 {
174 int i, blk, doc;
175 register unsigned long suff, prefix;
176 unsigned long fcnt, wcnt;
177 char term[MAXSTEMLEN + 1];
178
179 prefix = fgetc (dict);
180 suff = fgetc (dict);
181 fread (&term[prefix], sizeof (char), suff, dict);
182 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
183 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
184 term[prefix + suff] = '\0';
185
186 /* [RPAP - Jan 97: Endian Ordering] */
187 NTOHUL(fcnt);
188 NTOHUL(wcnt);
189
190 if (binary)
191 fwrite ((char *) &fcnt, sizeof (fcnt), 1, stdout);
192 else
193 {
194 if (term_dump)
195 printf ("%ld \"%s\"\n", fcnt, term);
196 else
197 printf ("%ld\n", fcnt);
198 }
199 blk = BIO_Bblock_Init (Nstatic, fcnt);
200 for (doc = i = 0; i < fcnt; i++)
201 {
202 int num;
203 BBLOCK_DECODE (num, blk);
204 doc += num;
205 if (binary)
206 fwrite ((char *) &doc, sizeof (doc), 1, stdout);
207 else
208 printf (" %d", doc);
209 if (ifh.InvfLevel >= 2)
210 {
211 int count;
212 GAMMA_DECODE (count);
213 if (word_counts)
214 if (binary)
215 fwrite ((char *) &count, sizeof (count), 1, stdout);
216 else
217 printf (" %d", count);
218 }
219 if (!binary)
220 putchar ('\n');
221 }
222 while (__btg)
223 DECODE_BIT;
224 }
225 DECODE_DONE
226}
Note: See TracBrowser for help on using the repository browser.