source: trunk/gsdl/packages/mg/src/text/mg_invf_dump.c@ 439

Last change on this file since 439 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.c -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dump.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "timing.h"
28#include "bitio_m.h"
29#include "bitio_m_stdio.h"
30#include "bitio_gen.h"
31#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33#include "mg_files.h"
34#include "locallib.h"
35#include "words.h"
36#include "invf.h"
37
38extern unsigned long S_btg;
39
40/*
41 $Log$
42 Revision 1.1 1999/08/10 21:18:09 sjboddie
43 renamed mg-1.3d directory mg
44
45 Revision 1.2 1998/11/25 07:55:46 rjmcnab
46
47 Modified mg to that you can specify the stemmer you want
48 to use via a command line option. You specify it to
49 mg_passes during the build process. The number of the
50 stemmer that you used is stored within the inverted
51 dictionary header and the stemmed dictionary header so
52 the correct stemmer is used in later stages of building
53 and querying.
54
55 Revision 1.1 1998/11/17 09:35:05 rjmcnab
56 *** empty log message ***
57
58 * Revision 1.3 1994/11/29 00:32:01 tes
59 * Committing the new merged files and changes.
60 *
61 * Revision 1.2 1994/09/20 04:41:50 tes
62 * For version 1.1
63 *
64 */
65
66static char *RCSID = "$Id: mg_invf_dump.c 439 1999-08-10 21:23:37Z sjboddie $";
67
68
69
70static void process_files (char *filename);
71
72int binary = 0;
73int word_counts = 0;
74int term_dump = 0;
75
76
77int main (int argc, char **argv)
78{
79 ProgTime start;
80 char *dir_name, *file_name = "";
81 int ch;
82 msg_prefix = argv[0];
83 dir_name = getenv ("MGDATA");
84 opterr = 0;
85 msg_prefix = argv[0];
86 while ((ch = getopt (argc, argv, "hbwtf:d:")) != -1)
87 switch (ch)
88 {
89 case 'f': /* input file */
90 file_name = optarg;
91 break;
92 case 'd':
93 set_basepath(optarg);
94 break;
95 case 'b':
96 binary = 1;
97 break;
98 case 'w':
99 word_counts = 1;
100 break;
101 case 't':
102 term_dump = 1;
103 break;
104 case 'h':
105 case '?':
106 fprintf (stderr, "usage: %s [-h] [-b] [-w] [-t] [-f input_file]"
107 "[-d data directory]\n", argv[0]);
108 exit (1);
109 }
110 GetTime (&start);
111 process_files (file_name);
112 Message ("%s\n", ElapsedTime (&start, NULL));
113 return 0;
114}
115
116static void
117process_files (char *name)
118{
119 unsigned long N, Nstatic, k;
120 FILE *dict;
121 FILE *invf;
122 struct invf_dict_header idh;
123 struct invf_file_header ifh;
124 int i; /* [RPAP - Jan 97: Endian Ordering] */
125
126 dict = open_file (name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
127
128 fread ((char *) &idh, sizeof (idh), 1, dict);
129
130 /* [RPAP - Jan 97: Endian Ordering] */
131 NTOHUL(idh.lookback);
132 NTOHUL(idh.dict_size);
133 NTOHUL(idh.total_bytes);
134 NTOHUL(idh.index_string_bytes);
135 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
136 NTOHUL(idh.num_of_docs);
137 NTOHUL(idh.static_num_of_docs);
138 NTOHUL(idh.num_of_words);
139 NTOHUL(idh.stemmer_num);
140 NTOHUL(idh.stem_method);
141
142 if (!(invf = open_file (name, INVF_SUFFIX ".ORG", "rb", MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
143 invf = open_file (name, INVF_SUFFIX, "rb", MAGIC_INVF, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
144
145 fread ((char *) &ifh, sizeof (ifh), 1, invf);
146
147 /* [RPAP - Jan 97: Endian Ordering] */
148 NTOHUL(ifh.no_of_words);
149 NTOHUL(ifh.no_of_ptrs);
150 NTOHUL(ifh.skip_mode);
151 for (i = 0; i < 16; i++)
152 NTOHUL(ifh.params[i]);
153 NTOHUL(ifh.InvfLevel);
154
155 if (ifh.skip_mode != 0)
156 FatalError (1, "The invf file contains skips. Unable to dump.");
157
158 DECODE_START (invf)
159 N = idh.num_of_docs;
160 Nstatic = idh.static_num_of_docs;
161 if (binary)
162 {
163 fwrite ((char *) &N, sizeof (N), 1, stdout);
164 fwrite ((char *) &ifh.no_of_words, sizeof (ifh.no_of_words), 1, stdout);
165 }
166 else
167 printf ("%ld %ld\n", N, ifh.no_of_words);
168 for (k = 0; k < ifh.no_of_words; k++)
169 {
170 int i, blk, doc;
171 register unsigned long suff, prefix;
172 unsigned long fcnt, wcnt;
173 char term[MAXSTEMLEN + 1];
174
175 prefix = fgetc (dict);
176 suff = fgetc (dict);
177 fread (&term[prefix], sizeof (char), suff, dict);
178 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
179 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
180 term[prefix + suff] = '\0';
181
182 /* [RPAP - Jan 97: Endian Ordering] */
183 NTOHUL(fcnt);
184 NTOHUL(wcnt);
185
186 if (binary)
187 fwrite ((char *) &fcnt, sizeof (fcnt), 1, stdout);
188 else
189 {
190 if (term_dump)
191 printf ("%ld \"%s\"\n", fcnt, term);
192 else
193 printf ("%ld\n", fcnt);
194 }
195 blk = BIO_Bblock_Init (Nstatic, fcnt);
196 for (doc = i = 0; i < fcnt; i++)
197 {
198 int num;
199 BBLOCK_DECODE (num, blk);
200 doc += num;
201 if (binary)
202 fwrite ((char *) &doc, sizeof (doc), 1, stdout);
203 else
204 printf (" %d", doc);
205 if (ifh.InvfLevel >= 2)
206 {
207 int count;
208 GAMMA_DECODE (count);
209 if (word_counts)
210 if (binary)
211 fwrite ((char *) &count, sizeof (count), 1, stdout);
212 else
213 printf (" %d", count);
214 }
215 if (!binary)
216 putchar ('\n');
217 }
218 while (__btg)
219 DECODE_BIT;
220 }
221 DECODE_DONE
222}
Note: See TracBrowser for help on using the repository browser.