source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mg/src/text/mg_invf_dump.c@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_invf_dump.c -- Program to dump uot an inverted fil
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_invf_dump.c 23508 2010-12-17 01:04:10Z sjm84 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "timing.h"
28#include "bitio_m.h"
29#include "bitio_m_stdio.h"
30#include "bitio_gen.h"
31#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
32
33#include "mg_files.h"
34#include "locallib.h"
35#include "words.h"
36#include "invf.h"
37
38extern mg_u_long S_btg;
39
40/*
41 $Log$
42 Revision 1.1 2003/02/20 21:18:24 mdewsnip
43 Addition of MG package for search and retrieval
44
45 Revision 1.2 2001/06/12 01:41:39 kjm18
46 changed the help message
47
48 Revision 1.1 1999/08/10 21:18:09 sjboddie
49 renamed mg-1.3d directory mg
50
51 Revision 1.2 1998/11/25 07:55:46 rjmcnab
52
53 Modified mg to that you can specify the stemmer you want
54 to use via a command line option. You specify it to
55 mg_passes during the build process. The number of the
56 stemmer that you used is stored within the inverted
57 dictionary header and the stemmed dictionary header so
58 the correct stemmer is used in later stages of building
59 and querying.
60
61 Revision 1.1 1998/11/17 09:35:05 rjmcnab
62 *** empty log message ***
63
64 * Revision 1.3 1994/11/29 00:32:01 tes
65 * Committing the new merged files and changes.
66 *
67 * Revision 1.2 1994/09/20 04:41:50 tes
68 * For version 1.1
69 *
70 */
71
72static char *RCSID = "$Id: mg_invf_dump.c 23508 2010-12-17 01:04:10Z sjm84 $";
73
74
75
76static void process_files (char *filename);
77
78int binary = 0;
79int word_counts = 0;
80int term_dump = 0;
81
82
83int main (int argc, char **argv)
84{
85 ProgTime start;
86 char *dir_name, *file_name = "";
87 int ch;
88 msg_prefix = argv[0];
89 dir_name = getenv ("MGDATA");
90 opterr = 0;
91 msg_prefix = argv[0];
92 while ((ch = getopt (argc, argv, "hbwtf:d:")) != -1)
93 switch (ch)
94 {
95 case 'f': /* input file */
96 file_name = optarg;
97 break;
98 case 'd':
99 set_basepath(optarg);
100 break;
101 case 'b':
102 binary = 1;
103 break;
104 case 'w':
105 word_counts = 1;
106 break;
107 case 't':
108 term_dump = 1;
109 break;
110 case 'h':
111 case '?':
112 fprintf (stderr, "usage: %s [-h] [-b] [-w] [-t] [-f input_file]"
113 "[-d data directory]\n"
114 "(b - binary mode, w - wordcount, t - term dump)\n", argv[0]);
115 exit (1);
116 }
117 GetTime (&start);
118 process_files (file_name);
119 Message ("%s\n", ElapsedTime (&start, NULL));
120 return 0;
121}
122
123static void
124process_files (char *name)
125{
126 mg_u_long N, Nstatic, k;
127 FILE *dict;
128 FILE *invf;
129 struct invf_dict_header idh;
130 struct invf_file_header ifh;
131 int i; /* [RPAP - Jan 97: Endian Ordering] */
132
133 dict = open_file (name, INVF_DICT_SUFFIX, "rb", MAGIC_STEM_BUILD, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
134
135 fread ((char *) &idh, sizeof (idh), 1, dict);
136
137 /* [RPAP - Jan 97: Endian Ordering] */
138 NTOHUL(idh.lookback);
139 NTOHUL(idh.dict_size);
140 NTOHUL(idh.total_bytes);
141 NTOHUL(idh.index_string_bytes);
142 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
143 NTOHUL(idh.num_of_docs);
144 NTOHUL(idh.static_num_of_docs);
145 NTOHUL(idh.num_of_words);
146 NTOHUL(idh.stemmer_num);
147 NTOHUL(idh.stem_method);
148
149 if (!(invf = open_file (name, INVF_SUFFIX ".ORG", "rb", MAGIC_INVF, MG_CONTINUE))) /* [RPAP - Feb 97: WIN32 Port] */
150 invf = open_file (name, INVF_SUFFIX, "rb", MAGIC_INVF, MG_ABORT); /* [RPAP - Feb 97: WIN32 Port] */
151
152 fread ((char *) &ifh, sizeof (ifh), 1, invf);
153
154 /* [RPAP - Jan 97: Endian Ordering] */
155 NTOHUL(ifh.no_of_words);
156 NTOHUL(ifh.no_of_ptrs);
157 NTOHUL(ifh.skip_mode);
158 for (i = 0; i < 16; i++)
159 NTOHUL(ifh.params[i]);
160 NTOHUL(ifh.InvfLevel);
161
162 if (ifh.skip_mode != 0)
163 FatalError (1, "The invf file contains skips. Unable to dump.");
164
165 DECODE_START (invf)
166 N = idh.num_of_docs;
167 Nstatic = idh.static_num_of_docs;
168 if (binary)
169 {
170 fwrite ((char *) &N, sizeof (N), 1, stdout);
171 fwrite ((char *) &ifh.no_of_words, sizeof (ifh.no_of_words), 1, stdout);
172 }
173 else
174 printf ("%d %d\n", N, ifh.no_of_words);
175 for (k = 0; k < ifh.no_of_words; k++)
176 {
177 int i, blk, doc;
178 register mg_u_long suff, prefix;
179 mg_u_long fcnt, wcnt;
180 char term[MAXSTEMLEN + 1];
181
182 prefix = fgetc (dict);
183 suff = fgetc (dict);
184 fread (&term[prefix], sizeof (char), suff, dict);
185 fread ((char *) &fcnt, sizeof (fcnt), 1, dict);
186 fread ((char *) &wcnt, sizeof (wcnt), 1, dict);
187 term[prefix + suff] = '\0';
188
189 /* [RPAP - Jan 97: Endian Ordering] */
190 NTOHUL(fcnt);
191 NTOHUL(wcnt);
192
193 if (binary)
194 fwrite ((char *) &fcnt, sizeof (fcnt), 1, stdout);
195 else
196 {
197 if (term_dump)
198 printf ("%d \"%s\"\n", fcnt, term);
199 else
200 printf ("%d\n", fcnt);
201 }
202 blk = BIO_Bblock_Init (Nstatic, fcnt);
203 for (doc = i = 0; i < fcnt; i++)
204 {
205 int num;
206 BBLOCK_DECODE (num, blk);
207 doc += num;
208 if (binary)
209 fwrite ((char *) &doc, sizeof (doc), 1, stdout);
210 else
211 printf (" %d", doc);
212 if (ifh.InvfLevel >= 2)
213 {
214 int count;
215 GAMMA_DECODE (count);
216 if (word_counts)
217 {
218 if (binary)
219 {
220 fwrite ((char *) &count, sizeof (count), 1, stdout);
221 }
222 else
223 {
224 printf (" %d", count);
225 }
226 }
227 }
228 if (!binary)
229 putchar ('\n');
230 }
231 while (__btg)
232 DECODE_BIT;
233 }
234 DECODE_DONE
235}
Note: See TracBrowser for help on using the repository browser.