source: trunk/gsdl/packages/mg-1.3d/src/text/mgdictlist.c@ 34

Last change on this file since 34 was 34, checked in by rjmcnab, 26 years ago

Modified mg to that you can specify the stemmer you want
to use via a command line option. You specify it to
mg_passes during the build process. The number of the
stemmer that you used is stored within the inverted
dictionary header and the stemmed dictionary header so
the correct stemmer is used in later stages of building
and querying.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1/**************************************************************************
2 *
3 * mgdictlist.c -- Program to list a dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgdictlist.c 34 1998-11-25 07:55:52Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "memlib.h"
28#include "local_strings.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31#include "mg_files.h"
32#include "text.h"
33#include "invf.h"
34#include "locallib.h"
35#include "words.h"
36
37/*
38 $Log$
39 Revision 1.2 1998/11/25 07:55:49 rjmcnab
40
41 Modified mg to that you can specify the stemmer you want
42 to use via a command line option. You specify it to
43 mg_passes during the build process. The number of the
44 stemmer that you used is stored within the inverted
45 dictionary header and the stemmed dictionary header so
46 the correct stemmer is used in later stages of building
47 and querying.
48
49 Revision 1.1 1998/11/17 09:35:24 rjmcnab
50 *** empty log message ***
51
52 * Revision 1.4 1994/11/29 00:32:07 tes
53 * Committing the new merged files and changes.
54 *
55 * Revision 1.3 1994/10/20 03:57:01 tes
56 * I have rewritten the boolean query optimiser and abstracted out the
57 * components of the boolean query.
58 *
59 * Revision 1.2 1994/09/20 04:41:56 tes
60 * For version 1.1
61 *
62 */
63
64static char *RCSID = "$Id: mgdictlist.c 34 1998-11-25 07:55:52Z rjmcnab $";
65
66
67int quick = 0;
68int no_of_words[2];
69u_long maxcodelen[2];
70
71char *dictname = "";
72
73
74
75
76void
77DumpStemDict (FILE * f)
78{
79 struct invf_dict_header idh;
80 int i;
81 u_char prev[MAXSTEMLEN + 1];
82
83 fread (&idh, sizeof (idh), 1, f);
84
85 /* [RPAP - Jan 97: Endian Ordering] */
86 NTOHUL(idh.lookback);
87 NTOHUL(idh.dict_size);
88 NTOHUL(idh.total_bytes);
89 NTOHUL(idh.index_string_bytes);
90 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
91 NTOHUL(idh.num_of_docs);
92 NTOHUL(idh.static_num_of_docs);
93 NTOHUL(idh.num_of_words);
94 NTOHUL(idh.stemmer_num);
95 NTOHUL(idh.stem_method);
96
97 if (quick)
98 printf ("%ld\n", idh.dict_size);
99 else
100 {
101 printf ("# lookback = %lu\n", idh.lookback);
102 printf ("# dict size = %lu\n", idh.dict_size);
103 printf ("# total bytes = %lu\n", idh.total_bytes);
104 printf ("# index string bytes = %lu\n", idh.index_string_bytes);
105 printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */
106 printf ("# num of docs = %lu\n", idh.num_of_docs);
107 printf ("# static num of docs = %lu\n", idh.static_num_of_docs);
108 printf ("# num of words = %lu\n", idh.num_of_words);
109 printf ("#\n");
110 }
111
112 for (i = 0; i < idh.dict_size; i++)
113 {
114 register unsigned long copy, suff;
115 unsigned long wcnt, fcnt;
116
117 /* build a new word on top of prev */
118 copy = getc (f);
119 suff = getc (f);
120 *prev = copy + suff;
121 fread (prev + copy + 1, sizeof (u_char), suff, f);
122
123 /* read other data, but no need to store it */
124 fread (&fcnt, sizeof (fcnt), 1, f);
125 fread (&wcnt, sizeof (wcnt), 1, f);
126
127 /* [RPAP - Jan 97: Endian Ordering] */
128 NTOHUL(fcnt);
129 NTOHUL(wcnt);
130
131 if (!quick)
132 {
133 printf ("%d: %8ld ", i, wcnt);
134 printf ("/ %5ld ", fcnt);
135 printf ("%2d %2ld\t\"", *prev, copy);
136 }
137 printf ("%s", word2str (prev));
138 if (quick)
139 printf (" %ld %ld\n", wcnt, fcnt);
140 else
141 {
142 putchar ('"');
143 putchar ('\n');
144 }
145 }
146}
147
148
149
150
151void
152ReadInWords (FILE * f)
153{
154 comp_frags_header cfh;
155 u_long *codes;
156 u_char prev[MAXSTEMLEN + 1];
157 int i;
158
159 if (Read_cfh (f, &cfh, NULL, NULL) == -1)
160 FatalError (1, "Unable to read in the dictionary");
161
162 printf ("#\n");
163 printf ("# max code len = %u\n", cfh.hd.maxcodelen);
164 printf ("# total bytes = %lu\n", cfh.uncompressed_size);
165 printf ("#\n");
166
167 if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL)))
168 FatalError (1, "no memory for huffman codes\n");
169
170 for (i = 0; i < cfh.hd.num_codes; i++)
171 {
172 register int val, copy, j, k;
173 char code[33];
174 val = fgetc (f);
175 copy = (val >> 4) & 0xf;
176 val &= 0xf;
177
178 fread (prev + copy + 1, sizeof (u_char), val, f);
179 *prev = val + copy;
180
181 for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++)
182 code[k] = '0' + ((codes[i] >> j) & 1);
183 code[k] = '\0';
184
185 printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i],
186 cfh.hd.maxcodelen, code, word2str (prev));
187 }
188 Xfree (codes);
189 Xfree (cfh.hd.clens);
190}
191
192
193void
194ReadCharHuffman (FILE * f, char *title)
195{
196 int i;
197 huff_data hd;
198 u_long *codes;
199
200 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
201 FatalError (1, "Unable to read huffman data");
202
203 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
204 FatalError (1, "no memory for huffman codes\n");
205
206 printf ("#\n# %s\n#\n", title);
207 for (i = 0; i < hd.num_codes; i++)
208 if (hd.clens[i])
209 {
210 int j, k;
211 char code[33];
212 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
213 code[k] = '0' + ((codes[i] >> j) & 1);
214 code[k] = '\0';
215 printf ("%2d : %*s : \"%s\"\n", hd.clens[i],
216 hd.maxcodelen, code, char2str (i));
217 }
218 Xfree (codes);
219 Xfree (hd.clens);
220}
221
222
223void
224ReadLenHuffman (FILE * f, char *title)
225{
226 int i;
227 huff_data hd;
228 u_long *codes;
229
230 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
231 FatalError (1, "Unable to read huffman data");
232
233 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
234 FatalError (1, "no memory for huffman codes\n");
235
236 printf ("#\n# %s\n#\n", title);
237 for (i = 0; i < hd.num_codes; i++)
238 if (hd.clens[i])
239 {
240 int j, k;
241 char code[33];
242 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
243 code[k] = '0' + ((codes[i] >> j) & 1);
244 code[k] = '\0';
245 printf ("%2d : %*s : %d\n", hd.clens[i],
246 hd.maxcodelen, code, i);
247 }
248 Xfree (codes);
249 Xfree (hd.clens);
250}
251
252
253
254
255
256void
257DumpTextDict (FILE * f)
258{
259 struct compression_dict_header cdh;
260 int which;
261
262 if (Read_cdh (f, &cdh, NULL, NULL) == -1)
263 FatalError (1, "Unable to read dictionary header");
264 switch (cdh.dict_type)
265 {
266 case MG_COMPLETE_DICTIONARY:
267 printf ("# COMPLETE DICTIONARY\n");
268 break;
269 case MG_PARTIAL_DICTIONARY:
270 printf ("# PARTIAL DICTIONARY\n");
271 break;
272 case MG_SEED_DICTIONARY:
273 printf ("# SEED DICTIONARY\n");
274 break;
275 }
276 printf ("# num words = %lu\n", cdh.num_words[1]);
277 printf ("# num word chars = %lu\n", cdh.num_word_chars[1]);
278 printf ("# num non-words = %lu\n", cdh.num_words[0]);
279 printf ("# num non-word chars = %lu\n", cdh.num_word_chars[0]);
280 printf ("# lookback = %lu\n", cdh.lookback);
281
282 for (which = 0; which < 2; which++)
283 switch (cdh.dict_type)
284 {
285 case MG_COMPLETE_DICTIONARY:
286 {
287 ReadInWords (f);
288 }
289 break;
290 case MG_PARTIAL_DICTIONARY:
291 {
292 if (cdh.num_words[which])
293 ReadInWords (f);
294
295 ReadCharHuffman (f, "Characters");
296 ReadLenHuffman (f, "Lengths");
297 }
298 break;
299 case MG_SEED_DICTIONARY:
300 {
301 if (cdh.num_words[which])
302 ReadInWords (f);
303
304 ReadCharHuffman (f, "Characters");
305 ReadLenHuffman (f, "Lengths");
306 }
307 break;
308 }
309}
310
311
312
313
314void
315DumpStatsDict (FILE * f)
316{
317 int i;
318 compression_stats_header csh;
319
320 fread (&csh, sizeof (csh), 1, f);
321
322 for (i = 0; i < 2; i++)
323 {
324 int j;
325 frags_stats_header fsh;
326
327 fread (&fsh, sizeof (fsh), 1, f);
328
329 /* [RPAP - Jan 97: Endian Ordering] */
330 NTOHUL(fsh.num_frags);
331 NTOHUL(fsh.mem_for_frags);
332
333 if (!quick)
334 printf ("#\n# num %9s = %lu\n#\n", i ? "words" : "non-words",
335 fsh.num_frags);
336
337 for (j = 0; j < fsh.num_frags; j++)
338 {
339 u_char Word[16];
340 u_long freq, occur_num;
341
342 fread (&freq, sizeof (freq), 1, f);
343 fread (&occur_num, sizeof (occur_num), 1, f);
344
345 /* [RPAP - Jan 97: Endian Ordering] */
346 NTOHUL(freq);
347 NTOHUL(occur_num);
348
349 Word[0] = fgetc (f);
350 fread (Word + 1, Word[0], 1, f);
351 printf ("%d: %7ld : %7ld : \"%s\"\n", j, freq,
352 occur_num, word2str (Word));
353 }
354 }
355}
356
357
358int main (int argc, char **argv)
359{
360 FILE *fp;
361 unsigned long magic = 0;
362
363 if (argc < 2)
364 FatalError (1, "A file name must be specified");
365 dictname = argv[1];
366 if (strcmp (dictname, "-q") == 0)
367 {
368 quick = 1;
369 if (argc < 3)
370 FatalError (1, "A file name must be specified");
371 dictname = argv[2];
372 }
373 if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
374 FatalError (1, "Unable to open \"%s\"", dictname);
375
376 fread (&magic, sizeof (magic), 1, fp);
377
378 NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
379
380 switch (magic)
381 {
382 case MAGIC_STEM_BUILD:
383 if (!quick)
384 printf ("# Contents of STEM file \"%s\"\n#\n", dictname);
385 DumpStemDict (fp);
386 break;
387 case MAGIC_DICT:
388 if (!quick)
389 printf ("# Contents of DICT file \"%s\"\n#\n", dictname);
390 DumpTextDict (fp);
391 break;
392 case MAGIC_STATS_DICT:
393 if (!quick)
394 printf ("# Contents of STATS file \"%s\"\n#\n", dictname);
395 DumpStatsDict (fp);
396 break;
397 default:
398 FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname);
399 }
400 fclose (fp);
401 return 0;
402}
Note: See TracBrowser for help on using the repository browser.