source: trunk/gsdl/packages/mg/src/text/mgdictlist.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1/**************************************************************************
2 *
3 * mgdictlist.c -- Program to list a dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgdictlist.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "memlib.h"
28#include "local_strings.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31#include "mg_files.h"
32#include "text.h"
33#include "invf.h"
34#include "locallib.h"
35#include "words.h"
36
37/*
38 $Log$
39 Revision 1.1 1999/08/10 21:18:17 sjboddie
40 renamed mg-1.3d directory mg
41
42 Revision 1.2 1998/11/25 07:55:49 rjmcnab
43
44 Modified mg to that you can specify the stemmer you want
45 to use via a command line option. You specify it to
46 mg_passes during the build process. The number of the
47 stemmer that you used is stored within the inverted
48 dictionary header and the stemmed dictionary header so
49 the correct stemmer is used in later stages of building
50 and querying.
51
52 Revision 1.1 1998/11/17 09:35:24 rjmcnab
53 *** empty log message ***
54
55 * Revision 1.4 1994/11/29 00:32:07 tes
56 * Committing the new merged files and changes.
57 *
58 * Revision 1.3 1994/10/20 03:57:01 tes
59 * I have rewritten the boolean query optimiser and abstracted out the
60 * components of the boolean query.
61 *
62 * Revision 1.2 1994/09/20 04:41:56 tes
63 * For version 1.1
64 *
65 */
66
67static char *RCSID = "$Id: mgdictlist.c 439 1999-08-10 21:23:37Z sjboddie $";
68
69
70int quick = 0;
71int no_of_words[2];
72u_long maxcodelen[2];
73
74char *dictname = "";
75
76
77
78
79void
80DumpStemDict (FILE * f)
81{
82 struct invf_dict_header idh;
83 int i;
84 u_char prev[MAXSTEMLEN + 1];
85
86 fread (&idh, sizeof (idh), 1, f);
87
88 /* [RPAP - Jan 97: Endian Ordering] */
89 NTOHUL(idh.lookback);
90 NTOHUL(idh.dict_size);
91 NTOHUL(idh.total_bytes);
92 NTOHUL(idh.index_string_bytes);
93 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
94 NTOHUL(idh.num_of_docs);
95 NTOHUL(idh.static_num_of_docs);
96 NTOHUL(idh.num_of_words);
97 NTOHUL(idh.stemmer_num);
98 NTOHUL(idh.stem_method);
99
100 if (quick)
101 printf ("%ld\n", idh.dict_size);
102 else
103 {
104 printf ("# lookback = %lu\n", idh.lookback);
105 printf ("# dict size = %lu\n", idh.dict_size);
106 printf ("# total bytes = %lu\n", idh.total_bytes);
107 printf ("# index string bytes = %lu\n", idh.index_string_bytes);
108 printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */
109 printf ("# num of docs = %lu\n", idh.num_of_docs);
110 printf ("# static num of docs = %lu\n", idh.static_num_of_docs);
111 printf ("# num of words = %lu\n", idh.num_of_words);
112 printf ("#\n");
113 }
114
115 for (i = 0; i < idh.dict_size; i++)
116 {
117 register unsigned long copy, suff;
118 unsigned long wcnt, fcnt;
119
120 /* build a new word on top of prev */
121 copy = getc (f);
122 suff = getc (f);
123 *prev = copy + suff;
124 fread (prev + copy + 1, sizeof (u_char), suff, f);
125
126 /* read other data, but no need to store it */
127 fread (&fcnt, sizeof (fcnt), 1, f);
128 fread (&wcnt, sizeof (wcnt), 1, f);
129
130 /* [RPAP - Jan 97: Endian Ordering] */
131 NTOHUL(fcnt);
132 NTOHUL(wcnt);
133
134 if (!quick)
135 {
136 printf ("%d: %8ld ", i, wcnt);
137 printf ("/ %5ld ", fcnt);
138 printf ("%2d %2ld\t\"", *prev, copy);
139 }
140 printf ("%s", word2str (prev));
141 if (quick)
142 printf (" %ld %ld\n", wcnt, fcnt);
143 else
144 {
145 putchar ('"');
146 putchar ('\n');
147 }
148 }
149}
150
151
152
153
154void
155ReadInWords (FILE * f)
156{
157 comp_frags_header cfh;
158 u_long *codes;
159 u_char prev[MAXSTEMLEN + 1];
160 int i;
161
162 if (Read_cfh (f, &cfh, NULL, NULL) == -1)
163 FatalError (1, "Unable to read in the dictionary");
164
165 printf ("#\n");
166 printf ("# max code len = %u\n", cfh.hd.maxcodelen);
167 printf ("# total bytes = %lu\n", cfh.uncompressed_size);
168 printf ("#\n");
169
170 if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL)))
171 FatalError (1, "no memory for huffman codes\n");
172
173 for (i = 0; i < cfh.hd.num_codes; i++)
174 {
175 register int val, copy, j, k;
176 char code[33];
177 val = fgetc (f);
178 copy = (val >> 4) & 0xf;
179 val &= 0xf;
180
181 fread (prev + copy + 1, sizeof (u_char), val, f);
182 *prev = val + copy;
183
184 for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++)
185 code[k] = '0' + ((codes[i] >> j) & 1);
186 code[k] = '\0';
187
188 printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i],
189 cfh.hd.maxcodelen, code, word2str (prev));
190 }
191 Xfree (codes);
192 Xfree (cfh.hd.clens);
193}
194
195
196void
197ReadCharHuffman (FILE * f, char *title)
198{
199 int i;
200 huff_data hd;
201 u_long *codes;
202
203 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
204 FatalError (1, "Unable to read huffman data");
205
206 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
207 FatalError (1, "no memory for huffman codes\n");
208
209 printf ("#\n# %s\n#\n", title);
210 for (i = 0; i < hd.num_codes; i++)
211 if (hd.clens[i])
212 {
213 int j, k;
214 char code[33];
215 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
216 code[k] = '0' + ((codes[i] >> j) & 1);
217 code[k] = '\0';
218 printf ("%2d : %*s : \"%s\"\n", hd.clens[i],
219 hd.maxcodelen, code, char2str (i));
220 }
221 Xfree (codes);
222 Xfree (hd.clens);
223}
224
225
226void
227ReadLenHuffman (FILE * f, char *title)
228{
229 int i;
230 huff_data hd;
231 u_long *codes;
232
233 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
234 FatalError (1, "Unable to read huffman data");
235
236 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
237 FatalError (1, "no memory for huffman codes\n");
238
239 printf ("#\n# %s\n#\n", title);
240 for (i = 0; i < hd.num_codes; i++)
241 if (hd.clens[i])
242 {
243 int j, k;
244 char code[33];
245 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
246 code[k] = '0' + ((codes[i] >> j) & 1);
247 code[k] = '\0';
248 printf ("%2d : %*s : %d\n", hd.clens[i],
249 hd.maxcodelen, code, i);
250 }
251 Xfree (codes);
252 Xfree (hd.clens);
253}
254
255
256
257
258
259void
260DumpTextDict (FILE * f)
261{
262 struct compression_dict_header cdh;
263 int which;
264
265 if (Read_cdh (f, &cdh, NULL, NULL) == -1)
266 FatalError (1, "Unable to read dictionary header");
267 switch (cdh.dict_type)
268 {
269 case MG_COMPLETE_DICTIONARY:
270 printf ("# COMPLETE DICTIONARY\n");
271 break;
272 case MG_PARTIAL_DICTIONARY:
273 printf ("# PARTIAL DICTIONARY\n");
274 break;
275 case MG_SEED_DICTIONARY:
276 printf ("# SEED DICTIONARY\n");
277 break;
278 }
279 printf ("# num words = %lu\n", cdh.num_words[1]);
280 printf ("# num word chars = %lu\n", cdh.num_word_chars[1]);
281 printf ("# num non-words = %lu\n", cdh.num_words[0]);
282 printf ("# num non-word chars = %lu\n", cdh.num_word_chars[0]);
283 printf ("# lookback = %lu\n", cdh.lookback);
284
285 for (which = 0; which < 2; which++)
286 switch (cdh.dict_type)
287 {
288 case MG_COMPLETE_DICTIONARY:
289 {
290 ReadInWords (f);
291 }
292 break;
293 case MG_PARTIAL_DICTIONARY:
294 {
295 if (cdh.num_words[which])
296 ReadInWords (f);
297
298 ReadCharHuffman (f, "Characters");
299 ReadLenHuffman (f, "Lengths");
300 }
301 break;
302 case MG_SEED_DICTIONARY:
303 {
304 if (cdh.num_words[which])
305 ReadInWords (f);
306
307 ReadCharHuffman (f, "Characters");
308 ReadLenHuffman (f, "Lengths");
309 }
310 break;
311 }
312}
313
314
315
316
317void
318DumpStatsDict (FILE * f)
319{
320 int i;
321 compression_stats_header csh;
322
323 fread (&csh, sizeof (csh), 1, f);
324
325 for (i = 0; i < 2; i++)
326 {
327 int j;
328 frags_stats_header fsh;
329
330 fread (&fsh, sizeof (fsh), 1, f);
331
332 /* [RPAP - Jan 97: Endian Ordering] */
333 NTOHUL(fsh.num_frags);
334 NTOHUL(fsh.mem_for_frags);
335
336 if (!quick)
337 printf ("#\n# num %9s = %lu\n#\n", i ? "words" : "non-words",
338 fsh.num_frags);
339
340 for (j = 0; j < fsh.num_frags; j++)
341 {
342 u_char Word[16];
343 u_long freq, occur_num;
344
345 fread (&freq, sizeof (freq), 1, f);
346 fread (&occur_num, sizeof (occur_num), 1, f);
347
348 /* [RPAP - Jan 97: Endian Ordering] */
349 NTOHUL(freq);
350 NTOHUL(occur_num);
351
352 Word[0] = fgetc (f);
353 fread (Word + 1, Word[0], 1, f);
354 printf ("%d: %7ld : %7ld : \"%s\"\n", j, freq,
355 occur_num, word2str (Word));
356 }
357 }
358}
359
360
361int main (int argc, char **argv)
362{
363 FILE *fp;
364 unsigned long magic = 0;
365
366 if (argc < 2)
367 FatalError (1, "A file name must be specified");
368 dictname = argv[1];
369 if (strcmp (dictname, "-q") == 0)
370 {
371 quick = 1;
372 if (argc < 3)
373 FatalError (1, "A file name must be specified");
374 dictname = argv[2];
375 }
376 if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
377 FatalError (1, "Unable to open \"%s\"", dictname);
378
379 fread (&magic, sizeof (magic), 1, fp);
380
381 NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
382
383 switch (magic)
384 {
385 case MAGIC_STEM_BUILD:
386 if (!quick)
387 printf ("# Contents of STEM file \"%s\"\n#\n", dictname);
388 DumpStemDict (fp);
389 break;
390 case MAGIC_DICT:
391 if (!quick)
392 printf ("# Contents of DICT file \"%s\"\n#\n", dictname);
393 DumpTextDict (fp);
394 break;
395 case MAGIC_STATS_DICT:
396 if (!quick)
397 printf ("# Contents of STATS file \"%s\"\n#\n", dictname);
398 DumpStatsDict (fp);
399 break;
400 default:
401 FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname);
402 }
403 fclose (fp);
404 return 0;
405}
Note: See TracBrowser for help on using the repository browser.