source: trunk/gsdl/src/mgpp/text/mgdictlist.cpp@ 711

Last change on this file since 711 was 711, checked in by cs025, 25 years ago

Changes to eradicate Xmalloc

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.3 KB
Line 
1/**************************************************************************
2 *
3 * mgdictlist.c -- Program to list a dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgdictlist.cpp 711 1999-10-17 23:43:31Z cs025 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "memlib.h"
28#include "local_strings.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31#include "mg_files.h"
32#include "text.h"
33#include "invf.h"
34#include "locallib.h"
35#include "words.h"
36
37/*
38 $Log$
39 Revision 1.2 1999/10/17 23:43:27 cs025
40 Changes to eradicate Xmalloc
41
42 Revision 1.1 1999/10/11 02:58:08 cs025
43 Base install of MG-PP
44
45 Revision 1.1 1999/08/10 21:18:17 sjboddie
46 renamed mg-1.3d directory mg
47
48 Revision 1.2 1998/11/25 07:55:49 rjmcnab
49
50 Modified mg to that you can specify the stemmer you want
51 to use via a command line option. You specify it to
52 mg_passes during the build process. The number of the
53 stemmer that you used is stored within the inverted
54 dictionary header and the stemmed dictionary header so
55 the correct stemmer is used in later stages of building
56 and querying.
57
58 Revision 1.1 1998/11/17 09:35:24 rjmcnab
59 *** empty log message ***
60
61 * Revision 1.4 1994/11/29 00:32:07 tes
62 * Committing the new merged files and changes.
63 *
64 * Revision 1.3 1994/10/20 03:57:01 tes
65 * I have rewritten the boolean query optimiser and abstracted out the
66 * components of the boolean query.
67 *
68 * Revision 1.2 1994/09/20 04:41:56 tes
69 * For version 1.1
70 *
71 */
72
73static char *RCSID = "$Id: mgdictlist.cpp 711 1999-10-17 23:43:31Z cs025 $";
74
75
76int quick = 0;
77int no_of_words[2];
78u_long maxcodelen[2];
79
80char *dictname = "";
81
82
83
84
85void
86DumpStemDict (FILE * f)
87{
88 struct invf_dict_header idh;
89 int i;
90 u_char prev[MAXSTEMLEN + 1];
91
92 Invf_Header_read(&idh, f);
93 /*
94 fread (&idh, sizeof (idh), 1, f);
95
96 // [RPAP - Jan 97: Endian Ordering]
97 NTOHUL(idh.lookback);
98 NTOHUL(idh.dict_size);
99 NTOHUL(idh.total_bytes);
100 NTOHUL(idh.index_string_bytes);
101 NTOHD(idh.input_bytes); // [RJM 07/97: 4G limit]
102 NTOHUL(idh.num_of_docs);
103 NTOHUL(idh.static_num_of_docs);
104 NTOHUL(idh.num_of_words);
105 NTOHUL(idh.stemmer_num);
106 NTOHUL(idh.stem_method);
107 */
108
109 if (quick)
110 printf ("%ld\n", idh.dict_size);
111 else
112 {
113 printf ("# lookback = %lu\n", idh.lookback);
114 printf ("# dict size = %lu\n", idh.dict_size);
115 printf ("# total bytes = %lu\n", idh.total_bytes);
116 printf ("# index string bytes = %lu\n", idh.index_string_bytes);
117 printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */
118 printf ("# num of docs = %lu\n", idh.num_of_docs);
119 printf ("# static num of docs = %lu\n", idh.static_num_of_docs);
120 printf ("# num of words = %lu\n", idh.num_of_words);
121 printf ("#\n");
122 }
123
124 for (i = 0; i < idh.dict_size; i++)
125 {
126 register unsigned long copy, suff;
127 unsigned long wcnt, fcnt;
128
129 /* build a new word on top of prev */
130 copy = getc (f);
131 suff = getc (f);
132 *prev = copy + suff;
133 fread (prev + copy + 1, sizeof (u_char), suff, f);
134
135 /* read other data, but no need to store it */
136 fread (&fcnt, sizeof (fcnt), 1, f);
137 fread (&wcnt, sizeof (wcnt), 1, f);
138
139 /* [RPAP - Jan 97: Endian Ordering] */
140 NTOHUL(fcnt);
141 NTOHUL(wcnt);
142
143 if (!quick)
144 {
145 printf ("%d: %8ld ", i, wcnt);
146 printf ("/ %5ld ", fcnt);
147 printf ("%2d %2ld\t\"", *prev, copy);
148 }
149 printf ("%s", word2str (prev));
150 if (quick)
151 printf (" %ld %ld\n", wcnt, fcnt);
152 else
153 {
154 putchar ('"');
155 putchar ('\n');
156 }
157 }
158}
159
160
161
162
163void
164ReadInWords (FILE * f)
165{
166 comp_frags_header cfh;
167 u_long *codes;
168 u_char prev[MAXSTEMLEN + 1];
169 int i;
170
171 if (Read_cfh (f, &cfh, NULL, NULL) == -1)
172 FatalError (1, "Unable to read in the dictionary");
173
174 printf ("#\n");
175 printf ("# max code len = %u\n", cfh.hd.maxcodelen);
176 printf ("# total bytes = %lu\n", cfh.uncompressed_size);
177 printf ("#\n");
178
179 if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL)))
180 FatalError (1, "no memory for huffman codes\n");
181
182 for (i = 0; i < cfh.hd.num_codes; i++)
183 {
184 register int val, copy, j, k;
185 char code[33];
186 val = fgetc (f);
187 copy = (val >> 4) & 0xf;
188 val &= 0xf;
189
190 fread (prev + copy + 1, sizeof (u_char), val, f);
191 *prev = val + copy;
192
193 for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++)
194 code[k] = '0' + ((codes[i] >> j) & 1);
195 code[k] = '\0';
196
197 printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i],
198 cfh.hd.maxcodelen, code, word2str (prev));
199 }
200 delete codes;
201 delete cfh.hd.clens;
202}
203
204void ReadCharsHuffman(FILE *f, char *title, int mode)
205{
206 int i;
207 huff_data hd;
208 u_long *codes;
209
210 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
211 FatalError (1, "Unable to read huffman data");
212
213 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
214 FatalError (1, "no memory for huffman codes\n");
215
216 printf ("#\n# %s\n#\n", title);
217 for (i = 0; i < hd.num_codes; i++)
218 if (hd.clens[i])
219 {
220 int j, k;
221 char code[33];
222 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
223 code[k] = '0' + ((codes[i] >> j) & 1);
224 code[k] = '\0';
225
226 if (mode == 0)
227 {
228 printf ("%2d : %*s : \"%s\"\n", hd.clens[i],
229 hd.maxcodelen, code, char2str (i));
230 }
231 else
232 {
233 printf ("%2d : %*s : %d\n", hd.clens[i],
234 hd.maxcodelen, code, i);
235 }
236 }
237 delete codes;
238 delete hd.clens;
239}
240
241
242void
243ReadCharHuffman (FILE * f, char *title)
244{
245 ReadCharsHuffman(f, title, 0);
246}
247
248
249void
250ReadLenHuffman (FILE * f, char *title)
251{
252 ReadCharsHuffman(f, title, 1);
253}
254
255
256
257
258
259void
260DumpTextDict (FILE * f)
261{
262 struct compression_dict_header cdh;
263 int which;
264
265 if (Read_cdh (f, &cdh, NULL, NULL) == -1)
266 FatalError (1, "Unable to read dictionary header");
267 switch (cdh.dict_type)
268 {
269 case MG_COMPLETE_DICTIONARY:
270 printf ("# COMPLETE DICTIONARY\n");
271 break;
272 case MG_PARTIAL_DICTIONARY:
273 printf ("# PARTIAL DICTIONARY\n");
274 break;
275 case MG_SEED_DICTIONARY:
276 printf ("# SEED DICTIONARY\n");
277 break;
278 }
279 printf ("# num words = %lu\n", cdh.num_words[1]);
280 printf ("# num word chars = %lu\n", cdh.num_word_chars[1]);
281 printf ("# num non-words = %lu\n", cdh.num_words[0]);
282 printf ("# num non-word chars = %lu\n", cdh.num_word_chars[0]);
283 printf ("# lookback = %lu\n", cdh.lookback);
284
285 for (which = 0; which < 2; which++)
286 switch (cdh.dict_type)
287 {
288 case MG_COMPLETE_DICTIONARY:
289 {
290 ReadInWords (f);
291 }
292 break;
293 case MG_PARTIAL_DICTIONARY:
294 case MG_SEED_DICTIONARY:
295 {
296 if (cdh.num_words[which])
297 ReadInWords (f);
298
299 ReadCharHuffman (f, "Characters");
300 ReadLenHuffman (f, "Lengths");
301 }
302 break;
303 }
304}
305
306
307
308
309void
310DumpStatsDict (FILE * f)
311{
312 int i;
313 compression_stats_header csh;
314
315 fread (&csh, sizeof (csh), 1, f);
316
317 for (i = 0; i < 2; i++)
318 {
319 int j;
320 frags_stats_header fsh;
321
322 fread (&fsh, sizeof (fsh), 1, f);
323
324 /* [RPAP - Jan 97: Endian Ordering] */
325 NTOHUL(fsh.num_frags);
326 NTOHUL(fsh.mem_for_frags);
327
328 if (!quick)
329 printf ("#\n# num %9s = %lu\n#\n", i ? "words" : "non-words",
330 fsh.num_frags);
331
332 for (j = 0; j < fsh.num_frags; j++)
333 {
334 u_char Word[16];
335 u_long freq, occur_num;
336
337 fread (&freq, sizeof (freq), 1, f);
338 fread (&occur_num, sizeof (occur_num), 1, f);
339
340 /* [RPAP - Jan 97: Endian Ordering] */
341 NTOHUL(freq);
342 NTOHUL(occur_num);
343
344 Word[0] = fgetc (f);
345 fread (Word + 1, Word[0], 1, f);
346 printf ("%d: %7ld : %7ld : \"%s\"\n", j, freq,
347 occur_num, word2str (Word));
348 }
349 }
350}
351
352
353int main (int argc, char **argv)
354{
355 FILE *fp;
356 unsigned long magic = 0;
357
358 if (argc < 2)
359 FatalError (1, "A file name must be specified");
360 dictname = argv[1];
361 if (strcmp (dictname, "-q") == 0)
362 {
363 quick = 1;
364 if (argc < 3)
365 FatalError (1, "A file name must be specified");
366 dictname = argv[2];
367 }
368 if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
369 FatalError (1, "Unable to open \"%s\"", dictname);
370
371 fread (&magic, sizeof (magic), 1, fp);
372
373 NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
374
375 switch (magic)
376 {
377 case MAGIC_STEM_BUILD:
378 if (!quick)
379 printf ("# Contents of STEM file \"%s\"\n#\n", dictname);
380 DumpStemDict (fp);
381 break;
382 case MAGIC_DICT:
383 if (!quick)
384 printf ("# Contents of DICT file \"%s\"\n#\n", dictname);
385 DumpTextDict (fp);
386 break;
387 case MAGIC_STATS_DICT:
388 if (!quick)
389 printf ("# Contents of STATS file \"%s\"\n#\n", dictname);
390 DumpStatsDict (fp);
391 break;
392 default:
393 FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname);
394 }
395 fclose (fp);
396 return 0;
397}
Note: See TracBrowser for help on using the repository browser.