source: main/trunk/greenstone2/common-src/indexers/mg/src/text/mgdictlist.c@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago

merged 64_bit_Greenstone branch into trunk, rev 25139

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.7 KB
Line 
1/**************************************************************************
2 *
3 * mgdictlist.c -- Program to list a dictionary
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgdictlist.c 25147 2012-02-28 00:59:00Z kjdon $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "messages.h"
27#include "memlib.h"
28#include "local_strings.h"
29#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31#include "mg_files.h"
32#include "text.h"
33#include "invf.h"
34#include "locallib.h"
35#include "words.h"
36
37/*
38 $Log$
39 Revision 1.1 2003/02/20 21:18:24 mdewsnip
40 Addition of MG package for search and retrieval
41
42 Revision 1.1 1999/08/10 21:18:17 sjboddie
43 renamed mg-1.3d directory mg
44
45 Revision 1.2 1998/11/25 07:55:49 rjmcnab
46
47 Modified mg to that you can specify the stemmer you want
48 to use via a command line option. You specify it to
49 mg_passes during the build process. The number of the
50 stemmer that you used is stored within the inverted
51 dictionary header and the stemmed dictionary header so
52 the correct stemmer is used in later stages of building
53 and querying.
54
55 Revision 1.1 1998/11/17 09:35:24 rjmcnab
56 *** empty log message ***
57
58 * Revision 1.4 1994/11/29 00:32:07 tes
59 * Committing the new merged files and changes.
60 *
61 * Revision 1.3 1994/10/20 03:57:01 tes
62 * I have rewritten the boolean query optimiser and abstracted out the
63 * components of the boolean query.
64 *
65 * Revision 1.2 1994/09/20 04:41:56 tes
66 * For version 1.1
67 *
68 */
69
70static char *RCSID = "$Id: mgdictlist.c 25147 2012-02-28 00:59:00Z kjdon $";
71
72
73int quick = 0;
74int no_of_words[2];
75mg_u_long maxcodelen[2];
76
77char *dictname = "";
78
79
80
81
82void
83DumpStemDict (FILE * f)
84{
85 struct invf_dict_header idh;
86 int i;
87 u_char prev[MAXSTEMLEN + 1];
88
89 fread (&idh, sizeof (idh), 1, f);
90
91 /* [RPAP - Jan 97: Endian Ordering] */
92 NTOHUL(idh.lookback);
93 NTOHUL(idh.dict_size);
94 NTOHUL(idh.total_bytes);
95 NTOHUL(idh.index_string_bytes);
96 NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
97 NTOHUL(idh.num_of_docs);
98 NTOHUL(idh.static_num_of_docs);
99 NTOHUL(idh.num_of_words);
100 NTOHUL(idh.stemmer_num);
101 NTOHUL(idh.stem_method);
102
103 if (quick)
104 printf ("%d\n", idh.dict_size);
105 else
106 {
107 printf ("# lookback = %u\n", idh.lookback);
108 printf ("# dict size = %u\n", idh.dict_size);
109 printf ("# total bytes = %u\n", idh.total_bytes);
110 printf ("# index string bytes = %u\n", idh.index_string_bytes);
111 printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */
112 printf ("# num of docs = %u\n", idh.num_of_docs);
113 printf ("# static num of docs = %u\n", idh.static_num_of_docs);
114 printf ("# num of words = %u\n", idh.num_of_words);
115 printf ("#\n");
116 }
117
118 for (i = 0; i < idh.dict_size; i++)
119 {
120 register mg_u_long copy, suff;
121 mg_u_long wcnt, fcnt;
122
123 /* build a new word on top of prev */
124 copy = getc (f);
125 suff = getc (f);
126 *prev = copy + suff;
127 fread (prev + copy + 1, sizeof (u_char), suff, f);
128
129 /* read other data, but no need to store it */
130 fread (&fcnt, sizeof (fcnt), 1, f);
131 fread (&wcnt, sizeof (wcnt), 1, f);
132
133 /* [RPAP - Jan 97: Endian Ordering] */
134 NTOHUL(fcnt);
135 NTOHUL(wcnt);
136
137 if (!quick)
138 {
139 printf ("%d: %8d ", i, wcnt);
140 printf ("/ %5d ", fcnt);
141 printf ("%2d %2d\t\"", *prev, copy);
142 }
143 printf ("%s", word2str (prev));
144 if (quick)
145 printf (" %d %d\n", wcnt, fcnt);
146 else
147 {
148 putchar ('"');
149 putchar ('\n');
150 }
151 }
152}
153
154
155
156
157void
158ReadInWords (FILE * f)
159{
160 comp_frags_header cfh;
161 mg_u_long *codes;
162 u_char prev[MAXSTEMLEN + 1];
163 int i;
164
165 if (Read_cfh (f, &cfh, NULL, NULL) == -1)
166 FatalError (1, "Unable to read in the dictionary");
167
168 printf ("#\n");
169 printf ("# max code len = %u\n", cfh.hd.maxcodelen);
170 printf ("# total bytes = %d\n", cfh.uncompressed_size);
171 printf ("#\n");
172
173 if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL)))
174 FatalError (1, "no memory for huffman codes\n");
175
176 for (i = 0; i < cfh.hd.num_codes; i++)
177 {
178 register int val, copy, j, k;
179 char code[33];
180 val = fgetc (f);
181 copy = (val >> 4) & 0xf;
182 val &= 0xf;
183
184 fread (prev + copy + 1, sizeof (u_char), val, f);
185 *prev = val + copy;
186
187 for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++)
188 code[k] = '0' + ((codes[i] >> j) & 1);
189 code[k] = '\0';
190
191 printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i],
192 cfh.hd.maxcodelen, code, word2str (prev));
193 }
194 Xfree (codes);
195 Xfree (cfh.hd.clens);
196}
197
198
199void
200ReadCharHuffman (FILE * f, char *title)
201{
202 int i;
203 huff_data hd;
204 mg_u_long *codes;
205
206 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
207 FatalError (1, "Unable to read huffman data");
208
209 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
210 FatalError (1, "no memory for huffman codes\n");
211
212 printf ("#\n# %s\n#\n", title);
213 for (i = 0; i < hd.num_codes; i++)
214 if (hd.clens[i])
215 {
216 int j, k;
217 char code[33];
218 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
219 code[k] = '0' + ((codes[i] >> j) & 1);
220 code[k] = '\0';
221 printf ("%2d : %*s : \"%s\"\n", hd.clens[i],
222 hd.maxcodelen, code, char2str (i));
223 }
224 Xfree (codes);
225 Xfree (hd.clens);
226}
227
228
229void
230ReadLenHuffman (FILE * f, char *title)
231{
232 int i;
233 huff_data hd;
234 mg_u_long *codes;
235
236 if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
237 FatalError (1, "Unable to read huffman data");
238
239 if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
240 FatalError (1, "no memory for huffman codes\n");
241
242 printf ("#\n# %s\n#\n", title);
243 for (i = 0; i < hd.num_codes; i++)
244 if (hd.clens[i])
245 {
246 int j, k;
247 char code[33];
248 for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
249 code[k] = '0' + ((codes[i] >> j) & 1);
250 code[k] = '\0';
251 printf ("%2d : %*s : %d\n", hd.clens[i],
252 hd.maxcodelen, code, i);
253 }
254 Xfree (codes);
255 Xfree (hd.clens);
256}
257
258
259
260
261
262void
263DumpTextDict (FILE * f)
264{
265 struct compression_dict_header cdh;
266 int which;
267
268 if (Read_cdh (f, &cdh, NULL, NULL) == -1)
269 FatalError (1, "Unable to read dictionary header");
270 switch (cdh.dict_type)
271 {
272 case MG_COMPLETE_DICTIONARY:
273 printf ("# COMPLETE DICTIONARY\n");
274 break;
275 case MG_PARTIAL_DICTIONARY:
276 printf ("# PARTIAL DICTIONARY\n");
277 break;
278 case MG_SEED_DICTIONARY:
279 printf ("# SEED DICTIONARY\n");
280 break;
281 }
282 printf ("# num words = %d\n", cdh.num_words[1]);
283 printf ("# num word chars = %d\n", cdh.num_word_chars[1]);
284 printf ("# num non-words = %d\n", cdh.num_words[0]);
285 printf ("# num non-word chars = %d\n", cdh.num_word_chars[0]);
286 printf ("# lookback = %d\n", cdh.lookback);
287
288 for (which = 0; which < 2; which++)
289 switch (cdh.dict_type)
290 {
291 case MG_COMPLETE_DICTIONARY:
292 {
293 ReadInWords (f);
294 }
295 break;
296 case MG_PARTIAL_DICTIONARY:
297 {
298 if (cdh.num_words[which])
299 ReadInWords (f);
300
301 ReadCharHuffman (f, "Characters");
302 ReadLenHuffman (f, "Lengths");
303 }
304 break;
305 case MG_SEED_DICTIONARY:
306 {
307 if (cdh.num_words[which])
308 ReadInWords (f);
309
310 ReadCharHuffman (f, "Characters");
311 ReadLenHuffman (f, "Lengths");
312 }
313 break;
314 }
315}
316
317
318
319
320void
321DumpStatsDict (FILE * f)
322{
323 int i;
324 compression_stats_header csh;
325
326 fread (&csh, sizeof (csh), 1, f);
327
328 for (i = 0; i < 2; i++)
329 {
330 int j;
331 frags_stats_header fsh;
332
333 fread (&fsh, sizeof (fsh), 1, f);
334
335 /* [RPAP - Jan 97: Endian Ordering] */
336 NTOHUL(fsh.num_frags);
337 NTOHUL(fsh.mem_for_frags);
338
339 if (!quick)
340 printf ("#\n# num %9s = %u\n#\n", i ? "words" : "non-words",
341 fsh.num_frags);
342
343 for (j = 0; j < fsh.num_frags; j++)
344 {
345 u_char Word[16];
346 mg_u_long freq, occur_num;
347
348 fread (&freq, sizeof (freq), 1, f);
349 fread (&occur_num, sizeof (occur_num), 1, f);
350
351 /* [RPAP - Jan 97: Endian Ordering] */
352 NTOHUL(freq);
353 NTOHUL(occur_num);
354
355 Word[0] = fgetc (f);
356 fread (Word + 1, Word[0], 1, f);
357 printf ("%d: %7d : %7d : \"%s\"\n", j, freq,
358 occur_num, word2str (Word));
359 }
360 }
361}
362
363
364int main (int argc, char **argv)
365{
366 FILE *fp;
367 mg_u_long magic = 0;
368
369 if (argc < 2)
370 FatalError (1, "A file name must be specified");
371 dictname = argv[1];
372 if (strcmp (dictname, "-q") == 0)
373 {
374 quick = 1;
375 if (argc < 3)
376 FatalError (1, "A file name must be specified");
377 dictname = argv[2];
378 }
379 if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
380 FatalError (1, "Unable to open \"%s\"", dictname);
381
382 fread (&magic, sizeof (magic), 1, fp);
383
384 NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
385
386 switch (magic)
387 {
388 case MAGIC_STEM_BUILD:
389 if (!quick)
390 printf ("# Contents of STEM file \"%s\"\n#\n", dictname);
391 DumpStemDict (fp);
392 break;
393 case MAGIC_DICT:
394 if (!quick)
395 printf ("# Contents of DICT file \"%s\"\n#\n", dictname);
396 DumpTextDict (fp);
397 break;
398 case MAGIC_STATS_DICT:
399 if (!quick)
400 printf ("# Contents of STATS file \"%s\"\n#\n", dictname);
401 DumpStatsDict (fp);
402 break;
403 default:
404 FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname);
405 }
406 fclose (fp);
407 return 0;
408}
Note: See TracBrowser for help on using the repository browser.