Context Navigation

mgdictlist.c@ 25147

Last change on this file since 25147 was 25147, checked in by kjdon, 12 years ago
merged 64_bit_Greenstone branch into trunk, rev 25139
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.7 KB

Line
1	/**************************************************************************
2	*
3	* mgdictlist.c -- Program to list a dictionary
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: mgdictlist.c 25147 2012-02-28 00:59:00Z kjdon $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "messages.h"
27	#include "memlib.h"
28	#include "local_strings.h"
29	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31	#include "mg_files.h"
32	#include "text.h"
33	#include "invf.h"
34	#include "locallib.h"
35	#include "words.h"
36
37	/*
38	$Log$
39	Revision 1.1 2003/02/20 21:18:24 mdewsnip
40	Addition of MG package for search and retrieval
41
42	Revision 1.1 1999/08/10 21:18:17 sjboddie
43	renamed mg-1.3d directory mg
44
45	Revision 1.2 1998/11/25 07:55:49 rjmcnab
46
47	Modified mg to that you can specify the stemmer you want
48	to use via a command line option. You specify it to
49	mg_passes during the build process. The number of the
50	stemmer that you used is stored within the inverted
51	dictionary header and the stemmed dictionary header so
52	the correct stemmer is used in later stages of building
53	and querying.
54
55	Revision 1.1 1998/11/17 09:35:24 rjmcnab
56	* empty log message *
57
58	* Revision 1.4 1994/11/29 00:32:07 tes
59	* Committing the new merged files and changes.
60	*
61	* Revision 1.3 1994/10/20 03:57:01 tes
62	* I have rewritten the boolean query optimiser and abstracted out the
63	* components of the boolean query.
64	*
65	* Revision 1.2 1994/09/20 04:41:56 tes
66	* For version 1.1
67	*
68	*/
69
70	static char *RCSID = "$Id: mgdictlist.c 25147 2012-02-28 00:59:00Z kjdon $";
71
72
73	int quick = 0;
74	int no_of_words[2];
75	mg_u_long maxcodelen[2];
76
77	char *dictname = "";
78
79
80
81
82	void
83	DumpStemDict (FILE * f)
84	{
85	struct invf_dict_header idh;
86	int i;
87	u_char prev[MAXSTEMLEN + 1];
88
89	fread (&idh, sizeof (idh), 1, f);
90
91	/* [RPAP - Jan 97: Endian Ordering] */
92	NTOHUL(idh.lookback);
93	NTOHUL(idh.dict_size);
94	NTOHUL(idh.total_bytes);
95	NTOHUL(idh.index_string_bytes);
96	NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
97	NTOHUL(idh.num_of_docs);
98	NTOHUL(idh.static_num_of_docs);
99	NTOHUL(idh.num_of_words);
100	NTOHUL(idh.stemmer_num);
101	NTOHUL(idh.stem_method);
102
103	if (quick)
104	printf ("%d\n", idh.dict_size);
105	else
106	{
107	printf ("# lookback = %u\n", idh.lookback);
108	printf ("# dict size = %u\n", idh.dict_size);
109	printf ("# total bytes = %u\n", idh.total_bytes);
110	printf ("# index string bytes = %u\n", idh.index_string_bytes);
111	printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */
112	printf ("# num of docs = %u\n", idh.num_of_docs);
113	printf ("# static num of docs = %u\n", idh.static_num_of_docs);
114	printf ("# num of words = %u\n", idh.num_of_words);
115	printf ("#\n");
116	}
117
118	for (i = 0; i < idh.dict_size; i++)
119	{
120	register mg_u_long copy, suff;
121	mg_u_long wcnt, fcnt;
122
123	/* build a new word on top of prev */
124	copy = getc (f);
125	suff = getc (f);
126	*prev = copy + suff;
127	fread (prev + copy + 1, sizeof (u_char), suff, f);
128
129	/* read other data, but no need to store it */
130	fread (&fcnt, sizeof (fcnt), 1, f);
131	fread (&wcnt, sizeof (wcnt), 1, f);
132
133	/* [RPAP - Jan 97: Endian Ordering] */
134	NTOHUL(fcnt);
135	NTOHUL(wcnt);
136
137	if (!quick)
138	{
139	printf ("%d: %8d ", i, wcnt);
140	printf ("/ %5d ", fcnt);
141	printf ("%2d %2d\t\"", *prev, copy);
142	}
143	printf ("%s", word2str (prev));
144	if (quick)
145	printf (" %d %d\n", wcnt, fcnt);
146	else
147	{
148	putchar ('"');
149	putchar ('\n');
150	}
151	}
152	}
153
154
155
156
157	void
158	ReadInWords (FILE * f)
159	{
160	comp_frags_header cfh;
161	mg_u_long *codes;
162	u_char prev[MAXSTEMLEN + 1];
163	int i;
164
165	if (Read_cfh (f, &cfh, NULL, NULL) == -1)
166	FatalError (1, "Unable to read in the dictionary");
167
168	printf ("#\n");
169	printf ("# max code len = %u\n", cfh.hd.maxcodelen);
170	printf ("# total bytes = %d\n", cfh.uncompressed_size);
171	printf ("#\n");
172
173	if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL)))
174	FatalError (1, "no memory for huffman codes\n");
175
176	for (i = 0; i < cfh.hd.num_codes; i++)
177	{
178	register int val, copy, j, k;
179	char code[33];
180	val = fgetc (f);
181	copy = (val >> 4) & 0xf;
182	val &= 0xf;
183
184	fread (prev + copy + 1, sizeof (u_char), val, f);
185	*prev = val + copy;
186
187	for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++)
188	code[k] = '0' + ((codes[i] >> j) & 1);
189	code[k] = '\0';
190
191	printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i],
192	cfh.hd.maxcodelen, code, word2str (prev));
193	}
194	Xfree (codes);
195	Xfree (cfh.hd.clens);
196	}
197
198
199	void
200	ReadCharHuffman (FILE * f, char *title)
201	{
202	int i;
203	huff_data hd;
204	mg_u_long *codes;
205
206	if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
207	FatalError (1, "Unable to read huffman data");
208
209	if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
210	FatalError (1, "no memory for huffman codes\n");
211
212	printf ("#\n# %s\n#\n", title);
213	for (i = 0; i < hd.num_codes; i++)
214	if (hd.clens[i])
215	{
216	int j, k;
217	char code[33];
218	for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
219	code[k] = '0' + ((codes[i] >> j) & 1);
220	code[k] = '\0';
221	printf ("%2d : %*s : \"%s\"\n", hd.clens[i],
222	hd.maxcodelen, code, char2str (i));
223	}
224	Xfree (codes);
225	Xfree (hd.clens);
226	}
227
228
229	void
230	ReadLenHuffman (FILE * f, char *title)
231	{
232	int i;
233	huff_data hd;
234	mg_u_long *codes;
235
236	if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
237	FatalError (1, "Unable to read huffman data");
238
239	if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
240	FatalError (1, "no memory for huffman codes\n");
241
242	printf ("#\n# %s\n#\n", title);
243	for (i = 0; i < hd.num_codes; i++)
244	if (hd.clens[i])
245	{
246	int j, k;
247	char code[33];
248	for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
249	code[k] = '0' + ((codes[i] >> j) & 1);
250	code[k] = '\0';
251	printf ("%2d : %*s : %d\n", hd.clens[i],
252	hd.maxcodelen, code, i);
253	}
254	Xfree (codes);
255	Xfree (hd.clens);
256	}
257
258
259
260
261
262	void
263	DumpTextDict (FILE * f)
264	{
265	struct compression_dict_header cdh;
266	int which;
267
268	if (Read_cdh (f, &cdh, NULL, NULL) == -1)
269	FatalError (1, "Unable to read dictionary header");
270	switch (cdh.dict_type)
271	{
272	case MG_COMPLETE_DICTIONARY:
273	printf ("# COMPLETE DICTIONARY\n");
274	break;
275	case MG_PARTIAL_DICTIONARY:
276	printf ("# PARTIAL DICTIONARY\n");
277	break;
278	case MG_SEED_DICTIONARY:
279	printf ("# SEED DICTIONARY\n");
280	break;
281	}
282	printf ("# num words = %d\n", cdh.num_words[1]);
283	printf ("# num word chars = %d\n", cdh.num_word_chars[1]);
284	printf ("# num non-words = %d\n", cdh.num_words[0]);
285	printf ("# num non-word chars = %d\n", cdh.num_word_chars[0]);
286	printf ("# lookback = %d\n", cdh.lookback);
287
288	for (which = 0; which < 2; which++)
289	switch (cdh.dict_type)
290	{
291	case MG_COMPLETE_DICTIONARY:
292	{
293	ReadInWords (f);
294	}
295	break;
296	case MG_PARTIAL_DICTIONARY:
297	{
298	if (cdh.num_words[which])
299	ReadInWords (f);
300
301	ReadCharHuffman (f, "Characters");
302	ReadLenHuffman (f, "Lengths");
303	}
304	break;
305	case MG_SEED_DICTIONARY:
306	{
307	if (cdh.num_words[which])
308	ReadInWords (f);
309
310	ReadCharHuffman (f, "Characters");
311	ReadLenHuffman (f, "Lengths");
312	}
313	break;
314	}
315	}
316
317
318
319
320	void
321	DumpStatsDict (FILE * f)
322	{
323	int i;
324	compression_stats_header csh;
325
326	fread (&csh, sizeof (csh), 1, f);
327
328	for (i = 0; i < 2; i++)
329	{
330	int j;
331	frags_stats_header fsh;
332
333	fread (&fsh, sizeof (fsh), 1, f);
334
335	/* [RPAP - Jan 97: Endian Ordering] */
336	NTOHUL(fsh.num_frags);
337	NTOHUL(fsh.mem_for_frags);
338
339	if (!quick)
340	printf ("#\n# num %9s = %u\n#\n", i ? "words" : "non-words",
341	fsh.num_frags);
342
343	for (j = 0; j < fsh.num_frags; j++)
344	{
345	u_char Word[16];
346	mg_u_long freq, occur_num;
347
348	fread (&freq, sizeof (freq), 1, f);
349	fread (&occur_num, sizeof (occur_num), 1, f);
350
351	/* [RPAP - Jan 97: Endian Ordering] */
352	NTOHUL(freq);
353	NTOHUL(occur_num);
354
355	Word[0] = fgetc (f);
356	fread (Word + 1, Word[0], 1, f);
357	printf ("%d: %7d : %7d : \"%s\"\n", j, freq,
358	occur_num, word2str (Word));
359	}
360	}
361	}
362
363
364	int main (int argc, char **argv)
365	{
366	FILE *fp;
367	mg_u_long magic = 0;
368
369	if (argc < 2)
370	FatalError (1, "A file name must be specified");
371	dictname = argv[1];
372	if (strcmp (dictname, "-q") == 0)
373	{
374	quick = 1;
375	if (argc < 3)
376	FatalError (1, "A file name must be specified");
377	dictname = argv[2];
378	}
379	if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
380	FatalError (1, "Unable to open \"%s\"", dictname);
381
382	fread (&magic, sizeof (magic), 1, fp);
383
384	NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
385
386	switch (magic)
387	{
388	case MAGIC_STEM_BUILD:
389	if (!quick)
390	printf ("# Contents of STEM file \"%s\"\n#\n", dictname);
391	DumpStemDict (fp);
392	break;
393	case MAGIC_DICT:
394	if (!quick)
395	printf ("# Contents of DICT file \"%s\"\n#\n", dictname);
396	DumpTextDict (fp);
397	break;
398	case MAGIC_STATS_DICT:
399	if (!quick)
400	printf ("# Contents of STATS file \"%s\"\n#\n", dictname);
401	DumpStatsDict (fp);
402	break;
403	default:
404	FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname);
405	}
406	fclose (fp);
407	return 0;
408	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/mg/src/text/mgdictlist.c@ 25147

Download in other formats: