Context Navigation

source: trunk/gsdl/packages/mg-1.3d/src/text/mgdictlist.c@ 34

Last change on this file since 34 was 34, checked in by rjmcnab, 26 years ago
Modified mg to that you can specify the stemmer you want to use via a command line option. You specify it to mg_passes during the build process. The number of the stemmer that you used is stored within the inverted dictionary header and the stemmed dictionary header so the correct stemmer is used in later stages of building and querying.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 9.5 KB

Line
1	/**************************************************************************
2	*
3	* mgdictlist.c -- Program to list a dictionary
4	* Copyright (C) 1994 Neil Sharman
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License as published by
8	* the Free Software Foundation; either version 2 of the License, or
9	* (at your option) any later version.
10	*
11	* This program is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	* GNU General Public License for more details.
15	*
16	* You should have received a copy of the GNU General Public License
17	* along with this program; if not, write to the Free Software
18	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19	*
20	* $Id: mgdictlist.c 34 1998-11-25 07:55:52Z rjmcnab $
21	*
22	**************************************************************************/
23
24	#include "sysfuncs.h"
25
26	#include "messages.h"
27	#include "memlib.h"
28	#include "local_strings.h"
29	#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
30
31	#include "mg_files.h"
32	#include "text.h"
33	#include "invf.h"
34	#include "locallib.h"
35	#include "words.h"
36
37	/*
38	$Log$
39	Revision 1.2 1998/11/25 07:55:49 rjmcnab
40
41	Modified mg to that you can specify the stemmer you want
42	to use via a command line option. You specify it to
43	mg_passes during the build process. The number of the
44	stemmer that you used is stored within the inverted
45	dictionary header and the stemmed dictionary header so
46	the correct stemmer is used in later stages of building
47	and querying.
48
49	Revision 1.1 1998/11/17 09:35:24 rjmcnab
50	* empty log message *
51
52	* Revision 1.4 1994/11/29 00:32:07 tes
53	* Committing the new merged files and changes.
54	*
55	* Revision 1.3 1994/10/20 03:57:01 tes
56	* I have rewritten the boolean query optimiser and abstracted out the
57	* components of the boolean query.
58	*
59	* Revision 1.2 1994/09/20 04:41:56 tes
60	* For version 1.1
61	*
62	*/
63
64	static char *RCSID = "$Id: mgdictlist.c 34 1998-11-25 07:55:52Z rjmcnab $";
65
66
67	int quick = 0;
68	int no_of_words[2];
69	u_long maxcodelen[2];
70
71	char *dictname = "";
72
73
74
75
76	void
77	DumpStemDict (FILE * f)
78	{
79	struct invf_dict_header idh;
80	int i;
81	u_char prev[MAXSTEMLEN + 1];
82
83	fread (&idh, sizeof (idh), 1, f);
84
85	/* [RPAP - Jan 97: Endian Ordering] */
86	NTOHUL(idh.lookback);
87	NTOHUL(idh.dict_size);
88	NTOHUL(idh.total_bytes);
89	NTOHUL(idh.index_string_bytes);
90	NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */
91	NTOHUL(idh.num_of_docs);
92	NTOHUL(idh.static_num_of_docs);
93	NTOHUL(idh.num_of_words);
94	NTOHUL(idh.stemmer_num);
95	NTOHUL(idh.stem_method);
96
97	if (quick)
98	printf ("%ld\n", idh.dict_size);
99	else
100	{
101	printf ("# lookback = %lu\n", idh.lookback);
102	printf ("# dict size = %lu\n", idh.dict_size);
103	printf ("# total bytes = %lu\n", idh.total_bytes);
104	printf ("# index string bytes = %lu\n", idh.index_string_bytes);
105	printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */
106	printf ("# num of docs = %lu\n", idh.num_of_docs);
107	printf ("# static num of docs = %lu\n", idh.static_num_of_docs);
108	printf ("# num of words = %lu\n", idh.num_of_words);
109	printf ("#\n");
110	}
111
112	for (i = 0; i < idh.dict_size; i++)
113	{
114	register unsigned long copy, suff;
115	unsigned long wcnt, fcnt;
116
117	/* build a new word on top of prev */
118	copy = getc (f);
119	suff = getc (f);
120	*prev = copy + suff;
121	fread (prev + copy + 1, sizeof (u_char), suff, f);
122
123	/* read other data, but no need to store it */
124	fread (&fcnt, sizeof (fcnt), 1, f);
125	fread (&wcnt, sizeof (wcnt), 1, f);
126
127	/* [RPAP - Jan 97: Endian Ordering] */
128	NTOHUL(fcnt);
129	NTOHUL(wcnt);
130
131	if (!quick)
132	{
133	printf ("%d: %8ld ", i, wcnt);
134	printf ("/ %5ld ", fcnt);
135	printf ("%2d %2ld\t\"", *prev, copy);
136	}
137	printf ("%s", word2str (prev));
138	if (quick)
139	printf (" %ld %ld\n", wcnt, fcnt);
140	else
141	{
142	putchar ('"');
143	putchar ('\n');
144	}
145	}
146	}
147
148
149
150
151	void
152	ReadInWords (FILE * f)
153	{
154	comp_frags_header cfh;
155	u_long *codes;
156	u_char prev[MAXSTEMLEN + 1];
157	int i;
158
159	if (Read_cfh (f, &cfh, NULL, NULL) == -1)
160	FatalError (1, "Unable to read in the dictionary");
161
162	printf ("#\n");
163	printf ("# max code len = %u\n", cfh.hd.maxcodelen);
164	printf ("# total bytes = %lu\n", cfh.uncompressed_size);
165	printf ("#\n");
166
167	if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL)))
168	FatalError (1, "no memory for huffman codes\n");
169
170	for (i = 0; i < cfh.hd.num_codes; i++)
171	{
172	register int val, copy, j, k;
173	char code[33];
174	val = fgetc (f);
175	copy = (val >> 4) & 0xf;
176	val &= 0xf;
177
178	fread (prev + copy + 1, sizeof (u_char), val, f);
179	*prev = val + copy;
180
181	for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++)
182	code[k] = '0' + ((codes[i] >> j) & 1);
183	code[k] = '\0';
184
185	printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i],
186	cfh.hd.maxcodelen, code, word2str (prev));
187	}
188	Xfree (codes);
189	Xfree (cfh.hd.clens);
190	}
191
192
193	void
194	ReadCharHuffman (FILE * f, char *title)
195	{
196	int i;
197	huff_data hd;
198	u_long *codes;
199
200	if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
201	FatalError (1, "Unable to read huffman data");
202
203	if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
204	FatalError (1, "no memory for huffman codes\n");
205
206	printf ("#\n# %s\n#\n", title);
207	for (i = 0; i < hd.num_codes; i++)
208	if (hd.clens[i])
209	{
210	int j, k;
211	char code[33];
212	for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
213	code[k] = '0' + ((codes[i] >> j) & 1);
214	code[k] = '\0';
215	printf ("%2d : %*s : \"%s\"\n", hd.clens[i],
216	hd.maxcodelen, code, char2str (i));
217	}
218	Xfree (codes);
219	Xfree (hd.clens);
220	}
221
222
223	void
224	ReadLenHuffman (FILE * f, char *title)
225	{
226	int i;
227	huff_data hd;
228	u_long *codes;
229
230	if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1)
231	FatalError (1, "Unable to read huffman data");
232
233	if (!(codes = Generate_Huffman_Codes (&hd, NULL)))
234	FatalError (1, "no memory for huffman codes\n");
235
236	printf ("#\n# %s\n#\n", title);
237	for (i = 0; i < hd.num_codes; i++)
238	if (hd.clens[i])
239	{
240	int j, k;
241	char code[33];
242	for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++)
243	code[k] = '0' + ((codes[i] >> j) & 1);
244	code[k] = '\0';
245	printf ("%2d : %*s : %d\n", hd.clens[i],
246	hd.maxcodelen, code, i);
247	}
248	Xfree (codes);
249	Xfree (hd.clens);
250	}
251
252
253
254
255
256	void
257	DumpTextDict (FILE * f)
258	{
259	struct compression_dict_header cdh;
260	int which;
261
262	if (Read_cdh (f, &cdh, NULL, NULL) == -1)
263	FatalError (1, "Unable to read dictionary header");
264	switch (cdh.dict_type)
265	{
266	case MG_COMPLETE_DICTIONARY:
267	printf ("# COMPLETE DICTIONARY\n");
268	break;
269	case MG_PARTIAL_DICTIONARY:
270	printf ("# PARTIAL DICTIONARY\n");
271	break;
272	case MG_SEED_DICTIONARY:
273	printf ("# SEED DICTIONARY\n");
274	break;
275	}
276	printf ("# num words = %lu\n", cdh.num_words[1]);
277	printf ("# num word chars = %lu\n", cdh.num_word_chars[1]);
278	printf ("# num non-words = %lu\n", cdh.num_words[0]);
279	printf ("# num non-word chars = %lu\n", cdh.num_word_chars[0]);
280	printf ("# lookback = %lu\n", cdh.lookback);
281
282	for (which = 0; which < 2; which++)
283	switch (cdh.dict_type)
284	{
285	case MG_COMPLETE_DICTIONARY:
286	{
287	ReadInWords (f);
288	}
289	break;
290	case MG_PARTIAL_DICTIONARY:
291	{
292	if (cdh.num_words[which])
293	ReadInWords (f);
294
295	ReadCharHuffman (f, "Characters");
296	ReadLenHuffman (f, "Lengths");
297	}
298	break;
299	case MG_SEED_DICTIONARY:
300	{
301	if (cdh.num_words[which])
302	ReadInWords (f);
303
304	ReadCharHuffman (f, "Characters");
305	ReadLenHuffman (f, "Lengths");
306	}
307	break;
308	}
309	}
310
311
312
313
314	void
315	DumpStatsDict (FILE * f)
316	{
317	int i;
318	compression_stats_header csh;
319
320	fread (&csh, sizeof (csh), 1, f);
321
322	for (i = 0; i < 2; i++)
323	{
324	int j;
325	frags_stats_header fsh;
326
327	fread (&fsh, sizeof (fsh), 1, f);
328
329	/* [RPAP - Jan 97: Endian Ordering] */
330	NTOHUL(fsh.num_frags);
331	NTOHUL(fsh.mem_for_frags);
332
333	if (!quick)
334	printf ("#\n# num %9s = %lu\n#\n", i ? "words" : "non-words",
335	fsh.num_frags);
336
337	for (j = 0; j < fsh.num_frags; j++)
338	{
339	u_char Word[16];
340	u_long freq, occur_num;
341
342	fread (&freq, sizeof (freq), 1, f);
343	fread (&occur_num, sizeof (occur_num), 1, f);
344
345	/* [RPAP - Jan 97: Endian Ordering] */
346	NTOHUL(freq);
347	NTOHUL(occur_num);
348
349	Word[0] = fgetc (f);
350	fread (Word + 1, Word[0], 1, f);
351	printf ("%d: %7ld : %7ld : \"%s\"\n", j, freq,
352	occur_num, word2str (Word));
353	}
354	}
355	}
356
357
358	int main (int argc, char **argv)
359	{
360	FILE *fp;
361	unsigned long magic = 0;
362
363	if (argc < 2)
364	FatalError (1, "A file name must be specified");
365	dictname = argv[1];
366	if (strcmp (dictname, "-q") == 0)
367	{
368	quick = 1;
369	if (argc < 3)
370	FatalError (1, "A file name must be specified");
371	dictname = argv[2];
372	}
373	if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */
374	FatalError (1, "Unable to open \"%s\"", dictname);
375
376	fread (&magic, sizeof (magic), 1, fp);
377
378	NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */
379
380	switch (magic)
381	{
382	case MAGIC_STEM_BUILD:
383	if (!quick)
384	printf ("# Contents of STEM file \"%s\"\n#\n", dictname);
385	DumpStemDict (fp);
386	break;
387	case MAGIC_DICT:
388	if (!quick)
389	printf ("# Contents of DICT file \"%s\"\n#\n", dictname);
390	DumpTextDict (fp);
391	break;
392	case MAGIC_STATS_DICT:
393	if (!quick)
394	printf ("# Contents of STATS file \"%s\"\n#\n", dictname);
395	DumpStatsDict (fp);
396	break;
397	default:
398	FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname);
399	}
400	fclose (fp);
401	return 0;
402	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: