/************************************************************************** * * mgdictlist.c -- Program to list a dictionary * Copyright (C) 1994 Neil Sharman * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * $Id: mgdictlist.c 34 1998-11-25 07:55:52Z rjmcnab $ * **************************************************************************/ #include "sysfuncs.h" #include "messages.h" #include "memlib.h" #include "local_strings.h" #include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */ #include "mg_files.h" #include "text.h" #include "invf.h" #include "locallib.h" #include "words.h" /* $Log$ Revision 1.2 1998/11/25 07:55:49 rjmcnab Modified mg to that you can specify the stemmer you want to use via a command line option. You specify it to mg_passes during the build process. The number of the stemmer that you used is stored within the inverted dictionary header and the stemmed dictionary header so the correct stemmer is used in later stages of building and querying. Revision 1.1 1998/11/17 09:35:24 rjmcnab *** empty log message *** * Revision 1.4 1994/11/29 00:32:07 tes * Committing the new merged files and changes. * * Revision 1.3 1994/10/20 03:57:01 tes * I have rewritten the boolean query optimiser and abstracted out the * components of the boolean query. * * Revision 1.2 1994/09/20 04:41:56 tes * For version 1.1 * */ static char *RCSID = "$Id: mgdictlist.c 34 1998-11-25 07:55:52Z rjmcnab $"; int quick = 0; int no_of_words[2]; u_long maxcodelen[2]; char *dictname = ""; void DumpStemDict (FILE * f) { struct invf_dict_header idh; int i; u_char prev[MAXSTEMLEN + 1]; fread (&idh, sizeof (idh), 1, f); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(idh.lookback); NTOHUL(idh.dict_size); NTOHUL(idh.total_bytes); NTOHUL(idh.index_string_bytes); NTOHD(idh.input_bytes); /* [RJM 07/97: 4G limit] */ NTOHUL(idh.num_of_docs); NTOHUL(idh.static_num_of_docs); NTOHUL(idh.num_of_words); NTOHUL(idh.stemmer_num); NTOHUL(idh.stem_method); if (quick) printf ("%ld\n", idh.dict_size); else { printf ("# lookback = %lu\n", idh.lookback); printf ("# dict size = %lu\n", idh.dict_size); printf ("# total bytes = %lu\n", idh.total_bytes); printf ("# index string bytes = %lu\n", idh.index_string_bytes); printf ("# input bytes = %.0f\n", idh.input_bytes); /* [RJM 07/97: 4G limit] */ printf ("# num of docs = %lu\n", idh.num_of_docs); printf ("# static num of docs = %lu\n", idh.static_num_of_docs); printf ("# num of words = %lu\n", idh.num_of_words); printf ("#\n"); } for (i = 0; i < idh.dict_size; i++) { register unsigned long copy, suff; unsigned long wcnt, fcnt; /* build a new word on top of prev */ copy = getc (f); suff = getc (f); *prev = copy + suff; fread (prev + copy + 1, sizeof (u_char), suff, f); /* read other data, but no need to store it */ fread (&fcnt, sizeof (fcnt), 1, f); fread (&wcnt, sizeof (wcnt), 1, f); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(fcnt); NTOHUL(wcnt); if (!quick) { printf ("%d: %8ld ", i, wcnt); printf ("/ %5ld ", fcnt); printf ("%2d %2ld\t\"", *prev, copy); } printf ("%s", word2str (prev)); if (quick) printf (" %ld %ld\n", wcnt, fcnt); else { putchar ('"'); putchar ('\n'); } } } void ReadInWords (FILE * f) { comp_frags_header cfh; u_long *codes; u_char prev[MAXSTEMLEN + 1]; int i; if (Read_cfh (f, &cfh, NULL, NULL) == -1) FatalError (1, "Unable to read in the dictionary"); printf ("#\n"); printf ("# max code len = %u\n", cfh.hd.maxcodelen); printf ("# total bytes = %lu\n", cfh.uncompressed_size); printf ("#\n"); if (!(codes = Generate_Huffman_Codes (&cfh.hd, NULL))) FatalError (1, "no memory for huffman codes\n"); for (i = 0; i < cfh.hd.num_codes; i++) { register int val, copy, j, k; char code[33]; val = fgetc (f); copy = (val >> 4) & 0xf; val &= 0xf; fread (prev + copy + 1, sizeof (u_char), val, f); *prev = val + copy; for (k = 0, j = cfh.hd.clens[i] - 1; j >= 0; j--, k++) code[k] = '0' + ((codes[i] >> j) & 1); code[k] = '\0'; printf ("%d: %2d : %*s : \"%s\"\n", i, cfh.hd.clens[i], cfh.hd.maxcodelen, code, word2str (prev)); } Xfree (codes); Xfree (cfh.hd.clens); } void ReadCharHuffman (FILE * f, char *title) { int i; huff_data hd; u_long *codes; if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1) FatalError (1, "Unable to read huffman data"); if (!(codes = Generate_Huffman_Codes (&hd, NULL))) FatalError (1, "no memory for huffman codes\n"); printf ("#\n# %s\n#\n", title); for (i = 0; i < hd.num_codes; i++) if (hd.clens[i]) { int j, k; char code[33]; for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++) code[k] = '0' + ((codes[i] >> j) & 1); code[k] = '\0'; printf ("%2d : %*s : \"%s\"\n", hd.clens[i], hd.maxcodelen, code, char2str (i)); } Xfree (codes); Xfree (hd.clens); } void ReadLenHuffman (FILE * f, char *title) { int i; huff_data hd; u_long *codes; if (Read_Huffman_Data (f, &hd, NULL, NULL) == -1) FatalError (1, "Unable to read huffman data"); if (!(codes = Generate_Huffman_Codes (&hd, NULL))) FatalError (1, "no memory for huffman codes\n"); printf ("#\n# %s\n#\n", title); for (i = 0; i < hd.num_codes; i++) if (hd.clens[i]) { int j, k; char code[33]; for (k = 0, j = hd.clens[i] - 1; j >= 0; j--, k++) code[k] = '0' + ((codes[i] >> j) & 1); code[k] = '\0'; printf ("%2d : %*s : %d\n", hd.clens[i], hd.maxcodelen, code, i); } Xfree (codes); Xfree (hd.clens); } void DumpTextDict (FILE * f) { struct compression_dict_header cdh; int which; if (Read_cdh (f, &cdh, NULL, NULL) == -1) FatalError (1, "Unable to read dictionary header"); switch (cdh.dict_type) { case MG_COMPLETE_DICTIONARY: printf ("# COMPLETE DICTIONARY\n"); break; case MG_PARTIAL_DICTIONARY: printf ("# PARTIAL DICTIONARY\n"); break; case MG_SEED_DICTIONARY: printf ("# SEED DICTIONARY\n"); break; } printf ("# num words = %lu\n", cdh.num_words[1]); printf ("# num word chars = %lu\n", cdh.num_word_chars[1]); printf ("# num non-words = %lu\n", cdh.num_words[0]); printf ("# num non-word chars = %lu\n", cdh.num_word_chars[0]); printf ("# lookback = %lu\n", cdh.lookback); for (which = 0; which < 2; which++) switch (cdh.dict_type) { case MG_COMPLETE_DICTIONARY: { ReadInWords (f); } break; case MG_PARTIAL_DICTIONARY: { if (cdh.num_words[which]) ReadInWords (f); ReadCharHuffman (f, "Characters"); ReadLenHuffman (f, "Lengths"); } break; case MG_SEED_DICTIONARY: { if (cdh.num_words[which]) ReadInWords (f); ReadCharHuffman (f, "Characters"); ReadLenHuffman (f, "Lengths"); } break; } } void DumpStatsDict (FILE * f) { int i; compression_stats_header csh; fread (&csh, sizeof (csh), 1, f); for (i = 0; i < 2; i++) { int j; frags_stats_header fsh; fread (&fsh, sizeof (fsh), 1, f); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(fsh.num_frags); NTOHUL(fsh.mem_for_frags); if (!quick) printf ("#\n# num %9s = %lu\n#\n", i ? "words" : "non-words", fsh.num_frags); for (j = 0; j < fsh.num_frags; j++) { u_char Word[16]; u_long freq, occur_num; fread (&freq, sizeof (freq), 1, f); fread (&occur_num, sizeof (occur_num), 1, f); /* [RPAP - Jan 97: Endian Ordering] */ NTOHUL(freq); NTOHUL(occur_num); Word[0] = fgetc (f); fread (Word + 1, Word[0], 1, f); printf ("%d: %7ld : %7ld : \"%s\"\n", j, freq, occur_num, word2str (Word)); } } } int main (int argc, char **argv) { FILE *fp; unsigned long magic = 0; if (argc < 2) FatalError (1, "A file name must be specified"); dictname = argv[1]; if (strcmp (dictname, "-q") == 0) { quick = 1; if (argc < 3) FatalError (1, "A file name must be specified"); dictname = argv[2]; } if (!(fp = fopen (dictname, "rb"))) /* [RPAP - Feb 97: WIN32 Port] */ FatalError (1, "Unable to open \"%s\"", dictname); fread (&magic, sizeof (magic), 1, fp); NTOHUL(magic); /* [RPAP - Jan 97: Endian Ordering] */ switch (magic) { case MAGIC_STEM_BUILD: if (!quick) printf ("# Contents of STEM file \"%s\"\n#\n", dictname); DumpStemDict (fp); break; case MAGIC_DICT: if (!quick) printf ("# Contents of DICT file \"%s\"\n#\n", dictname); DumpTextDict (fp); break; case MAGIC_STATS_DICT: if (!quick) printf ("# Contents of STATS file \"%s\"\n#\n", dictname); DumpStatsDict (fp); break; default: FatalError (1, "Bad magic number. \"%s\" cannot be dumped", dictname); } fclose (fp); return 0; }