source: trunk/gsdl/packages/mg-1.3d/src/text/mgstat.c@ 30

Last change on this file since 30 was 13, checked in by rjmcnab, 26 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.8 KB
Line 
1/**************************************************************************
2 *
3 * mgstat.c -- Program to generate statistics on a text collection
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgstat.c 13 1998-11-17 09:36:00Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
27
28#include "mg_files.h"
29#include "locallib.h"
30#include "mg.h"
31#include "words.h"
32#include "invf.h"
33#include "text.h"
34
35
36/*
37 $Log$
38 Revision 1.1 1998/11/17 09:35:31 rjmcnab
39 *** empty log message ***
40
41 * Revision 1.2 1994/09/20 04:41:59 tes
42 * For version 1.1
43 *
44 */
45
46static char *RCSID = "$Id: mgstat.c 13 1998-11-17 09:36:00Z rjmcnab $";
47
48char *search_for_collection (char *name);
49int process_file (char *, char *, int);
50void ProcessDict (char *);
51void ProcessStem (char *);
52long ProcessStemBlk (char *name); /* [RPAP - Jan 97: Stem Index Change] */
53
54static double inputbytes = 0.0; /* [RJM 07/97: 4G limit] */
55static unsigned long total = 0;
56
57
58
59
60
61void
62main (int argc, char **argv)
63{
64 unsigned long sub_total;
65 int fast;
66 char *file_name = "";
67 int ch;
68 int exact = 0;
69 long indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */
70 opterr = 0;
71 while ((ch = getopt (argc, argv, "Ehf:d:")) != -1)
72 switch (ch)
73 {
74 case 'E':
75 exact = 1;
76 break;
77 case 'f': /* input file */
78 file_name = optarg;
79 break;
80 case 'd':
81 set_basepath (optarg);
82 break;
83 case 'h':
84 case '?':
85 fprintf (stderr, "usage: %s [-f input_file]"
86 "[-d data directory] [-h]\n", argv[0]);
87 exit (1);
88 }
89
90 if (optind < argc)
91 file_name = search_for_collection (argv[optind]);
92
93 ProcessDict (file_name);
94 ProcessStem (file_name);
95 indexed = ProcessStemBlk (file_name); /* [RPAP - Jan 97: Stem Index Change] */
96
97 {
98 char Name[256];
99 char *s;
100 sprintf (Name, FILE_NAME_FORMAT, get_basepath (), file_name, "");
101 s = strrchr (Name, '/');
102 if (s)
103 *s = '\0';
104 printf ("\nThe collection is in \"%s\"\n", Name);
105 }
106
107
108 printf ("\n\t\tFiles required by mgquery\n");
109 process_file (file_name, TEXT_SUFFIX, exact);
110 process_file (file_name, INVF_SUFFIX, exact);
111 process_file (file_name, TEXT_IDX_WGT_SUFFIX, exact);
112 process_file (file_name, APPROX_WEIGHTS_SUFFIX, exact);
113 process_file (file_name, INVF_DICT_BLOCKED_SUFFIX, exact);
114
115 /* [RPAP - Jan 97: Stem Index Change] */
116 if (indexed)
117 {
118 process_file (file_name, INVF_DICT_BLOCKED_1_SUFFIX, exact);
119 process_file (file_name, INVF_DICT_BLOCKED_2_SUFFIX, exact);
120 process_file (file_name, INVF_DICT_BLOCKED_3_SUFFIX, exact);
121 }
122
123 fast = process_file (file_name, TEXT_DICT_FAST_SUFFIX, exact);
124 if (!fast)
125 {
126 process_file (file_name, TEXT_DICT_SUFFIX, exact);
127 process_file (file_name, TEXT_DICT_AUX_SUFFIX, exact);
128 }
129
130
131 process_file (NULL, "SUB TOTAL", exact);
132 sub_total = total;
133 total = 0;
134
135 printf ("\n\t\tFiles NOT required by mgquery\n");
136 if (fast)
137 {
138 process_file (file_name, TEXT_DICT_SUFFIX, exact);
139 process_file (file_name, TEXT_DICT_AUX_SUFFIX, exact);
140 }
141 process_file (file_name, INVF_DICT_SUFFIX, exact);
142 process_file (file_name, INVF_IDX_SUFFIX, exact);
143 process_file (file_name, TEXT_STATS_DICT_SUFFIX, exact);
144 process_file (file_name, TEXT_IDX_SUFFIX, exact);
145 process_file (file_name, WEIGHTS_SUFFIX, exact);
146 process_file (file_name, INVF_CHUNK_SUFFIX, exact);
147 process_file (file_name, INVF_CHUNK_TRANS_SUFFIX, exact);
148 process_file (file_name, INVF_DICT_HASH_SUFFIX, exact);
149 process_file (file_name, INVF_PARAGRAPH_SUFFIX, exact);
150 process_file (NULL, "SUB TOTAL", exact);
151 total += sub_total;
152 printf ("\n");
153 process_file (NULL, "TOTAL", exact);
154 exit (0);
155}
156
157
158
159char *
160search_for_collection (char *name)
161{
162 char *dir = get_basepath ();
163 static char buffer[512];
164 struct stat stat_buf;
165
166 /* Look in the current directory first */
167 if (stat (name, &stat_buf) != -1)
168 {
169 if (S_ISDIR(stat_buf.st_mode))
170 {
171 /* The name is a directory */
172 sprintf (buffer, "%s/%s", name, name);
173 set_basepath (".");
174 return buffer;
175 }
176 }
177
178 sprintf (buffer, "%s.text", name);
179 if (stat (buffer, &stat_buf) != -1)
180 {
181 if (S_ISREG(stat_buf.st_mode))
182 {
183 /* The name is a directory */
184 set_basepath (".");
185 return name;
186 }
187 }
188 sprintf (buffer, "%s/%s", dir, name);
189 if (stat (buffer, &stat_buf) != -1)
190 {
191 if (S_ISDIR(stat_buf.st_mode))
192 {
193 /* The name is a directory */
194 sprintf (buffer, "%s/%s", name, name);
195 return buffer;
196 }
197 }
198 return name;
199}
200
201
202
203
204
205
206void
207ProcessDict (char *name)
208{
209 FILE *f;
210 compression_dict_header cdh;
211 int have_cdh = 0;
212 compressed_text_header cth;
213 int have_cth = 0;
214
215 if ((f = open_file (name, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
216 {
217 Read_cdh (f, &cdh, NULL, NULL);
218 fclose (f);
219 have_cdh = 1;
220 }
221
222 if ((f = open_file (name, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
223 {
224 fread ((char *) &cth, sizeof (cth), 1, f);
225
226 /* [RPAP - Jan 97: Endian Ordering] */
227 NTOHUL(cth.num_of_docs);
228 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
229 NTOHUL(cth.num_of_words);
230 NTOHUL(cth.length_of_longest_doc);
231 NTOHD(cth.ratio);
232
233 fclose (f);
234 have_cth = 1;
235 }
236
237 if (have_cth)
238 {
239 inputbytes = cth.num_of_bytes;
240 printf ("Input bytes : %10.0f, %8.2f Mbyte\n",
241 cth.num_of_bytes, cth.num_of_bytes / 1024.0 / 1024.0); /* [RJM 07/97: 4G limit] */
242 printf ("Documents : %10lu\n", cth.num_of_docs);
243 printf ("Words in collection [dict] : %10lu\n", cth.num_of_words);
244 printf ("Longest doc in collection [dict] : %10lu characters\n",
245 cth.length_of_longest_doc);
246 printf ("Maximum ratio : %10.2f\n", cth.ratio);
247 }
248
249 if (have_cdh)
250 {
251 printf ("Words in dict : %10lu\n", cdh.num_words[1]);
252 printf ("Non-words in dict : %10lu\n", cdh.num_words[0]);
253 printf ("Total chars of distinct words : %10lu\n", cdh.num_word_chars[1]);
254 printf ("Total chars of distinct non-words : %10lu\n", cdh.num_word_chars[0]);
255 }
256
257}
258
259
260
261
262
263
264
265
266void
267ProcessStem (char *name)
268{
269 FILE *f;
270 struct invf_dict_header idh;
271
272 if (!(f = open_file (name, INVF_DICT_SUFFIX, "rb",
273 MAGIC_STEM_BUILD, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
274 return;
275 fread ((char *) &idh, sizeof (idh), 1, f);
276 printf ("Words in collection [stem] : %10ld\n", NTOHUL(idh.num_of_words)); /* [RPAP - Jan 97: Endian Ordering] */
277 printf ("Words in stem : %10ld\n", NTOHUL(idh.dict_size)); /* [RPAP - Jan 97: Endian Ordering] */
278 printf ("Indexed fragments : %10ld\n", NTOHUL(idh.num_of_docs)); /* [RPAP - Jan 97: Endian Ordering] */
279 printf ("Total chars of stem words : %10ld\n", NTOHUL(idh.total_bytes)); /* [RPAP - Jan 97: Endian Ordering] */
280 fclose (f);
281}
282
283
284
285
286
287/* [RPAP - Jan 97: Stem Index Change] */
288long
289ProcessStemBlk (char *name)
290{
291 FILE *f;
292 struct stem_dict_header sdh;
293
294 if (!(f = open_file (name, INVF_DICT_BLOCKED_SUFFIX, "rb",
295 MAGIC_STEM, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
296 return 0;
297 fread ((char *) &sdh, sizeof (sdh), 1, f);
298 fclose (f);
299 return NTOHUL(sdh.indexed); /* [RPAP - Jan 97: Endian Ordering] */
300}
301
302
303
304
305int
306process_file (char *name, char *ext, int exact)
307{
308 static double scale = 0;
309 static char *units;
310 struct stat buf;
311 if (scale == 0)
312 {
313 /* So can get output in Mb or Kb, */
314 /* will divide by a scale of 1024(Kb) or 1024*1024(Mb) */
315 /* Note: if inputbytes==0, then use 1024 as default */
316 scale = inputbytes > 10 * 1000 * 1000 ? 1024 * 1024 : 1024;
317 units = scale == 1024 ? "Kb" : "Mb";
318 }
319 if (name)
320 {
321 char Name[256];
322 sprintf (Name, FILE_NAME_FORMAT, get_basepath (), name, ext);
323 if (!stat (Name, &buf))
324 {
325 char fname[256];
326 char *nam = strrchr (name, '/');
327 nam = nam ? nam + 1 : name;
328 sprintf (fname, "%s%s", nam, ext);
329
330 if (inputbytes == 0.0)
331 {
332 if (exact)
333 printf ("%s%*s: %10ld bytes %7.3f%%\n", fname,
334 35 - (int) strlen (fname), "",
335 buf.st_size,
336 100.0 * buf.st_size / inputbytes);
337 else
338 printf ("%s%*s: %8.2f %s %7.3f%%\n", fname,
339 35 - (int) strlen (fname), "",
340 buf.st_size / scale,
341 units,
342 100.0 * buf.st_size / inputbytes);
343 }
344 else
345 {
346 if (exact)
347 printf ("%s%*s: %10ld bytes\n", fname,
348 35 - (int) strlen (fname), "",
349 buf.st_size);
350 else
351 printf ("%s%*s: %8.2f %s\n", fname,
352 35 - (int) strlen (fname), "",
353 buf.st_size / scale,
354 units);
355 }
356 total += buf.st_size;
357 return 1;
358 }
359 else
360 return 0;
361 }
362 else
363 {
364 if (inputbytes == 0.0)
365 {
366 if (exact)
367 printf ("%-34s : %10ld bytes %7.3f%%\n", ext,
368 total,
369 100.0 * total / inputbytes);
370 else
371 printf ("%-34s : %8.2f %s %7.3f%%\n", ext,
372 total / scale, units,
373 100.0 * total / inputbytes);
374 }
375 else
376 {
377 if (exact)
378 printf ("%-34s : %10ld bytes\n", ext, total);
379 else
380 printf ("%-34s : %8.2f %s\n", ext,
381 total / scale, units);
382 }
383 }
384 return 1;
385
386}
Note: See TracBrowser for help on using the repository browser.