source: trunk/gsdl/packages/mg/src/text/mgstat.c@ 1014

Last change on this file since 1014 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.2 KB
Line 
1/**************************************************************************
2 *
3 * mgstat.c -- Program to generate statistics on a text collection
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgstat.c 439 1999-08-10 21:23:37Z sjboddie $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
27
28#include "mg_files.h"
29#include "locallib.h"
30#include "mg.h"
31#include "words.h"
32#include "invf.h"
33#include "text.h"
34
35
36/*
37 $Log$
38 Revision 1.1 1999/08/10 21:18:19 sjboddie
39 renamed mg-1.3d directory mg
40
41 Revision 1.2 1998/11/25 07:55:50 rjmcnab
42
43 Modified mg to that you can specify the stemmer you want
44 to use via a command line option. You specify it to
45 mg_passes during the build process. The number of the
46 stemmer that you used is stored within the inverted
47 dictionary header and the stemmed dictionary header so
48 the correct stemmer is used in later stages of building
49 and querying.
50
51 Revision 1.1 1998/11/17 09:35:31 rjmcnab
52 *** empty log message ***
53
54 * Revision 1.2 1994/09/20 04:41:59 tes
55 * For version 1.1
56 *
57 */
58
59static char *RCSID = "$Id: mgstat.c 439 1999-08-10 21:23:37Z sjboddie $";
60
61char *search_for_collection (char *name);
62int process_file (char *, char *, int);
63void ProcessDict (char *);
64void ProcessStem (char *);
65long ProcessStemBlk (char *name); /* [RPAP - Jan 97: Stem Index Change] */
66
67static double inputbytes = 0.0; /* [RJM 07/97: 4G limit] */
68static unsigned long total = 0;
69
70
71
72
73
74int main (int argc, char **argv)
75{
76 unsigned long sub_total;
77 int fast;
78 char *file_name = "";
79 int ch;
80 int exact = 0;
81 long indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */
82 opterr = 0;
83 while ((ch = getopt (argc, argv, "Ehf:d:")) != -1)
84 switch (ch)
85 {
86 case 'E':
87 exact = 1;
88 break;
89 case 'f': /* input file */
90 file_name = optarg;
91 break;
92 case 'd':
93 set_basepath (optarg);
94 break;
95 case 'h':
96 case '?':
97 fprintf (stderr, "usage: %s [-f input_file]"
98 "[-d data directory] [-h]\n", argv[0]);
99 exit (1);
100 }
101
102 if (optind < argc)
103 file_name = search_for_collection (argv[optind]);
104
105 ProcessDict (file_name);
106 ProcessStem (file_name);
107 indexed = ProcessStemBlk (file_name); /* [RPAP - Jan 97: Stem Index Change] */
108
109 {
110 char Name[256];
111 char *s;
112 sprintf (Name, FILE_NAME_FORMAT, get_basepath (), file_name, "");
113 s = strrchr (Name, '/');
114 if (s)
115 *s = '\0';
116 printf ("\nThe collection is in \"%s\"\n", Name);
117 }
118
119
120 printf ("\n\t\tFiles required by mgquery\n");
121 process_file (file_name, TEXT_SUFFIX, exact);
122 process_file (file_name, INVF_SUFFIX, exact);
123 process_file (file_name, TEXT_IDX_WGT_SUFFIX, exact);
124 process_file (file_name, APPROX_WEIGHTS_SUFFIX, exact);
125 process_file (file_name, INVF_DICT_BLOCKED_SUFFIX, exact);
126
127 /* [RPAP - Jan 97: Stem Index Change] */
128 if (indexed)
129 {
130 process_file (file_name, INVF_DICT_BLOCKED_1_SUFFIX, exact);
131 process_file (file_name, INVF_DICT_BLOCKED_2_SUFFIX, exact);
132 process_file (file_name, INVF_DICT_BLOCKED_3_SUFFIX, exact);
133 }
134
135 fast = process_file (file_name, TEXT_DICT_FAST_SUFFIX, exact);
136 if (!fast)
137 {
138 process_file (file_name, TEXT_DICT_SUFFIX, exact);
139 process_file (file_name, TEXT_DICT_AUX_SUFFIX, exact);
140 }
141
142
143 process_file (NULL, "SUB TOTAL", exact);
144 sub_total = total;
145 total = 0;
146
147 printf ("\n\t\tFiles NOT required by mgquery\n");
148 if (fast)
149 {
150 process_file (file_name, TEXT_DICT_SUFFIX, exact);
151 process_file (file_name, TEXT_DICT_AUX_SUFFIX, exact);
152 }
153 process_file (file_name, INVF_DICT_SUFFIX, exact);
154 process_file (file_name, INVF_IDX_SUFFIX, exact);
155 process_file (file_name, TEXT_STATS_DICT_SUFFIX, exact);
156 process_file (file_name, TEXT_IDX_SUFFIX, exact);
157 process_file (file_name, WEIGHTS_SUFFIX, exact);
158 process_file (file_name, INVF_CHUNK_SUFFIX, exact);
159 process_file (file_name, INVF_CHUNK_TRANS_SUFFIX, exact);
160 process_file (file_name, INVF_DICT_HASH_SUFFIX, exact);
161 process_file (file_name, INVF_PARAGRAPH_SUFFIX, exact);
162 process_file (NULL, "SUB TOTAL", exact);
163 total += sub_total;
164 printf ("\n");
165 process_file (NULL, "TOTAL", exact);
166
167 return 0;
168}
169
170
171
172char *
173search_for_collection (char *name)
174{
175 char *dir = get_basepath ();
176 static char buffer[512];
177 struct stat stat_buf;
178
179 /* Look in the current directory first */
180 if (stat (name, &stat_buf) != -1)
181 {
182 if (S_ISDIR(stat_buf.st_mode))
183 {
184 /* The name is a directory */
185 sprintf (buffer, "%s/%s", name, name);
186 set_basepath (".");
187 return buffer;
188 }
189 }
190
191 sprintf (buffer, "%s.text", name);
192 if (stat (buffer, &stat_buf) != -1)
193 {
194 if (S_ISREG(stat_buf.st_mode))
195 {
196 /* The name is a directory */
197 set_basepath (".");
198 return name;
199 }
200 }
201 sprintf (buffer, "%s/%s", dir, name);
202 if (stat (buffer, &stat_buf) != -1)
203 {
204 if (S_ISDIR(stat_buf.st_mode))
205 {
206 /* The name is a directory */
207 sprintf (buffer, "%s/%s", name, name);
208 return buffer;
209 }
210 }
211 return name;
212}
213
214
215
216
217
218
219void
220ProcessDict (char *name)
221{
222 FILE *f;
223 compression_dict_header cdh;
224 int have_cdh = 0;
225 compressed_text_header cth;
226 int have_cth = 0;
227
228 if ((f = open_file (name, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
229 {
230 Read_cdh (f, &cdh, NULL, NULL);
231 fclose (f);
232 have_cdh = 1;
233 }
234
235 if ((f = open_file (name, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
236 {
237 fread ((char *) &cth, sizeof (cth), 1, f);
238
239 /* [RPAP - Jan 97: Endian Ordering] */
240 NTOHUL(cth.num_of_docs);
241 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
242 NTOHUL(cth.num_of_words);
243 NTOHUL(cth.length_of_longest_doc);
244 NTOHD(cth.ratio);
245
246 fclose (f);
247 have_cth = 1;
248 }
249
250 if (have_cth)
251 {
252 inputbytes = cth.num_of_bytes;
253 printf ("Input bytes : %10.0f, %8.2f Mbyte\n",
254 cth.num_of_bytes, cth.num_of_bytes / 1024.0 / 1024.0); /* [RJM 07/97: 4G limit] */
255 printf ("Documents : %10lu\n", cth.num_of_docs);
256 printf ("Words in collection [dict] : %10lu\n", cth.num_of_words);
257 printf ("Longest doc in collection [dict] : %10lu characters\n",
258 cth.length_of_longest_doc);
259 printf ("Maximum ratio : %10.2f\n", cth.ratio);
260 }
261
262 if (have_cdh)
263 {
264 printf ("Words in dict : %10lu\n", cdh.num_words[1]);
265 printf ("Non-words in dict : %10lu\n", cdh.num_words[0]);
266 printf ("Total chars of distinct words : %10lu\n", cdh.num_word_chars[1]);
267 printf ("Total chars of distinct non-words : %10lu\n", cdh.num_word_chars[0]);
268 }
269
270}
271
272
273
274
275
276
277
278
279void
280ProcessStem (char *name)
281{
282 FILE *f;
283 struct invf_dict_header idh;
284
285 if (!(f = open_file (name, INVF_DICT_SUFFIX, "rb",
286 MAGIC_STEM_BUILD, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
287 return;
288 fread ((char *) &idh, sizeof (idh), 1, f);
289 printf ("Words in collection [stem] : %10ld\n", NTOHUL(idh.num_of_words)); /* [RPAP - Jan 97: Endian Ordering] */
290 printf ("Words in stem : %10ld\n", NTOHUL(idh.dict_size)); /* [RPAP - Jan 97: Endian Ordering] */
291 printf ("Indexed fragments : %10ld\n", NTOHUL(idh.num_of_docs)); /* [RPAP - Jan 97: Endian Ordering] */
292 printf ("Total chars of stem words : %10ld\n", NTOHUL(idh.total_bytes)); /* [RPAP - Jan 97: Endian Ordering] */
293 fclose (f);
294}
295
296
297
298
299
300/* [RPAP - Jan 97: Stem Index Change] */
301long
302ProcessStemBlk (char *name)
303{
304 FILE *f;
305 struct stem_dict_header sdh;
306
307 if (!(f = open_file (name, INVF_DICT_BLOCKED_SUFFIX, "rb",
308 MAGIC_STEM, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
309 return 0;
310 fread ((char *) &sdh, sizeof (sdh), 1, f);
311 fclose (f);
312 return NTOHUL(sdh.indexed); /* [RPAP - Jan 97: Endian Ordering] */
313}
314
315
316
317
318int
319process_file (char *name, char *ext, int exact)
320{
321 static double scale = 0;
322 static char *units;
323 struct stat buf;
324 if (scale == 0)
325 {
326 /* So can get output in Mb or Kb, */
327 /* will divide by a scale of 1024(Kb) or 1024*1024(Mb) */
328 /* Note: if inputbytes==0, then use 1024 as default */
329 scale = inputbytes > 10 * 1000 * 1000 ? 1024 * 1024 : 1024;
330 units = scale == 1024 ? "Kb" : "Mb";
331 }
332 if (name)
333 {
334 char Name[256];
335 sprintf (Name, FILE_NAME_FORMAT, get_basepath (), name, ext);
336 if (!stat (Name, &buf))
337 {
338 char fname[256];
339 char *nam = strrchr (name, '/');
340 nam = nam ? nam + 1 : name;
341 sprintf (fname, "%s%s", nam, ext);
342
343 if (inputbytes == 0.0)
344 {
345 if (exact)
346 printf ("%s%*s: %10ld bytes %7.3f%%\n", fname,
347 35 - (int) strlen (fname), "",
348 buf.st_size,
349 100.0 * buf.st_size / inputbytes);
350 else
351 printf ("%s%*s: %8.2f %s %7.3f%%\n", fname,
352 35 - (int) strlen (fname), "",
353 buf.st_size / scale,
354 units,
355 100.0 * buf.st_size / inputbytes);
356 }
357 else
358 {
359 if (exact)
360 printf ("%s%*s: %10ld bytes\n", fname,
361 35 - (int) strlen (fname), "",
362 buf.st_size);
363 else
364 printf ("%s%*s: %8.2f %s\n", fname,
365 35 - (int) strlen (fname), "",
366 buf.st_size / scale,
367 units);
368 }
369 total += buf.st_size;
370 return 1;
371 }
372 else
373 return 0;
374 }
375 else
376 {
377 if (inputbytes == 0.0)
378 {
379 if (exact)
380 printf ("%-34s : %10ld bytes %7.3f%%\n", ext,
381 total,
382 100.0 * total / inputbytes);
383 else
384 printf ("%-34s : %8.2f %s %7.3f%%\n", ext,
385 total / scale, units,
386 100.0 * total / inputbytes);
387 }
388 else
389 {
390 if (exact)
391 printf ("%-34s : %10ld bytes\n", ext, total);
392 else
393 printf ("%-34s : %8.2f %s\n", ext,
394 total / scale, units);
395 }
396 }
397 return 1;
398
399}
Note: See TracBrowser for help on using the repository browser.