source: main/tags/2.80/indexers/mg/src/text/mgstat.c@ 24541

Last change on this file since 24541 was 3745, checked in by mdewsnip, 21 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 10.5 KB
Line 
1/**************************************************************************
2 *
3 * mgstat.c -- Program to generate statistics on a text collection
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mgstat.c 3745 2003-02-20 21:20:24Z mdewsnip $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#include "netorder.h" /* [RPAP - Jan 97: Endian Ordering] */
27
28#include "mg_files.h"
29#include "locallib.h"
30#include "mg.h"
31#include "words.h"
32#include "invf.h"
33#include "text.h"
34#include "longlong.h"
35
36/*
37 $Log$
38 Revision 1.1 2003/02/20 21:18:24 mdewsnip
39 Addition of MG package for search and retrieval
40
41 Revision 1.2 2001/09/21 12:46:42 kjm18
42 updated mg to be in line with mg_1.3f. Now uses long long for some variables
43 to enable indexing of very large collections.
44
45 Revision 1.1 1999/08/10 21:18:19 sjboddie
46 renamed mg-1.3d directory mg
47
48 Revision 1.2 1998/11/25 07:55:50 rjmcnab
49
50 Modified mg to that you can specify the stemmer you want
51 to use via a command line option. You specify it to
52 mg_passes during the build process. The number of the
53 stemmer that you used is stored within the inverted
54 dictionary header and the stemmed dictionary header so
55 the correct stemmer is used in later stages of building
56 and querying.
57
58 Revision 1.1 1998/11/17 09:35:31 rjmcnab
59 *** empty log message ***
60
61 * Revision 1.2 1994/09/20 04:41:59 tes
62 * For version 1.1
63 *
64 */
65
66static char *RCSID = "$Id: mgstat.c 3745 2003-02-20 21:20:24Z mdewsnip $";
67
68char *search_for_collection (char *name);
69int process_file (char *, char *, int);
70void ProcessDict (char *);
71void ProcessStem (char *);
72long ProcessStemBlk (char *name); /* [RPAP - Jan 97: Stem Index Change] */
73
74static mg_ullong inputbytes = 0;
75static mg_ullong total = 0;
76
77
78
79
80
81int main (int argc, char **argv)
82{
83 mg_ullong sub_total;
84 int fast;
85 char *file_name = "";
86 int ch;
87 int exact = 0;
88 long indexed = 0; /* [RPAP - Jan 97: Stem Index Change] */
89 opterr = 0;
90 while ((ch = getopt (argc, argv, "Ehf:d:")) != -1)
91 switch (ch)
92 {
93 case 'E':
94 exact = 1;
95 break;
96 case 'f': /* input file */
97 file_name = optarg;
98 break;
99 case 'd':
100 set_basepath (optarg);
101 break;
102 case 'h':
103 case '?':
104 fprintf (stderr, "usage: %s [-f input_file]"
105 "[-d data directory] [-h]\n", argv[0]);
106 exit (1);
107 }
108
109 if (optind < argc)
110 file_name = search_for_collection (argv[optind]);
111
112 ProcessDict (file_name);
113 ProcessStem (file_name);
114 indexed = ProcessStemBlk (file_name); /* [RPAP - Jan 97: Stem Index Change] */
115
116 {
117 char Name[256];
118 char *s;
119 sprintf (Name, FILE_NAME_FORMAT, get_basepath (), file_name, "");
120 s = strrchr (Name, '/');
121 if (s)
122 *s = '\0';
123 printf ("\nThe collection is in \"%s\"\n", Name);
124 }
125
126
127 printf ("\n\t\tFiles required by mgquery\n");
128 process_file (file_name, TEXT_SUFFIX, exact);
129 process_file (file_name, INVF_SUFFIX, exact);
130 process_file (file_name, TEXT_IDX_WGT_SUFFIX, exact);
131 process_file (file_name, APPROX_WEIGHTS_SUFFIX, exact);
132 process_file (file_name, INVF_DICT_BLOCKED_SUFFIX, exact);
133
134 /* [RPAP - Jan 97: Stem Index Change] */
135 if (indexed)
136 {
137 process_file (file_name, INVF_DICT_BLOCKED_1_SUFFIX, exact);
138 process_file (file_name, INVF_DICT_BLOCKED_2_SUFFIX, exact);
139 process_file (file_name, INVF_DICT_BLOCKED_3_SUFFIX, exact);
140 }
141
142 fast = process_file (file_name, TEXT_DICT_FAST_SUFFIX, exact);
143 if (!fast)
144 {
145 process_file (file_name, TEXT_DICT_SUFFIX, exact);
146 process_file (file_name, TEXT_DICT_AUX_SUFFIX, exact);
147 }
148
149
150 process_file (NULL, "SUB TOTAL", exact);
151 sub_total = total;
152 total = 0;
153
154 printf ("\n\t\tFiles NOT required by mgquery\n");
155 if (fast)
156 {
157 process_file (file_name, TEXT_DICT_SUFFIX, exact);
158 process_file (file_name, TEXT_DICT_AUX_SUFFIX, exact);
159 }
160 process_file (file_name, INVF_DICT_SUFFIX, exact);
161 process_file (file_name, INVF_IDX_SUFFIX, exact);
162 process_file (file_name, TEXT_STATS_DICT_SUFFIX, exact);
163 process_file (file_name, TEXT_IDX_SUFFIX, exact);
164 process_file (file_name, WEIGHTS_SUFFIX, exact);
165 process_file (file_name, INVF_CHUNK_SUFFIX, exact);
166 process_file (file_name, INVF_CHUNK_TRANS_SUFFIX, exact);
167 process_file (file_name, INVF_DICT_HASH_SUFFIX, exact);
168 process_file (file_name, INVF_PARAGRAPH_SUFFIX, exact);
169 process_file (NULL, "SUB TOTAL", exact);
170 total += sub_total;
171 printf ("\n");
172 process_file (NULL, "TOTAL", exact);
173 return 0;
174
175}
176
177
178
179char *
180search_for_collection (char *name)
181{
182 char *dir = get_basepath ();
183 static char buffer[512];
184 struct stat stat_buf;
185
186 /* Look in the current directory first */
187 if (stat (name, &stat_buf) != -1)
188 {
189 if (S_ISDIR(stat_buf.st_mode))
190 {
191 /* The name is a directory */
192 sprintf (buffer, "%s/%s", name, name);
193 set_basepath (".");
194 return buffer;
195 }
196 }
197
198 sprintf (buffer, "%s.text", name);
199 if (stat (buffer, &stat_buf) != -1)
200 {
201 if (S_ISREG(stat_buf.st_mode))
202 {
203 /* The name is a directory */
204 set_basepath (".");
205 return name;
206 }
207 }
208 sprintf (buffer, "%s/%s", dir, name);
209 if (stat (buffer, &stat_buf) != -1)
210 {
211 if (S_ISDIR(stat_buf.st_mode))
212 {
213 /* The name is a directory */
214 sprintf (buffer, "%s/%s", name, name);
215 return buffer;
216 }
217 }
218 return name;
219}
220
221
222
223
224
225
226void
227ProcessDict (char *name)
228{
229 FILE *f;
230 compression_dict_header cdh;
231 int have_cdh = 0;
232 compressed_text_header cth;
233 int have_cth = 0;
234
235 if ((f = open_file (name, TEXT_DICT_SUFFIX, "rb", MAGIC_DICT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
236 {
237 Read_cdh (f, &cdh, NULL, NULL);
238 fclose (f);
239 have_cdh = 1;
240 }
241
242 if ((f = open_file (name, TEXT_SUFFIX, "rb", MAGIC_TEXT, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
243 {
244 fread ((char *) &cth, sizeof (cth), 1, f);
245
246 /* [RPAP - Jan 97: Endian Ordering] */
247 NTOHUL(cth.num_of_docs);
248 NTOHD(cth.num_of_bytes); /* [RJM 07/97: 4G limit] */
249 NTOHUL(cth.num_of_words);
250 NTOHUL(cth.length_of_longest_doc);
251 NTOHD(cth.ratio);
252
253 fclose (f);
254 have_cth = 1;
255 }
256
257 if (have_cth)
258 {
259 inputbytes = cth.num_of_bytes;
260 printf ("Input bytes : %10" ULL_FS ", %8.2f Mbyte\n",
261 cth.num_of_bytes, (double) cth.num_of_bytes / 1024 / 1024);
262 printf ("Documents : %10lu\n", cth.num_of_docs);
263 printf ("Words in collection [dict] : %10lu\n", cth.num_of_words);
264 printf ("Longest doc in collection [dict] : %10lu characters\n",
265 cth.length_of_longest_doc);
266 printf ("Maximum ratio : %10.2f\n", cth.ratio);
267 }
268
269 if (have_cdh)
270 {
271 printf ("Words in dict : %10lu\n", cdh.num_words[1]);
272 printf ("Non-words in dict : %10lu\n", cdh.num_words[0]);
273 printf ("Total chars of distinct words : %10lu\n", cdh.num_word_chars[1]);
274 printf ("Total chars of distinct non-words : %10lu\n", cdh.num_word_chars[0]);
275 }
276
277}
278
279
280
281
282
283
284
285
286void
287ProcessStem (char *name)
288{
289 FILE *f;
290 struct invf_dict_header idh;
291
292 if (!(f = open_file (name, INVF_DICT_SUFFIX, "rb",
293 MAGIC_STEM_BUILD, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
294 return;
295 fread ((char *) &idh, sizeof (idh), 1, f);
296 printf ("Words in collection [stem] : %10ld\n", NTOHUL(idh.num_of_words)); /* [RPAP - Jan 97: Endian Ordering] */
297 printf ("Words in stem : %10ld\n", NTOHUL(idh.dict_size)); /* [RPAP - Jan 97: Endian Ordering] */
298 printf ("Indexed fragments : %10ld\n", NTOHUL(idh.num_of_docs)); /* [RPAP - Jan 97: Endian Ordering] */
299 printf ("Total chars of stem words : %10ld\n", NTOHUL(idh.total_bytes)); /* [RPAP - Jan 97: Endian Ordering] */
300 fclose (f);
301}
302
303
304
305
306
307/* [RPAP - Jan 97: Stem Index Change] */
308long
309ProcessStemBlk (char *name)
310{
311 FILE *f;
312 struct stem_dict_header sdh;
313
314 if (!(f = open_file (name, INVF_DICT_BLOCKED_SUFFIX, "rb",
315 MAGIC_STEM, MG_MESSAGE))) /* [RPAP - Feb 97: WIN32 Port] */
316 return 0;
317 fread ((char *) &sdh, sizeof (sdh), 1, f);
318 fclose (f);
319 return NTOHUL(sdh.indexed); /* [RPAP - Jan 97: Endian Ordering] */
320}
321
322
323
324
325int
326process_file (char *name, char *ext, int exact)
327{
328 static double scale = 0;
329 static char *units;
330 struct stat buf;
331 if (scale == 0)
332 {
333 /* So can get output in Mb or Kb, */
334 /* will divide by a scale of 1024(Kb) or 1024*1024(Mb) */
335 /* Note: if inputbytes==0, then use 1024 as default */
336 scale = inputbytes > 10 * 1000 * 1000 ? 1024 * 1024 : 1024;
337 units = scale == 1024 ? "Kb" : "Mb";
338 }
339 if (name)
340 {
341 char Name[256];
342 sprintf (Name, FILE_NAME_FORMAT, get_basepath (), name, ext);
343 if (!stat (Name, &buf))
344 {
345 char fname[256];
346 char *nam = strrchr (name, '/');
347 nam = nam ? nam + 1 : name;
348 sprintf (fname, "%s%s", nam, ext);
349
350 if (inputbytes == 0.0)
351 {
352 if (exact)
353 printf ("%s%*s: %10ld bytes %7.3f%%\n", fname,
354 35 - (int) strlen (fname), "",
355 buf.st_size,
356 100.0 * buf.st_size / inputbytes);
357 else
358 printf ("%s%*s: %8.2f %s %7.3f%%\n", fname,
359 35 - (int) strlen (fname), "",
360 buf.st_size / scale,
361 units,
362 100.0 * buf.st_size / inputbytes);
363 }
364 else
365 {
366 if (exact)
367 printf ("%s%*s: %10ld bytes\n", fname,
368 35 - (int) strlen (fname), "",
369 buf.st_size);
370 else
371 printf ("%s%*s: %8.2f %s\n", fname,
372 35 - (int) strlen (fname), "",
373 buf.st_size / scale,
374 units);
375 }
376 total += buf.st_size;
377 return 1;
378 }
379 else
380 return 0;
381 }
382 else
383 {
384 if (inputbytes == 0.0)
385 {
386 if (exact)
387 printf ("%-34s : %10" ULL_FS " bytes %7.3f%%\n", ext,
388 total,
389 100.0 * total / inputbytes);
390 else
391 printf ("%-34s : %8.2f %s %7.3f%%\n", ext,
392 total / scale, units,
393 100.0 * total / inputbytes);
394 }
395 else
396 {
397 if (exact)
398 printf ("%-34s : %10" ULL_FS " bytes\n", ext, total);
399 else
400 printf ("%-34s : %8.2f %s\n", ext,
401 total / scale, units);
402 }
403 }
404 return 1;
405
406}
Note: See TracBrowser for help on using the repository browser.