source: trunk/gsdl/packages/mg-1.3d/src/text/mg_passes.c@ 30

Last change on this file since 30 was 13, checked in by rjmcnab, 26 years ago

* empty log message *

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_passes.c 13 1998-11-17 09:36:00Z rjmcnab $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29
30#include "memlib.h"
31#include "messages.h"
32#include "timing.h"
33
34#include "mg_files.h"
35#include "mg.h"
36#include "build.h"
37#include "text.h"
38#include "stemmer.h"
39
40
41/*
42 $Log$
43 Revision 1.1 1998/11/17 09:35:13 rjmcnab
44 *** empty log message ***
45
46 * Revision 1.3 1994/10/20 03:56:57 tes
47 * I have rewritten the boolean query optimiser and abstracted out the
48 * components of the boolean query.
49 *
50 * Revision 1.2 1994/09/20 04:41:52 tes
51 * For version 1.1
52 *
53 */
54
55static char *RCSID = "$Id: mg_passes.c 13 1998-11-17 09:36:00Z rjmcnab $";
56
57#define MAX_PASSES 5
58
59#define SPECIAL 1
60#define TEXT_PASS_1 2
61#define TEXT_PASS_2 4
62#define IVF_PASS_1 8
63#define IVF_PASS_2 16
64
65#define MIN_BUF 8192
66#define TERMRECORD '\002'
67
68unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
69unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
70unsigned long ChunkLimit = 0;
71char InvfLevel = 2;
72char SkipSGML = 0;
73char MakeWeights = 0;
74FILE *Comp_Stats = NULL;
75int comp_stat_point = 0;
76double bytes_processed = 0; /* [RJM 07/97: 4G limit] */
77double bytes_received = 0; /* [RJM 07/97: 4G limit] */
78int stem_method = 0;
79
80static char Passes = 0;
81static unsigned long trace = 0;
82static int Dump = 0;
83static char **files = NULL;
84static int num_files = 0;
85static char *trace_name = NULL;
86
87
88typedef struct pass_data
89 {
90 char *name;
91 int (*init) (char *);
92 int (*process) (u_char *, int);
93 int (*done) (char *);
94#ifdef HAVE_TIMES
95 clock_t init_time;
96 clock_t process_time;
97 clock_t done_time;
98#else
99 struct timeval init_time;
100 struct timeval process_time;
101 struct timeval done_time;
102#endif
103 }
104pass_data;
105
106#ifdef HAVE_TIMES
107#define NULL_TIMES 0, 0, 0
108#else
109#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
110#endif
111
112static pass_data PassData[MAX_PASSES] =
113{
114 {"special", init_special, process_special, done_special, NULL_TIMES},
115 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
116 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
117 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
118 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
119};
120
121static char *usage_str = "\nUSAGE:\n"
122" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
123" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
124" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
125" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n";
126
127
128
129
130
131
132
133
134
135static void
136usage (char *err)
137{
138 if (err)
139 Message (err);
140 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
141 strlen (msg_prefix), "", strlen (msg_prefix), "");
142 exit (1);
143}
144
145
146
147
148#if 0
149static char *
150str_comma (unsigned long u)
151{
152 static char buf[20];
153 unsigned long a, b, c, d;
154 a = u / 1000000000;
155 u -= a * 1000000000;
156 b = u / 1000000;
157 u -= b * 1000000;
158 c = u / 1000;
159 u -= c * 1000;
160 d = u;
161
162 if (a)
163 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d);
164 else if (b)
165 sprintf (buf, "%u,%03u,%03u", b, c, d);
166 else if (c)
167 sprintf (buf, "%u,%03u", c, d);
168 else
169 sprintf (buf, "%u", d);
170 return (buf);
171}
172#endif
173
174
175
176
177int
178open_next_file (int in_fd)
179{
180 if (in_fd > 0)
181 close (in_fd);
182 if (num_files == 0)
183 return (-1);
184 if ((in_fd = open (files[0], O_RDONLY)) == -1)
185 FatalError (1, "Cannot open %s", files[0]);
186 files++;
187 num_files--;
188 return (in_fd);
189}
190
191
192static void
193driver (int in_fd, FILE * Trace, char *file_name)
194{
195 int pass, num = 1;
196
197
198 char *buffer = Xmalloc (buf_size);
199 unsigned long num_docs = 0;
200 unsigned long block_bytes = 0;
201 register int buf_left = buf_size;
202 register char *look_pos = buffer;
203 register char *end_pos = buffer;
204
205 ProgTime StartTime, InitTime, ProcTime, DoneTime;
206
207 GetTime (&StartTime);
208
209 for (pass = 0; pass < MAX_PASSES; pass++)
210 if (Passes & (1 << pass))
211 {
212 pass_data *pd = &PassData[pass];
213#ifdef HAVE_TIMES
214 struct tms tims;
215 times (&tims);
216 pd->init_time -= tims.tms_utime + tims.tms_stime;
217#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
218 struct rusage ru;
219
220 getrusage (RUSAGE_SELF, &ru);
221 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
222 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
223#endif
224 if (pd->init (file_name) == COMPERROR)
225 FatalError (1, "Error during init of \"%s\"", pd->name);
226
227#ifdef HAVE_TIMES
228 times (&tims);
229 pd->init_time += tims.tms_utime + tims.tms_stime;
230#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
231 getrusage (RUSAGE_SELF, &ru);
232 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
233 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
234 time_normalise (&pd->init_time);
235#endif
236 }
237
238 GetTime (&InitTime);
239 for (;;)
240 {
241 int len = 0;
242 char *base = look_pos;
243
244 while (look_pos != end_pos && *look_pos != TERMRECORD)
245 look_pos++;
246
247 while (look_pos == end_pos)
248 {
249 if (buf_left < MIN_BUF)
250 {
251 bcopy (base, buffer, end_pos - base);
252 look_pos = buffer + (end_pos - base);
253 buf_left = buf_size - (end_pos - base);
254 end_pos = look_pos;
255 base = buffer;
256 }
257 if (buf_left)
258 {
259 num = read (in_fd, end_pos, buf_left);
260 if (num < 0) num = 0; /* RJM - quick hack :-) */
261 if (num == 0)
262 if ((in_fd = open_next_file (in_fd)) != -1)
263 num = read (in_fd, end_pos, buf_left);
264 bytes_received += num;
265 buf_left -= num;
266 end_pos += num;
267 }
268 while (look_pos < end_pos && *look_pos != TERMRECORD)
269 look_pos++;
270 if (buf_left == 0 && base == buffer && look_pos == end_pos)
271 {
272 Message ("Unable to find document terminator (i.e ^B)"
273 " in the document");
274 FatalError (1, "The document is in excess of %d chars long",
275 look_pos - base);
276 }
277 if (!num)
278 break;
279 }
280 len = look_pos++ - base;
281
282 if (!num && base == end_pos)
283 break;
284
285 bytes_processed += len;
286
287#ifndef QUIET
288 if (!len)
289 Message ("Warning : Processing zero length document");
290#endif
291
292 for (pass = 0; pass < MAX_PASSES; pass++)
293 if (Passes & (1 << pass))
294 {
295 register pass_data *pd = &PassData[pass];
296
297#ifdef HAVE_TIMES
298 struct tms tims;
299 times (&tims);
300 pd->process_time -= tims.tms_utime + tims.tms_stime;
301#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
302 struct rusage ru;
303 register struct timeval *tv = &pd->process_time;
304
305 getrusage (RUSAGE_SELF, &ru);
306 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
307 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
308#endif
309
310 if (pd->process ((u_char *) base, len) == COMPERROR)
311 {
312 Message ("Error during processing of \"%s\"", pd->name);
313 if (Dump || Trace)
314 {
315 int i;
316 FILE *f = Trace ? Trace : stderr;
317 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
318 for (i = 0; i < len; i++)
319 {
320 char ch = base[i];
321 if (ch == '\1' || ch == '\2')
322 ch = '\n';
323 putc (ch, f);
324 }
325 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
326 }
327 if (Trace)
328 fprintf (Trace, "%10.0f bytes |%7lu docs | %s\n",
329 bytes_processed, num_docs,
330 ElapsedTime (&StartTime, NULL));
331 exit (1);
332 }
333
334#ifdef HAVE_TIMES
335 times (&tims);
336 pd->process_time += tims.tms_utime + tims.tms_stime;
337#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
338 getrusage (RUSAGE_SELF, &ru);
339 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
340 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
341#endif
342 }
343 num_docs++;
344 if (Trace)
345 {
346 block_bytes += (look_pos - base);
347 if (block_bytes >= trace)
348 {
349#ifdef HAVE_MALLINFO
350 struct mallinfo mi;
351 mi = mallinfo ();
352 block_bytes -= trace;
353 fprintf (Trace, "%10.0f bytes |%7lu docs |%7.3f Mb | %s\n",
354 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
355 ElapsedTime (&StartTime, NULL));
356#else
357 block_bytes -= trace;
358 fprintf (Trace, "%10.0f bytes |%7lu docs | %s\n",
359 bytes_processed, num_docs,
360 ElapsedTime (&StartTime, NULL));
361#endif
362 }
363 }
364 if (!num && look_pos - 1 == end_pos)
365 break;
366 }
367
368#ifndef HAVE_TIMES
369 for (pass = 0; pass < MAX_PASSES; pass++)
370 if (Passes & (1 << pass))
371 time_normalise (&PassData[pass].process_time);
372#endif
373
374 GetTime (&ProcTime);
375
376 for (pass = 0; pass < MAX_PASSES; pass++)
377 if (Passes & (1 << pass))
378 {
379 pass_data *pd = &PassData[pass];
380#ifdef HAVE_TIMES
381 struct tms tims;
382 times (&tims);
383 pd->done_time -= tims.tms_utime + tims.tms_stime;
384#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
385 struct rusage ru;
386
387 getrusage (RUSAGE_SELF, &ru);
388 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
389 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
390#endif
391 if (pd->done (file_name) == COMPERROR)
392 FatalError (1, "Error during done of \"%s\"", pd->name);
393
394#ifdef HAVE_TIMES
395 times (&tims);
396 pd->done_time += tims.tms_utime + tims.tms_stime;
397#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
398 getrusage (RUSAGE_SELF, &ru);
399 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
400 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
401 time_normalise (&pd->done_time);
402#endif
403 }
404 if (Trace)
405 {
406#ifdef HAVE_MALLINFO
407 struct mallinfo mi;
408 mi = mallinfo ();
409 fprintf (Trace, "%10.0f bytes |%7lu docs |%7.3f Mb | %s\n",
410 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
411 ElapsedTime (&StartTime, NULL));
412#else
413 fprintf (Trace, "%10.0f bytes |%7lu docs | %s\n",
414 bytes_processed, num_docs,
415 ElapsedTime (&StartTime, NULL));
416#endif
417 }
418
419 GetTime (&DoneTime);
420
421 Message ("");
422 Message ("%10s : init process done", "");
423 for (pass = 0; pass < MAX_PASSES; pass++)
424 if (Passes & (1 << pass))
425 {
426 pass_data *pd = &PassData[pass];
427 char it[15], pt[15], dt[15];
428#ifdef HAVE_TIMES
429 strcpy (it, cputime_string (pd->init_time));
430 strcpy (pt, cputime_string (pd->process_time));
431 strcpy (dt, cputime_string (pd->done_time));
432#else
433 strcpy (it, cputime_string (&pd->init_time));
434 strcpy (pt, cputime_string (&pd->process_time));
435 strcpy (dt, cputime_string (&pd->done_time));
436#endif
437 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
438 }
439 Message ("");
440 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
441 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
442 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
443 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
444 Message ("Documents : %u", num_docs);
445 Message ("Bytes received : %.0f", bytes_received);
446 Message ("Bytes processed : %.0f", bytes_processed);
447 Message ("Process Rate : %.1f kB per cpu second",
448 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
449 free (buffer);
450}
451
452
453
454void
455main (int argc, char **argv)
456{
457 int ch, in_fd;
458 char *filename = NULL;
459 FILE *Trace = NULL;
460
461 msg_prefix = argv[0];
462
463 opterr = 0;
464 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:")) != -1)
465 {
466 switch (ch)
467 {
468 case 'G':
469 SkipSGML = 1;
470 break;
471 case 'S':
472 Passes |= SPECIAL;
473 break;
474 case '1':
475 InvfLevel = 1;
476 break;
477 case '2':
478 InvfLevel = 2;
479 break;
480 case '3':
481 InvfLevel = 3;
482 break;
483 case 'f':
484 filename = optarg;
485 break;
486 case 'n':
487 trace_name = optarg;
488 break;
489 case 'D':
490 Dump = 1;
491 break;
492 case 'W':
493 MakeWeights = 1;
494 break;
495 case 'd':
496 set_basepath (optarg);
497 break;
498 case 's':
499 stem_method = atoi (optarg) & STEMMER_MASK;
500 break;
501 case 'b':
502 buf_size = atoi (optarg) * 1024;
503 break;
504 case 'C':
505 comp_stat_point = atoi (optarg) * 1024;
506 break;
507 case 'c':
508 ChunkLimit = atoi (optarg);
509 break;
510 case 'm':
511 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
512 break;
513 case 'I':
514 case 'N': /* N kept for compatability */
515 if (*optarg == '1')
516 Passes |= IVF_PASS_1;
517 else if (*optarg == '2')
518 Passes |= IVF_PASS_2;
519 else
520 usage ("Invalid pass number");
521 break;
522 case 'T':
523 if (*optarg == '1')
524 Passes |= TEXT_PASS_1;
525 else if (*optarg == '2')
526 Passes |= TEXT_PASS_2;
527 else
528 usage ("Invalid pass number");
529 break;
530 case 't':
531 trace = (unsigned long) (atof (optarg) * 1024 * 1024);
532 break;
533 case 'h':
534 case '?':
535 usage (NULL);
536 }
537 }
538
539 if (!filename || *filename == '\0')
540 FatalError (1, "A document collection name must be specified.");
541
542 if (buf_size < MIN_BUF)
543 FatalError (1, "The buffer size must exceed 1024 bytes.");
544
545 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
546 FatalError (1, "I1 and I2 cannot be done simultaneously.");
547
548 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
549 FatalError (1, "T1 and T2 cannot be done simultaneously.");
550
551 if (!Passes)
552 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
553
554 if (optind < argc)
555 {
556 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
557 FatalError (1, "Cannot open %s", argv[optind]);
558 files = &argv[optind + 1];
559 num_files = argc - (optind + 1);
560 }
561 else
562 in_fd = 0; /* stdin */
563
564
565 if (trace)
566 {
567 if (!trace_name)
568 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
569 if (!(Trace = fopen (trace_name, "a")))
570 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
571 else
572 setbuf (Trace, NULL);
573 }
574 else
575 Trace = NULL;
576
577 if (comp_stat_point)
578 {
579 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
580 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
581 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
582 name);
583 }
584
585
586 if (Trace)
587 {
588 int i;
589 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
590 for (i = 0; i < argc; i++)
591 fprintf (Trace, "%s ", argv[i]);
592 fprintf (Trace, "\n\n");
593 }
594
595 driver (in_fd, Trace, filename);
596
597 if (Trace)
598 fclose (Trace);
599
600 if (Comp_Stats)
601 fclose (Comp_Stats);
602
603 exit (0);
604}
Note: See TracBrowser for help on using the repository browser.