source: trunk/indexers/mg/src/text/gs3_mg_passes.c@ 7452

Last change on this file since 7452 was 7452, checked in by kjdon, 20 years ago

tidied up the setting filename and basepath path stuff

  • Property svn:keywords set to Author Date Id Revision
File size: 17.3 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: gs3_mg_passes.c 7452 2004-05-26 04:38:42Z kjdon $
21 *
22 **************************************************************************/
23/* this needs to come first */
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29
30#include "memlib.h"
31#include "messages.h"
32#include "timing.h"
33
34#include "longlong.h"
35#include "stemmer.h"
36
37
38#include "mg_files.h"
39#include "mg.h"
40#include "build.h"
41#include "text.h"
42
43#include "words.h"
44
45/*
46 $Log$
47 Revision 1.3 2004/05/26 04:38:42 kjdon
48 tidied up the setting filename and basepath path stuff
49
50 Revision 1.2 2004/05/26 02:39:57 kjdon
51 some hacky changes - fix up under linux
52
53 Revision 1.1 2004/05/25 03:30:12 kjdon
54 new mg passes for gs3. I thought I had commited this already
55
56 Revision 1.2 2004/04/25 23:01:18 kjdon
57 added a new -M option to mg_passes, allowing maxnumeric to be altered - made this change to keep gsdl3 mg inline with gsdl2 mg.
58
59 Revision 1.1 2003/02/20 21:18:24 mdewsnip
60 Addition of MG package for search and retrieval
61
62 Revision 1.3 2001/09/21 12:46:42 kjm18
63 updated mg to be in line with mg_1.3f. Now uses long long for some variables
64 to enable indexing of very large collections.
65
66 Revision 1.2 2001/06/12 23:23:42 jrm21
67 fixed a bug where mg_passes segfaults when trying to print the usage message.
68
69 Revision 1.1 1999/08/10 21:18:12 sjboddie
70 renamed mg-1.3d directory mg
71
72 Revision 1.3 1998/12/17 09:12:53 rjmcnab
73
74 Altered mg to process utf-8 encoded Unicode. The main changes
75 are in the parsing of the input, the casefolding, and the stemming.
76
77 Revision 1.2 1998/11/25 07:55:47 rjmcnab
78
79 Modified mg to that you can specify the stemmer you want
80 to use via a command line option. You specify it to
81 mg_passes during the build process. The number of the
82 stemmer that you used is stored within the inverted
83 dictionary header and the stemmed dictionary header so
84 the correct stemmer is used in later stages of building
85 and querying.
86
87 Revision 1.1 1998/11/17 09:35:13 rjmcnab
88 *** empty log message ***
89
90 * Revision 1.3 1994/10/20 03:56:57 tes
91 * I have rewritten the boolean query optimiser and abstracted out the
92 * components of the boolean query.
93 *
94 * Revision 1.2 1994/09/20 04:41:52 tes
95 * For version 1.1
96 *
97
98*/
99
100static char *RCSID = "$Id: gs3_mg_passes.c 7452 2004-05-26 04:38:42Z kjdon $";
101
102#define MAX_PASSES 5
103
104#define SPECIAL 1
105#define TEXT_PASS_1 2
106#define TEXT_PASS_2 4
107#define IVF_PASS_1 8
108#define IVF_PASS_2 16
109
110#define MIN_BUF 8192
111#define TERMRECORD '\002'
112
113unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
114unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
115unsigned long ChunkLimit = 0;
116char InvfLevel = 2;
117char SkipSGML = 0;
118char MakeWeights = 0;
119FILE *Comp_Stats = NULL;
120int comp_stat_point = 0;
121mg_ullong bytes_processed = 0;
122mg_ullong bytes_received = 0;
123int stemmer_num = 0; /* default to the lovin stemmer */
124int stem_method = 0;
125FILE * Trace;
126char * filename;
127unsigned long num_docs = 0;
128unsigned long block_bytes = 0;
129
130static char Passes = 0;
131static unsigned long trace = 0;
132static int Dump = 0;
133static char **files = NULL;
134static int num_files = 0;
135static char *trace_name = NULL;
136
137typedef struct pass_data
138 {
139 char *name;
140 int (*init) (char *);
141 int (*process) (u_char *, int);
142 int (*done) (char *);
143#ifdef HAVE_TIMES
144 clock_t init_time;
145 clock_t process_time;
146 clock_t done_time;
147#else
148 struct timeval init_time;
149 struct timeval process_time;
150 struct timeval done_time;
151#endif
152 }
153pass_data;
154
155#ifdef HAVE_TIMES
156#define NULL_TIMES 0, 0, 0
157#else
158#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
159#endif
160
161static pass_data PassData[MAX_PASSES] =
162{
163 {"special", init_special, process_special, done_special, NULL_TIMES},
164 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
165 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
166 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
167 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
168};
169
170static char *usage_str = "\nUSAGE:\n"
171" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
172" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
173" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
174" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n"
175" %*s [-a stemmer] [-M max-numeric] -f doc-collection-name\n";
176
177
178static void
179usage (char *err)
180{
181 if (err)
182 Message (err);
183 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
184 strlen (msg_prefix), "",strlen (msg_prefix), "",
185 strlen (msg_prefix),"");
186 exit (1);
187}
188
189
190
191
192#if 0
193static char *
194str_comma (unsigned long u)
195{
196 static char buf[20];
197 unsigned long a, b, c, d;
198 a = u / 1000000000;
199 u -= a * 1000000000;
200 b = u / 1000000;
201 u -= b * 1000000;
202 c = u / 1000;
203 u -= c * 1000;
204 d = u;
205
206 if (a)
207 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d);
208 else if (b)
209 sprintf (buf, "%u,%03u,%03u", b, c, d);
210 else if (c)
211 sprintf (buf, "%u,%03u", c, d);
212 else
213 sprintf (buf, "%u", d);
214 return (buf);
215}
216#endif
217
218
219
220/*
221 int
222 open_next_file (int in_fd)
223 {
224 if (in_fd > 0)
225 close (in_fd);
226 if (num_files == 0)
227 return (-1);
228 if ((in_fd = open (files[0], O_RDONLY)) == -1)
229 FatalError (1, "Cannot open %s", files[0]);
230 files++;
231 num_files--;
232 return (in_fd);
233 }
234*/
235
236void clear_variables() {
237
238 buf_size = 3 * 1024 * 1024; /* 3Mb */
239 invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
240 ChunkLimit = 0;
241 InvfLevel = 2;
242 SkipSGML = 0;
243 MakeWeights = 0;
244 Comp_Stats = NULL;
245 comp_stat_point = 0;
246 bytes_processed = 0;
247 bytes_received = 0;
248 stemmer_num = 0; /* default to the lovin stemmer */
249 stem_method = 0;
250 Trace = NULL;
251 filename = NULL;
252 num_docs = 0;
253 block_bytes = 0;
254
255 Passes = 0;
256 trace = 0;
257 Dump = 0;
258 files = NULL;
259 num_files = 0;
260 trace_name = NULL;
261
262
263}
264void set_invf_level(char level) {
265
266 switch (level) {
267 case '1':
268 InvfLevel = 1;
269 break;
270 case '2':
271 InvfLevel = 2;
272 break;
273 case '3':
274 InvfLevel = 3;
275 break;
276 }
277
278}
279void set_stem_options(char * stemmer, int method) {
280 stemmer_num = stemmernumber (stemmer);
281 printf("stemmer num set to %d\n", stemmer_num);
282 stem_method = method & STEMMER_MASK;
283
284}
285
286void set_filename(char * filen) {
287 int len = strlen(filen);
288 if (filename) {
289 Xfree (filename);
290 filename = NULL;
291 }
292 filename = Xstrdup (filen);
293 // put this here for now
294 SkipSGML=0;
295 Dump=1;
296 trace = 512;
297 if (!trace_name)
298 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
299 if (!(Trace = fopen (trace_name, "a")))
300 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
301 else
302 setbuf (Trace, NULL);
303
304}
305
306
307void add_pass (char pass_type, char pass_num) {
308
309 switch(pass_type) {
310 case 'S':
311 Passes |= SPECIAL;
312 break;
313 case 'I':
314 case 'N':
315 if (pass_num == '1')
316 Passes |= IVF_PASS_1;
317 else if (pass_num == '2')
318 Passes |= IVF_PASS_2;
319 else
320 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
321 break;
322 case 'T':
323 if (pass_num == '1')
324 Passes |= TEXT_PASS_1;
325 else if (pass_num == '2')
326 Passes |= TEXT_PASS_2;
327 else
328 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
329 break;
330 }
331
332}
333ProgTime StartTime, InitTime, ProcTime, DoneTime;
334
335void
336init_driver ()
337{
338 int pass;
339
340 GetTime (&StartTime);
341
342 for (pass = 0; pass < MAX_PASSES; pass++) {
343 if (Passes & (1 << pass)) {
344 pass_data *pd = &PassData[pass];
345#ifdef HAVE_TIMES
346 struct tms tims;
347 times (&tims);
348 pd->init_time -= tims.tms_utime + tims.tms_stime;
349#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
350 struct rusage ru;
351
352 getrusage (RUSAGE_SELF, &ru);
353 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
354 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
355#endif
356 if (pd->init (filename) == COMPERROR)
357 FatalError (1, "Error during init of \"%s\"", pd->name);
358
359#ifdef HAVE_TIMES
360 times (&tims);
361 pd->init_time += tims.tms_utime + tims.tms_stime;
362#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
363 getrusage (RUSAGE_SELF, &ru);
364 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
365 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
366 time_normalise (&pd->init_time);
367#endif
368 }
369 }
370 GetTime (&InitTime);
371}
372
373
374void process_document(u_char *buffer, int len) {
375 int pass;
376 bytes_processed += len;
377
378 printf("process doc, len=%d\n",len);
379#ifndef QUIET
380 if (!len)
381 Message ("Warning : Processing zero length document");
382#endif
383
384 for (pass = 0; pass < MAX_PASSES; pass++) {
385 if (Passes & (1 << pass))
386 {
387 register pass_data *pd = &PassData[pass];
388
389#ifdef HAVE_TIMES
390 struct tms tims;
391 times (&tims);
392 pd->process_time -= tims.tms_utime + tims.tms_stime;
393#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
394 struct rusage ru;
395 register struct timeval *tv = &pd->process_time;
396
397 getrusage (RUSAGE_SELF, &ru);
398 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
399 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
400#endif
401 if (pd->process ((u_char *) buffer, len) == COMPERROR)
402 {
403 Message ("Error during processing of \"%s\"", pd->name);
404 if (Dump || Trace)
405 {
406 int i;
407 FILE *f = Trace ? Trace : stderr;
408 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
409 for (i = 0; i < len; i++)
410 {
411 char ch = buffer[i];
412 if (ch == '\1' || ch == '\2')
413 ch = '\n';
414 putc (ch, f);
415 }
416 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
417 }
418 if (Trace)
419 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
420 bytes_processed, num_docs,
421 ElapsedTime (&StartTime, NULL));
422 exit (1);
423 }
424#ifdef HAVE_TIMES
425 times (&tims);
426 pd->process_time += tims.tms_utime + tims.tms_stime;
427#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
428 getrusage (RUSAGE_SELF, &ru);
429 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
430 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
431#endif
432 }
433 }
434 num_docs++;
435 if (Trace)
436 {
437 block_bytes += len;
438 if (block_bytes >= trace)
439 {
440#ifdef HAVE_MALLINFO
441 struct mallinfo mi;
442 mi = mallinfo ();
443 block_bytes -= trace;
444 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
445 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
446 ElapsedTime (&StartTime, NULL));
447#else
448 block_bytes -= trace;
449 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
450 bytes_processed, num_docs,
451 ElapsedTime (&StartTime, NULL));
452#endif
453 }
454 }
455}
456
457void finalise_driver() {
458 int pass;
459#ifndef HAVE_TIMES
460 for (pass = 0; pass < MAX_PASSES; pass++)
461 if (Passes & (1 << pass))
462 time_normalise (&PassData[pass].process_time);
463#endif
464
465 GetTime (&ProcTime);
466
467 for (pass = 0; pass < MAX_PASSES; pass++)
468 if (Passes & (1 << pass))
469 {
470 pass_data *pd = &PassData[pass];
471#ifdef HAVE_TIMES
472 struct tms tims;
473 times (&tims);
474 pd->done_time -= tims.tms_utime + tims.tms_stime;
475#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
476 struct rusage ru;
477
478 getrusage (RUSAGE_SELF, &ru);
479 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
480 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
481#endif
482 if (pd->done (filename) == COMPERROR)
483 FatalError (1, "Error during done of \"%s\"", pd->name);
484
485#ifdef HAVE_TIMES
486 times (&tims);
487 pd->done_time += tims.tms_utime + tims.tms_stime;
488#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
489 getrusage (RUSAGE_SELF, &ru);
490 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
491 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
492 time_normalise (&pd->done_time);
493#endif
494 }
495 if (Trace)
496 {
497#ifdef HAVE_MALLINFO
498 struct mallinfo mi;
499 mi = mallinfo ();
500 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
501 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
502 ElapsedTime (&StartTime, NULL));
503#else
504 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
505 bytes_processed, num_docs,
506 ElapsedTime (&StartTime, NULL));
507#endif
508 }
509
510 GetTime (&DoneTime);
511
512 Message ("");
513 Message ("%10s : init process done", "");
514 for (pass = 0; pass < MAX_PASSES; pass++)
515 if (Passes & (1 << pass))
516 {
517 pass_data *pd = &PassData[pass];
518 char it[15], pt[15], dt[15];
519#ifdef HAVE_TIMES
520 strcpy (it, cputime_string (pd->init_time));
521 strcpy (pt, cputime_string (pd->process_time));
522 strcpy (dt, cputime_string (pd->done_time));
523#else
524 strcpy (it, cputime_string (&pd->init_time));
525 strcpy (pt, cputime_string (&pd->process_time));
526 strcpy (dt, cputime_string (&pd->done_time));
527#endif
528 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
529 }
530 Message ("");
531 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
532 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
533 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
534 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
535 Message ("Documents : %u", num_docs);
536 Message ("Bytes received : %" ULL_FS, bytes_received);
537 Message ("Bytes processed : %" ULL_FS, bytes_processed);
538 Message ("Process Rate : %.1f kB per cpu second",
539 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
540 //free (buffer);
541}
542
543
544
545int main (int argc, char **argv)
546{
547 int ch, in_fd;
548
549 msg_prefix = argv[0];
550
551 opterr = 0;
552 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:M:")) != -1)
553 {
554 switch (ch)
555 {
556 case 'G':
557 SkipSGML = 1;
558 break;
559 case 'S':
560 Passes |= SPECIAL;
561 break;
562 case '1':
563 InvfLevel = 1;
564 break;
565 case '2':
566 InvfLevel = 2;
567 break;
568 case '3':
569 InvfLevel = 3;
570 break;
571 case 'f':
572 filename = optarg;
573 break;
574 case 'n':
575 trace_name = optarg;
576 break;
577 case 'D':
578 Dump = 1;
579 break;
580 case 'W':
581 MakeWeights = 1;
582 break;
583 case 'd':
584 set_basepath (optarg);
585 break;
586 case 'a':
587 stemmer_num = stemmernumber (optarg);
588 break;
589 case 's':
590 stem_method = atoi (optarg) & STEMMER_MASK;
591 break;
592 case 'b':
593 buf_size = atoi (optarg) * 1024;
594 break;
595 case 'C':
596 comp_stat_point = atoi (optarg) * 1024;
597 break;
598 case 'c':
599 ChunkLimit = atoi (optarg);
600 break;
601 case 'm':
602 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
603 break;
604 case 'I':
605 case 'N': /* N kept for compatability */
606 if (*optarg == '1')
607 Passes |= IVF_PASS_1;
608 else if (*optarg == '2')
609 Passes |= IVF_PASS_2;
610 else
611 usage ("Invalid pass number");
612 break;
613 case 'T':
614 if (*optarg == '1')
615 Passes |= TEXT_PASS_1;
616 else if (*optarg == '2')
617 Passes |= TEXT_PASS_2;
618 else
619 usage ("Invalid pass number");
620 break;
621 case 't':
622 trace = (unsigned long) (atof (optarg) * 1024 * 1024);
623 break;
624 case 'M':
625 SetEnv ("maxnumeric", optarg, NULL);
626 break;
627 case 'h':
628 case '?':
629 usage (NULL);
630 }
631 }
632
633 if (!filename || *filename == '\0')
634 FatalError (1, "A document collection name must be specified.");
635
636 if (buf_size < MIN_BUF)
637 FatalError (1, "The buffer size must exceed 1024 bytes.");
638
639 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
640 FatalError (1, "I1 and I2 cannot be done simultaneously.");
641
642 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
643 FatalError (1, "T1 and T2 cannot be done simultaneously.");
644
645 if (!Passes)
646 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
647
648 if (optind < argc)
649 {
650 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
651 FatalError (1, "Cannot open %s", argv[optind]);
652 files = &argv[optind + 1];
653 num_files = argc - (optind + 1);
654 }
655 else
656 in_fd = 0; /* stdin */
657
658
659 if (trace)
660 {
661 if (!trace_name)
662 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
663 if (!(Trace = fopen (trace_name, "a")))
664 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
665 else
666 setbuf (Trace, NULL);
667 }
668 else
669 Trace = NULL;
670
671 if (comp_stat_point)
672 {
673 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
674 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
675 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
676 name);
677 }
678
679
680 if (Trace)
681 {
682 int i;
683 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
684 for (i = 0; i < argc; i++)
685 fprintf (Trace, "%s ", argv[i]);
686 fprintf (Trace, "\n\n");
687 }
688
689 init_driver ();
690 /* here we have to do something to process docs from stdin */
691 finalise_driver();
692 if (Trace)
693 fclose (Trace);
694
695 if (Comp_Stats)
696 fclose (Comp_Stats);
697
698 return 0;
699}
Note: See TracBrowser for help on using the repository browser.