source: main/branches/64_bit_Greenstone/greenstone2/common-src/indexers/mg/src/text/mg_passes.c@ 23508

Last change on this file since 23508 was 23508, checked in by sjm84, 13 years ago

Committing 64 bit changes into the branch

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_passes.c 23508 2010-12-17 01:04:10Z sjm84 $
21 *
22 **************************************************************************/
23
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29#include <stdlib.h>
30#include "memlib.h"
31#include "messages.h"
32#include "timing.h"
33
34#include "longlong.h"
35
36#include "mg_files.h"
37#include "mg.h"
38#include "build.h"
39#include "text.h"
40#include "stemmer.h"
41
42#include "words.h"
43
44/*
45 $Log$
46 Revision 1.3 2004/11/29 03:15:13 kjdon
47 added some changes made by Emanuel Dejanu (Simple Words)
48
49 Revision 1.2 2004/04/25 23:01:18 kjdon
50 added a new -M option to mg_passes, allowing maxnumeric to be altered - made this change to keep gsdl3 mg inline with gsdl2 mg.
51
52 Revision 1.1 2003/02/20 21:18:24 mdewsnip
53 Addition of MG package for search and retrieval
54
55 Revision 1.3 2001/09/21 12:46:42 kjm18
56 updated mg to be in line with mg_1.3f. Now uses long long for some variables
57 to enable indexing of very large collections.
58
59 Revision 1.2 2001/06/12 23:23:42 jrm21
60 fixed a bug where mg_passes segfaults when trying to print the usage message.
61
62 Revision 1.1 1999/08/10 21:18:12 sjboddie
63 renamed mg-1.3d directory mg
64
65 Revision 1.3 1998/12/17 09:12:53 rjmcnab
66
67 Altered mg to process utf-8 encoded Unicode. The main changes
68 are in the parsing of the input, the casefolding, and the stemming.
69
70 Revision 1.2 1998/11/25 07:55:47 rjmcnab
71
72 Modified mg to that you can specify the stemmer you want
73 to use via a command line option. You specify it to
74 mg_passes during the build process. The number of the
75 stemmer that you used is stored within the inverted
76 dictionary header and the stemmed dictionary header so
77 the correct stemmer is used in later stages of building
78 and querying.
79
80 Revision 1.1 1998/11/17 09:35:13 rjmcnab
81 *** empty log message ***
82
83 * Revision 1.3 1994/10/20 03:56:57 tes
84 * I have rewritten the boolean query optimiser and abstracted out the
85 * components of the boolean query.
86 *
87 * Revision 1.2 1994/09/20 04:41:52 tes
88 * For version 1.1
89 *
90 */
91
92static char *RCSID = "$Id: mg_passes.c 23508 2010-12-17 01:04:10Z sjm84 $";
93
94#define MAX_PASSES 5
95
96#define SPECIAL 1
97#define TEXT_PASS_1 2
98#define TEXT_PASS_2 4
99#define IVF_PASS_1 8
100#define IVF_PASS_2 16
101
102#define MIN_BUF 8192
103#define TERMRECORD '\002'
104
105mg_u_long buf_size = 3 * 1024 * 1024; /* 3Mb */
106mg_u_long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
107mg_u_long ChunkLimit = 0;
108char InvfLevel = 2;
109char SkipSGML = 0;
110char MakeWeights = 0;
111FILE *Comp_Stats = NULL;
112int comp_stat_point = 0;
113mg_ullong bytes_processed = 0;
114mg_ullong bytes_received = 0;
115int stemmer_num = 0; /* default to the lovin stemmer */
116int stem_method = 0;
117
118static char Passes = 0;
119static mg_u_long trace = 0;
120static int Dump = 0;
121static char **files = NULL;
122static int num_files = 0;
123static char *trace_name = NULL;
124
125
126typedef struct pass_data
127 {
128 char *name;
129 int (*init) (char *);
130 int (*process) (u_char *, int);
131 int (*done) (char *);
132#ifdef HAVE_TIMES
133 clock_t init_time;
134 clock_t process_time;
135 clock_t done_time;
136#else
137 struct timeval init_time;
138 struct timeval process_time;
139 struct timeval done_time;
140#endif
141 }
142pass_data;
143
144#ifdef HAVE_TIMES
145#define NULL_TIMES 0, 0, 0
146#else
147#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
148#endif
149
150static pass_data PassData[MAX_PASSES] =
151{
152 {"special", init_special, process_special, done_special, NULL_TIMES},
153 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
154 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
155 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
156 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
157};
158
159static char *usage_str = "\nUSAGE:\n"
160" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
161" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
162" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
163" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n"
164" %*s [-a stemmer] [-M max-numeric] -f doc-collection-name\n";
165
166
167static void
168usage (char *err)
169{
170 if (err)
171 Message (err);
172 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
173 strlen (msg_prefix), "",strlen (msg_prefix), "",
174 strlen (msg_prefix),"");
175 exit (1);
176}
177
178
179
180
181#if 0
182static char *
183str_comma (mg_u_long u)
184{
185 static char buf[20];
186 mg_u_long a, b, c, d;
187 a = u / 1000000000;
188 u -= a * 1000000000;
189 b = u / 1000000;
190 u -= b * 1000000;
191 c = u / 1000;
192 u -= c * 1000;
193 d = u;
194
195 if (a)
196 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d);
197 else if (b)
198 sprintf (buf, "%u,%03u,%03u", b, c, d);
199 else if (c)
200 sprintf (buf, "%u,%03u", c, d);
201 else
202 sprintf (buf, "%u", d);
203 return (buf);
204}
205#endif
206
207
208
209
210int
211open_next_file (int in_fd)
212{
213 if (in_fd > 0)
214 close (in_fd);
215 if (num_files == 0)
216 return (-1);
217 if ((in_fd = open (files[0], O_RDONLY)) == -1)
218 FatalError (1, "Cannot open %s", files[0]);
219 files++;
220 num_files--;
221 return (in_fd);
222}
223
224
225static void
226driver (int in_fd, FILE * Trace, char *file_name)
227{
228 int pass, num = 1;
229
230 char *buffer = Xmalloc (buf_size);
231 mg_u_long num_docs = 0;
232 mg_u_long block_bytes = 0;
233 register int buf_left = buf_size;
234 register char *look_pos = buffer;
235 register char *end_pos = buffer;
236
237 ProgTime StartTime, InitTime, ProcTime, DoneTime;
238
239 GetTime (&StartTime);
240
241 for (pass = 0; pass < MAX_PASSES; pass++)
242 if (Passes & (1 << pass))
243 {
244 pass_data *pd = &PassData[pass];
245#ifdef HAVE_TIMES
246 struct tms tims;
247 times (&tims);
248 pd->init_time -= tims.tms_utime + tims.tms_stime;
249#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
250 struct rusage ru;
251
252 getrusage (RUSAGE_SELF, &ru);
253 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
254 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
255#endif
256 if (pd->init (file_name) == COMPERROR)
257 FatalError (1, "Error during init of \"%s\"", pd->name);
258
259#ifdef HAVE_TIMES
260 times (&tims);
261 pd->init_time += tims.tms_utime + tims.tms_stime;
262#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
263 getrusage (RUSAGE_SELF, &ru);
264 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
265 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
266 time_normalise (&pd->init_time);
267#endif
268 }
269
270 GetTime (&InitTime);
271 for (;;)
272 {
273 int len = 0;
274 char *base = look_pos;
275
276 while (look_pos != end_pos && *look_pos != TERMRECORD)
277 look_pos++;
278
279 while (look_pos == end_pos)
280 {
281 if (buf_left < MIN_BUF)
282 {
283 bcopy (base, buffer, end_pos - base);
284 look_pos = buffer + (end_pos - base);
285 buf_left = buf_size - (end_pos - base);
286 end_pos = look_pos;
287 base = buffer;
288 }
289 if (buf_left)
290 {
291 num = read (in_fd, end_pos, buf_left);
292 if (num < 0) num = 0; /* RJM - quick hack :-) */
293 if (num == 0)
294 if ((in_fd = open_next_file (in_fd)) != -1)
295 num = read (in_fd, end_pos, buf_left);
296 bytes_received += num;
297 buf_left -= num;
298 end_pos += num;
299 }
300 while (look_pos < end_pos && *look_pos != TERMRECORD)
301 look_pos++;
302 if (buf_left == 0 && base == buffer && look_pos == end_pos)
303 {
304 Message ("Unable to find document terminator (i.e ^B)"
305 " in the document");
306 FatalError (1, "The document is in excess of %d chars long",
307 look_pos - base);
308 }
309 if (!num)
310 break;
311 }
312 len = look_pos++ - base;
313
314 if (!num && base == end_pos)
315 break;
316
317 bytes_processed += len;
318
319#ifndef QUIET
320 if (!len)
321 Message ("Warning : Processing zero length document");
322#endif
323
324 for (pass = 0; pass < MAX_PASSES; pass++)
325 if (Passes & (1 << pass))
326 {
327 register pass_data *pd = &PassData[pass];
328
329#ifdef HAVE_TIMES
330 struct tms tims;
331 times (&tims);
332 pd->process_time -= tims.tms_utime + tims.tms_stime;
333#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
334 struct rusage ru;
335 register struct timeval *tv = &pd->process_time;
336
337 getrusage (RUSAGE_SELF, &ru);
338 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
339 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
340#endif
341
342 if (pd->process ((u_char *) base, len) == COMPERROR)
343 {
344 Message ("Error during processing of \"%s\"", pd->name);
345 if (Dump || Trace)
346 {
347 int i;
348 FILE *f = Trace ? Trace : stderr;
349 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
350 for (i = 0; i < len; i++)
351 {
352 char ch = base[i];
353 if (ch == '\1' || ch == '\2')
354 ch = '\n';
355 putc (ch, f);
356 }
357 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
358 }
359 if (Trace)
360 fprintf (Trace, "%11" ULL_FS " bytes |%7u docs | %s\n",
361 bytes_processed, num_docs,
362 ElapsedTime (&StartTime, NULL));
363 exit (1);
364 }
365
366#ifdef HAVE_TIMES
367 times (&tims);
368 pd->process_time += tims.tms_utime + tims.tms_stime;
369#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
370 getrusage (RUSAGE_SELF, &ru);
371 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
372 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
373#endif
374 }
375 num_docs++;
376 if (Trace)
377 {
378 block_bytes += (look_pos - base);
379 if (block_bytes >= trace)
380 {
381#ifdef HAVE_MALLINFO
382 struct mallinfo mi;
383 mi = mallinfo ();
384 block_bytes -= trace;
385 fprintf (Trace, "%11" ULL_FS " bytes |%7u docs |%7.3f Mb | %s\n",
386 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
387 ElapsedTime (&StartTime, NULL));
388#else
389 block_bytes -= trace;
390 fprintf (Trace, "%11" ULL_FS " bytes |%7u docs | %s\n",
391 bytes_processed, num_docs,
392 ElapsedTime (&StartTime, NULL));
393#endif
394 }
395 }
396 if (!num && look_pos - 1 == end_pos)
397 break;
398 }
399
400#ifndef HAVE_TIMES
401 for (pass = 0; pass < MAX_PASSES; pass++)
402 if (Passes & (1 << pass))
403 time_normalise (&PassData[pass].process_time);
404#endif
405
406 GetTime (&ProcTime);
407
408 for (pass = 0; pass < MAX_PASSES; pass++)
409 if (Passes & (1 << pass))
410 {
411 pass_data *pd = &PassData[pass];
412#ifdef HAVE_TIMES
413 struct tms tims;
414 times (&tims);
415 pd->done_time -= tims.tms_utime + tims.tms_stime;
416#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
417 struct rusage ru;
418
419 getrusage (RUSAGE_SELF, &ru);
420 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
421 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
422#endif
423 if (pd->done (file_name) == COMPERROR)
424 FatalError (1, "Error during done of \"%s\"", pd->name);
425
426#ifdef HAVE_TIMES
427 times (&tims);
428 pd->done_time += tims.tms_utime + tims.tms_stime;
429#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
430 getrusage (RUSAGE_SELF, &ru);
431 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
432 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
433 time_normalise (&pd->done_time);
434#endif
435 }
436 if (Trace)
437 {
438#ifdef HAVE_MALLINFO
439 struct mallinfo mi;
440 mi = mallinfo ();
441 fprintf (Trace, "%11" ULL_FS " bytes |%7u docs |%7.3f Mb | %s\n",
442 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
443 ElapsedTime (&StartTime, NULL));
444#else
445 fprintf (Trace, "%11" ULL_FS " bytes |%7u docs | %s\n",
446 bytes_processed, num_docs,
447 ElapsedTime (&StartTime, NULL));
448#endif
449 }
450
451 GetTime (&DoneTime);
452
453 Message ("");
454 Message ("%10s : init process done", "");
455 for (pass = 0; pass < MAX_PASSES; pass++)
456 if (Passes & (1 << pass))
457 {
458 pass_data *pd = &PassData[pass];
459 char it[15], pt[15], dt[15];
460#ifdef HAVE_TIMES
461 strcpy (it, cputime_string (pd->init_time));
462 strcpy (pt, cputime_string (pd->process_time));
463 strcpy (dt, cputime_string (pd->done_time));
464#else
465 strcpy (it, cputime_string (&pd->init_time));
466 strcpy (pt, cputime_string (&pd->process_time));
467 strcpy (dt, cputime_string (&pd->done_time));
468#endif
469 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
470 }
471 Message ("");
472 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
473 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
474 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
475 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
476 Message ("Documents : %u", num_docs);
477 Message ("Bytes received : %" ULL_FS, bytes_received);
478 Message ("Bytes processed : %" ULL_FS, bytes_processed);
479 Message ("Process Rate : %.1f kB per cpu second",
480 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
481 free (buffer);
482}
483
484
485
486int main (int argc, char **argv)
487{
488 int ch, in_fd;
489 char *filename = NULL;
490 FILE *Trace = NULL;
491
492 msg_prefix = argv[0];
493
494 opterr = 0;
495 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:M:")) != -1)
496 {
497 switch (ch)
498 {
499 case 'G':
500 SkipSGML = 1;
501 break;
502 case 'S':
503 Passes |= SPECIAL;
504 break;
505 case '1':
506 InvfLevel = 1;
507 break;
508 case '2':
509 InvfLevel = 2;
510 break;
511 case '3':
512 InvfLevel = 3;
513 break;
514 case 'f':
515 filename = optarg;
516 break;
517 case 'n':
518 trace_name = optarg;
519 break;
520 case 'D':
521 Dump = 1;
522 break;
523 case 'W':
524 MakeWeights = 1;
525 break;
526 case 'd':
527 set_basepath (optarg);
528 break;
529 case 'a':
530 stemmer_num = stemmernumber (optarg);
531 break;
532 case 's':
533 stem_method = atoi (optarg) & STEMMER_MASK;
534 break;
535 case 'b':
536 buf_size = atoi (optarg) * 1024;
537 break;
538 case 'C':
539 comp_stat_point = atoi (optarg) * 1024;
540 break;
541 case 'c':
542 ChunkLimit = atoi (optarg);
543 break;
544 case 'm':
545 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
546 break;
547 case 'I':
548 case 'N': /* N kept for compatability */
549 if (*optarg == '1')
550 Passes |= IVF_PASS_1;
551 else if (*optarg == '2')
552 Passes |= IVF_PASS_2;
553 else
554 usage ("Invalid pass number");
555 break;
556 case 'T':
557 if (*optarg == '1')
558 Passes |= TEXT_PASS_1;
559 else if (*optarg == '2')
560 Passes |= TEXT_PASS_2;
561 else
562 usage ("Invalid pass number");
563 break;
564 case 't':
565 trace = (mg_u_long) (atof (optarg) * 1024 * 1024);
566 break;
567 case 'M':
568 SetEnv ("maxnumeric", optarg, NULL);
569 break;
570 case 'h':
571 case '?':
572 usage (NULL);
573 }
574 }
575
576 if (!filename || *filename == '\0')
577 FatalError (1, "A document collection name must be specified.");
578
579 if (buf_size < MIN_BUF)
580 FatalError (1, "The buffer size must exceed 1024 bytes.");
581
582 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
583 FatalError (1, "I1 and I2 cannot be done simultaneously.");
584
585 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
586 FatalError (1, "T1 and T2 cannot be done simultaneously.");
587
588 if (!Passes)
589 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
590
591 if (optind < argc)
592 {
593 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
594 FatalError (1, "Cannot open %s", argv[optind]);
595 files = &argv[optind + 1];
596 num_files = argc - (optind + 1);
597 }
598 else
599 in_fd = 0; /* stdin */
600
601
602 if (trace)
603 {
604 if (!trace_name)
605 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
606 if (!(Trace = fopen (trace_name, "a")))
607 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
608 else
609 setbuf (Trace, NULL);
610 }
611 else
612 Trace = NULL;
613
614 if (comp_stat_point)
615 {
616 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
617 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
618 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
619 name);
620 }
621
622
623 if (Trace)
624 {
625 int i;
626 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
627 for (i = 0; i < argc; i++)
628 fprintf (Trace, "%s ", argv[i]);
629 fprintf (Trace, "\n\n");
630 }
631
632 driver (in_fd, Trace, filename);
633
634 if (Trace)
635 fclose (Trace);
636
637 if (Comp_Stats)
638 fclose (Comp_Stats);
639
640 return 0;
641}
Note: See TracBrowser for help on using the repository browser.