source: trunk/gsdl3/packages/mg/src/text/mg_passes_4jni.c@ 7455

Last change on this file since 7455 was 7455, checked in by kjdon, 20 years ago

renamed gs3_mg_passes to mg_passes_4jni, and added a bit more stuff to it

  • Property svn:keywords set to Author Date Id Revision
File size: 15.7 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_passes_4jni.c 7455 2004-05-26 23:25:52Z kjdon $
21 *
22 **************************************************************************/
23/* this needs to come first */
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29
30#include "memlib.h"
31#include "messages.h"
32#include "timing.h"
33
34#include "longlong.h"
35#include "stemmer.h"
36
37
38#include "mg_files.h"
39#include "mg.h"
40#include "build.h"
41#include "text.h"
42
43#include "words.h"
44
45static char *RCSID = "$Id: mg_passes_4jni.c 7455 2004-05-26 23:25:52Z kjdon $";
46
47#define MAX_PASSES 5
48
49#define SPECIAL 1
50#define TEXT_PASS_1 2
51#define TEXT_PASS_2 4
52#define IVF_PASS_1 8
53#define IVF_PASS_2 16
54
55#define MIN_BUF 8192
56#define TERMRECORD '\002'
57
58unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
59unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
60unsigned long ChunkLimit = 0;
61char InvfLevel = 2;
62char SkipSGML = 0;
63char MakeWeights = 0;
64FILE *Comp_Stats = NULL;
65int comp_stat_point = 0;
66mg_ullong bytes_processed = 0;
67mg_ullong bytes_received = 0;
68int stemmer_num = 0; /* default to the lovin stemmer */
69int stem_method = 0;
70FILE * Trace;
71char * filename;
72unsigned long num_docs = 0;
73unsigned long block_bytes = 0;
74
75static char Passes = 0;
76static unsigned long trace = 0;
77static int Dump = 0;
78static char **files = NULL;
79static int num_files = 0;
80static char *trace_name = NULL;
81
82typedef struct pass_data
83 {
84 char *name;
85 int (*init) (char *);
86 int (*process) (u_char *, int);
87 int (*done) (char *);
88#ifdef HAVE_TIMES
89 clock_t init_time;
90 clock_t process_time;
91 clock_t done_time;
92#else
93 struct timeval init_time;
94 struct timeval process_time;
95 struct timeval done_time;
96#endif
97 }
98pass_data;
99
100#ifdef HAVE_TIMES
101#define NULL_TIMES 0, 0, 0
102#else
103#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
104#endif
105
106static pass_data PassData[MAX_PASSES] =
107{
108 {"special", init_special, process_special, done_special, NULL_TIMES},
109 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
110 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
111 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
112 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
113};
114
115static char *usage_str = "\nUSAGE:\n"
116" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
117" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
118" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
119" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method]\n"
120" %*s [-a stemmer] [-M max-numeric] -f doc-collection-name\n";
121
122
123static void
124usage (char *err)
125{
126 if (err)
127 Message (err);
128 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
129 strlen (msg_prefix), "",strlen (msg_prefix), "",
130 strlen (msg_prefix),"");
131 exit (1);
132}
133
134
135
136
137#if 0
138static char *
139str_comma (unsigned long u)
140{
141 static char buf[20];
142 unsigned long a, b, c, d;
143 a = u / 1000000000;
144 u -= a * 1000000000;
145 b = u / 1000000;
146 u -= b * 1000000;
147 c = u / 1000;
148 u -= c * 1000;
149 d = u;
150
151 if (a)
152 sprintf (buf, "%u,%03u,%03u,%03u", a, b, c, d);
153 else if (b)
154 sprintf (buf, "%u,%03u,%03u", b, c, d);
155 else if (c)
156 sprintf (buf, "%u,%03u", c, d);
157 else
158 sprintf (buf, "%u", d);
159 return (buf);
160}
161#endif
162
163
164
165/*
166 int
167 open_next_file (int in_fd)
168 {
169 if (in_fd > 0)
170 close (in_fd);
171 if (num_files == 0)
172 return (-1);
173 if ((in_fd = open (files[0], O_RDONLY)) == -1)
174 FatalError (1, "Cannot open %s", files[0]);
175 files++;
176 num_files--;
177 return (in_fd);
178 }
179*/
180
181void clear_variables() {
182
183 buf_size = 3 * 1024 * 1024; /* 3Mb */
184 invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
185 ChunkLimit = 0;
186 InvfLevel = 2;
187 SkipSGML = 0;
188 MakeWeights = 0;
189 Comp_Stats = NULL;
190 comp_stat_point = 0;
191 bytes_processed = 0;
192 bytes_received = 0;
193 stemmer_num = 0; /* default to the lovin stemmer */
194 stem_method = 0;
195 Trace = NULL;
196 filename = NULL;
197 num_docs = 0;
198 block_bytes = 0;
199
200 Passes = 0;
201 trace = 0;
202 Dump = 0;
203 files = NULL;
204 num_files = 0;
205 trace_name = NULL;
206
207
208}
209void set_invf_level(char level) {
210
211 switch (level) {
212 case '1':
213 InvfLevel = 1;
214 break;
215 case '2':
216 InvfLevel = 2;
217 break;
218 case '3':
219 InvfLevel = 3;
220 break;
221 }
222
223}
224void set_inversion_limit(int limit) {
225 invf_buffer_size = limit * 1024 * 1024;
226}
227
228void ignore_sgml_tags(int ignore) {
229 if (ignore) {
230 SkipSGML = 1;
231 } else {
232 SkipSGML = 0;
233 }
234}
235
236void set_buffer_size(long size) {
237 buf_size = size * 1024;
238 if (buf_size < MIN_BUF) {
239 buf_size = MIN_BUF;
240 }
241}
242
243void set_stem_options(char * stemmer, int method) {
244 stemmer_num = stemmernumber (stemmer);
245 stem_method = method & STEMMER_MASK;
246
247}
248
249void set_filename(char * filen) {
250 int len = strlen(filen);
251 if (filename) {
252 Xfree (filename);
253 filename = NULL;
254 }
255 filename = Xstrdup (filen);
256 // put this here for now
257 Dump=1;
258 trace = 512;
259 if (!trace_name)
260 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
261 if (!(Trace = fopen (trace_name, "a")))
262 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
263 else
264 setbuf (Trace, NULL);
265
266}
267
268
269void add_pass (char pass_type, char pass_num) {
270
271 switch(pass_type) {
272 case 'S':
273 Passes |= SPECIAL;
274 break;
275 case 'I':
276 case 'N':
277 if (pass_num == '1')
278 Passes |= IVF_PASS_1;
279 else if (pass_num == '2')
280 Passes |= IVF_PASS_2;
281 else
282 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
283 break;
284 case 'T':
285 if (pass_num == '1')
286 Passes |= TEXT_PASS_1;
287 else if (pass_num == '2')
288 Passes |= TEXT_PASS_2;
289 else
290 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
291 break;
292 }
293
294}
295ProgTime StartTime, InitTime, ProcTime, DoneTime;
296
297void
298init_driver ()
299{
300 int pass;
301
302 GetTime (&StartTime);
303
304 for (pass = 0; pass < MAX_PASSES; pass++) {
305 if (Passes & (1 << pass)) {
306 pass_data *pd = &PassData[pass];
307#ifdef HAVE_TIMES
308 struct tms tims;
309 times (&tims);
310 pd->init_time -= tims.tms_utime + tims.tms_stime;
311#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
312 struct rusage ru;
313
314 getrusage (RUSAGE_SELF, &ru);
315 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
316 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
317#endif
318 if (pd->init (filename) == COMPERROR)
319 FatalError (1, "Error during init of \"%s\"", pd->name);
320
321#ifdef HAVE_TIMES
322 times (&tims);
323 pd->init_time += tims.tms_utime + tims.tms_stime;
324#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
325 getrusage (RUSAGE_SELF, &ru);
326 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
327 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
328 time_normalise (&pd->init_time);
329#endif
330 }
331 }
332 GetTime (&InitTime);
333}
334
335
336void process_document(u_char *buffer, int len) {
337 int pass;
338 bytes_processed += len;
339
340 printf("process doc, len=%d\n",len);
341#ifndef QUIET
342 if (!len)
343 Message ("Warning : Processing zero length document");
344#endif
345
346 for (pass = 0; pass < MAX_PASSES; pass++) {
347 if (Passes & (1 << pass))
348 {
349 register pass_data *pd = &PassData[pass];
350
351#ifdef HAVE_TIMES
352 struct tms tims;
353 times (&tims);
354 pd->process_time -= tims.tms_utime + tims.tms_stime;
355#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
356 struct rusage ru;
357 register struct timeval *tv = &pd->process_time;
358
359 getrusage (RUSAGE_SELF, &ru);
360 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
361 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
362#endif
363 if (pd->process ((u_char *) buffer, len) == COMPERROR)
364 {
365 Message ("Error during processing of \"%s\"", pd->name);
366 if (Dump || Trace)
367 {
368 int i;
369 FILE *f = Trace ? Trace : stderr;
370 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
371 for (i = 0; i < len; i++)
372 {
373 char ch = buffer[i];
374 if (ch == '\1' || ch == '\2')
375 ch = '\n';
376 putc (ch, f);
377 }
378 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
379 }
380 if (Trace)
381 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
382 bytes_processed, num_docs,
383 ElapsedTime (&StartTime, NULL));
384 exit (1);
385 }
386#ifdef HAVE_TIMES
387 times (&tims);
388 pd->process_time += tims.tms_utime + tims.tms_stime;
389#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
390 getrusage (RUSAGE_SELF, &ru);
391 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
392 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
393#endif
394 }
395 }
396 num_docs++;
397 if (Trace)
398 {
399 block_bytes += len;
400 if (block_bytes >= trace)
401 {
402#ifdef HAVE_MALLINFO
403 struct mallinfo mi;
404 mi = mallinfo ();
405 block_bytes -= trace;
406 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
407 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
408 ElapsedTime (&StartTime, NULL));
409#else
410 block_bytes -= trace;
411 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
412 bytes_processed, num_docs,
413 ElapsedTime (&StartTime, NULL));
414#endif
415 }
416 }
417}
418
419void finalise_driver() {
420 int pass;
421#ifndef HAVE_TIMES
422 for (pass = 0; pass < MAX_PASSES; pass++)
423 if (Passes & (1 << pass))
424 time_normalise (&PassData[pass].process_time);
425#endif
426
427 GetTime (&ProcTime);
428
429 for (pass = 0; pass < MAX_PASSES; pass++)
430 if (Passes & (1 << pass))
431 {
432 pass_data *pd = &PassData[pass];
433#ifdef HAVE_TIMES
434 struct tms tims;
435 times (&tims);
436 pd->done_time -= tims.tms_utime + tims.tms_stime;
437#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
438 struct rusage ru;
439
440 getrusage (RUSAGE_SELF, &ru);
441 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
442 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
443#endif
444 if (pd->done (filename) == COMPERROR)
445 FatalError (1, "Error during done of \"%s\"", pd->name);
446
447#ifdef HAVE_TIMES
448 times (&tims);
449 pd->done_time += tims.tms_utime + tims.tms_stime;
450#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
451 getrusage (RUSAGE_SELF, &ru);
452 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
453 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
454 time_normalise (&pd->done_time);
455#endif
456 }
457 if (Trace)
458 {
459#ifdef HAVE_MALLINFO
460 struct mallinfo mi;
461 mi = mallinfo ();
462 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
463 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
464 ElapsedTime (&StartTime, NULL));
465#else
466 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
467 bytes_processed, num_docs,
468 ElapsedTime (&StartTime, NULL));
469#endif
470 }
471
472 GetTime (&DoneTime);
473
474 Message ("");
475 Message ("%10s : init process done", "");
476 for (pass = 0; pass < MAX_PASSES; pass++)
477 if (Passes & (1 << pass))
478 {
479 pass_data *pd = &PassData[pass];
480 char it[15], pt[15], dt[15];
481#ifdef HAVE_TIMES
482 strcpy (it, cputime_string (pd->init_time));
483 strcpy (pt, cputime_string (pd->process_time));
484 strcpy (dt, cputime_string (pd->done_time));
485#else
486 strcpy (it, cputime_string (&pd->init_time));
487 strcpy (pt, cputime_string (&pd->process_time));
488 strcpy (dt, cputime_string (&pd->done_time));
489#endif
490 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
491 }
492 Message ("");
493 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
494 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
495 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
496 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
497 Message ("Documents : %u", num_docs);
498 Message ("Bytes received : %" ULL_FS, bytes_received);
499 Message ("Bytes processed : %" ULL_FS, bytes_processed);
500 Message ("Process Rate : %.1f kB per cpu second",
501 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
502 //free (buffer);
503}
504
505
506
507int main (int argc, char **argv)
508{
509 int ch, in_fd;
510
511 msg_prefix = argv[0];
512
513 opterr = 0;
514 while ((ch = getopt (argc, argv, "hC:WGSD123f:d:b:T:I:t:m:N:c:n:s:a:M:")) != -1)
515 {
516 switch (ch)
517 {
518 case 'G':
519 SkipSGML = 1;
520 break;
521 case 'S':
522 Passes |= SPECIAL;
523 break;
524 case '1':
525 InvfLevel = 1;
526 break;
527 case '2':
528 InvfLevel = 2;
529 break;
530 case '3':
531 InvfLevel = 3;
532 break;
533 case 'f':
534 filename = optarg;
535 break;
536 case 'n':
537 trace_name = optarg;
538 break;
539 case 'D':
540 Dump = 1;
541 break;
542 case 'W':
543 MakeWeights = 1;
544 break;
545 case 'd':
546 set_basepath (optarg);
547 break;
548 case 'a':
549 stemmer_num = stemmernumber (optarg);
550 break;
551 case 's':
552 stem_method = atoi (optarg) & STEMMER_MASK;
553 break;
554 case 'b':
555 buf_size = atoi (optarg) * 1024;
556 break;
557 case 'C':
558 comp_stat_point = atoi (optarg) * 1024;
559 break;
560 case 'c':
561 ChunkLimit = atoi (optarg);
562 break;
563 case 'm':
564 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
565 break;
566 case 'I':
567 case 'N': /* N kept for compatability */
568 if (*optarg == '1')
569 Passes |= IVF_PASS_1;
570 else if (*optarg == '2')
571 Passes |= IVF_PASS_2;
572 else
573 usage ("Invalid pass number");
574 break;
575 case 'T':
576 if (*optarg == '1')
577 Passes |= TEXT_PASS_1;
578 else if (*optarg == '2')
579 Passes |= TEXT_PASS_2;
580 else
581 usage ("Invalid pass number");
582 break;
583 case 't':
584 trace = (unsigned long) (atof (optarg) * 1024 * 1024);
585 break;
586 case 'M':
587 SetEnv ("maxnumeric", optarg, NULL);
588 break;
589 case 'h':
590 case '?':
591 usage (NULL);
592 }
593 }
594
595 if (!filename || *filename == '\0')
596 FatalError (1, "A document collection name must be specified.");
597
598 if (buf_size < MIN_BUF)
599 FatalError (1, "The buffer size must exceed 1024 bytes.");
600
601 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
602 FatalError (1, "I1 and I2 cannot be done simultaneously.");
603
604 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
605 FatalError (1, "T1 and T2 cannot be done simultaneously.");
606
607 if (!Passes)
608 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
609
610 if (optind < argc)
611 {
612 if ((in_fd = open (argv[optind], O_RDONLY)) == -1)
613 FatalError (1, "Cannot open %s", argv[optind]);
614 files = &argv[optind + 1];
615 num_files = argc - (optind + 1);
616 }
617 else
618 in_fd = 0; /* stdin */
619
620
621 if (trace)
622 {
623 if (!trace_name)
624 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
625 if (!(Trace = fopen (trace_name, "a")))
626 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
627 else
628 setbuf (Trace, NULL);
629 }
630 else
631 Trace = NULL;
632
633 if (comp_stat_point)
634 {
635 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
636 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
637 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
638 name);
639 }
640
641
642 if (Trace)
643 {
644 int i;
645 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
646 for (i = 0; i < argc; i++)
647 fprintf (Trace, "%s ", argv[i]);
648 fprintf (Trace, "\n\n");
649 }
650
651 init_driver ();
652 /* here we have to do something to process docs from stdin */
653 finalise_driver();
654 if (Trace)
655 fclose (Trace);
656
657 if (Comp_Stats)
658 fclose (Comp_Stats);
659
660 return 0;
661}
Note: See TracBrowser for help on using the repository browser.