source: gsdl/trunk/trunk/mg/src/text/mg_passes_4jni.c@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 13.9 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes_4jni.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_passes_4jni.c 16583 2008-07-29 10:20:36Z davidb $
21 *
22 **************************************************************************/
23/* this needs to come first */
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29
30#include "mg_passes_4jni.h"
31#include "memlib.h"
32#include "messages.h"
33#include "timing.h"
34
35#include "longlong.h"
36#include "stemmer.h"
37
38#include "mg_files.h"
39#include "mg.h"
40#include "build.h"
41#include "text.h"
42
43#include "words.h"
44#include "environment.h"
45
46static char *RCSID = "$Id: mg_passes_4jni.c 16583 2008-07-29 10:20:36Z davidb $";
47
48#define MAX_PASSES 5
49
50#define SPECIAL 1
51#define TEXT_PASS_1 2
52#define TEXT_PASS_2 4
53#define IVF_PASS_1 8
54#define IVF_PASS_2 16
55
56#define MIN_BUF 8192
57#define TERMRECORD '\002'
58
59unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
60unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
61unsigned long ChunkLimit = 0;
62char InvfLevel = 2;
63char SkipSGML = 0;
64char MakeWeights = 0;
65FILE *Comp_Stats = NULL;
66int comp_stat_point = 0;
67mg_ullong bytes_processed = 0;
68mg_ullong bytes_received = 0;
69int stemmer_num = 0; /* default to the lovin stemmer */
70int stem_method = 0;
71FILE * Trace;
72char * filename;
73unsigned long num_docs = 0;
74unsigned long block_bytes = 0;
75
76static char Passes = 0;
77static unsigned long trace = 0;
78static int Dump = 0;
79static char **files = NULL;
80static int num_files = 0;
81static char *trace_name = NULL;
82
83int mg_passes_exit_value = 0;
84
85typedef struct pass_data
86 {
87 char *name;
88 int (*init) (char *);
89 int (*process) (u_char *, int);
90 int (*done) (char *);
91#ifdef HAVE_TIMES
92 clock_t init_time;
93 clock_t process_time;
94 clock_t done_time;
95#else
96 struct timeval init_time;
97 struct timeval process_time;
98 struct timeval done_time;
99#endif
100 }
101pass_data;
102
103#ifdef HAVE_TIMES
104#define NULL_TIMES 0, 0, 0
105#else
106#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
107#endif
108
109static pass_data PassData[MAX_PASSES] =
110{
111 {"special", init_special, process_special, done_special, NULL_TIMES},
112 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
113 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
114 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
115 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
116};
117
118
119/* clear all the settings from one mg_passes run to the next */
120void clear_variables() {
121
122 buf_size = 3 * 1024 * 1024; /* 3Mb */
123 invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
124 ChunkLimit = 0;
125 InvfLevel = 2;
126 SkipSGML = 0;
127 MakeWeights = 0;
128 Comp_Stats = NULL;
129 comp_stat_point = 0;
130 bytes_processed = 0;
131 bytes_received = 0;
132 stemmer_num = 0; /* default to the lovin stemmer */
133 stem_method = 0;
134 Trace = NULL;
135 filename = NULL;
136 num_docs = 0;
137 block_bytes = 0;
138
139 Passes = 0;
140 trace = 0;
141 Dump = 0;
142 files = NULL;
143 num_files = 0;
144 trace_name = NULL;
145
146 mg_passes_exit_value = 0;
147}
148
149/* ################################################## */
150/* the following are methods to set all the variables that used to be
151 set by command line args */
152
153/* -S, -T1, -T2, -I1, -I2, args to mg_passes */
154void add_pass (char pass_type, char pass_num) {
155
156 switch(pass_type) {
157 case 'S':
158 Passes |= SPECIAL;
159 break;
160 case 'I':
161 case 'N':
162 if (pass_num == '1')
163 Passes |= IVF_PASS_1;
164 else if (pass_num == '2')
165 Passes |= IVF_PASS_2;
166 else
167 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
168 break;
169 case 'T':
170 if (pass_num == '1')
171 Passes |= TEXT_PASS_1;
172 else if (pass_num == '2')
173 Passes |= TEXT_PASS_2;
174 else
175 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
176 break;
177 }
178}
179
180/* -D arg to mg_passes */
181void dump_failed_document(int dump) {
182 Dump = dump;
183}
184
185/* -G arg to mg_passes */
186void ignore_sgml_tags(int ignore) {
187 if (ignore) {
188 SkipSGML = 1;
189 } else {
190 SkipSGML = 0;
191 }
192}
193
194/* -b arg to mg_passes */
195void set_buffer_size(long size) {
196 buf_size = size * 1024;
197 if (buf_size < MIN_BUF) {
198 buf_size = MIN_BUF;
199 }
200}
201
202/* -c arg to mg_passes */
203void set_chunk_limit(long chunk_limit) {
204 ChunkLimit = chunk_limit;
205}
206
207/* -C arg to mg_passes */
208void set_comp_stat_point(int stat_point) {
209 comp_stat_point = stat_point * 1024;
210}
211
212/* -f arg to mg_passes */
213void set_filename(const char * filen) {
214 if (filename) {
215 Xfree (filename);
216 filename = NULL;
217 }
218 filename = Xstrdup (filen);
219}
220
221/* -m arg to mg_passes */
222void set_inversion_limit(int limit) {
223 invf_buffer_size = limit * 1024 * 1024;
224}
225
226/* -1, -2, -3 args to mg_passes */
227void set_invf_level(char level) {
228 switch (level) {
229 case '1':
230 InvfLevel = 1;
231 break;
232 case '2':
233 InvfLevel = 2;
234 break;
235 case '3':
236 InvfLevel = 3;
237 break;
238 }
239}
240
241/* -W arg to mg_passes */
242void set_make_weights(int make_w) {
243 MakeWeights = make_w;
244}
245
246/* -M arg to mg_passes */
247void set_max_numeric(int max_numeric) {
248 char data[99];
249 sprintf(data, "%d", max_numeric);
250 SetEnv ("maxnumeric", data, NULL);
251}
252
253/* -a, -s args to mg_passes */
254void set_stem_options(const char * stemmer, int method) {
255 stemmer_num = stemmernumber (stemmer);
256 stem_method = method & STEMMER_MASK;
257}
258
259/* -t arg to mg_passes */
260void set_trace_point(int tracepos) {
261 trace = (unsigned long) (tracepos * 1024 * 1024);
262}
263
264/* -n arg to mg_passes */
265void set_trace_file(const char * filen) {
266 if (trace_name) {
267 Xfree (trace_name);
268 trace_name = NULL;
269 }
270 trace_name = Xstrdup (filen);
271}
272
273/* ############################################### */
274/* The old driver method has been split into 3:
275init_driver, process_document (called numdocs times),
276finalise_driver.
277The above set vars methods should all be called before init_driver.
278*/
279
280
281ProgTime StartTime, InitTime, ProcTime, DoneTime;
282
283void
284init_driver ()
285{
286 int pass;
287 if (!filename || *filename == '\0') {
288 mg_passes_exit_value = 1;
289 FatalError (1, "A document collection name must be specified.");
290 }
291
292 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2)) {
293 mg_passes_exit_value = 1;
294 FatalError (1, "I1 and I2 cannot be done simultaneously.");
295
296 }
297 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2)) {
298 mg_passes_exit_value = 1;
299 FatalError (1, "T1 and T2 cannot be done simultaneously.");
300 }
301 if (!Passes) {
302 mg_passes_exit_value = 1;
303 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
304 }
305 if (trace)
306 {
307 if (!trace_name)
308 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
309 if (!(Trace = fopen (trace_name, "a")))
310 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
311 else
312 setbuf (Trace, NULL);
313 }
314 else
315 Trace = NULL;
316
317 if (comp_stat_point)
318 {
319 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
320 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
321 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
322 name);
323 }
324
325 if (Trace)
326 {
327 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
328 /* print out the args to mg_passes */
329 fprintf (Trace, "\n\n");
330 }
331
332
333 GetTime (&StartTime);
334
335 for (pass = 0; pass < MAX_PASSES; pass++) {
336 if (Passes & (1 << pass)) {
337 pass_data *pd = &PassData[pass];
338#ifdef HAVE_TIMES
339 struct tms tims;
340 times (&tims);
341 pd->init_time -= tims.tms_utime + tims.tms_stime;
342#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
343 struct rusage ru;
344
345 getrusage (RUSAGE_SELF, &ru);
346 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
347 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
348#endif
349 if (pd->init (filename) == COMPERROR) {
350 mg_passes_exit_value = 1;
351 FatalError (1, "Error during init of \"%s\"", pd->name);
352 }
353#ifdef HAVE_TIMES
354 times (&tims);
355 pd->init_time += tims.tms_utime + tims.tms_stime;
356#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
357 getrusage (RUSAGE_SELF, &ru);
358 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
359 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
360 time_normalise (&pd->init_time);
361#endif
362 }
363 }
364 GetTime (&InitTime);
365}
366
367
368void process_document(const u_char *buffer, int len) {
369 int pass;
370 bytes_processed += len;
371
372#ifndef QUIET
373 if (!len)
374 Message ("Warning : Processing zero length document");
375#endif
376
377 for (pass = 0; pass < MAX_PASSES; pass++) {
378 if (Passes & (1 << pass))
379 {
380 register pass_data *pd = &PassData[pass];
381
382#ifdef HAVE_TIMES
383 struct tms tims;
384 times (&tims);
385 pd->process_time -= tims.tms_utime + tims.tms_stime;
386#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
387 struct rusage ru;
388 register struct timeval *tv = &pd->process_time;
389
390 getrusage (RUSAGE_SELF, &ru);
391 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
392 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
393#endif
394 if (pd->process ((u_char *) buffer, len) == COMPERROR)
395 {
396 Message ("Error during processing of \"%s\"", pd->name);
397 if (Dump || Trace)
398 {
399 int i;
400 FILE *f = Trace ? Trace : stderr;
401 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
402 for (i = 0; i < len; i++)
403 {
404 char ch = buffer[i];
405 if (ch == '\1' || ch == '\2')
406 ch = '\n';
407 putc (ch, f);
408 }
409 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
410 }
411 if (Trace)
412 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
413 bytes_processed, num_docs,
414 ElapsedTime (&StartTime, NULL));
415 exit (1);
416 }
417#ifdef HAVE_TIMES
418 times (&tims);
419 pd->process_time += tims.tms_utime + tims.tms_stime;
420#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
421 getrusage (RUSAGE_SELF, &ru);
422 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
423 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
424#endif
425 }
426 }
427 num_docs++;
428 if (Trace)
429 {
430 block_bytes += len;
431 if (block_bytes >= trace)
432 {
433#ifdef HAVE_MALLINFO
434 struct mallinfo mi;
435 mi = mallinfo ();
436 block_bytes -= trace;
437 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
438 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
439 ElapsedTime (&StartTime, NULL));
440#else
441 block_bytes -= trace;
442 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
443 bytes_processed, num_docs,
444 ElapsedTime (&StartTime, NULL));
445#endif
446 }
447 }
448}
449
450void finalise_driver() {
451 int pass;
452#ifndef HAVE_TIMES
453 for (pass = 0; pass < MAX_PASSES; pass++)
454 if (Passes & (1 << pass))
455 time_normalise (&PassData[pass].process_time);
456#endif
457
458 GetTime (&ProcTime);
459
460 for (pass = 0; pass < MAX_PASSES; pass++)
461 if (Passes & (1 << pass))
462 {
463 pass_data *pd = &PassData[pass];
464#ifdef HAVE_TIMES
465 struct tms tims;
466 times (&tims);
467 pd->done_time -= tims.tms_utime + tims.tms_stime;
468#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
469 struct rusage ru;
470
471 getrusage (RUSAGE_SELF, &ru);
472 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
473 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
474#endif
475 if (pd->done (filename) == COMPERROR) {
476 mg_passes_exit_value = 1;
477 FatalError (1, "Error during done of \"%s\"", pd->name);
478 }
479#ifdef HAVE_TIMES
480 times (&tims);
481 pd->done_time += tims.tms_utime + tims.tms_stime;
482#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
483 getrusage (RUSAGE_SELF, &ru);
484 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
485 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
486 time_normalise (&pd->done_time);
487#endif
488 }
489 if (Trace)
490 {
491#ifdef HAVE_MALLINFO
492 struct mallinfo mi;
493 mi = mallinfo ();
494 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
495 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
496 ElapsedTime (&StartTime, NULL));
497#else
498 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
499 bytes_processed, num_docs,
500 ElapsedTime (&StartTime, NULL));
501#endif
502 }
503
504 GetTime (&DoneTime);
505
506 Message ("");
507 Message ("%10s : init process done", "");
508 for (pass = 0; pass < MAX_PASSES; pass++)
509 if (Passes & (1 << pass))
510 {
511 pass_data *pd = &PassData[pass];
512 char it[15], pt[15], dt[15];
513#ifdef HAVE_TIMES
514 strcpy (it, cputime_string (pd->init_time));
515 strcpy (pt, cputime_string (pd->process_time));
516 strcpy (dt, cputime_string (pd->done_time));
517#else
518 strcpy (it, cputime_string (&pd->init_time));
519 strcpy (pt, cputime_string (&pd->process_time));
520 strcpy (dt, cputime_string (&pd->done_time));
521#endif
522 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
523 }
524 Message ("");
525 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
526 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
527 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
528 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
529 Message ("Documents : %u", num_docs);
530 Message ("Bytes received : %" ULL_FS, bytes_received);
531 Message ("Bytes processed : %" ULL_FS, bytes_processed);
532 Message ("Process Rate : %.1f kB per cpu second",
533 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
534 //free (buffer);
535
536 if (Trace)
537 fclose (Trace);
538
539 if (Comp_Stats)
540 fclose (Comp_Stats);
541
542}
543
544int get_exit_value() {
545 return mg_passes_exit_value;
546}
Note: See TracBrowser for help on using the repository browser.