source: trunk/gsdl3/src/packages/mg/src/text/mg_passes_4jni.c@ 7630

Last change on this file since 7630 was 7630, checked in by kjdon, 20 years ago

fixed up some compiler warnings, and made this header file to be used by the jni class - see comment in jni/MGWrapperImpl.c

  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes_4jni.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_passes_4jni.c 7630 2004-06-22 04:24:30Z kjdon $
21 *
22 **************************************************************************/
23/* this needs to come first */
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29
30#include "mg_passes_4jni.h"
31#include "memlib.h"
32#include "messages.h"
33#include "timing.h"
34
35#include "longlong.h"
36#include "stemmer.h"
37
38#include "mg_files.h"
39#include "mg.h"
40#include "build.h"
41#include "text.h"
42
43#include "words.h"
44#include "environment.h"
45
46static char *RCSID = "$Id: mg_passes_4jni.c 7630 2004-06-22 04:24:30Z kjdon $";
47
48#define MAX_PASSES 5
49
50#define SPECIAL 1
51#define TEXT_PASS_1 2
52#define TEXT_PASS_2 4
53#define IVF_PASS_1 8
54#define IVF_PASS_2 16
55
56#define MIN_BUF 8192
57#define TERMRECORD '\002'
58
59unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
60unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
61unsigned long ChunkLimit = 0;
62char InvfLevel = 2;
63char SkipSGML = 0;
64char MakeWeights = 0;
65FILE *Comp_Stats = NULL;
66int comp_stat_point = 0;
67mg_ullong bytes_processed = 0;
68mg_ullong bytes_received = 0;
69int stemmer_num = 0; /* default to the lovin stemmer */
70int stem_method = 0;
71FILE * Trace;
72char * filename;
73unsigned long num_docs = 0;
74unsigned long block_bytes = 0;
75
76static char Passes = 0;
77static unsigned long trace = 0;
78static int Dump = 0;
79static char **files = NULL;
80static int num_files = 0;
81static char *trace_name = NULL;
82
83typedef struct pass_data
84 {
85 char *name;
86 int (*init) (char *);
87 int (*process) (u_char *, int);
88 int (*done) (char *);
89#ifdef HAVE_TIMES
90 clock_t init_time;
91 clock_t process_time;
92 clock_t done_time;
93#else
94 struct timeval init_time;
95 struct timeval process_time;
96 struct timeval done_time;
97#endif
98 }
99pass_data;
100
101#ifdef HAVE_TIMES
102#define NULL_TIMES 0, 0, 0
103#else
104#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
105#endif
106
107static pass_data PassData[MAX_PASSES] =
108{
109 {"special", init_special, process_special, done_special, NULL_TIMES},
110 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
111 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
112 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
113 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
114};
115
116
117/* clear all the settings from one mg_passes run to the next */
118void clear_variables() {
119
120 buf_size = 3 * 1024 * 1024; /* 3Mb */
121 invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
122 ChunkLimit = 0;
123 InvfLevel = 2;
124 SkipSGML = 0;
125 MakeWeights = 0;
126 Comp_Stats = NULL;
127 comp_stat_point = 0;
128 bytes_processed = 0;
129 bytes_received = 0;
130 stemmer_num = 0; /* default to the lovin stemmer */
131 stem_method = 0;
132 Trace = NULL;
133 filename = NULL;
134 num_docs = 0;
135 block_bytes = 0;
136
137 Passes = 0;
138 trace = 0;
139 Dump = 0;
140 files = NULL;
141 num_files = 0;
142 trace_name = NULL;
143
144
145}
146
147/* ################################################## */
148/* the following are methods to set all the variables that used to be
149 set by command line args */
150
151/* -S, -T1, -T2, -I1, -I2, args to mg_passes */
152void add_pass (char pass_type, char pass_num) {
153
154 switch(pass_type) {
155 case 'S':
156 Passes |= SPECIAL;
157 break;
158 case 'I':
159 case 'N':
160 if (pass_num == '1')
161 Passes |= IVF_PASS_1;
162 else if (pass_num == '2')
163 Passes |= IVF_PASS_2;
164 else
165 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
166 break;
167 case 'T':
168 if (pass_num == '1')
169 Passes |= TEXT_PASS_1;
170 else if (pass_num == '2')
171 Passes |= TEXT_PASS_2;
172 else
173 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
174 break;
175 }
176}
177
178/* -D arg to mg_passes */
179void dump_failed_document(int dump) {
180 Dump = dump;
181}
182
183/* -G arg to mg_passes */
184void ignore_sgml_tags(int ignore) {
185 if (ignore) {
186 SkipSGML = 1;
187 } else {
188 SkipSGML = 0;
189 }
190}
191
192/* -b arg to mg_passes */
193void set_buffer_size(long size) {
194 buf_size = size * 1024;
195 if (buf_size < MIN_BUF) {
196 buf_size = MIN_BUF;
197 }
198}
199
200/* -c arg to mg_passes */
201void set_chunk_limit(long chunk_limit) {
202 ChunkLimit = chunk_limit;
203}
204
205/* -C arg to mg_passes */
206void set_comp_stat_point(int stat_point) {
207 comp_stat_point = stat_point * 1024;
208}
209
210/* -f arg to mg_passes */
211void set_filename(const char * filen) {
212 if (filename) {
213 Xfree (filename);
214 filename = NULL;
215 }
216 filename = Xstrdup (filen);
217}
218
219/* -m arg to mg_passes */
220void set_inversion_limit(int limit) {
221 invf_buffer_size = limit * 1024 * 1024;
222}
223
224/* -1, -2, -3 args to mg_passes */
225void set_invf_level(char level) {
226 switch (level) {
227 case '1':
228 InvfLevel = 1;
229 break;
230 case '2':
231 InvfLevel = 2;
232 break;
233 case '3':
234 InvfLevel = 3;
235 break;
236 }
237}
238
239/* -W arg to mg_passes */
240void set_make_weights(int make_w) {
241 MakeWeights = make_w;
242}
243
244/* -M arg to mg_passes */
245void set_max_numeric(int max_numeric) {
246 char data[99];
247 sprintf(data, "%d", max_numeric);
248 SetEnv ("maxnumeric", data, NULL);
249}
250
251/* -a, -s args to mg_passes */
252void set_stem_options(const char * stemmer, int method) {
253 stemmer_num = stemmernumber (stemmer);
254 stem_method = method & STEMMER_MASK;
255}
256
257/* -t arg to mg_passes */
258void set_trace_point(int tracepos) {
259 trace = (unsigned long) (tracepos * 1024 * 1024);
260}
261
262/* -n arg to mg_passes */
263void set_trace_file(const char * filen) {
264 if (trace_name) {
265 Xfree (trace_name);
266 trace_name = NULL;
267 }
268 trace_name = Xstrdup (filen);
269}
270
271/* ############################################### */
272/* The old driver method has been split into 3:
273init_driver, process_document (called numdocs times),
274finalise_driver.
275The above set vars methods should all be called before init_driver.
276*/
277
278
279ProgTime StartTime, InitTime, ProcTime, DoneTime;
280
281void
282init_driver ()
283{
284 int pass;
285 if (!filename || *filename == '\0')
286 FatalError (1, "A document collection name must be specified.");
287
288 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
289 FatalError (1, "I1 and I2 cannot be done simultaneously.");
290
291 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
292 FatalError (1, "T1 and T2 cannot be done simultaneously.");
293
294 if (!Passes)
295 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
296
297 if (trace)
298 {
299 if (!trace_name)
300 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
301 if (!(Trace = fopen (trace_name, "a")))
302 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
303 else
304 setbuf (Trace, NULL);
305 }
306 else
307 Trace = NULL;
308
309 if (comp_stat_point)
310 {
311 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
312 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
313 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
314 name);
315 }
316
317 if (Trace)
318 {
319 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
320 /* print out the args to mg_passes */
321 fprintf (Trace, "\n\n");
322 }
323
324
325 GetTime (&StartTime);
326
327 for (pass = 0; pass < MAX_PASSES; pass++) {
328 if (Passes & (1 << pass)) {
329 pass_data *pd = &PassData[pass];
330#ifdef HAVE_TIMES
331 struct tms tims;
332 times (&tims);
333 pd->init_time -= tims.tms_utime + tims.tms_stime;
334#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
335 struct rusage ru;
336
337 getrusage (RUSAGE_SELF, &ru);
338 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
339 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
340#endif
341 if (pd->init (filename) == COMPERROR)
342 FatalError (1, "Error during init of \"%s\"", pd->name);
343
344#ifdef HAVE_TIMES
345 times (&tims);
346 pd->init_time += tims.tms_utime + tims.tms_stime;
347#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
348 getrusage (RUSAGE_SELF, &ru);
349 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
350 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
351 time_normalise (&pd->init_time);
352#endif
353 }
354 }
355 GetTime (&InitTime);
356}
357
358
359void process_document(const u_char *buffer, int len) {
360 int pass;
361 bytes_processed += len;
362
363#ifndef QUIET
364 if (!len)
365 Message ("Warning : Processing zero length document");
366#endif
367
368 for (pass = 0; pass < MAX_PASSES; pass++) {
369 if (Passes & (1 << pass))
370 {
371 register pass_data *pd = &PassData[pass];
372
373#ifdef HAVE_TIMES
374 struct tms tims;
375 times (&tims);
376 pd->process_time -= tims.tms_utime + tims.tms_stime;
377#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
378 struct rusage ru;
379 register struct timeval *tv = &pd->process_time;
380
381 getrusage (RUSAGE_SELF, &ru);
382 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
383 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
384#endif
385 if (pd->process ((u_char *) buffer, len) == COMPERROR)
386 {
387 Message ("Error during processing of \"%s\"", pd->name);
388 if (Dump || Trace)
389 {
390 int i;
391 FILE *f = Trace ? Trace : stderr;
392 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
393 for (i = 0; i < len; i++)
394 {
395 char ch = buffer[i];
396 if (ch == '\1' || ch == '\2')
397 ch = '\n';
398 putc (ch, f);
399 }
400 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
401 }
402 if (Trace)
403 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
404 bytes_processed, num_docs,
405 ElapsedTime (&StartTime, NULL));
406 exit (1);
407 }
408#ifdef HAVE_TIMES
409 times (&tims);
410 pd->process_time += tims.tms_utime + tims.tms_stime;
411#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
412 getrusage (RUSAGE_SELF, &ru);
413 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
414 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
415#endif
416 }
417 }
418 num_docs++;
419 if (Trace)
420 {
421 block_bytes += len;
422 if (block_bytes >= trace)
423 {
424#ifdef HAVE_MALLINFO
425 struct mallinfo mi;
426 mi = mallinfo ();
427 block_bytes -= trace;
428 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
429 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
430 ElapsedTime (&StartTime, NULL));
431#else
432 block_bytes -= trace;
433 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
434 bytes_processed, num_docs,
435 ElapsedTime (&StartTime, NULL));
436#endif
437 }
438 }
439}
440
441void finalise_driver() {
442 int pass;
443#ifndef HAVE_TIMES
444 for (pass = 0; pass < MAX_PASSES; pass++)
445 if (Passes & (1 << pass))
446 time_normalise (&PassData[pass].process_time);
447#endif
448
449 GetTime (&ProcTime);
450
451 for (pass = 0; pass < MAX_PASSES; pass++)
452 if (Passes & (1 << pass))
453 {
454 pass_data *pd = &PassData[pass];
455#ifdef HAVE_TIMES
456 struct tms tims;
457 times (&tims);
458 pd->done_time -= tims.tms_utime + tims.tms_stime;
459#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
460 struct rusage ru;
461
462 getrusage (RUSAGE_SELF, &ru);
463 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
464 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
465#endif
466 if (pd->done (filename) == COMPERROR)
467 FatalError (1, "Error during done of \"%s\"", pd->name);
468
469#ifdef HAVE_TIMES
470 times (&tims);
471 pd->done_time += tims.tms_utime + tims.tms_stime;
472#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
473 getrusage (RUSAGE_SELF, &ru);
474 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
475 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
476 time_normalise (&pd->done_time);
477#endif
478 }
479 if (Trace)
480 {
481#ifdef HAVE_MALLINFO
482 struct mallinfo mi;
483 mi = mallinfo ();
484 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
485 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
486 ElapsedTime (&StartTime, NULL));
487#else
488 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
489 bytes_processed, num_docs,
490 ElapsedTime (&StartTime, NULL));
491#endif
492 }
493
494 GetTime (&DoneTime);
495
496 Message ("");
497 Message ("%10s : init process done", "");
498 for (pass = 0; pass < MAX_PASSES; pass++)
499 if (Passes & (1 << pass))
500 {
501 pass_data *pd = &PassData[pass];
502 char it[15], pt[15], dt[15];
503#ifdef HAVE_TIMES
504 strcpy (it, cputime_string (pd->init_time));
505 strcpy (pt, cputime_string (pd->process_time));
506 strcpy (dt, cputime_string (pd->done_time));
507#else
508 strcpy (it, cputime_string (&pd->init_time));
509 strcpy (pt, cputime_string (&pd->process_time));
510 strcpy (dt, cputime_string (&pd->done_time));
511#endif
512 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
513 }
514 Message ("");
515 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
516 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
517 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
518 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
519 Message ("Documents : %u", num_docs);
520 Message ("Bytes received : %" ULL_FS, bytes_received);
521 Message ("Bytes processed : %" ULL_FS, bytes_processed);
522 Message ("Process Rate : %.1f kB per cpu second",
523 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
524 //free (buffer);
525
526 if (Trace)
527 fclose (Trace);
528
529 if (Comp_Stats)
530 fclose (Comp_Stats);
531
532}
533
Note: See TracBrowser for help on using the repository browser.