source: trunk/indexers/mg/src/text/mg_passes_4jni.c@ 7593

Last change on this file since 7593 was 7593, checked in by say1, 20 years ago

moved variable to the start of the block

  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1/**************************************************************************
2 *
3 * mg_passes_4jni.c -- Driver for the various passes
4 * Copyright (C) 1994 Neil Sharman
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 *
20 * $Id: mg_passes_4jni.c 7593 2004-06-14 07:41:57Z say1 $
21 *
22 **************************************************************************/
23/* this needs to come first */
24#include "sysfuncs.h"
25
26#ifdef HAVE_MALLINFO
27# include <malloc.h>
28#endif
29
30#include "memlib.h"
31#include "messages.h"
32#include "timing.h"
33
34#include "longlong.h"
35#include "stemmer.h"
36
37#include "mg_files.h"
38#include "mg.h"
39#include "build.h"
40#include "text.h"
41
42#include "words.h"
43
44static char *RCSID = "$Id: mg_passes_4jni.c 7593 2004-06-14 07:41:57Z say1 $";
45
46#define MAX_PASSES 5
47
48#define SPECIAL 1
49#define TEXT_PASS_1 2
50#define TEXT_PASS_2 4
51#define IVF_PASS_1 8
52#define IVF_PASS_2 16
53
54#define MIN_BUF 8192
55#define TERMRECORD '\002'
56
57unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
58unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
59unsigned long ChunkLimit = 0;
60char InvfLevel = 2;
61char SkipSGML = 0;
62char MakeWeights = 0;
63FILE *Comp_Stats = NULL;
64int comp_stat_point = 0;
65mg_ullong bytes_processed = 0;
66mg_ullong bytes_received = 0;
67int stemmer_num = 0; /* default to the lovin stemmer */
68int stem_method = 0;
69FILE * Trace;
70char * filename;
71unsigned long num_docs = 0;
72unsigned long block_bytes = 0;
73
74static char Passes = 0;
75static unsigned long trace = 0;
76static int Dump = 0;
77static char **files = NULL;
78static int num_files = 0;
79static char *trace_name = NULL;
80
81typedef struct pass_data
82 {
83 char *name;
84 int (*init) (char *);
85 int (*process) (u_char *, int);
86 int (*done) (char *);
87#ifdef HAVE_TIMES
88 clock_t init_time;
89 clock_t process_time;
90 clock_t done_time;
91#else
92 struct timeval init_time;
93 struct timeval process_time;
94 struct timeval done_time;
95#endif
96 }
97pass_data;
98
99#ifdef HAVE_TIMES
100#define NULL_TIMES 0, 0, 0
101#else
102#define NULL_TIMES {0, 0}, {0, 0}, {0, 0}
103#endif
104
105static pass_data PassData[MAX_PASSES] =
106{
107 {"special", init_special, process_special, done_special, NULL_TIMES},
108 {"text.pass1", init_text_1, process_text_1, done_text_1, NULL_TIMES},
109 {"text.pass2", init_text_2, process_text_2, done_text_2, NULL_TIMES},
110 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1, NULL_TIMES},
111 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2, NULL_TIMES},
112};
113
114
115/* clear all the settings from one mg_passes run to the next */
116void clear_variables() {
117
118 buf_size = 3 * 1024 * 1024; /* 3Mb */
119 invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
120 ChunkLimit = 0;
121 InvfLevel = 2;
122 SkipSGML = 0;
123 MakeWeights = 0;
124 Comp_Stats = NULL;
125 comp_stat_point = 0;
126 bytes_processed = 0;
127 bytes_received = 0;
128 stemmer_num = 0; /* default to the lovin stemmer */
129 stem_method = 0;
130 Trace = NULL;
131 filename = NULL;
132 num_docs = 0;
133 block_bytes = 0;
134
135 Passes = 0;
136 trace = 0;
137 Dump = 0;
138 files = NULL;
139 num_files = 0;
140 trace_name = NULL;
141
142
143}
144
145/* ################################################## */
146/* the following are methods to set all the variables that used to be
147 set by command line args */
148
149/* -S, -T1, -T2, -I1, -I2, args to mg_passes */
150void add_pass (char pass_type, char pass_num) {
151
152 switch(pass_type) {
153 case 'S':
154 Passes |= SPECIAL;
155 break;
156 case 'I':
157 case 'N':
158 if (pass_num == '1')
159 Passes |= IVF_PASS_1;
160 else if (pass_num == '2')
161 Passes |= IVF_PASS_2;
162 else
163 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
164 break;
165 case 'T':
166 if (pass_num == '1')
167 Passes |= TEXT_PASS_1;
168 else if (pass_num == '2')
169 Passes |= TEXT_PASS_2;
170 else
171 fprintf(stderr, "Invalid pass number %c for pass type %c\n", pass_num, pass_type);
172 break;
173 }
174}
175
176/* -D arg to mg_passes */
177void dump_failed_document(int dump) {
178 Dump = dump;
179}
180
181/* -G arg to mg_passes */
182void ignore_sgml_tags(int ignore) {
183 if (ignore) {
184 SkipSGML = 1;
185 } else {
186 SkipSGML = 0;
187 }
188}
189
190/* -b arg to mg_passes */
191void set_buffer_size(long size) {
192 buf_size = size * 1024;
193 if (buf_size < MIN_BUF) {
194 buf_size = MIN_BUF;
195 }
196}
197
198/* -c arg to mg_passes */
199void set_chunk_limit(long chunk_limit) {
200 ChunkLimit = chunk_limit;
201}
202
203/* -C arg to mg_passes */
204void set_comp_stat_point(int stat_point) {
205 comp_stat_point = stat_point * 1024;
206}
207
208/* -f arg to mg_passes */
209void set_filename(char * filen) {
210 int len = strlen(filen);
211 if (filename) {
212 Xfree (filename);
213 filename = NULL;
214 }
215 filename = Xstrdup (filen);
216}
217
218/* -m arg to mg_passes */
219void set_inversion_limit(int limit) {
220 invf_buffer_size = limit * 1024 * 1024;
221}
222
223/* -1, -2, -3 args to mg_passes */
224void set_invf_level(char level) {
225 switch (level) {
226 case '1':
227 InvfLevel = 1;
228 break;
229 case '2':
230 InvfLevel = 2;
231 break;
232 case '3':
233 InvfLevel = 3;
234 break;
235 }
236}
237
238/* -W arg to mg_passes */
239void set_make_weights(int make_w) {
240 MakeWeights = make_w;
241}
242
243/* -M arg to mg_passes */
244void set_max_numeric(int max_numeric) {
245 SetEnv ("maxnumeric", max_numeric, NULL);
246}
247
248/* -a, -s args to mg_passes */
249void set_stem_options(char * stemmer, int method) {
250 stemmer_num = stemmernumber (stemmer);
251 stem_method = method & STEMMER_MASK;
252}
253
254/* -t arg to mg_passes */
255void set_trace_point(int tracepos) {
256 trace = (unsigned long) (tracepos * 1024 * 1024);
257}
258
259/* -n arg to mg_passes */
260void set_trace_file(char * filen) {
261 int len = strlen(filen);
262 if (trace_name) {
263 Xfree (trace_name);
264 trace_name = NULL;
265 }
266 trace_name = Xstrdup (filen);
267}
268
269/* ############################################### */
270/* The old driver method has been split into 3:
271init_driver, process_document (called numdocs times),
272finalise_driver.
273The above set vars methods should all be called before init_driver.
274*/
275
276
277ProgTime StartTime, InitTime, ProcTime, DoneTime;
278
279void
280init_driver ()
281{
282 int pass;
283 if (!filename || *filename == '\0')
284 FatalError (1, "A document collection name must be specified.");
285
286 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
287 FatalError (1, "I1 and I2 cannot be done simultaneously.");
288
289 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
290 FatalError (1, "T1 and T2 cannot be done simultaneously.");
291
292 if (!Passes)
293 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
294
295 if (trace)
296 {
297 if (!trace_name)
298 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
299 if (!(Trace = fopen (trace_name, "a")))
300 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
301 else
302 setbuf (Trace, NULL);
303 }
304 else
305 Trace = NULL;
306
307 if (comp_stat_point)
308 {
309 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
310 if (!(Comp_Stats = fopen (name, "wb"))) /* [RPAP - Feb 97: WIN32 Port] */
311 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
312 name);
313 }
314
315 if (Trace)
316 {
317 int i;
318 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
319 /* print out the args to mg_passes */
320 fprintf (Trace, "\n\n");
321 }
322
323
324 GetTime (&StartTime);
325
326 for (pass = 0; pass < MAX_PASSES; pass++) {
327 if (Passes & (1 << pass)) {
328 pass_data *pd = &PassData[pass];
329#ifdef HAVE_TIMES
330 struct tms tims;
331 times (&tims);
332 pd->init_time -= tims.tms_utime + tims.tms_stime;
333#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
334 struct rusage ru;
335
336 getrusage (RUSAGE_SELF, &ru);
337 pd->init_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
338 pd->init_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
339#endif
340 if (pd->init (filename) == COMPERROR)
341 FatalError (1, "Error during init of \"%s\"", pd->name);
342
343#ifdef HAVE_TIMES
344 times (&tims);
345 pd->init_time += tims.tms_utime + tims.tms_stime;
346#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
347 getrusage (RUSAGE_SELF, &ru);
348 pd->init_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
349 pd->init_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
350 time_normalise (&pd->init_time);
351#endif
352 }
353 }
354 GetTime (&InitTime);
355}
356
357
358void process_document(u_char *buffer, int len) {
359 int pass;
360 bytes_processed += len;
361
362#ifndef QUIET
363 if (!len)
364 Message ("Warning : Processing zero length document");
365#endif
366
367 for (pass = 0; pass < MAX_PASSES; pass++) {
368 if (Passes & (1 << pass))
369 {
370 register pass_data *pd = &PassData[pass];
371
372#ifdef HAVE_TIMES
373 struct tms tims;
374 times (&tims);
375 pd->process_time -= tims.tms_utime + tims.tms_stime;
376#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
377 struct rusage ru;
378 register struct timeval *tv = &pd->process_time;
379
380 getrusage (RUSAGE_SELF, &ru);
381 tv->tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
382 tv->tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
383#endif
384 if (pd->process ((u_char *) buffer, len) == COMPERROR)
385 {
386 Message ("Error during processing of \"%s\"", pd->name);
387 if (Dump || Trace)
388 {
389 int i;
390 FILE *f = Trace ? Trace : stderr;
391 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
392 for (i = 0; i < len; i++)
393 {
394 char ch = buffer[i];
395 if (ch == '\1' || ch == '\2')
396 ch = '\n';
397 putc (ch, f);
398 }
399 fprintf (f, "-=- * -=- * -=- * -=- * -=- * -=- * -=-\n");
400 }
401 if (Trace)
402 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
403 bytes_processed, num_docs,
404 ElapsedTime (&StartTime, NULL));
405 exit (1);
406 }
407#ifdef HAVE_TIMES
408 times (&tims);
409 pd->process_time += tims.tms_utime + tims.tms_stime;
410#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
411 getrusage (RUSAGE_SELF, &ru);
412 tv->tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
413 tv->tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
414#endif
415 }
416 }
417 num_docs++;
418 if (Trace)
419 {
420 block_bytes += len;
421 if (block_bytes >= trace)
422 {
423#ifdef HAVE_MALLINFO
424 struct mallinfo mi;
425 mi = mallinfo ();
426 block_bytes -= trace;
427 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
428 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
429 ElapsedTime (&StartTime, NULL));
430#else
431 block_bytes -= trace;
432 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
433 bytes_processed, num_docs,
434 ElapsedTime (&StartTime, NULL));
435#endif
436 }
437 }
438}
439
440void finalise_driver() {
441 int pass;
442#ifndef HAVE_TIMES
443 for (pass = 0; pass < MAX_PASSES; pass++)
444 if (Passes & (1 << pass))
445 time_normalise (&PassData[pass].process_time);
446#endif
447
448 GetTime (&ProcTime);
449
450 for (pass = 0; pass < MAX_PASSES; pass++)
451 if (Passes & (1 << pass))
452 {
453 pass_data *pd = &PassData[pass];
454#ifdef HAVE_TIMES
455 struct tms tims;
456 times (&tims);
457 pd->done_time -= tims.tms_utime + tims.tms_stime;
458#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
459 struct rusage ru;
460
461 getrusage (RUSAGE_SELF, &ru);
462 pd->done_time.tv_sec -= ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
463 pd->done_time.tv_usec -= ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
464#endif
465 if (pd->done (filename) == COMPERROR)
466 FatalError (1, "Error during done of \"%s\"", pd->name);
467
468#ifdef HAVE_TIMES
469 times (&tims);
470 pd->done_time += tims.tms_utime + tims.tms_stime;
471#elif defined(HAVE_GETRUSAGE) /* [RPAP - Feb 97: WIN32 Port] */
472 getrusage (RUSAGE_SELF, &ru);
473 pd->done_time.tv_sec += ru.ru_utime.tv_sec + ru.ru_stime.tv_sec;
474 pd->done_time.tv_usec += ru.ru_utime.tv_usec + ru.ru_stime.tv_usec;
475 time_normalise (&pd->done_time);
476#endif
477 }
478 if (Trace)
479 {
480#ifdef HAVE_MALLINFO
481 struct mallinfo mi;
482 mi = mallinfo ();
483 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs |%7.3f Mb | %s\n",
484 bytes_processed, num_docs, mi.arena / 1024.0 / 1024.0,
485 ElapsedTime (&StartTime, NULL));
486#else
487 fprintf (Trace, "%11" ULL_FS " bytes |%7lu docs | %s\n",
488 bytes_processed, num_docs,
489 ElapsedTime (&StartTime, NULL));
490#endif
491 }
492
493 GetTime (&DoneTime);
494
495 Message ("");
496 Message ("%10s : init process done", "");
497 for (pass = 0; pass < MAX_PASSES; pass++)
498 if (Passes & (1 << pass))
499 {
500 pass_data *pd = &PassData[pass];
501 char it[15], pt[15], dt[15];
502#ifdef HAVE_TIMES
503 strcpy (it, cputime_string (pd->init_time));
504 strcpy (pt, cputime_string (pd->process_time));
505 strcpy (dt, cputime_string (pd->done_time));
506#else
507 strcpy (it, cputime_string (&pd->init_time));
508 strcpy (pt, cputime_string (&pd->process_time));
509 strcpy (dt, cputime_string (&pd->done_time));
510#endif
511 Message ("%-10s : %s %s %s", pd->name, it, pt, dt);
512 }
513 Message ("");
514 Message ("Init time : %s", ElapsedTime (&StartTime, &InitTime));
515 Message ("Process time : %s", ElapsedTime (&InitTime, &ProcTime));
516 Message ("Done time : %s", ElapsedTime (&ProcTime, &DoneTime));
517 Message ("Total time : %s", ElapsedTime (&StartTime, &DoneTime));
518 Message ("Documents : %u", num_docs);
519 Message ("Bytes received : %" ULL_FS, bytes_received);
520 Message ("Bytes processed : %" ULL_FS, bytes_processed);
521 Message ("Process Rate : %.1f kB per cpu second",
522 (double) bytes_processed / (ProcTime.CPUTime - InitTime.CPUTime) / 1024);
523 //free (buffer);
524
525 if (Trace)
526 fclose (Trace);
527
528 if (Comp_Stats)
529 fclose (Comp_Stats);
530
531}
532
Note: See TracBrowser for help on using the repository browser.