source: trunk/indexers/mg/src/text/MGPASS.C@ 3745

Last change on this file since 3745 was 3745, checked in by mdewsnip, 19 years ago

Addition of MG package for search and retrieval

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1/**************************************************************************
2 *
3 * mgpass.cpp -- Driver for the various passes -
4
5 V1 - removed all the pipe processing and replaced with
6 code to directly explore a directory of files.
7 V2 - rebuilt to extract non text files from web browser
8 'catch' file. Also to display progress count
9 GH/WJR
10
11 * Copyright (C) 1994 Neil Sharman, ..
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 * $Id: MGPASS.C 3745 2003-02-20 21:20:24Z mdewsnip $
28 *
29 **************************************************************************/
30
31#include "sysfuncs.h"
32
33#include <malloc.h>
34#include <stdlib.h>
35#include <stdio.h>
36#include <io.h>
37#include <fcntl.h>
38#include <string.h>
39
40#include "memlib.h"
41#include "messages.h"
42
43#include "longlong.h"
44
45#include "mg_files.h"
46#include "mg.h"
47#include "build.h"
48#include "text.h"
49#include "stemmer.h"
50
51/*
52 $Log$
53 Revision 1.1 2003/02/20 21:18:23 mdewsnip
54 Addition of MG package for search and retrieval
55
56 Revision 1.2 2001/09/20 02:32:00 cs025
57 a few fixes for compiling under windows with new long long stuff in mg
58
59 Revision 1.1 1999/08/10 21:17:39 sjboddie
60 renamed mg-1.3d directory mg
61
62 Revision 1.5 1998/12/02 01:32:40 sjboddie
63 fixed 1 missing semi-colon..
64
65 Revision 1.4 1998/12/02 00:57:10 rjmcnab
66
67 Fixed to compile with the changes to the stemming.
68
69 Revision 1.3 1998/11/24 03:39:23 rjmcnab
70
71 Fixed a small error in the windows compilation
72
73 Revision 1.2 1998/11/24 01:29:39 rjmcnab
74
75 Fixed a few problems with the windows build
76
77 Revision 1.1 1998/11/17 09:34:12 rjmcnab
78 *** empty log message ***
79
80 * Revision 1.3 1994/10/20 03:56:57 tes
81 * I have rewritten the boolean query optimiser and abstracted out the
82 * components of the boolean query.
83 *
84 * Revision 1.2 1994/09/20 04:41:52 tes
85 * For version 1.1
86 *
87 */
88
89static char *RCSID = "$Id: MGPASS.C 3745 2003-02-20 21:20:24Z mdewsnip $";
90
91#define MAX_PASSES 5
92
93#define SPECIAL 1
94#define TEXT_PASS_1 2
95#define TEXT_PASS_2 4
96#define IVF_PASS_1 8
97#define IVF_PASS_2 16
98
99#define MIN_BUF 8192
100#define path_length 256
101
102unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
103unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
104unsigned long ChunkLimit = 0;
105char InvfLevel = 2;
106char SkipSGML = 0;
107char MakeWeights = 0;
108FILE *Comp_Stats = NULL;
109int comp_stat_point = 0;
110mg_ullong bytes_processed = 0;
111int num_docs = 0;
112mg_ullong bytes_received = 0;
113int stem_method = 0;
114int stemmer_num = 0; /* default to the Lovin stemmer */
115
116static char Passes = 0;
117static unsigned long trace = 0;
118static int Dump = 0;
119static char **files = NULL;
120static int num_files = 0;
121static char *trace_name = NULL;
122
123static char dirname[path_length], wildname[path_length];
124static int by_para = 0, recurse = 0, html_catch = 0;
125static char *buffer;
126char *line_start, *data_end, *base, *scan;
127
128typedef struct pass_data
129 {
130 char *name;
131 int (*init) (char *);
132 int (*process) (u_char *, int);
133 int (*done) (char *);
134 }
135pass_data;
136
137static pass_data PassData[MAX_PASSES] =
138{
139 {"special", init_special, process_special, done_special},
140 {"text.pass1", init_text_1, process_text_1, done_text_1},
141 {"text.pass2", init_text_2, process_text_2, done_text_2},
142 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
143 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
144};
145
146static char *usage_str = "\nUSAGE:\n"
147" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
148" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
149" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
150" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n"
151" %*s [source directory\\] [source file]\n";
152
153static void usage (char *err)
154{
155 if (err) Message (err);
156 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
157 strlen (msg_prefix), "", strlen (msg_prefix), "");
158 exit (1);
159}
160
161void do_process(char *buffer, int num)
162{
163 int pass;
164 if (num == 0) Message ("Warning : Processing zero length document");
165 num_docs++;
166 bytes_processed += num;
167 for (pass = 0; pass < MAX_PASSES; pass++)
168 if (Passes & (1 << pass))
169 if (PassData[pass].process((u_char *)buffer, num) == COMPERROR)
170 FatalError(1, "Error during file processing");
171}
172
173int refill(int in_fd)
174{
175 int num, bitleft;
176 bitleft = data_end - base;
177 memmove(buffer, base, bitleft);
178 if (buf_size - bitleft < MIN_BUF)
179 FatalError(1, "Paragraph too big for buffer");
180 num = read(in_fd, &buffer[bitleft], buf_size - bitleft);
181 line_start -= (base - buffer);
182 scan -= (base - buffer);
183 base = buffer;
184 data_end = buffer + bitleft + num;
185 if (num > 0) return 1; else return 0;
186}
187
188char *scanpara(int in_fd)
189{
190 int num, blank, in_blank, at_end;
191 num = read(in_fd,buffer,buf_size);
192 in_blank = 1; base = buffer; line_start = buffer; scan = buffer;
193 data_end = buffer + num; at_end = 0;
194 for (;;) {
195 blank = 1; line_start = scan; // Get a line
196 while (scan < data_end && *scan != '\n')
197 if (*scan++ > ' ') blank = 0;
198 if (scan >= data_end) {
199 if (refill(in_fd)) {
200 while (scan < data_end && *scan != '\n')
201 if (*scan++ > ' ') blank = 0;
202 if (scan < data_end) scan++;
203 }
204 else at_end = 1;
205 }
206 else scan++;
207 if (line_start < scan) { // If we have a line
208 if (in_blank) {
209 if (!blank) { in_blank = 0; base = line_start; }
210 }
211 else {
212 if (blank) {
213 do_process(base, line_start - base);
214 in_blank = 1; base = line_start;
215 }
216 }
217 }
218 if (at_end) break;
219 }
220 if (in_blank) base = scan;
221 if (scan + 2 <= buffer + buf_size) {
222 *scan++ = 26; *scan++ = 10;
223 }
224 if (base < scan) do_process(base, scan - base);
225 return NULL;
226}
227
228char *scanfile(int in_fd)
229{
230 int num = read(in_fd,buffer,buf_size); /*expect to read the whole file*/
231 if (num < 0) return "file locked";
232 if (num >= buf_size-1) return "file too long";
233 do_process(buffer, num);
234 return NULL;
235}
236
237void search(char *dname, char *fname)
238{
239 long dirtag; struct _finddata_t dirinfo; int in_fd; char *res;
240 char search_name[path_length], found_name[path_length];
241
242 sprintf(search_name, "%s%s", dname, fname); /*Scan files*/
243 dirtag = _findfirst(search_name, &dirinfo);
244 if (dirtag >= 0) {
245 do {
246 if ((dirinfo.attrib & (_A_SUBDIR | _A_HIDDEN | _A_SYSTEM)) == 0) {
247 sprintf(found_name,"%s%s",dname,dirinfo.name);
248 in_fd = open(found_name,O_RDONLY|O_BINARY);
249 if (in_fd >= 0) {
250 if (by_para)
251 res = scanpara(in_fd);
252 else
253 res = scanfile(in_fd);
254 if (res != NULL) {
255 Message("Error %s in processing file %s\n", res, found_name);
256 exit(1);
257 }
258 close(in_fd);
259 }
260 }
261 } while (_findnext(dirtag, &dirinfo) == 0);
262 _findclose(dirtag);
263 }
264
265 if (recurse == 0) return;
266
267 sprintf(search_name, "%s*.*", dname); /*Look for subdirs*/
268 dirtag = _findfirst(search_name, &dirinfo);
269 if (dirtag < 0) return;
270 do {
271 if ( ((dirinfo.attrib & (_A_HIDDEN | _A_SYSTEM)) == 0) &&
272 ((dirinfo.attrib & _A_SUBDIR) != 0) &&
273 strcmp(dirinfo.name,".") != 0 &&
274 strcmp(dirinfo.name,"..") != 0) {
275 sprintf(found_name,"%s%s",dname,dirinfo.name);
276 strcat(found_name,"\\");
277 search(found_name,fname);
278 }
279 } while (_findnext(dirtag, &dirinfo) == 0);
280 _findclose(dirtag);
281}
282
283static int toobig(int n)
284{
285 if (n > path_length) {
286 printf("Cannot handle urls > %d characters in length\n", path_length);
287 exit(1);
288 }
289 return 0;
290}
291
292void scan_catch(char *dname, char *fname)
293{
294 int in_fd, urllen, conlen, filesize; char catch_name[path_length];
295 int filecount = 0;
296 enum { filekind_redirected, filekind_text, filekind_other } filekind;
297 sprintf(catch_name, "%s%s", dname, fname);
298 in_fd = open(catch_name,O_RDONLY|O_BINARY);
299 if (in_fd < 0)
300 FatalError(1, "Couldn't open catch file \"%s\"", catch_name);
301 for (;;) {
302 filecount++; if (filecount%100 == 0) { printf("%d\r", filecount); fflush(stdout); }
303 if (read(in_fd, &urllen, sizeof(int)) != sizeof(int) || toobig(urllen) ||
304 read(in_fd, catch_name, urllen) != urllen ||
305 read(in_fd, &conlen, sizeof(int)) != sizeof(int) || toobig(conlen) ||
306 read(in_fd, catch_name, conlen) != conlen) break;
307 if (conlen >= 1 && *catch_name == '@')
308 filekind = filekind_redirected;
309 else {
310 if (conlen >= 4 && strncmp(catch_name, "text", 4) == 0)
311 filekind = filekind_text;
312 else
313 filekind = filekind_other;
314 if (read(in_fd, &filesize, sizeof(int)) != sizeof(int))
315 FatalError(1, "File read failed for size field");
316 if (filesize > buf_size)
317 FatalError(1, "File too large (%d > %d)", filesize, buf_size);
318 if (read(in_fd, buffer, filesize) != filesize)
319 FatalError(1, "Failed to read file data");
320 }
321 if (filekind == filekind_text) do_process(buffer, filesize);
322 }
323 close(in_fd);
324}
325
326static void driver (FILE * Trace, char *file_name)
327{
328 int pass;
329
330 buffer = (char *)Xmalloc (buf_size);
331
332 for (pass = 0; pass < MAX_PASSES; pass++)
333 if (Passes & (1 << pass))
334 {
335 if (PassData[pass].init (file_name) == COMPERROR)
336 FatalError (1, "Error during init of \"%s\"",PassData[pass].name);
337 }
338
339 if (html_catch == 0) search(dirname, wildname);
340 else scan_catch(dirname, wildname);
341
342 for (pass = 0; pass < MAX_PASSES; pass++)
343 if (Passes & (1 << pass))
344 {
345 if (PassData[pass].done (file_name) == COMPERROR)
346 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
347 }
348
349
350 free (buffer);
351}
352
353void main (int argc, char **argv)
354{
355 int ch;
356 char *filename = NULL;
357 FILE *Trace = NULL;
358
359 msg_prefix = argv[0];
360
361 opterr = 0;
362 while ((ch = getopt (argc, argv, "hC:WHGpSD123f:d:b:T:I:t:m:N:c:n:s:")) != -1)
363 {
364 switch (ch)
365 {
366 case 'H':
367 html_catch = 1;
368 break;
369 case 'G':
370 SkipSGML = 1;
371 break;
372 case 'p':
373 by_para = 1;
374 break;
375 case 'S':
376 Passes |= SPECIAL;
377 break;
378 case '1':
379 InvfLevel = 1;
380 break;
381 case '2':
382 InvfLevel = 2;
383 break;
384 case '3':
385 InvfLevel = 3;
386 break;
387 case 'f':
388 filename = optarg;
389 break;
390 case 'n':
391 trace_name = optarg;
392 break;
393 case 'D':
394 Dump = 1;
395 break;
396 case 'W':
397 MakeWeights = 1;
398 break;
399 case 'd':
400 set_basepath (optarg);
401 break;
402 case 's':
403 stem_method = atoi (optarg) & STEMMER_MASK;
404 break;
405 case 'b':
406 buf_size = atoi (optarg) * 1024;
407 break;
408 case 'C':
409 comp_stat_point = atoi (optarg) * 1024;
410 break;
411 case 'c':
412 ChunkLimit = atoi (optarg);
413 break;
414 case 'm':
415 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
416 break;
417 case 'I':
418 case 'N': /* N kept for compatability */
419 if (*optarg == '1')
420 Passes |= IVF_PASS_1;
421 else if (*optarg == '2')
422 Passes |= IVF_PASS_2;
423 else
424 usage ("Invalid pass number");
425 break;
426 case 'T':
427 if (*optarg == '1')
428 Passes |= TEXT_PASS_1;
429 else if (*optarg == '2')
430 Passes |= TEXT_PASS_2;
431 else
432 usage ("Invalid pass number");
433 break;
434 case 't':
435 trace = (unsigned long) (atof (optarg) * 1024 * 1024);
436 break;
437 case 'h':
438 case '?':
439 usage (NULL);
440 }
441 }
442
443 if (!filename || *filename == '\0')
444 FatalError (1, "A document collection name must be specified.");
445
446 if (buf_size < MIN_BUF)
447 FatalError (1, "The buffer size must exceed 1024 bytes.");
448
449 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
450 FatalError (1, "I1 and I2 cannot be done simultaneously.");
451
452 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
453 FatalError (1, "T1 and T2 cannot be done simultaneously.");
454
455 if (!Passes)
456 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
457
458 if (argc - optind == 1) {
459 strcpy(dirname,"");
460 strcpy(wildname,argv[optind]);
461 }
462 else if (argc - optind == 2) {
463 strcpy(dirname,argv[optind]);
464 strcpy(wildname,argv[optind+1]);
465 }
466 else FatalError(1, "Finder code requires directory and filespec.");
467
468 if (strrchr(wildname,'*') != NULL || strrchr(wildname,'?') != NULL)
469 recurse = 1;
470
471 if (trace)
472 {
473 if (!trace_name)
474 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
475 if (!(Trace = fopen (trace_name, "a")))
476 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
477 else
478 setbuf (Trace, NULL);
479 }
480 else
481 Trace = NULL;
482
483 if (comp_stat_point)
484 {
485 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
486 if (!(Comp_Stats = fopen (name, "wb")))
487 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
488 name);
489 }
490
491
492 if (Trace)
493 {
494 int i;
495 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
496 for (i = 0; i < argc; i++)
497 fprintf (Trace, "%s ", argv[i]);
498 fprintf (Trace, "\n\n");
499 }
500
501 driver (Trace, filename);
502
503 if (Trace)
504 fclose (Trace);
505
506 if (Comp_Stats)
507 fclose (Comp_Stats);
508
509 exit (0);
510}
Note: See TracBrowser for help on using the repository browser.