1 | /**************************************************************************
|
---|
2 | *
|
---|
3 | * mgpass.cpp -- Driver for the various passes -
|
---|
4 |
|
---|
5 | V1 - removed all the pipe processing and replaced with
|
---|
6 | code to directly explore a directory of files.
|
---|
7 | V2 - rebuilt to extract non text files from web browser
|
---|
8 | 'catch' file. Also to display progress count
|
---|
9 | GH/WJR
|
---|
10 |
|
---|
11 | * Copyright (C) 1994 Neil Sharman, ..
|
---|
12 | *
|
---|
13 | * This program is free software; you can redistribute it and/or modify
|
---|
14 | * it under the terms of the GNU General Public License as published by
|
---|
15 | * the Free Software Foundation; either version 2 of the License, or
|
---|
16 | * (at your option) any later version.
|
---|
17 | *
|
---|
18 | * This program is distributed in the hope that it will be useful,
|
---|
19 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
20 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
21 | * GNU General Public License for more details.
|
---|
22 | *
|
---|
23 | * You should have received a copy of the GNU General Public License
|
---|
24 | * along with this program; if not, write to the Free Software
|
---|
25 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
26 | *
|
---|
27 | * $Id: MGPASS.C 28 1998-11-24 03:39:23Z rjmcnab $
|
---|
28 | *
|
---|
29 | **************************************************************************/
|
---|
30 |
|
---|
31 | #include "sysfuncs.h"
|
---|
32 |
|
---|
33 | #include <malloc.h>
|
---|
34 | #include <stdlib.h>
|
---|
35 | #include <stdio.h>
|
---|
36 | #include <io.h>
|
---|
37 | #include <fcntl.h>
|
---|
38 | #include <string.h>
|
---|
39 |
|
---|
40 | #include "memlib.h"
|
---|
41 | #include "messages.h"
|
---|
42 |
|
---|
43 | #include "mg_files.h"
|
---|
44 | #include "mg.h"
|
---|
45 | #include "build.h"
|
---|
46 | #include "text.h"
|
---|
47 | #include "stemmer.h"
|
---|
48 |
|
---|
49 | /*
|
---|
50 | $Log$
|
---|
51 | Revision 1.3 1998/11/24 03:39:23 rjmcnab
|
---|
52 |
|
---|
53 | Fixed a small error in the windows compilation
|
---|
54 |
|
---|
55 | Revision 1.2 1998/11/24 01:29:39 rjmcnab
|
---|
56 |
|
---|
57 | Fixed a few problems with the windows build
|
---|
58 |
|
---|
59 | Revision 1.1 1998/11/17 09:34:12 rjmcnab
|
---|
60 | *** empty log message ***
|
---|
61 |
|
---|
62 | * Revision 1.3 1994/10/20 03:56:57 tes
|
---|
63 | * I have rewritten the boolean query optimiser and abstracted out the
|
---|
64 | * components of the boolean query.
|
---|
65 | *
|
---|
66 | * Revision 1.2 1994/09/20 04:41:52 tes
|
---|
67 | * For version 1.1
|
---|
68 | *
|
---|
69 | */
|
---|
70 |
|
---|
71 | static char *RCSID = "$Id: MGPASS.C 28 1998-11-24 03:39:23Z rjmcnab $";
|
---|
72 |
|
---|
73 | #define MAX_PASSES 5
|
---|
74 |
|
---|
75 | #define SPECIAL 1
|
---|
76 | #define TEXT_PASS_1 2
|
---|
77 | #define TEXT_PASS_2 4
|
---|
78 | #define IVF_PASS_1 8
|
---|
79 | #define IVF_PASS_2 16
|
---|
80 |
|
---|
81 | #define MIN_BUF 8192
|
---|
82 | #define path_length 256
|
---|
83 |
|
---|
84 | unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
|
---|
85 | unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
|
---|
86 | unsigned long ChunkLimit = 0;
|
---|
87 | char InvfLevel = 2;
|
---|
88 | char SkipSGML = 0;
|
---|
89 | char MakeWeights = 0;
|
---|
90 | FILE *Comp_Stats = NULL;
|
---|
91 | int comp_stat_point = 0;
|
---|
92 | double bytes_processed = 0;
|
---|
93 | int num_docs = 0;
|
---|
94 | double bytes_received = 0;
|
---|
95 | int stem_method = 0;
|
---|
96 |
|
---|
97 | static char Passes = 0;
|
---|
98 | static unsigned long trace = 0;
|
---|
99 | static int Dump = 0;
|
---|
100 | static char **files = NULL;
|
---|
101 | static int num_files = 0;
|
---|
102 | static char *trace_name = NULL;
|
---|
103 |
|
---|
104 | static char dirname[path_length], wildname[path_length];
|
---|
105 | static int by_para = 0, recurse = 0, html_catch = 0;
|
---|
106 | static char *buffer;
|
---|
107 | char *line_start, *data_end, *base, *scan;
|
---|
108 |
|
---|
109 | typedef struct pass_data
|
---|
110 | {
|
---|
111 | char *name;
|
---|
112 | int (*init) (char *);
|
---|
113 | int (*process) (u_char *, int);
|
---|
114 | int (*done) (char *);
|
---|
115 | }
|
---|
116 | pass_data;
|
---|
117 |
|
---|
118 | static pass_data PassData[MAX_PASSES] =
|
---|
119 | {
|
---|
120 | {"special", init_special, process_special, done_special},
|
---|
121 | {"text.pass1", init_text_1, process_text_1, done_text_1},
|
---|
122 | {"text.pass2", init_text_2, process_text_2, done_text_2},
|
---|
123 | {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
|
---|
124 | {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
|
---|
125 | };
|
---|
126 |
|
---|
127 | static char *usage_str = "\nUSAGE:\n"
|
---|
128 | " %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
|
---|
129 | " %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
|
---|
130 | " %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
|
---|
131 | " %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n"
|
---|
132 | " %*s [source directory\\] [source file]\n";
|
---|
133 |
|
---|
134 | static void usage (char *err)
|
---|
135 | {
|
---|
136 | if (err) Message (err);
|
---|
137 | fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
|
---|
138 | strlen (msg_prefix), "", strlen (msg_prefix), "");
|
---|
139 | exit (1);
|
---|
140 | }
|
---|
141 |
|
---|
142 | void do_process(char *buffer, int num)
|
---|
143 | {
|
---|
144 | int pass;
|
---|
145 | if (num == 0) Message ("Warning : Processing zero length document");
|
---|
146 | num_docs++;
|
---|
147 | bytes_processed += num;
|
---|
148 | for (pass = 0; pass < MAX_PASSES; pass++)
|
---|
149 | if (Passes & (1 << pass))
|
---|
150 | if (PassData[pass].process((u_char *)buffer, num) == COMPERROR)
|
---|
151 | FatalError(1, "Error during file processing");
|
---|
152 | }
|
---|
153 |
|
---|
154 | int refill(int in_fd)
|
---|
155 | {
|
---|
156 | int num, bitleft;
|
---|
157 | bitleft = data_end - base;
|
---|
158 | memmove(buffer, base, bitleft);
|
---|
159 | if (buf_size - bitleft < MIN_BUF)
|
---|
160 | FatalError(1, "Paragraph too big for buffer");
|
---|
161 | num = read(in_fd, &buffer[bitleft], buf_size - bitleft);
|
---|
162 | line_start -= (base - buffer);
|
---|
163 | scan -= (base - buffer);
|
---|
164 | base = buffer;
|
---|
165 | data_end = buffer + bitleft + num;
|
---|
166 | if (num > 0) return 1; else return 0;
|
---|
167 | }
|
---|
168 |
|
---|
169 | char *scanpara(int in_fd)
|
---|
170 | {
|
---|
171 | int num, blank, in_blank, at_end;
|
---|
172 | num = read(in_fd,buffer,buf_size);
|
---|
173 | in_blank = 1; base = buffer; line_start = buffer; scan = buffer;
|
---|
174 | data_end = buffer + num; at_end = 0;
|
---|
175 | for (;;) {
|
---|
176 | blank = 1; line_start = scan; // Get a line
|
---|
177 | while (scan < data_end && *scan != '\n')
|
---|
178 | if (*scan++ > ' ') blank = 0;
|
---|
179 | if (scan >= data_end) {
|
---|
180 | if (refill(in_fd)) {
|
---|
181 | while (scan < data_end && *scan != '\n')
|
---|
182 | if (*scan++ > ' ') blank = 0;
|
---|
183 | if (scan < data_end) scan++;
|
---|
184 | }
|
---|
185 | else at_end = 1;
|
---|
186 | }
|
---|
187 | else scan++;
|
---|
188 | if (line_start < scan) { // If we have a line
|
---|
189 | if (in_blank) {
|
---|
190 | if (!blank) { in_blank = 0; base = line_start; }
|
---|
191 | }
|
---|
192 | else {
|
---|
193 | if (blank) {
|
---|
194 | do_process(base, line_start - base);
|
---|
195 | in_blank = 1; base = line_start;
|
---|
196 | }
|
---|
197 | }
|
---|
198 | }
|
---|
199 | if (at_end) break;
|
---|
200 | }
|
---|
201 | if (in_blank) base = scan;
|
---|
202 | if (scan + 2 <= buffer + buf_size) {
|
---|
203 | *scan++ = 26; *scan++ = 10;
|
---|
204 | }
|
---|
205 | if (base < scan) do_process(base, scan - base);
|
---|
206 | return NULL;
|
---|
207 | }
|
---|
208 |
|
---|
209 | char *scanfile(int in_fd)
|
---|
210 | {
|
---|
211 | int num = read(in_fd,buffer,buf_size); /*expect to read the whole file*/
|
---|
212 | if (num < 0) return "file locked";
|
---|
213 | if (num >= buf_size-1) return "file too long";
|
---|
214 | do_process(buffer, num);
|
---|
215 | return NULL;
|
---|
216 | }
|
---|
217 |
|
---|
218 | void search(char *dname, char *fname)
|
---|
219 | {
|
---|
220 | long dirtag; struct _finddata_t dirinfo; int in_fd; char *res;
|
---|
221 | char search_name[path_length], found_name[path_length];
|
---|
222 |
|
---|
223 | sprintf(search_name, "%s%s", dname, fname); /*Scan files*/
|
---|
224 | dirtag = _findfirst(search_name, &dirinfo);
|
---|
225 | if (dirtag >= 0) {
|
---|
226 | do {
|
---|
227 | if ((dirinfo.attrib & (_A_SUBDIR | _A_HIDDEN | _A_SYSTEM)) == 0) {
|
---|
228 | sprintf(found_name,"%s%s",dname,dirinfo.name);
|
---|
229 | in_fd = open(found_name,O_RDONLY|O_BINARY);
|
---|
230 | if (in_fd >= 0) {
|
---|
231 | if (by_para)
|
---|
232 | res = scanpara(in_fd);
|
---|
233 | else
|
---|
234 | res = scanfile(in_fd);
|
---|
235 | if (res != NULL) {
|
---|
236 | Message("Error %s in processing file %s\n", res, found_name);
|
---|
237 | exit(1);
|
---|
238 | }
|
---|
239 | close(in_fd);
|
---|
240 | }
|
---|
241 | }
|
---|
242 | } while (_findnext(dirtag, &dirinfo) == 0);
|
---|
243 | _findclose(dirtag);
|
---|
244 | }
|
---|
245 |
|
---|
246 | if (recurse == 0) return;
|
---|
247 |
|
---|
248 | sprintf(search_name, "%s*.*", dname); /*Look for subdirs*/
|
---|
249 | dirtag = _findfirst(search_name, &dirinfo);
|
---|
250 | if (dirtag < 0) return;
|
---|
251 | do {
|
---|
252 | if ( ((dirinfo.attrib & (_A_HIDDEN | _A_SYSTEM)) == 0) &&
|
---|
253 | ((dirinfo.attrib & _A_SUBDIR) != 0) &&
|
---|
254 | strcmp(dirinfo.name,".") != 0 &&
|
---|
255 | strcmp(dirinfo.name,"..") != 0) {
|
---|
256 | sprintf(found_name,"%s%s",dname,dirinfo.name);
|
---|
257 | strcat(found_name,"\\");
|
---|
258 | search(found_name,fname);
|
---|
259 | }
|
---|
260 | } while (_findnext(dirtag, &dirinfo) == 0);
|
---|
261 | _findclose(dirtag);
|
---|
262 | }
|
---|
263 |
|
---|
264 | static int toobig(int n)
|
---|
265 | {
|
---|
266 | if (n > path_length) {
|
---|
267 | printf("Cannot handle urls > %d characters in length\n", path_length);
|
---|
268 | exit(1);
|
---|
269 | }
|
---|
270 | return 0;
|
---|
271 | }
|
---|
272 |
|
---|
273 | void scan_catch(char *dname, char *fname)
|
---|
274 | {
|
---|
275 | int in_fd, urllen, conlen, filesize; char catch_name[path_length];
|
---|
276 | int filecount = 0;
|
---|
277 | enum { filekind_redirected, filekind_text, filekind_other } filekind;
|
---|
278 | sprintf(catch_name, "%s%s", dname, fname);
|
---|
279 | in_fd = open(catch_name,O_RDONLY|O_BINARY);
|
---|
280 | if (in_fd < 0)
|
---|
281 | FatalError(1, "Couldn't open catch file \"%s\"", catch_name);
|
---|
282 | for (;;) {
|
---|
283 | filecount++; if (filecount%100 == 0) { printf("%d\r", filecount); fflush(stdout); }
|
---|
284 | if (read(in_fd, &urllen, sizeof(int)) != sizeof(int) || toobig(urllen) ||
|
---|
285 | read(in_fd, catch_name, urllen) != urllen ||
|
---|
286 | read(in_fd, &conlen, sizeof(int)) != sizeof(int) || toobig(conlen) ||
|
---|
287 | read(in_fd, catch_name, conlen) != conlen) break;
|
---|
288 | if (conlen >= 1 && *catch_name == '@')
|
---|
289 | filekind = filekind_redirected;
|
---|
290 | else {
|
---|
291 | if (conlen >= 4 && strncmp(catch_name, "text", 4) == 0)
|
---|
292 | filekind = filekind_text;
|
---|
293 | else
|
---|
294 | filekind = filekind_other;
|
---|
295 | if (read(in_fd, &filesize, sizeof(int)) != sizeof(int))
|
---|
296 | FatalError(1, "File read failed for size field");
|
---|
297 | if (filesize > buf_size)
|
---|
298 | FatalError(1, "File too large (%d > %d)", filesize, buf_size);
|
---|
299 | if (read(in_fd, buffer, filesize) != filesize)
|
---|
300 | FatalError(1, "Failed to read file data");
|
---|
301 | }
|
---|
302 | if (filekind == filekind_text) do_process(buffer, filesize);
|
---|
303 | }
|
---|
304 | close(in_fd);
|
---|
305 | }
|
---|
306 |
|
---|
307 | static void driver (FILE * Trace, char *file_name)
|
---|
308 | {
|
---|
309 | int pass;
|
---|
310 |
|
---|
311 | buffer = (char *)Xmalloc (buf_size);
|
---|
312 |
|
---|
313 | for (pass = 0; pass < MAX_PASSES; pass++)
|
---|
314 | if (Passes & (1 << pass))
|
---|
315 | {
|
---|
316 | if (PassData[pass].init (file_name) == COMPERROR)
|
---|
317 | FatalError (1, "Error during init of \"%s\"",PassData[pass].name);
|
---|
318 | }
|
---|
319 |
|
---|
320 | if (html_catch == 0) search(dirname, wildname);
|
---|
321 | else scan_catch(dirname, wildname);
|
---|
322 |
|
---|
323 | for (pass = 0; pass < MAX_PASSES; pass++)
|
---|
324 | if (Passes & (1 << pass))
|
---|
325 | {
|
---|
326 | if (PassData[pass].done (file_name) == COMPERROR)
|
---|
327 | FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
|
---|
328 | }
|
---|
329 |
|
---|
330 |
|
---|
331 | free (buffer);
|
---|
332 | }
|
---|
333 |
|
---|
334 | void main (int argc, char **argv)
|
---|
335 | {
|
---|
336 | int ch;
|
---|
337 | char *filename = NULL;
|
---|
338 | FILE *Trace = NULL;
|
---|
339 |
|
---|
340 | msg_prefix = argv[0];
|
---|
341 |
|
---|
342 | opterr = 0;
|
---|
343 | while ((ch = getopt (argc, argv, "hC:WHGpSD123f:d:b:T:I:t:m:N:c:n:s:")) != -1)
|
---|
344 | {
|
---|
345 | switch (ch)
|
---|
346 | {
|
---|
347 | case 'H':
|
---|
348 | html_catch = 1;
|
---|
349 | break;
|
---|
350 | case 'G':
|
---|
351 | SkipSGML = 1;
|
---|
352 | break;
|
---|
353 | case 'p':
|
---|
354 | by_para = 1;
|
---|
355 | break;
|
---|
356 | case 'S':
|
---|
357 | Passes |= SPECIAL;
|
---|
358 | break;
|
---|
359 | case '1':
|
---|
360 | InvfLevel = 1;
|
---|
361 | break;
|
---|
362 | case '2':
|
---|
363 | InvfLevel = 2;
|
---|
364 | break;
|
---|
365 | case '3':
|
---|
366 | InvfLevel = 3;
|
---|
367 | break;
|
---|
368 | case 'f':
|
---|
369 | filename = optarg;
|
---|
370 | break;
|
---|
371 | case 'n':
|
---|
372 | trace_name = optarg;
|
---|
373 | break;
|
---|
374 | case 'D':
|
---|
375 | Dump = 1;
|
---|
376 | break;
|
---|
377 | case 'W':
|
---|
378 | MakeWeights = 1;
|
---|
379 | break;
|
---|
380 | case 'd':
|
---|
381 | set_basepath (optarg);
|
---|
382 | break;
|
---|
383 | case 's':
|
---|
384 | stem_method = atoi (optarg) & STEMMER_MASK;
|
---|
385 | break;
|
---|
386 | case 'b':
|
---|
387 | buf_size = atoi (optarg) * 1024;
|
---|
388 | break;
|
---|
389 | case 'C':
|
---|
390 | comp_stat_point = atoi (optarg) * 1024;
|
---|
391 | break;
|
---|
392 | case 'c':
|
---|
393 | ChunkLimit = atoi (optarg);
|
---|
394 | break;
|
---|
395 | case 'm':
|
---|
396 | invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
|
---|
397 | break;
|
---|
398 | case 'I':
|
---|
399 | case 'N': /* N kept for compatability */
|
---|
400 | if (*optarg == '1')
|
---|
401 | Passes |= IVF_PASS_1;
|
---|
402 | else if (*optarg == '2')
|
---|
403 | Passes |= IVF_PASS_2;
|
---|
404 | else
|
---|
405 | usage ("Invalid pass number");
|
---|
406 | break;
|
---|
407 | case 'T':
|
---|
408 | if (*optarg == '1')
|
---|
409 | Passes |= TEXT_PASS_1;
|
---|
410 | else if (*optarg == '2')
|
---|
411 | Passes |= TEXT_PASS_2;
|
---|
412 | else
|
---|
413 | usage ("Invalid pass number");
|
---|
414 | break;
|
---|
415 | case 't':
|
---|
416 | trace = (unsigned long) (atof (optarg) * 1024 * 1024);
|
---|
417 | break;
|
---|
418 | case 'h':
|
---|
419 | case '?':
|
---|
420 | usage (NULL);
|
---|
421 | }
|
---|
422 | }
|
---|
423 |
|
---|
424 | if (!filename || *filename == '\0')
|
---|
425 | FatalError (1, "A document collection name must be specified.");
|
---|
426 |
|
---|
427 | if (buf_size < MIN_BUF)
|
---|
428 | FatalError (1, "The buffer size must exceed 1024 bytes.");
|
---|
429 |
|
---|
430 | if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
|
---|
431 | FatalError (1, "I1 and I2 cannot be done simultaneously.");
|
---|
432 |
|
---|
433 | if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
|
---|
434 | FatalError (1, "T1 and T2 cannot be done simultaneously.");
|
---|
435 |
|
---|
436 | if (!Passes)
|
---|
437 | FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
|
---|
438 |
|
---|
439 | if (argc - optind == 1) {
|
---|
440 | strcpy(dirname,"");
|
---|
441 | strcpy(wildname,argv[optind]);
|
---|
442 | }
|
---|
443 | else if (argc - optind == 2) {
|
---|
444 | strcpy(dirname,argv[optind]);
|
---|
445 | strcpy(wildname,argv[optind+1]);
|
---|
446 | }
|
---|
447 | else FatalError(1, "Finder code requires directory and filespec.");
|
---|
448 |
|
---|
449 | if (strrchr(wildname,'*') != NULL || strrchr(wildname,'?') != NULL)
|
---|
450 | recurse = 1;
|
---|
451 |
|
---|
452 | if (trace)
|
---|
453 | {
|
---|
454 | if (!trace_name)
|
---|
455 | trace_name = make_name (filename, TRACE_SUFFIX, NULL);
|
---|
456 | if (!(Trace = fopen (trace_name, "a")))
|
---|
457 | Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
|
---|
458 | else
|
---|
459 | setbuf (Trace, NULL);
|
---|
460 | }
|
---|
461 | else
|
---|
462 | Trace = NULL;
|
---|
463 |
|
---|
464 | if (comp_stat_point)
|
---|
465 | {
|
---|
466 | char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
|
---|
467 | if (!(Comp_Stats = fopen (name, "wb")))
|
---|
468 | Message ("Unable to open \"%s\". No comp. stats. will be generated.",
|
---|
469 | name);
|
---|
470 | }
|
---|
471 |
|
---|
472 |
|
---|
473 | if (Trace)
|
---|
474 | {
|
---|
475 | int i;
|
---|
476 | fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
|
---|
477 | for (i = 0; i < argc; i++)
|
---|
478 | fprintf (Trace, "%s ", argv[i]);
|
---|
479 | fprintf (Trace, "\n\n");
|
---|
480 | }
|
---|
481 |
|
---|
482 | driver (Trace, filename);
|
---|
483 |
|
---|
484 | if (Trace)
|
---|
485 | fclose (Trace);
|
---|
486 |
|
---|
487 | if (Comp_Stats)
|
---|
488 | fclose (Comp_Stats);
|
---|
489 |
|
---|
490 | exit (0);
|
---|
491 | }
|
---|
492 |
|
---|