source: trunk/gsdl/packages/mg/src/text/MGPASS.C@ 1013

Last change on this file since 1013 was 439, checked in by sjboddie, 25 years ago

renamed mg-1.3d directory mg

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1/**************************************************************************
2 *
3 * mgpass.cpp -- Driver for the various passes -
4
5 V1 - removed all the pipe processing and replaced with
6 code to directly explore a directory of files.
7 V2 - rebuilt to extract non text files from web browser
8 'catch' file. Also to display progress count
9 GH/WJR
10
11 * Copyright (C) 1994 Neil Sharman, ..
12 *
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
17 *
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
22 *
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
26 *
27 * $Id: MGPASS.C 439 1999-08-10 21:23:37Z sjboddie $
28 *
29 **************************************************************************/
30
31#include "sysfuncs.h"
32
33#include <malloc.h>
34#include <stdlib.h>
35#include <stdio.h>
36#include <io.h>
37#include <fcntl.h>
38#include <string.h>
39
40#include "memlib.h"
41#include "messages.h"
42
43#include "mg_files.h"
44#include "mg.h"
45#include "build.h"
46#include "text.h"
47#include "stemmer.h"
48
49/*
50 $Log$
51 Revision 1.1 1999/08/10 21:17:39 sjboddie
52 renamed mg-1.3d directory mg
53
54 Revision 1.5 1998/12/02 01:32:40 sjboddie
55 fixed 1 missing semi-colon..
56
57 Revision 1.4 1998/12/02 00:57:10 rjmcnab
58
59 Fixed to compile with the changes to the stemming.
60
61 Revision 1.3 1998/11/24 03:39:23 rjmcnab
62
63 Fixed a small error in the windows compilation
64
65 Revision 1.2 1998/11/24 01:29:39 rjmcnab
66
67 Fixed a few problems with the windows build
68
69 Revision 1.1 1998/11/17 09:34:12 rjmcnab
70 *** empty log message ***
71
72 * Revision 1.3 1994/10/20 03:56:57 tes
73 * I have rewritten the boolean query optimiser and abstracted out the
74 * components of the boolean query.
75 *
76 * Revision 1.2 1994/09/20 04:41:52 tes
77 * For version 1.1
78 *
79 */
80
81static char *RCSID = "$Id: MGPASS.C 439 1999-08-10 21:23:37Z sjboddie $";
82
83#define MAX_PASSES 5
84
85#define SPECIAL 1
86#define TEXT_PASS_1 2
87#define TEXT_PASS_2 4
88#define IVF_PASS_1 8
89#define IVF_PASS_2 16
90
91#define MIN_BUF 8192
92#define path_length 256
93
94unsigned long buf_size = 3 * 1024 * 1024; /* 3Mb */
95unsigned long invf_buffer_size = 5 * 1024 * 1024; /* 5Mb */
96unsigned long ChunkLimit = 0;
97char InvfLevel = 2;
98char SkipSGML = 0;
99char MakeWeights = 0;
100FILE *Comp_Stats = NULL;
101int comp_stat_point = 0;
102double bytes_processed = 0;
103int num_docs = 0;
104double bytes_received = 0;
105int stem_method = 0;
106int stemmer_num = 0; /* default to the Lovin stemmer */
107
108static char Passes = 0;
109static unsigned long trace = 0;
110static int Dump = 0;
111static char **files = NULL;
112static int num_files = 0;
113static char *trace_name = NULL;
114
115static char dirname[path_length], wildname[path_length];
116static int by_para = 0, recurse = 0, html_catch = 0;
117static char *buffer;
118char *line_start, *data_end, *base, *scan;
119
120typedef struct pass_data
121 {
122 char *name;
123 int (*init) (char *);
124 int (*process) (u_char *, int);
125 int (*done) (char *);
126 }
127pass_data;
128
129static pass_data PassData[MAX_PASSES] =
130{
131 {"special", init_special, process_special, done_special},
132 {"text.pass1", init_text_1, process_text_1, done_text_1},
133 {"text.pass2", init_text_2, process_text_2, done_text_2},
134 {"ivf.pass1", init_ivf_1, process_ivf_1, done_ivf_1},
135 {"ivf.pass2", init_ivf_2, process_ivf_2, done_ivf_2},
136};
137
138static char *usage_str = "\nUSAGE:\n"
139" %s [-h] [-G] [-D] [-1|-2|-3] [-T1] [-T2] [-I1] [-I2] [-N1]\n"
140" %*s [-N2] [-W] [-S] [-b buffer-size] [-d dictionary-directory]\n"
141" %*s [-t trace-point Mb] [-m invf-memory] [-c chunk-limit]\n"
142" %*s [-n trace-name] [-C comp-stat-size] [-s stem_method] -f doc-collection-name\n"
143" %*s [source directory\\] [source file]\n";
144
145static void usage (char *err)
146{
147 if (err) Message (err);
148 fprintf (stderr, usage_str, msg_prefix, strlen (msg_prefix), "",
149 strlen (msg_prefix), "", strlen (msg_prefix), "");
150 exit (1);
151}
152
153void do_process(char *buffer, int num)
154{
155 int pass;
156 if (num == 0) Message ("Warning : Processing zero length document");
157 num_docs++;
158 bytes_processed += num;
159 for (pass = 0; pass < MAX_PASSES; pass++)
160 if (Passes & (1 << pass))
161 if (PassData[pass].process((u_char *)buffer, num) == COMPERROR)
162 FatalError(1, "Error during file processing");
163}
164
165int refill(int in_fd)
166{
167 int num, bitleft;
168 bitleft = data_end - base;
169 memmove(buffer, base, bitleft);
170 if (buf_size - bitleft < MIN_BUF)
171 FatalError(1, "Paragraph too big for buffer");
172 num = read(in_fd, &buffer[bitleft], buf_size - bitleft);
173 line_start -= (base - buffer);
174 scan -= (base - buffer);
175 base = buffer;
176 data_end = buffer + bitleft + num;
177 if (num > 0) return 1; else return 0;
178}
179
180char *scanpara(int in_fd)
181{
182 int num, blank, in_blank, at_end;
183 num = read(in_fd,buffer,buf_size);
184 in_blank = 1; base = buffer; line_start = buffer; scan = buffer;
185 data_end = buffer + num; at_end = 0;
186 for (;;) {
187 blank = 1; line_start = scan; // Get a line
188 while (scan < data_end && *scan != '\n')
189 if (*scan++ > ' ') blank = 0;
190 if (scan >= data_end) {
191 if (refill(in_fd)) {
192 while (scan < data_end && *scan != '\n')
193 if (*scan++ > ' ') blank = 0;
194 if (scan < data_end) scan++;
195 }
196 else at_end = 1;
197 }
198 else scan++;
199 if (line_start < scan) { // If we have a line
200 if (in_blank) {
201 if (!blank) { in_blank = 0; base = line_start; }
202 }
203 else {
204 if (blank) {
205 do_process(base, line_start - base);
206 in_blank = 1; base = line_start;
207 }
208 }
209 }
210 if (at_end) break;
211 }
212 if (in_blank) base = scan;
213 if (scan + 2 <= buffer + buf_size) {
214 *scan++ = 26; *scan++ = 10;
215 }
216 if (base < scan) do_process(base, scan - base);
217 return NULL;
218}
219
220char *scanfile(int in_fd)
221{
222 int num = read(in_fd,buffer,buf_size); /*expect to read the whole file*/
223 if (num < 0) return "file locked";
224 if (num >= buf_size-1) return "file too long";
225 do_process(buffer, num);
226 return NULL;
227}
228
229void search(char *dname, char *fname)
230{
231 long dirtag; struct _finddata_t dirinfo; int in_fd; char *res;
232 char search_name[path_length], found_name[path_length];
233
234 sprintf(search_name, "%s%s", dname, fname); /*Scan files*/
235 dirtag = _findfirst(search_name, &dirinfo);
236 if (dirtag >= 0) {
237 do {
238 if ((dirinfo.attrib & (_A_SUBDIR | _A_HIDDEN | _A_SYSTEM)) == 0) {
239 sprintf(found_name,"%s%s",dname,dirinfo.name);
240 in_fd = open(found_name,O_RDONLY|O_BINARY);
241 if (in_fd >= 0) {
242 if (by_para)
243 res = scanpara(in_fd);
244 else
245 res = scanfile(in_fd);
246 if (res != NULL) {
247 Message("Error %s in processing file %s\n", res, found_name);
248 exit(1);
249 }
250 close(in_fd);
251 }
252 }
253 } while (_findnext(dirtag, &dirinfo) == 0);
254 _findclose(dirtag);
255 }
256
257 if (recurse == 0) return;
258
259 sprintf(search_name, "%s*.*", dname); /*Look for subdirs*/
260 dirtag = _findfirst(search_name, &dirinfo);
261 if (dirtag < 0) return;
262 do {
263 if ( ((dirinfo.attrib & (_A_HIDDEN | _A_SYSTEM)) == 0) &&
264 ((dirinfo.attrib & _A_SUBDIR) != 0) &&
265 strcmp(dirinfo.name,".") != 0 &&
266 strcmp(dirinfo.name,"..") != 0) {
267 sprintf(found_name,"%s%s",dname,dirinfo.name);
268 strcat(found_name,"\\");
269 search(found_name,fname);
270 }
271 } while (_findnext(dirtag, &dirinfo) == 0);
272 _findclose(dirtag);
273}
274
275static int toobig(int n)
276{
277 if (n > path_length) {
278 printf("Cannot handle urls > %d characters in length\n", path_length);
279 exit(1);
280 }
281 return 0;
282}
283
284void scan_catch(char *dname, char *fname)
285{
286 int in_fd, urllen, conlen, filesize; char catch_name[path_length];
287 int filecount = 0;
288 enum { filekind_redirected, filekind_text, filekind_other } filekind;
289 sprintf(catch_name, "%s%s", dname, fname);
290 in_fd = open(catch_name,O_RDONLY|O_BINARY);
291 if (in_fd < 0)
292 FatalError(1, "Couldn't open catch file \"%s\"", catch_name);
293 for (;;) {
294 filecount++; if (filecount%100 == 0) { printf("%d\r", filecount); fflush(stdout); }
295 if (read(in_fd, &urllen, sizeof(int)) != sizeof(int) || toobig(urllen) ||
296 read(in_fd, catch_name, urllen) != urllen ||
297 read(in_fd, &conlen, sizeof(int)) != sizeof(int) || toobig(conlen) ||
298 read(in_fd, catch_name, conlen) != conlen) break;
299 if (conlen >= 1 && *catch_name == '@')
300 filekind = filekind_redirected;
301 else {
302 if (conlen >= 4 && strncmp(catch_name, "text", 4) == 0)
303 filekind = filekind_text;
304 else
305 filekind = filekind_other;
306 if (read(in_fd, &filesize, sizeof(int)) != sizeof(int))
307 FatalError(1, "File read failed for size field");
308 if (filesize > buf_size)
309 FatalError(1, "File too large (%d > %d)", filesize, buf_size);
310 if (read(in_fd, buffer, filesize) != filesize)
311 FatalError(1, "Failed to read file data");
312 }
313 if (filekind == filekind_text) do_process(buffer, filesize);
314 }
315 close(in_fd);
316}
317
318static void driver (FILE * Trace, char *file_name)
319{
320 int pass;
321
322 buffer = (char *)Xmalloc (buf_size);
323
324 for (pass = 0; pass < MAX_PASSES; pass++)
325 if (Passes & (1 << pass))
326 {
327 if (PassData[pass].init (file_name) == COMPERROR)
328 FatalError (1, "Error during init of \"%s\"",PassData[pass].name);
329 }
330
331 if (html_catch == 0) search(dirname, wildname);
332 else scan_catch(dirname, wildname);
333
334 for (pass = 0; pass < MAX_PASSES; pass++)
335 if (Passes & (1 << pass))
336 {
337 if (PassData[pass].done (file_name) == COMPERROR)
338 FatalError (1, "Error during done of \"%s\"", PassData[pass].name);
339 }
340
341
342 free (buffer);
343}
344
345void main (int argc, char **argv)
346{
347 int ch;
348 char *filename = NULL;
349 FILE *Trace = NULL;
350
351 msg_prefix = argv[0];
352
353 opterr = 0;
354 while ((ch = getopt (argc, argv, "hC:WHGpSD123f:d:b:T:I:t:m:N:c:n:s:")) != -1)
355 {
356 switch (ch)
357 {
358 case 'H':
359 html_catch = 1;
360 break;
361 case 'G':
362 SkipSGML = 1;
363 break;
364 case 'p':
365 by_para = 1;
366 break;
367 case 'S':
368 Passes |= SPECIAL;
369 break;
370 case '1':
371 InvfLevel = 1;
372 break;
373 case '2':
374 InvfLevel = 2;
375 break;
376 case '3':
377 InvfLevel = 3;
378 break;
379 case 'f':
380 filename = optarg;
381 break;
382 case 'n':
383 trace_name = optarg;
384 break;
385 case 'D':
386 Dump = 1;
387 break;
388 case 'W':
389 MakeWeights = 1;
390 break;
391 case 'd':
392 set_basepath (optarg);
393 break;
394 case 's':
395 stem_method = atoi (optarg) & STEMMER_MASK;
396 break;
397 case 'b':
398 buf_size = atoi (optarg) * 1024;
399 break;
400 case 'C':
401 comp_stat_point = atoi (optarg) * 1024;
402 break;
403 case 'c':
404 ChunkLimit = atoi (optarg);
405 break;
406 case 'm':
407 invf_buffer_size = (int) (atof (optarg) * 1024 * 1024);
408 break;
409 case 'I':
410 case 'N': /* N kept for compatability */
411 if (*optarg == '1')
412 Passes |= IVF_PASS_1;
413 else if (*optarg == '2')
414 Passes |= IVF_PASS_2;
415 else
416 usage ("Invalid pass number");
417 break;
418 case 'T':
419 if (*optarg == '1')
420 Passes |= TEXT_PASS_1;
421 else if (*optarg == '2')
422 Passes |= TEXT_PASS_2;
423 else
424 usage ("Invalid pass number");
425 break;
426 case 't':
427 trace = (unsigned long) (atof (optarg) * 1024 * 1024);
428 break;
429 case 'h':
430 case '?':
431 usage (NULL);
432 }
433 }
434
435 if (!filename || *filename == '\0')
436 FatalError (1, "A document collection name must be specified.");
437
438 if (buf_size < MIN_BUF)
439 FatalError (1, "The buffer size must exceed 1024 bytes.");
440
441 if ((Passes & (IVF_PASS_1 | IVF_PASS_2)) == (IVF_PASS_1 | IVF_PASS_2))
442 FatalError (1, "I1 and I2 cannot be done simultaneously.");
443
444 if ((Passes & (TEXT_PASS_1 | TEXT_PASS_2)) == (TEXT_PASS_1 | TEXT_PASS_2))
445 FatalError (1, "T1 and T2 cannot be done simultaneously.");
446
447 if (!Passes)
448 FatalError (1, "S, T1, T2, I1 or I2 must be specified.");
449
450 if (argc - optind == 1) {
451 strcpy(dirname,"");
452 strcpy(wildname,argv[optind]);
453 }
454 else if (argc - optind == 2) {
455 strcpy(dirname,argv[optind]);
456 strcpy(wildname,argv[optind+1]);
457 }
458 else FatalError(1, "Finder code requires directory and filespec.");
459
460 if (strrchr(wildname,'*') != NULL || strrchr(wildname,'?') != NULL)
461 recurse = 1;
462
463 if (trace)
464 {
465 if (!trace_name)
466 trace_name = make_name (filename, TRACE_SUFFIX, NULL);
467 if (!(Trace = fopen (trace_name, "a")))
468 Message ("Unable to open \"%s\". No tracing will be done.", trace_name);
469 else
470 setbuf (Trace, NULL);
471 }
472 else
473 Trace = NULL;
474
475 if (comp_stat_point)
476 {
477 char *name = make_name (filename, COMPRESSION_STATS_SUFFIX, NULL);
478 if (!(Comp_Stats = fopen (name, "wb")))
479 Message ("Unable to open \"%s\". No comp. stats. will be generated.",
480 name);
481 }
482
483
484 if (Trace)
485 {
486 int i;
487 fprintf (Trace, "\n\n\t\t-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-\n\n");
488 for (i = 0; i < argc; i++)
489 fprintf (Trace, "%s ", argv[i]);
490 fprintf (Trace, "\n\n");
491 }
492
493 driver (Trace, filename);
494
495 if (Trace)
496 fclose (Trace);
497
498 if (Comp_Stats)
499 fclose (Comp_Stats);
500
501 exit (0);
502}
Note: See TracBrowser for help on using the repository browser.